diff --git a/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0001-block-cgroups-kconfig-build-bits-for-BFQ-v6-3.8.patch b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0001-block-cgroups-kconfig-build-bits-for-BFQ-v6-3.8.patch new file mode 100644 index 000000000..77c829a33 --- /dev/null +++ b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0001-block-cgroups-kconfig-build-bits-for-BFQ-v6-3.8.patch @@ -0,0 +1,98 @@ +# Calculate format=diff os_linux_system==desktop +From 59fd22f37d9acfa07186a02bb1cd2d64785d82b1 Mon Sep 17 00:00:00 2001 +From: Arianna Avanzini +Date: Sat, 4 Feb 2012 10:55:51 +0100 +Subject: [PATCH 1/2] block: cgroups, kconfig, build bits for BFQ-v6-3.8 + +Update Kconfig.iosched and do the related Makefile changes to include +kernel configuration options for BFQ. Also add the bfqio controller +to the cgroups subsystem. + +Signed-off-by: Paolo Valente +Signed-off-by: Arianna Avanzini +--- + block/Kconfig.iosched | 26 ++++++++++++++++++++++++++ + block/Makefile | 1 + + include/linux/cgroup_subsys.h | 6 ++++++ + 3 files changed, 33 insertions(+) + +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched +index 421bef9..56474b2 100644 +--- a/block/Kconfig.iosched ++++ b/block/Kconfig.iosched +@@ -39,6 +39,28 @@ config CFQ_GROUP_IOSCHED + ---help--- + Enable group IO scheduling in CFQ. + ++config IOSCHED_BFQ ++ tristate "BFQ I/O scheduler" ++ depends on EXPERIMENTAL ++ default n ++ ---help--- ++ The BFQ I/O scheduler tries to distribute bandwidth among ++ all processes according to their weights. ++ It aims at distributing the bandwidth as desired, independently of ++ the disk parameters and with any workload. It also tries to ++ guarantee low latency to interactive and soft real-time ++ applications. If compiled built-in (saying Y here), BFQ can ++ be configured to support hierarchical scheduling. ++ ++config CGROUP_BFQIO ++ bool "BFQ hierarchical scheduling support" ++ depends on CGROUPS && IOSCHED_BFQ=y ++ default n ++ ---help--- ++ Enable hierarchical scheduling in BFQ, using the cgroups ++ filesystem interface. The name of the subsystem will be ++ bfqio. ++ + choice + prompt "Default I/O scheduler" + default DEFAULT_CFQ +@@ -52,6 +74,9 @@ choice + config DEFAULT_CFQ + bool "CFQ" if IOSCHED_CFQ=y + ++ config DEFAULT_BFQ ++ bool "BFQ" if IOSCHED_BFQ=y ++ + config DEFAULT_NOOP + bool "No-op" + +@@ -61,6 +86,7 @@ config DEFAULT_IOSCHED + string + default "deadline" if DEFAULT_DEADLINE + default "cfq" if DEFAULT_CFQ ++ default "bfq" if DEFAULT_BFQ + default "noop" if DEFAULT_NOOP + + endmenu +diff --git a/block/Makefile b/block/Makefile +index 39b76ba..c0d20fa 100644 +--- a/block/Makefile ++++ b/block/Makefile +@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o + obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o + obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o + obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o ++obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o + + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o + obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o +diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h +index f204a7a..b999bfa 100644 +--- a/include/linux/cgroup_subsys.h ++++ b/include/linux/cgroup_subsys.h +@@ -78,3 +78,9 @@ SUBSYS(hugetlb) + #endif + + /* */ ++ ++#ifdef CONFIG_CGROUP_BFQIO ++SUBSYS(bfqio) ++#endif ++ ++/* */ +-- +1.8.1.2 + diff --git a/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0002-block-introduce-the-BFQ-v6-I-O-sched-for-3.8.patch b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0002-block-introduce-the-BFQ-v6-I-O-sched-for-3.8.patch new file mode 100644 index 000000000..34cd92a6d --- /dev/null +++ b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0002-block-introduce-the-BFQ-v6-I-O-sched-for-3.8.patch @@ -0,0 +1,5856 @@ +# Calculate format=diff os_linux_system==desktop +From d2ba3dc4196b2b1579f6ccbb64880e662684b8ba Mon Sep 17 00:00:00 2001 +From: Arianna Avanzini +Date: Sun, 5 Feb 2012 01:04:27 +0100 +Subject: [PATCH 2/2] block: introduce the BFQ-v6 I/O sched for 3.8 + +Add the BFQ-v6 I/O scheduler to 3.8. +The general structure is borrowed from CFQ, as much code. A (bfq_)queue is +associated to each task doing I/O on a device, and each time a scheduling +decision has to be made a queue is selected and served until it expires. + + - Slices are given in the service domain: tasks are assigned budgets, + measured in number of sectors. Once got the disk, a task must + however consume its assigned budget within a configurable maximum time + (by default, the maximum possible value of the budgets is automatically + computed to comply with this timeout). This allows the desired latency + vs "throughput boosting" tradeoff to be set. + + - Budgets are scheduled according to a variant of WF2Q+, implemented + using an augmented rb-tree to take eligibility into account while + preserving an O(log N) overall complexity. + + - A low-latency tunable is provided; if enabled, both interactive and soft + real-time applications are guaranteed very low latency. + + - Latency guarantees are preserved also in presence of NCQ. + + - Also with flash-based devices, a high throughput is achieved while + still preserving latency guarantees. + + - A useful feature borrowed from CFQ: static fallback queue for OOM. + + - Differently from CFQ, BFQ uses a unified mechanism (Early Queue Merge, + EQM) to get a sequential read pattern, and hence a high throughput, + with any set of processes performing interleaved I/O. EQM also + preserves low latency. The code for detecting whether two queues have + to be merged is a slightly modified version of the CFQ code for + detecting whether two queues belong to cooperating processes and whether + the service of a queue should be preempted to boost the throughput. + + - BFQ supports full hierarchical scheduling, exporting a cgroups + interface. Each node has a full scheduler, so each group can + be assigned its own ioprio (mapped to a weight, see next point) + and an ioprio_class. + + - If the cgroups interface is used, weights can be explictly assigned, + otherwise ioprio values are mapped to weights using the relation + weight = IOPRIO_BE_NR - ioprio. + + - ioprio classes are served in strict priority order, i.e., lower + priority queues are not served as long as there are higher priority + queues. Among queues in the same class the bandwidth is distributed + in proportion to the weight of each queue. A very thin extra bandwidth + is however guaranteed to the Idle class, to prevent it from starving. + +Signed-off-by: Paolo Valente +Signed-off-by: Arianna Avanzini +--- + block/bfq-cgroup.c | 838 ++++++++++++++ + block/bfq-ioc.c | 36 + + block/bfq-iosched.c | 3218 +++++++++++++++++++++++++++++++++++++++++++++++++++ + block/bfq-sched.c | 1044 +++++++++++++++++ + block/bfq.h | 617 ++++++++++ + 5 files changed, 5753 insertions(+) + create mode 100644 block/bfq-cgroup.c + create mode 100644 block/bfq-ioc.c + create mode 100644 block/bfq-iosched.c + create mode 100644 block/bfq-sched.c + create mode 100644 block/bfq.h + +diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c +new file mode 100644 +index 0000000..1ae54d1 +--- /dev/null ++++ b/block/bfq-cgroup.c +@@ -0,0 +1,838 @@ ++/* ++ * BFQ: CGROUPS support. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2010 Paolo Valente ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. ++ */ ++ ++#ifdef CONFIG_CGROUP_BFQIO ++static struct bfqio_cgroup bfqio_root_cgroup = { ++ .weight = BFQ_DEFAULT_GRP_WEIGHT, ++ .ioprio = BFQ_DEFAULT_GRP_IOPRIO, ++ .ioprio_class = BFQ_DEFAULT_GRP_CLASS, ++}; ++ ++static inline void bfq_init_entity(struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++ entity->weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; ++ entity->ioprio = entity->new_ioprio; ++ entity->ioprio_class = entity->new_ioprio_class; ++ entity->parent = bfqg->my_entity; ++ entity->sched_data = &bfqg->sched_data; ++} ++ ++static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup) ++{ ++ return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id), ++ struct bfqio_cgroup, css); ++} ++ ++/* ++ * Search the bfq_group for bfqd into the hash table (by now only a list) ++ * of bgrp. Must be called under rcu_read_lock(). ++ */ ++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp, ++ struct bfq_data *bfqd) ++{ ++ struct bfq_group *bfqg; ++ struct hlist_node *n; ++ void *key; ++ ++ hlist_for_each_entry_rcu(bfqg, n, &bgrp->group_data, group_node) { ++ key = rcu_dereference(bfqg->bfqd); ++ if (key == bfqd) ++ return bfqg; ++ } ++ ++ return NULL; ++} ++ ++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp, ++ struct bfq_group *bfqg) ++{ ++ struct bfq_entity *entity = &bfqg->entity; ++ ++ entity->weight = entity->new_weight = bgrp->weight; ++ entity->orig_weight = entity->new_weight; ++ entity->ioprio = entity->new_ioprio = bgrp->ioprio; ++ entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class; ++ entity->ioprio_changed = 1; ++ entity->my_sched_data = &bfqg->sched_data; ++} ++ ++static inline void bfq_group_set_parent(struct bfq_group *bfqg, ++ struct bfq_group *parent) ++{ ++ struct bfq_entity *entity; ++ ++ BUG_ON(parent == NULL); ++ BUG_ON(bfqg == NULL); ++ ++ entity = &bfqg->entity; ++ entity->parent = parent->my_entity; ++ entity->sched_data = &parent->sched_data; ++} ++ ++/** ++ * bfq_group_chain_alloc - allocate a chain of groups. ++ * @bfqd: queue descriptor. ++ * @cgroup: the leaf cgroup this chain starts from. ++ * ++ * Allocate a chain of groups starting from the one belonging to ++ * @cgroup up to the root cgroup. Stop if a cgroup on the chain ++ * to the root has already an allocated group on @bfqd. ++ */ ++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd, ++ struct cgroup *cgroup) ++{ ++ struct bfqio_cgroup *bgrp; ++ struct bfq_group *bfqg, *prev = NULL, *leaf = NULL; ++ ++ for (; cgroup != NULL; cgroup = cgroup->parent) { ++ bgrp = cgroup_to_bfqio(cgroup); ++ ++ bfqg = bfqio_lookup_group(bgrp, bfqd); ++ if (bfqg != NULL) { ++ /* ++ * All the cgroups in the path from there to the ++ * root must have a bfq_group for bfqd, so we don't ++ * need any more allocations. ++ */ ++ break; ++ } ++ ++ bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC); ++ if (bfqg == NULL) ++ goto cleanup; ++ ++ bfq_group_init_entity(bgrp, bfqg); ++ bfqg->my_entity = &bfqg->entity; ++ ++ if (leaf == NULL) { ++ leaf = bfqg; ++ prev = leaf; ++ } else { ++ bfq_group_set_parent(prev, bfqg); ++ /* ++ * Build a list of allocated nodes using the bfqd ++ * filed, that is still unused and will be initialized ++ * only after the node will be connected. ++ */ ++ prev->bfqd = bfqg; ++ prev = bfqg; ++ } ++ } ++ ++ return leaf; ++ ++cleanup: ++ while (leaf != NULL) { ++ prev = leaf; ++ leaf = leaf->bfqd; ++ kfree(prev); ++ } ++ ++ return NULL; ++} ++ ++/** ++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy. ++ * @bfqd: the queue descriptor. ++ * @cgroup: the leaf cgroup to start from. ++ * @leaf: the leaf group (to be associated to @cgroup). ++ * ++ * Try to link a chain of groups to a cgroup hierarchy, connecting the ++ * nodes bottom-up, so we can be sure that when we find a cgroup in the ++ * hierarchy that already as a group associated to @bfqd all the nodes ++ * in the path to the root cgroup have one too. ++ * ++ * On locking: the queue lock protects the hierarchy (there is a hierarchy ++ * per device) while the bfqio_cgroup lock protects the list of groups ++ * belonging to the same cgroup. ++ */ ++static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup, ++ struct bfq_group *leaf) ++{ ++ struct bfqio_cgroup *bgrp; ++ struct bfq_group *bfqg, *next, *prev = NULL; ++ unsigned long flags; ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ ++ for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) { ++ bgrp = cgroup_to_bfqio(cgroup); ++ next = leaf->bfqd; ++ ++ bfqg = bfqio_lookup_group(bgrp, bfqd); ++ BUG_ON(bfqg != NULL); ++ ++ spin_lock_irqsave(&bgrp->lock, flags); ++ ++ rcu_assign_pointer(leaf->bfqd, bfqd); ++ hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data); ++ hlist_add_head(&leaf->bfqd_node, &bfqd->group_list); ++ ++ spin_unlock_irqrestore(&bgrp->lock, flags); ++ ++ prev = leaf; ++ leaf = next; ++ } ++ ++ BUG_ON(cgroup == NULL && leaf != NULL); ++ if (cgroup != NULL && prev != NULL) { ++ bgrp = cgroup_to_bfqio(cgroup); ++ bfqg = bfqio_lookup_group(bgrp, bfqd); ++ bfq_group_set_parent(prev, bfqg); ++ } ++} ++ ++/** ++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup. ++ * @bfqd: queue descriptor. ++ * @cgroup: cgroup being searched for. ++ * ++ * Return a group associated to @bfqd in @cgroup, allocating one if ++ * necessary. When a group is returned all the cgroups in the path ++ * to the root have a group associated to @bfqd. ++ * ++ * If the allocation fails, return the root group: this breaks guarantees ++ * but is a safe fallbak. If this loss becames a problem it can be ++ * mitigated using the equivalent weight (given by the product of the ++ * weights of the groups in the path from @group to the root) in the ++ * root scheduler. ++ * ++ * We allocate all the missing nodes in the path from the leaf cgroup ++ * to the root and we connect the nodes only after all the allocations ++ * have been successful. ++ */ ++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, ++ struct cgroup *cgroup) ++{ ++ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup); ++ struct bfq_group *bfqg; ++ ++ bfqg = bfqio_lookup_group(bgrp, bfqd); ++ if (bfqg != NULL) ++ return bfqg; ++ ++ bfqg = bfq_group_chain_alloc(bfqd, cgroup); ++ if (bfqg != NULL) ++ bfq_group_chain_link(bfqd, cgroup, bfqg); ++ else ++ bfqg = bfqd->root_group; ++ ++ return bfqg; ++} ++ ++/** ++ * bfq_bfqq_move - migrate @bfqq to @bfqg. ++ * @bfqd: queue descriptor. ++ * @bfqq: the queue to move. ++ * @entity: @bfqq's entity. ++ * @bfqg: the group to move to. ++ * ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating ++ * it on the new one. Avoid putting the entity on the old group idle tree. ++ * ++ * Must be called under the queue lock; the cgroup owning @bfqg must ++ * not disappear (by now this just means that we are called under ++ * rcu_read_lock()). ++ */ ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_entity *entity, struct bfq_group *bfqg) ++{ ++ int busy, resume; ++ ++ busy = bfq_bfqq_busy(bfqq); ++ resume = !RB_EMPTY_ROOT(&bfqq->sort_list); ++ ++ BUG_ON(resume && !entity->on_st); ++ BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue); ++ ++ if (busy) { ++ BUG_ON(atomic_read(&bfqq->ref) < 2); ++ ++ if (!resume) ++ bfq_del_bfqq_busy(bfqd, bfqq, 0); ++ else ++ bfq_deactivate_bfqq(bfqd, bfqq, 0); ++ } else if (entity->on_st) ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); ++ ++ /* ++ * Here we use a reference to bfqg. We don't need a refcounter ++ * as the cgroup reference will not be dropped, so that its ++ * destroy() callback will not be invoked. ++ */ ++ entity->parent = bfqg->my_entity; ++ entity->sched_data = &bfqg->sched_data; ++ ++ if (busy && resume) ++ bfq_activate_bfqq(bfqd, bfqq); ++ ++ if (bfqd->active_queue == NULL && !bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++} ++ ++/** ++ * __bfq_bic_change_cgroup - move @bic to @cgroup. ++ * @bfqd: the queue descriptor. ++ * @bic: the bic to move. ++ * @cgroup: the cgroup to move to. ++ * ++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller ++ * has to make sure that the reference to cgroup is valid across the call. ++ * ++ * NOTE: an alternative approach might have been to store the current ++ * cgroup in bfqq and getting a reference to it, reducing the lookup ++ * time here, at the price of slightly more complex code. ++ */ ++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, ++ struct cgroup *cgroup) ++{ ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); ++ struct bfq_entity *entity; ++ struct bfq_group *bfqg; ++ struct bfqio_cgroup *bgrp; ++ ++ bgrp = cgroup_to_bfqio(cgroup); ++ ++ bfqg = bfq_find_alloc_group(bfqd, cgroup); ++ if (async_bfqq != NULL) { ++ entity = &async_bfqq->entity; ++ ++ if (entity->sched_data != &bfqg->sched_data) { ++ bic_set_bfqq(bic, NULL, 0); ++ bfq_log_bfqq(bfqd, async_bfqq, ++ "bic_change_group: %p %d", ++ async_bfqq, atomic_read(&async_bfqq->ref)); ++ bfq_put_queue(async_bfqq); ++ } ++ } ++ ++ if (sync_bfqq != NULL) { ++ entity = &sync_bfqq->entity; ++ if (entity->sched_data != &bfqg->sched_data) ++ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); ++ } ++ ++ return bfqg; ++} ++ ++/** ++ * bfq_bic_change_cgroup - move @bic to @cgroup. ++ * @bic: the bic being migrated. ++ * @cgroup: the destination cgroup. ++ * ++ * When the task owning @bic is moved to @cgroup, @bic is immediately ++ * moved into its new parent group. ++ */ ++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic, ++ struct cgroup *cgroup) ++{ ++ struct bfq_data *bfqd; ++ unsigned long uninitialized_var(flags); ++ ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags); ++ if (bfqd != NULL) { ++ __bfq_bic_change_cgroup(bfqd, bic, cgroup); ++ bfq_put_bfqd_unlock(bfqd, &flags); ++ } ++} ++ ++/** ++ * bfq_bic_update_cgroup - update the cgroup of @bic. ++ * @bic: the @bic to update. ++ * ++ * Make sure that @bic is enqueued in the cgroup of the current task. ++ * We need this in addition to moving bics during the cgroup attach ++ * phase because the task owning @bic could be at its first disk ++ * access or we may end up in the root cgroup as the result of a ++ * memory allocation failure and here we try to move to the right ++ * group. ++ * ++ * Must be called under the queue lock. It is safe to use the returned ++ * value even after the rcu_read_unlock() as the migration/destruction ++ * paths act under the queue lock too. IOW it is impossible to race with ++ * group migration/destruction and end up with an invalid group as: ++ * a) here cgroup has not yet been destroyed, nor its destroy callback ++ * has started execution, as current holds a reference to it, ++ * b) if it is destroyed after rcu_read_unlock() [after current is ++ * migrated to a different cgroup] its attach() callback will have ++ * taken care of remove all the references to the old cgroup data. ++ */ ++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ struct bfq_group *bfqg; ++ struct cgroup *cgroup; ++ ++ BUG_ON(bfqd == NULL); ++ ++ rcu_read_lock(); ++ cgroup = task_cgroup(current, bfqio_subsys_id); ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup); ++ rcu_read_unlock(); ++ ++ return bfqg; ++} ++ ++/** ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. ++ * @st: the service tree being flushed. ++ */ ++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entity = st->first_idle; ++ ++ for (; entity != NULL; entity = st->first_idle) ++ __bfq_deactivate_entity(entity, 0); ++} ++ ++/** ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group. ++ * @bfqd: the device data structure with the root group. ++ * @entity: the entity to move. ++ */ ++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(bfqq == NULL); ++ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); ++ return; ++} ++ ++/** ++ * bfq_reparent_active_entities - move to the root group all active entities. ++ * @bfqd: the device data structure with the root group. ++ * @bfqg: the group to move from. ++ * @st: the service tree with the entities. ++ * ++ * Needs queue_lock to be taken and reference to be valid over the call. ++ */ ++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ struct bfq_service_tree *st) ++{ ++ struct rb_root *active = &st->active; ++ struct bfq_entity *entity = NULL; ++ ++ if (!RB_EMPTY_ROOT(&st->active)) ++ entity = bfq_entity_of(rb_first(active)); ++ ++ for (; entity != NULL ; entity = bfq_entity_of(rb_first(active))) ++ bfq_reparent_leaf_entity(bfqd, entity); ++ ++ if (bfqg->sched_data.active_entity != NULL) ++ bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity); ++ ++ return; ++} ++ ++/** ++ * bfq_destroy_group - destroy @bfqg. ++ * @bgrp: the bfqio_cgroup containing @bfqg. ++ * @bfqg: the group being destroyed. ++ * ++ * Destroy @bfqg, making sure that it is not referenced from its parent. ++ */ ++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg) ++{ ++ struct bfq_data *bfqd; ++ struct bfq_service_tree *st; ++ struct bfq_entity *entity = bfqg->my_entity; ++ unsigned long uninitialized_var(flags); ++ int i; ++ ++ hlist_del(&bfqg->group_node); ++ ++ /* ++ * Empty all service_trees belonging to this group before deactivating ++ * the group itself. ++ */ ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { ++ st = bfqg->sched_data.service_tree + i; ++ ++ /* ++ * The idle tree may still contain bfq_queues belonging ++ * to exited task because they never migrated to a different ++ * cgroup from the one being destroyed now. Noone else ++ * can access them so it's safe to act without any lock. ++ */ ++ bfq_flush_idle_tree(st); ++ ++ /* ++ * It may happen that some queues are still active ++ * (busy) upon group destruction (if the corresponding ++ * processes have been forced to terminate). We move ++ * all the leaf entities corresponding to these queues ++ * to the root_group. ++ * Also, it may happen that the group has an entity ++ * under service, which is disconnected from the active ++ * tree: it must be moved, too. ++ * There is no need to put the sync queues, as the ++ * scheduler has taken no reference. ++ */ ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); ++ if (bfqd != NULL) { ++ bfq_reparent_active_entities(bfqd, bfqg, st); ++ bfq_put_bfqd_unlock(bfqd, &flags); ++ } ++ BUG_ON(!RB_EMPTY_ROOT(&st->active)); ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); ++ } ++ BUG_ON(bfqg->sched_data.next_active != NULL); ++ BUG_ON(bfqg->sched_data.active_entity != NULL); ++ ++ /* ++ * We may race with device destruction, take extra care when ++ * dereferencing bfqg->bfqd. ++ */ ++ bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags); ++ if (bfqd != NULL) { ++ hlist_del(&bfqg->bfqd_node); ++ __bfq_deactivate_entity(entity, 0); ++ bfq_put_async_queues(bfqd, bfqg); ++ bfq_put_bfqd_unlock(bfqd, &flags); ++ } ++ BUG_ON(entity->tree != NULL); ++ ++ /* ++ * No need to defer the kfree() to the end of the RCU grace ++ * period: we are called from the destroy() callback of our ++ * cgroup, so we can be sure that noone is a) still using ++ * this cgroup or b) doing lookups in it. ++ */ ++ kfree(bfqg); ++} ++ ++/** ++ * bfq_disconnect_groups - diconnect @bfqd from all its groups. ++ * @bfqd: the device descriptor being exited. ++ * ++ * When the device exits we just make sure that no lookup can return ++ * the now unused group structures. They will be deallocated on cgroup ++ * destruction. ++ */ ++static void bfq_disconnect_groups(struct bfq_data *bfqd) ++{ ++ struct hlist_node *pos, *n; ++ struct bfq_group *bfqg; ++ ++ bfq_log(bfqd, "disconnect_groups beginning") ; ++ hlist_for_each_entry_safe(bfqg, pos, n, &bfqd->group_list, bfqd_node) { ++ hlist_del(&bfqg->bfqd_node); ++ ++ __bfq_deactivate_entity(bfqg->my_entity, 0); ++ ++ /* ++ * Don't remove from the group hash, just set an ++ * invalid key. No lookups can race with the ++ * assignment as bfqd is being destroyed; this ++ * implies also that new elements cannot be added ++ * to the list. ++ */ ++ rcu_assign_pointer(bfqg->bfqd, NULL); ++ ++ bfq_log(bfqd, "disconnect_groups: put async for group %p", ++ bfqg) ; ++ bfq_put_async_queues(bfqd, bfqg); ++ } ++} ++ ++static inline void bfq_free_root_group(struct bfq_data *bfqd) ++{ ++ struct bfqio_cgroup *bgrp = &bfqio_root_cgroup; ++ struct bfq_group *bfqg = bfqd->root_group; ++ ++ bfq_put_async_queues(bfqd, bfqg); ++ ++ spin_lock_irq(&bgrp->lock); ++ hlist_del_rcu(&bfqg->group_node); ++ spin_unlock_irq(&bgrp->lock); ++ ++ /* ++ * No need to synchronize_rcu() here: since the device is gone ++ * there cannot be any read-side access to its root_group. ++ */ ++ kfree(bfqg); ++} ++ ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) ++{ ++ struct bfq_group *bfqg; ++ struct bfqio_cgroup *bgrp; ++ int i; ++ ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); ++ if (bfqg == NULL) ++ return NULL; ++ ++ bfqg->entity.parent = NULL; ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ ++ bgrp = &bfqio_root_cgroup; ++ spin_lock_irq(&bgrp->lock); ++ rcu_assign_pointer(bfqg->bfqd, bfqd); ++ hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data); ++ spin_unlock_irq(&bgrp->lock); ++ ++ return bfqg; ++} ++ ++#define SHOW_FUNCTION(__VAR) \ ++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup, \ ++ struct cftype *cftype) \ ++{ \ ++ struct bfqio_cgroup *bgrp; \ ++ u64 ret; \ ++ \ ++ if (!cgroup_lock_live_group(cgroup)) \ ++ return -ENODEV; \ ++ \ ++ bgrp = cgroup_to_bfqio(cgroup); \ ++ spin_lock_irq(&bgrp->lock); \ ++ ret = bgrp->__VAR; \ ++ spin_unlock_irq(&bgrp->lock); \ ++ \ ++ cgroup_unlock(); \ ++ \ ++ return ret; \ ++} ++ ++SHOW_FUNCTION(weight); ++SHOW_FUNCTION(ioprio); ++SHOW_FUNCTION(ioprio_class); ++#undef SHOW_FUNCTION ++ ++#define STORE_FUNCTION(__VAR, __MIN, __MAX) \ ++static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup, \ ++ struct cftype *cftype, \ ++ u64 val) \ ++{ \ ++ struct bfqio_cgroup *bgrp; \ ++ struct bfq_group *bfqg; \ ++ struct hlist_node *n; \ ++ \ ++ if (val < (__MIN) || val > (__MAX)) \ ++ return -EINVAL; \ ++ \ ++ if (!cgroup_lock_live_group(cgroup)) \ ++ return -ENODEV; \ ++ \ ++ bgrp = cgroup_to_bfqio(cgroup); \ ++ \ ++ spin_lock_irq(&bgrp->lock); \ ++ bgrp->__VAR = (unsigned short)val; \ ++ hlist_for_each_entry(bfqg, n, &bgrp->group_data, group_node) { \ ++ bfqg->entity.new_##__VAR = (unsigned short)val; \ ++ smp_wmb(); \ ++ bfqg->entity.ioprio_changed = 1; \ ++ } \ ++ spin_unlock_irq(&bgrp->lock); \ ++ \ ++ cgroup_unlock(); \ ++ \ ++ return 0; \ ++} ++ ++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT); ++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1); ++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE); ++#undef STORE_FUNCTION ++ ++static struct cftype bfqio_files[] = { ++ { ++ .name = "weight", ++ .read_u64 = bfqio_cgroup_weight_read, ++ .write_u64 = bfqio_cgroup_weight_write, ++ }, ++ { ++ .name = "ioprio", ++ .read_u64 = bfqio_cgroup_ioprio_read, ++ .write_u64 = bfqio_cgroup_ioprio_write, ++ }, ++ { ++ .name = "ioprio_class", ++ .read_u64 = bfqio_cgroup_ioprio_class_read, ++ .write_u64 = bfqio_cgroup_ioprio_class_write, ++ }, ++ { }, /* terminate */ ++}; ++ ++static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup) ++{ ++ struct bfqio_cgroup *bgrp; ++ ++ if (cgroup->parent != NULL) { ++ bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL); ++ if (bgrp == NULL) ++ return ERR_PTR(-ENOMEM); ++ } else ++ bgrp = &bfqio_root_cgroup; ++ ++ spin_lock_init(&bgrp->lock); ++ INIT_HLIST_HEAD(&bgrp->group_data); ++ bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO; ++ bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS; ++ ++ return &bgrp->css; ++} ++ ++/* ++ * We cannot support shared io contexts, as we have no means to support ++ * two tasks with the same ioc in two different groups without major rework ++ * of the main bic/bfqq data structures. By now we allow a task to change ++ * its cgroup only if it's the only owner of its ioc; the drawback of this ++ * behavior is that a group containing a task that forked using CLONE_IO ++ * will not be destroyed until the tasks sharing the ioc die. ++ */ ++static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset) ++{ ++ struct task_struct *task; ++ struct io_context *ioc; ++ int ret = 0; ++ ++ cgroup_taskset_for_each(task, cgroup, tset) { ++ /* task_lock() is needed to avoid races with exit_io_context() */ ++ task_lock(task); ++ ioc = task->io_context; ++ if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) ++ /* ++ * ioc == NULL means that the task is either too young or ++ * exiting: if it has still no ioc the ioc can't be shared, ++ * if the task is exiting the attach will fail anyway, no ++ * matter what we return here. ++ */ ++ ret = -EINVAL; ++ task_unlock(task); ++ if (ret) ++ break; ++ } ++ ++ return ret; ++} ++ ++static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset) ++{ ++ struct task_struct *task; ++ struct io_context *ioc; ++ struct io_cq *icq; ++ struct hlist_node *n; ++ ++ /* ++ * IMPORTANT NOTE: The move of more than one process at a time to a ++ * new group has not yet been tested. ++ */ ++ cgroup_taskset_for_each(task, cgroup, tset) { ++ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); ++ if (ioc) { ++ /* ++ * Handle cgroup change here. ++ */ ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(icq, n, &ioc->icq_list, ioc_node) ++ if (!strncmp(icq->q->elevator->type->elevator_name, ++ "bfq", ELV_NAME_MAX)) ++ bfq_bic_change_cgroup(icq_to_bic(icq), ++ cgroup); ++ rcu_read_unlock(); ++ put_io_context(ioc); ++ } ++ } ++} ++ ++static void bfqio_destroy(struct cgroup *cgroup) ++{ ++ struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup); ++ struct hlist_node *n, *tmp; ++ struct bfq_group *bfqg; ++ ++ /* ++ * Since we are destroying the cgroup, there are no more tasks ++ * referencing it, and all the RCU grace periods that may have ++ * referenced it are ended (as the destruction of the parent ++ * cgroup is RCU-safe); bgrp->group_data will not be accessed by ++ * anything else and we don't need any synchronization. ++ */ ++ hlist_for_each_entry_safe(bfqg, n, tmp, &bgrp->group_data, group_node) ++ bfq_destroy_group(bgrp, bfqg); ++ ++ BUG_ON(!hlist_empty(&bgrp->group_data)); ++ ++ kfree(bgrp); ++} ++ ++struct cgroup_subsys bfqio_subsys = { ++ .name = "bfqio", ++ .css_alloc = bfqio_create, ++ .can_attach = bfqio_can_attach, ++ .attach = bfqio_attach, ++ .css_free = bfqio_destroy, ++ .subsys_id = bfqio_subsys_id, ++ .base_cftypes = bfqio_files, ++}; ++#else ++static inline void bfq_init_entity(struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++ entity->weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; ++ entity->ioprio = entity->new_ioprio; ++ entity->ioprio_class = entity->new_ioprio_class; ++ entity->sched_data = &bfqg->sched_data; ++} ++ ++static inline struct bfq_group * ++bfq_bic_update_cgroup(struct bfq_io_cq *bic) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ return bfqd->root_group; ++} ++ ++static inline void bfq_bfqq_move(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++} ++ ++static inline void bfq_disconnect_groups(struct bfq_data *bfqd) ++{ ++ bfq_put_async_queues(bfqd, bfqd->root_group); ++} ++ ++static inline void bfq_free_root_group(struct bfq_data *bfqd) ++{ ++ kfree(bfqd->root_group); ++} ++ ++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node) ++{ ++ struct bfq_group *bfqg; ++ int i; ++ ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); ++ if (bfqg == NULL) ++ return NULL; ++ ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ ++ return bfqg; ++} ++#endif +diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c +new file mode 100644 +index 0000000..326e3ec +--- /dev/null ++++ b/block/bfq-ioc.c +@@ -0,0 +1,36 @@ ++/* ++ * BFQ: I/O context handling. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2010 Paolo Valente ++ */ ++ ++/** ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. ++ * @icq: the iocontext queue. ++ */ ++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq) ++{ ++ /* bic->icq is the first member, %NULL will convert to %NULL */ ++ return container_of(icq, struct bfq_io_cq, icq); ++} ++ ++/** ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. ++ * @bfqd: the lookup key. ++ * @ioc: the io_context of the process doing I/O. ++ * ++ * Queue lock must be held. ++ */ ++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, ++ struct io_context *ioc) ++{ ++ if(ioc) ++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); ++ return NULL; ++} +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c +new file mode 100644 +index 0000000..c9d57b0 +--- /dev/null ++++ b/block/bfq-iosched.c +@@ -0,0 +1,3218 @@ ++/* ++ * BFQ, or Budget Fair Queueing, disk scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2010 Paolo Valente ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. ++ * ++ * BFQ is a proportional share disk scheduling algorithm based on the ++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, ++ * measured in number of sectors, to tasks instead of time slices. ++ * The disk is not granted to the active task for a given time slice, ++ * but until it has exahusted its assigned budget. This change from ++ * the time to the service domain allows BFQ to distribute the disk ++ * bandwidth among tasks as desired, without any distortion due to ++ * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc ++ * internal scheduler, called B-WF2Q+, to schedule tasks according to ++ * their budgets. Thanks to this accurate scheduler, BFQ can afford ++ * to assign high budgets to disk-bound non-seeky tasks (to boost the ++ * throughput), and yet guarantee low latencies to interactive and ++ * soft real-time applications. ++ * ++ * BFQ has been introduced in [1], where the interested reader can ++ * find an accurate description of the algorithm, the bandwidth ++ * distribution and latency guarantees it provides, plus formal proofs ++ * of all the properties. With respect to the algorithm presented in ++ * the paper, this implementation adds several little heuristics, and ++ * a hierarchical extension, based on H-WF2Q+. ++ * ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) ++ * complexity derives from the one introduced with EEVDF in [3]. ++ * ++ * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling ++ * with Deterministic Guarantees on Bandwidth Distribution,'', ++ * IEEE Transactions on Computer, May 2010. ++ * ++ * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf ++ * ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, ++ * Oct 1997. ++ * ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz ++ * ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline ++ * First: A Flexible and Accurate Mechanism for Proportional Share ++ * Resource Allocation,'' technical report. ++ * ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "bfq.h" ++#include "blk.h" ++ ++/* Max number of dispatches in one round of service. */ ++static const int bfq_quantum = 4; ++ ++/* Expiration time of sync (0) and async (1) requests, in jiffies. */ ++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; ++ ++/* Maximum backwards seek, in KiB. */ ++static const int bfq_back_max = 16 * 1024; ++ ++/* Penalty of a backwards seek, in number of sectors. */ ++static const int bfq_back_penalty = 2; ++ ++/* Idling period duration, in jiffies. */ ++static int bfq_slice_idle = HZ / 125; ++ ++/* Default maximum budget values, in sectors and number of requests. */ ++static const int bfq_default_max_budget = 16 * 1024; ++static const int bfq_max_budget_async_rq = 4; ++ ++/* ++ * Async to sync throughput distribution is controlled as follows: ++ * when an async request is served, the entity is charged the number ++ * of sectors of the request, multipled by the factor below ++ */ ++static const int bfq_async_charge_factor = 10; ++ ++/* Default timeout values, in jiffies, approximating CFQ defaults. */ ++static const int bfq_timeout_sync = HZ / 8; ++static int bfq_timeout_async = HZ / 25; ++ ++struct kmem_cache *bfq_pool; ++ ++/* Below this threshold (in ms), we consider thinktime immediate. */ ++#define BFQ_MIN_TT 2 ++ ++/* hw_tag detection: parallel requests threshold and min samples needed. */ ++#define BFQ_HW_QUEUE_THRESHOLD 4 ++#define BFQ_HW_QUEUE_SAMPLES 32 ++ ++#define BFQQ_SEEK_THR (sector_t)(8 * 1024) ++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) ++ ++/* Min samples used for peak rate estimation (for autotuning). */ ++#define BFQ_PEAK_RATE_SAMPLES 32 ++ ++/* Shift used for peak rate fixed precision calculations. */ ++#define BFQ_RATE_SHIFT 16 ++ ++/* ++ * The duration of the weight raising for interactive applications is ++ * computed automatically (as default behaviour), using the following ++ * formula: duration = (R / r) * T, where r is the peak rate of the ++ * disk, and R and T are two reference parameters. In particular, R is ++ * the peak rate of a reference disk, and T is about the maximum time ++ * for starting popular large applications on that disk, under BFQ and ++ * while reading two files in parallel. Finally, BFQ uses two ++ * different pairs (R, T) depending on whether the disk is rotational ++ * or non-rotational. ++ */ ++#define T_rot (msecs_to_jiffies(5500)) ++#define T_nonrot (msecs_to_jiffies(2000)) ++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */ ++#define R_rot 17415 ++#define R_nonrot 34791 ++ ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) ++ ++#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) ++ ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd); ++ ++#include "bfq-ioc.c" ++#include "bfq-sched.c" ++#include "bfq-cgroup.c" ++ ++#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\ ++ IOPRIO_CLASS_IDLE) ++#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\ ++ IOPRIO_CLASS_RT) ++ ++#define bfq_sample_valid(samples) ((samples) > 80) ++ ++/* ++ * We regard a request as SYNC, if either it's a read or has the SYNC bit ++ * set (in which case it could also be a direct WRITE). ++ */ ++static inline int bfq_bio_sync(struct bio *bio) ++{ ++ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) ++ return 1; ++ ++ return 0; ++} ++ ++/* ++ * Scheduler run of queue, if there are requests pending and no one in the ++ * driver that will restart queueing. ++ */ ++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) ++{ ++ if (bfqd->queued != 0) { ++ bfq_log(bfqd, "schedule dispatch"); ++ kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work); ++ } ++} ++ ++/* ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now. ++ * We choose the request that is closesr to the head right now. Distance ++ * behind the head is penalized and only allowed to a certain extent. ++ */ ++static struct request *bfq_choose_req(struct bfq_data *bfqd, ++ struct request *rq1, ++ struct request *rq2, ++ sector_t last) ++{ ++ sector_t s1, s2, d1 = 0, d2 = 0; ++ unsigned long back_max; ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ ++ unsigned wrap = 0; /* bit mask: requests behind the disk head? */ ++ ++ if (rq1 == NULL || rq1 == rq2) ++ return rq2; ++ if (rq2 == NULL) ++ return rq1; ++ ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) ++ return rq1; ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) ++ return rq2; ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) ++ return rq1; ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) ++ return rq2; ++ ++ s1 = blk_rq_pos(rq1); ++ s2 = blk_rq_pos(rq2); ++ ++ /* ++ * By definition, 1KiB is 2 sectors. ++ */ ++ back_max = bfqd->bfq_back_max * 2; ++ ++ /* ++ * Strict one way elevator _except_ in the case where we allow ++ * short backward seeks which are biased as twice the cost of a ++ * similar forward seek. ++ */ ++ if (s1 >= last) ++ d1 = s1 - last; ++ else if (s1 + back_max >= last) ++ d1 = (last - s1) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ1_WRAP; ++ ++ if (s2 >= last) ++ d2 = s2 - last; ++ else if (s2 + back_max >= last) ++ d2 = (last - s2) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ2_WRAP; ++ ++ /* Found required data */ ++ ++ /* ++ * By doing switch() on the bit mask "wrap" we avoid having to ++ * check two variables for all permutations: --> faster! ++ */ ++ switch (wrap) { ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ ++ if (d1 < d2) ++ return rq1; ++ else if (d2 < d1) ++ return rq2; ++ else { ++ if (s1 >= s2) ++ return rq1; ++ else ++ return rq2; ++ } ++ ++ case BFQ_RQ2_WRAP: ++ return rq1; ++ case BFQ_RQ1_WRAP: ++ return rq2; ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ ++ default: ++ /* ++ * Since both rqs are wrapped, ++ * start with the one that's further behind head ++ * (--> only *one* back seek required), ++ * since back seek takes more time than forward. ++ */ ++ if (s1 <= s2) ++ return rq1; ++ else ++ return rq2; ++ } ++} ++ ++static struct bfq_queue * ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, ++ sector_t sector, struct rb_node **ret_parent, ++ struct rb_node ***rb_link) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *bfqq = NULL; ++ ++ parent = NULL; ++ p = &root->rb_node; ++ while (*p) { ++ struct rb_node **n; ++ ++ parent = *p; ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ ++ /* ++ * Sort strictly based on sector. Smallest to the left, ++ * largest to the right. ++ */ ++ if (sector > blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_right; ++ else if (sector < blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_left; ++ else ++ break; ++ p = n; ++ bfqq = NULL; ++ } ++ ++ *ret_parent = parent; ++ if (rb_link) ++ *rb_link = p; ++ ++ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", ++ (long long unsigned)sector, ++ bfqq != NULL ? bfqq->pid : 0); ++ ++ return bfqq; ++} ++ ++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *__bfqq; ++ ++ if (bfqq->pos_root != NULL) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ ++ if (bfq_class_idle(bfqq)) ++ return; ++ if (!bfqq->next_rq) ++ return; ++ ++ bfqq->pos_root = &bfqd->rq_pos_tree; ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, ++ blk_rq_pos(bfqq->next_rq), &parent, &p); ++ if (__bfqq == NULL) { ++ rb_link_node(&bfqq->pos_node, parent, p); ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); ++ } else ++ bfqq->pos_root = NULL; ++} ++ ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct rb_node *rbnext = rb_next(&last->rb_node); ++ struct rb_node *rbprev = rb_prev(&last->rb_node); ++ struct request *next = NULL, *prev = NULL; ++ ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); ++ ++ if (rbprev != NULL) ++ prev = rb_entry_rq(rbprev); ++ ++ if (rbnext != NULL) ++ next = rb_entry_rq(rbnext); ++ else { ++ rbnext = rb_first(&bfqq->sort_list); ++ if (rbnext && rbnext != &last->rb_node) ++ next = rb_entry_rq(rbnext); ++ } ++ ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); ++} ++ ++static void bfq_del_rq_rb(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ const int sync = rq_is_sync(rq); ++ ++ BUG_ON(bfqq->queued[sync] == 0); ++ bfqq->queued[sync]--; ++ bfqd->queued--; ++ ++ elv_rb_del(&bfqq->sort_list, rq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue) ++ bfq_del_bfqq_busy(bfqd, bfqq, 1); ++ /* ++ * Remove queue from request-position tree as it is empty. ++ */ ++ if (bfqq->pos_root != NULL) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ } ++} ++ ++/* see the definition of bfq_async_charge_factor for details */ ++static inline unsigned long bfq_serv_to_charge(struct request *rq, ++ struct bfq_queue *bfqq) ++{ ++ return blk_rq_sectors(rq) * ++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) * ++ bfq_async_charge_factor)); ++} ++ ++/** ++ * bfq_updated_next_req - update the queue after a new next_rq selection. ++ * @bfqd: the device data the queue belongs to. ++ * @bfqq: the queue to update. ++ * ++ * If the first request of a queue changes we make sure that the queue ++ * has enough budget to serve at least its first request (if the ++ * request has grown). We do this because if the queue has not enough ++ * budget for its first request, it has to go through two dispatch ++ * rounds to actually get it dispatched. ++ */ ++static void bfq_updated_next_req(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ struct request *next_rq = bfqq->next_rq; ++ unsigned long new_budget; ++ ++ if (next_rq == NULL) ++ return; ++ ++ if (bfqq == bfqd->active_queue) ++ /* ++ * In order not to break guarantees, budgets cannot be ++ * changed after an entity has been selected. ++ */ ++ return; ++ ++ BUG_ON(entity->tree != &st->active); ++ BUG_ON(entity == entity->sched_data->active_entity); ++ ++ new_budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ entity->budget = new_budget; ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); ++ bfq_activate_bfqq(bfqd, bfqq); ++} ++ ++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd) ++{ ++ u64 dur; ++ ++ if (bfqd->bfq_raising_max_time > 0) ++ return bfqd->bfq_raising_max_time; ++ ++ dur = bfqd->RT_prod; ++ do_div(dur, bfqd->peak_rate); ++ ++ return dur; ++} ++ ++static inline void ++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) ++{ ++ if (bic->saved_idle_window) ++ bfq_mark_bfqq_idle_window(bfqq); ++ else ++ bfq_clear_bfqq_idle_window(bfqq); ++ if (bic->raising_time_left) { ++ /* ++ * Start a weight raising period with the duration given by ++ * the raising_time_left snapshot. ++ */ ++ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff; ++ bfqq->raising_cur_max_time = bic->raising_time_left; ++ bfqq->last_rais_start_finish = jiffies; ++ } ++} ++ ++/* ++ * Must be called with the queue_lock held. ++ */ ++static int bfqq_process_refs(struct bfq_queue *bfqq) ++{ ++ int process_refs, io_refs; ++ ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; ++ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; ++ BUG_ON(process_refs < 0); ++ return process_refs; ++} ++ ++static void bfq_add_rq_rb(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *next_rq, *prev; ++ unsigned long old_raising_coeff = bfqq->raising_coeff; ++ int idle_for_long_time = bfqq->budget_timeout + ++ bfqd->bfq_raising_min_idle_time < jiffies; ++ ++ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq)); ++ bfqq->queued[rq_is_sync(rq)]++; ++ bfqd->queued++; ++ ++ elv_rb_add(&bfqq->sort_list, rq); ++ ++ /* ++ * Check if this request is a better next-serve candidate. ++ */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); ++ BUG_ON(next_rq == NULL); ++ bfqq->next_rq = next_rq; ++ ++ /* ++ * Adjust priority tree position, if next_rq changes. ++ */ ++ if (prev != bfqq->next_rq) ++ bfq_rq_pos_tree_add(bfqd, bfqq); ++ ++ if (!bfq_bfqq_busy(bfqq)) { ++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && ++ bfqq->soft_rt_next_start < jiffies; ++ entity->budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ ++ if (! bfqd->low_latency) ++ goto add_bfqq_busy; ++ ++ if (bfq_bfqq_just_split(bfqq)) ++ goto set_ioprio_changed; ++ ++ /* ++ * If the queue: ++ * - is not being boosted, ++ * - has been idle for enough time, ++ * - is not a sync queue or is linked to a bfq_io_cq (it is ++ * shared "for its nature" or it is not shared and its ++ * requests have not been redirected to a shared queue) ++ * start a weight-raising period. ++ */ ++ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt) && ++ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) { ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff; ++ if (idle_for_long_time) ++ bfqq->raising_cur_max_time = ++ bfq_wrais_duration(bfqd); ++ else ++ bfqq->raising_cur_max_time = ++ bfqd->bfq_raising_rt_max_time; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais starting at %llu msec," ++ "rais_max_time %u", ++ bfqq->last_rais_start_finish, ++ jiffies_to_msecs(bfqq-> ++ raising_cur_max_time)); ++ } else if (old_raising_coeff > 1) { ++ if (idle_for_long_time) ++ bfqq->raising_cur_max_time = ++ bfq_wrais_duration(bfqd); ++ else if (bfqq->raising_cur_max_time == ++ bfqd->bfq_raising_rt_max_time && ++ !soft_rt) { ++ bfqq->raising_coeff = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %llu msec," ++ "rais_max_time %u", ++ bfqq->last_rais_start_finish, ++ jiffies_to_msecs(bfqq-> ++ raising_cur_max_time)); ++ } ++ } ++set_ioprio_changed: ++ if (old_raising_coeff != bfqq->raising_coeff) ++ entity->ioprio_changed = 1; ++add_bfqq_busy: ++ bfq_add_bfqq_busy(bfqd, bfqq); ++ } else { ++ if(bfqd->low_latency && old_raising_coeff == 1 && ++ !rq_is_sync(rq) && ++ bfqq->last_rais_start_finish + ++ bfqd->bfq_raising_min_inter_arr_async < jiffies) { ++ bfqq->raising_coeff = bfqd->bfq_raising_coeff; ++ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd); ++ ++ entity->ioprio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "non-idle wrais starting at %llu msec," ++ "rais_max_time %u", ++ bfqq->last_rais_start_finish, ++ jiffies_to_msecs(bfqq-> ++ raising_cur_max_time)); ++ } ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ if(bfqd->low_latency && ++ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 || ++ idle_for_long_time)) ++ bfqq->last_rais_start_finish = jiffies; ++} ++ ++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq) ++{ ++ elv_rb_del(&bfqq->sort_list, rq); ++ bfqq->queued[rq_is_sync(rq)]--; ++ bfqq->bfqd->queued--; ++ bfq_add_rq_rb(rq); ++} ++ ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, ++ struct bio *bio) ++{ ++ struct task_struct *tsk = current; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ if (bic == NULL) ++ return NULL; ++ ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); ++ if (bfqq != NULL) { ++ sector_t sector = bio->bi_sector + bio_sectors(bio); ++ ++ return elv_rb_find(&bfqq->sort_list, sector); ++ } ++ ++ return NULL; ++} ++ ++static void bfq_activate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ bfqd->rq_in_driver++; ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", ++ (long long unsigned)bfqd->last_position); ++} ++ ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ WARN_ON(bfqd->rq_in_driver == 0); ++ bfqd->rq_in_driver--; ++} ++ ++static void bfq_remove_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ ++ if (bfqq->next_rq == rq) { ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ list_del_init(&rq->queuelist); ++ bfq_del_rq_rb(rq); ++ ++ if (rq->cmd_flags & REQ_META) { ++ WARN_ON(bfqq->meta_pending == 0); ++ bfqq->meta_pending--; ++ } ++} ++ ++static int bfq_merge(struct request_queue *q, struct request **req, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct request *__rq; ++ ++ __rq = bfq_find_rq_fmerge(bfqd, bio); ++ if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { ++ *req = __rq; ++ return ELEVATOR_FRONT_MERGE; ++ } ++ ++ return ELEVATOR_NO_MERGE; ++} ++ ++static void bfq_merged_request(struct request_queue *q, struct request *req, ++ int type) ++{ ++ if (type == ELEVATOR_FRONT_MERGE) { ++ struct bfq_queue *bfqq = RQ_BFQQ(req); ++ ++ bfq_reposition_rq_rb(bfqq, req); ++ } ++} ++ ++static void bfq_merged_requests(struct request_queue *q, struct request *rq, ++ struct request *next) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ /* ++ * Reposition in fifo if next is older than rq. ++ */ ++ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && ++ time_before(rq_fifo_time(next), rq_fifo_time(rq))) { ++ list_move(&rq->queuelist, &next->queuelist); ++ rq_set_fifo_time(rq, rq_fifo_time(next)); ++ } ++ ++ if (bfqq->next_rq == next) ++ bfqq->next_rq = rq; ++ ++ bfq_remove_request(next); ++} ++ ++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request) ++{ ++ if (request) ++ return blk_rq_pos(io_struct); ++ else ++ return ((struct bio *)io_struct)->bi_sector; ++} ++ ++static inline sector_t bfq_dist_from(sector_t pos1, ++ sector_t pos2) ++{ ++ if (pos1 >= pos2) ++ return pos1 - pos2; ++ else ++ return pos2 - pos1; ++} ++ ++static inline int bfq_rq_close_to_sector(void *io_struct, bool request, ++ sector_t sector) ++{ ++ return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <= ++ BFQQ_SEEK_THR; ++} ++ ++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector) ++{ ++ struct rb_root *root = &bfqd->rq_pos_tree; ++ struct rb_node *parent, *node; ++ struct bfq_queue *__bfqq; ++ ++ if (RB_EMPTY_ROOT(root)) ++ return NULL; ++ ++ /* ++ * First, if we find a request starting at the end of the last ++ * request, choose it. ++ */ ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); ++ if (__bfqq != NULL) ++ return __bfqq; ++ ++ /* ++ * If the exact sector wasn't found, the parent of the NULL leaf ++ * will contain the closest sector (rq_pos_tree sorted by next_request ++ * position). ++ */ ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ if (blk_rq_pos(__bfqq->next_rq) < sector) ++ node = rb_next(&__bfqq->pos_node); ++ else ++ node = rb_prev(&__bfqq->pos_node); ++ if (node == NULL) ++ return NULL; ++ ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ return NULL; ++} ++ ++/* ++ * bfqd - obvious ++ * cur_bfqq - passed in so that we don't decide that the current queue ++ * is closely cooperating with itself ++ * sector - used as a reference point to search for a close queue ++ */ ++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, ++ struct bfq_queue *cur_bfqq, ++ sector_t sector) ++{ ++ struct bfq_queue *bfqq; ++ ++ if (bfq_class_idle(cur_bfqq)) ++ return NULL; ++ if (!bfq_bfqq_sync(cur_bfqq)) ++ return NULL; ++ if (BFQQ_SEEKY(cur_bfqq)) ++ return NULL; ++ ++ /* If device has only one backlogged bfq_queue, don't search. */ ++ if (bfqd->busy_queues == 1) ++ return NULL; ++ ++ /* ++ * We should notice if some of the queues are cooperating, e.g. ++ * working closely on the same area of the disk. In that case, ++ * we can group them together and don't waste time idling. ++ */ ++ bfqq = bfqq_close(bfqd, sector); ++ if (bfqq == NULL || bfqq == cur_bfqq) ++ return NULL; ++ ++ /* ++ * Do not merge queues from different bfq_groups. ++ */ ++ if (bfqq->entity.parent != cur_bfqq->entity.parent) ++ return NULL; ++ ++ /* ++ * It only makes sense to merge sync queues. ++ */ ++ if (!bfq_bfqq_sync(bfqq)) ++ return NULL; ++ if (BFQQ_SEEKY(bfqq)) ++ return NULL; ++ ++ /* ++ * Do not merge queues of different priority classes. ++ */ ++ if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq)) ++ return NULL; ++ ++ return bfqq; ++} ++ ++static struct bfq_queue * ++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ int process_refs, new_process_refs; ++ struct bfq_queue *__bfqq; ++ ++ /* ++ * If there are no process references on the new_bfqq, then it is ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain ++ * may have dropped their last reference (not just their last process ++ * reference). ++ */ ++ if (!bfqq_process_refs(new_bfqq)) ++ return NULL; ++ ++ /* Avoid a circular list and skip interim queue merges. */ ++ while ((__bfqq = new_bfqq->new_bfqq)) { ++ if (__bfqq == bfqq) ++ return NULL; ++ new_bfqq = __bfqq; ++ } ++ ++ process_refs = bfqq_process_refs(bfqq); ++ new_process_refs = bfqq_process_refs(new_bfqq); ++ /* ++ * If the process for the bfqq has gone away, there is no ++ * sense in merging the queues. ++ */ ++ if (process_refs == 0 || new_process_refs == 0) ++ return NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", ++ new_bfqq->pid); ++ ++ /* ++ * Merging is just a redirection: the requests of the process owning ++ * one of the two queues are redirected to the other queue. The latter ++ * queue, in its turn, is set as shared if this is the first time that ++ * the requests of some process are redirected to it. ++ * ++ * We redirect bfqq to new_bfqq and not the opposite, because we ++ * are in the context of the process owning bfqq, hence we have the ++ * io_cq of this process. So we can immediately configure this io_cq ++ * to redirect the requests of the process to new_bfqq. ++ * ++ * NOTE, even if new_bfqq coincides with the active queue, the io_cq of ++ * new_bfqq is not available, because, if the active queue is shared, ++ * bfqd->active_bic may not point to the io_cq of the active queue. ++ * Redirecting the requests of the process owning bfqq to the currently ++ * active queue is in any case the best option, as we feed the active queue ++ * with new requests close to the last request served and, by doing so, ++ * hopefully increase the throughput. ++ */ ++ bfqq->new_bfqq = new_bfqq; ++ atomic_add(process_refs, &new_bfqq->ref); ++ return new_bfqq; ++} ++ ++/* ++ * Attempt to schedule a merge of bfqq with the currently active queue or ++ * with a close queue among the scheduled queues. ++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue ++ * structure otherwise. ++ */ ++static struct bfq_queue * ++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ void *io_struct, bool request) ++{ ++ struct bfq_queue *active_bfqq, *new_bfqq; ++ ++ if (bfqq->new_bfqq) ++ return bfqq->new_bfqq; ++ ++ if (!io_struct) ++ return NULL; ++ ++ active_bfqq = bfqd->active_queue; ++ ++ if (active_bfqq == NULL || active_bfqq == bfqq || !bfqd->active_bic) ++ goto check_scheduled; ++ ++ if (bfq_class_idle(active_bfqq) || bfq_class_idle(bfqq)) ++ goto check_scheduled; ++ ++ if (bfq_class_rt(active_bfqq) != bfq_class_rt(bfqq)) ++ goto check_scheduled; ++ ++ if (active_bfqq->entity.parent != bfqq->entity.parent) ++ goto check_scheduled; ++ ++ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && ++ bfq_bfqq_sync(active_bfqq) && bfq_bfqq_sync(bfqq)) ++ if ((new_bfqq = bfq_setup_merge(bfqq, active_bfqq))) ++ return new_bfqq; /* Merge with the active queue */ ++ ++ /* ++ * Check whether there is a cooperator among currently scheduled ++ * queues. The only thing we need is that the bio/request is not ++ * NULL, as we need it to establish whether a cooperator exists. ++ */ ++check_scheduled: ++ new_bfqq = bfq_close_cooperator(bfqd, bfqq, ++ bfq_io_struct_pos(io_struct, request)); ++ if (new_bfqq) ++ return bfq_setup_merge(bfqq, new_bfqq); ++ ++ return NULL; ++} ++ ++static inline void ++bfq_bfqq_save_state(struct bfq_queue *bfqq) ++{ ++ /* ++ * If bfqq->bic == NULL, the queue is already shared or its requests ++ * have already been redirected to a shared queue; both idle window ++ * and weight raising state have already been saved. Do nothing. ++ */ ++ if (bfqq->bic == NULL) ++ return; ++ if (bfqq->raising_coeff > 1) { ++ unsigned long wrais_duration = ++ jiffies - bfqq->last_rais_start_finish; ++ /* ++ * It may happen that a queue's weight raising period lasts ++ * longer than its raising_cur_max_time, as weight raising is ++ * handled only when a request is enqueued or dispatched (it ++ * does not use any timer). If the weight raising period is ++ * about to end, don't save it. ++ */ ++ if (bfqq->raising_cur_max_time <= wrais_duration) ++ bfqq->bic->raising_time_left = 0; ++ else ++ bfqq->bic->raising_time_left = ++ bfqq->raising_cur_max_time - wrais_duration; ++ /* ++ * The bfq_queue is becoming shared or the requests of the ++ * process owning the queue are being redirected to a shared ++ * queue. Stop the weight raising period of the queue, as in ++ * both cases it should not be owned by an interactive or soft ++ * real-time application. ++ */ ++ bfqq->raising_coeff = 1; ++ bfqq->entity.ioprio_changed = 1; ++ } else ++ bfqq->bic->raising_time_left = 0; ++ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); ++} ++ ++static inline void ++bfq_get_bic_reference(struct bfq_queue *bfqq) ++{ ++ /* ++ * If bfqq->bic has a non-NULL value, the bic to which it belongs ++ * is about to begin using a shared bfq_queue. ++ */ ++ if (bfqq->bic) ++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); ++} ++ ++static void ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", ++ (long unsigned)new_bfqq->pid); ++ /* Save weight raising and idle window of the merged queues */ ++ bfq_bfqq_save_state(bfqq); ++ bfq_bfqq_save_state(new_bfqq); ++ /* ++ * Grab a reference to the bic, to prevent it from being destroyed ++ * before being possibly touched by a bfq_split_bfqq(). ++ */ ++ bfq_get_bic_reference(bfqq); ++ bfq_get_bic_reference(new_bfqq); ++ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */ ++ bic_set_bfqq(bic, new_bfqq, 1); ++ bfq_mark_bfqq_coop(new_bfqq); ++ /* ++ * new_bfqq now belongs to at least two bics (it is a shared queue): set ++ * new_bfqq->bic to NULL. bfqq either: ++ * - does not belong to any bic any more, and hence bfqq->bic must ++ * be set to NULL, or ++ * - is a queue whose owning bics have already been redirected to a ++ * different queue, hence the queue is destined to not belong to any ++ * bic soon and bfqq->bic is already NULL (therefore the next ++ * assignment causes no harm). ++ */ ++ new_bfqq->bic = NULL; ++ bfqq->bic = NULL; ++ bfq_put_queue(bfqq); ++} ++ ++static int bfq_allow_merge(struct request_queue *q, struct request *rq, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq, *new_bfqq; ++ ++ /* ++ * Disallow merge of a sync bio into an async request. ++ */ ++ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) ++ return 0; ++ ++ /* ++ * Lookup the bfqq that this bio will be queued with. Allow ++ * merge only if rq is queued there. ++ * Queue lock is held here. ++ */ ++ bic = bfq_bic_lookup(bfqd, current->io_context); ++ if (bic == NULL) ++ return 0; ++ ++ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); ++ /* ++ * We take advantage of this function to perform an early merge ++ * of the queues of possible cooperating processes. ++ */ ++ if (bfqq != NULL && ++ (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false))) { ++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); ++ /* ++ * If we get here, the bio will be queued in the shared queue, ++ * i.e., new_bfqq, so use new_bfqq to decide whether bio and ++ * rq can be merged. ++ */ ++ bfqq = new_bfqq; ++ } ++ ++ return bfqq == RQ_BFQQ(rq); ++} ++ ++static void __bfq_set_active_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ if (bfqq != NULL) { ++ bfq_mark_bfqq_must_alloc(bfqq); ++ bfq_mark_bfqq_budget_new(bfqq); ++ bfq_clear_bfqq_fifo_expire(bfqq); ++ ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; ++ ++ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu", ++ bfqq->entity.budget); ++ } ++ ++ bfqd->active_queue = bfqq; ++} ++ ++/* ++ * Get and set a new active queue for service. ++ */ ++static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); ++ ++ __bfq_set_active_queue(bfqd, bfqq); ++ return bfqq; ++} ++ ++/* ++ * If enough samples have been computed, return the current max budget ++ * stored in bfqd, which is dynamically updated according to the ++ * estimated disk peak rate; otherwise return the default max budget ++ */ ++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < 194) ++ return bfq_default_max_budget; ++ else ++ return bfqd->bfq_max_budget; ++} ++ ++/* ++ * Return min budget, which is a fraction of the current or default ++ * max budget (trying with 1/32) ++ */ ++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < 194) ++ return bfq_default_max_budget / 32; ++ else ++ return bfqd->bfq_max_budget / 32; ++} ++ ++/* ++ * Decides whether idling should be done for given device and ++ * given active queue. ++ */ ++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd, ++ struct bfq_queue *active_bfqq) ++{ ++ if (active_bfqq == NULL) ++ return false; ++ /* ++ * If device is SSD it has no seek penalty, disable idling; but ++ * do so only if: ++ * - device does not support queuing, otherwise we still have ++ * a problem with sync vs async workloads; ++ * - the queue is not weight-raised, to preserve guarantees. ++ */ ++ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag && ++ active_bfqq->raising_coeff == 1); ++} ++ ++static void bfq_arm_slice_timer(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->active_queue; ++ struct bfq_io_cq *bic; ++ unsigned long sl; ++ ++ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (bfq_queue_nonrot_noidle(bfqd, bfqq)) ++ return; ++ ++ /* Idling is disabled, either manually or by past process history. */ ++ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_idle_window(bfqq)) ++ return; ++ ++ /* Tasks have exited, don't wait. */ ++ bic = bfqd->active_bic; ++ if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0) ++ return; ++ ++ bfq_mark_bfqq_wait_request(bfqq); ++ ++ /* ++ * We don't want to idle for seeks, but we do want to allow ++ * fair distribution of slice time for a process doing back-to-back ++ * seeks. So allow a little bit of time for him to submit a new rq. ++ * ++ * To prevent processes with (partly) seeky workloads from ++ * being too ill-treated, grant them a small fraction of the ++ * assigned budget before reducing the waiting time to ++ * BFQ_MIN_TT. This happened to help reduce latency. ++ */ ++ sl = bfqd->bfq_slice_idle; ++ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) && ++ bfqq->entity.service > bfq_max_budget(bfqd) / 8 && ++ bfqq->raising_coeff == 1) ++ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); ++ else if (bfqq->raising_coeff > 1) ++ sl = sl * 3; ++ bfqd->last_idling_start = ktime_get(); ++ mod_timer(&bfqd->idle_slice_timer, jiffies + sl); ++ bfq_log(bfqd, "arm idle: %u/%u ms", ++ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); ++} ++ ++/* ++ * Set the maximum time for the active queue to consume its ++ * budget. This prevents seeky processes from lowering the disk ++ * throughput (always guaranteed with a time slice scheme as in CFQ). ++ */ ++static void bfq_set_budget_timeout(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->active_queue; ++ unsigned int timeout_coeff; ++ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time) ++ timeout_coeff = 1; ++ else ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; ++ ++ bfqd->last_budget_start = ktime_get(); ++ ++ bfq_clear_bfqq_budget_new(bfqq); ++ bfqq->budget_timeout = jiffies + ++ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; ++ ++ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", ++ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * ++ timeout_coeff)); ++} ++ ++/* ++ * Move request from internal lists to the request queue dispatch list. ++ */ ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ bfq_remove_request(rq); ++ bfqq->dispatched++; ++ elv_dispatch_sort(q, rq); ++ ++ if (bfq_bfqq_sync(bfqq)) ++ bfqd->sync_flight++; ++} ++ ++/* ++ * Return expired entry, or NULL to just start from scratch in rbtree. ++ */ ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq) ++{ ++ struct request *rq = NULL; ++ ++ if (bfq_bfqq_fifo_expire(bfqq)) ++ return NULL; ++ ++ bfq_mark_bfqq_fifo_expire(bfqq); ++ ++ if (list_empty(&bfqq->fifo)) ++ return NULL; ++ ++ rq = rq_entry_fifo(bfqq->fifo.next); ++ ++ if (time_before(jiffies, rq_fifo_time(rq))) ++ return NULL; ++ ++ return rq; ++} ++ ++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ return entity->budget - entity->service; ++} ++ ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfqq != bfqd->active_queue); ++ ++ __bfq_bfqd_reset_active(bfqd); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ bfq_del_bfqq_busy(bfqd, bfqq, 1); ++ /* ++ * overloading budget_timeout field to store when ++ * the queue remains with no backlog, used by ++ * the weight-raising mechanism ++ */ ++ bfqq->budget_timeout = jiffies ; ++ } else { ++ bfq_activate_bfqq(bfqd, bfqq); ++ /* ++ * Resort priority tree of potential close cooperators. ++ */ ++ bfq_rq_pos_tree_add(bfqd, bfqq); ++ } ++ ++ /* ++ * If this bfqq is shared between multiple processes, check ++ * to make sure that those processes are still issuing I/Os ++ * within the mean seek distance. If not, it may be time to ++ * break the queues apart again. ++ */ ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) ++ bfq_mark_bfqq_split_coop(bfqq); ++} ++ ++/** ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. ++ * @bfqd: device data. ++ * @bfqq: queue to update. ++ * @reason: reason for expiration. ++ * ++ * Handle the feedback on @bfqq budget. See the body for detailed ++ * comments. ++ */ ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ enum bfqq_expiration reason) ++{ ++ struct request *next_rq; ++ unsigned long budget, min_budget; ++ ++ budget = bfqq->max_budget; ++ min_budget = bfq_min_budget(bfqd); ++ ++ BUG_ON(bfqq != bfqd->active_queue); ++ ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", ++ budget, bfq_min_budget(bfqd)); ++ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue)); ++ ++ if (bfq_bfqq_sync(bfqq)) { ++ switch (reason) { ++ /* ++ * Caveat: in all the following cases we trade latency ++ * for throughput. ++ */ ++ case BFQ_BFQQ_TOO_IDLE: ++ /* ++ * This is the only case where we may reduce ++ * the budget: if there is no requets of the ++ * process still waiting for completion, then ++ * we assume (tentatively) that the timer has ++ * expired because the batch of requests of ++ * the process could have been served with a ++ * smaller budget. Hence, betting that ++ * process will behave in the same way when it ++ * becomes backlogged again, we reduce its ++ * next budget. As long as we guess right, ++ * this budget cut reduces the latency ++ * experienced by the process. ++ * ++ * However, if there are still outstanding ++ * requests, then the process may have not yet ++ * issued its next request just because it is ++ * still waiting for the completion of some of ++ * the still oustanding ones. So in this ++ * subcase we do not reduce its budget, on the ++ * contrary we increase it to possibly boost ++ * the throughput, as discussed in the ++ * comments to the BUDGET_TIMEOUT case. ++ */ ++ if (bfqq->dispatched > 0) /* still oustanding reqs */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ else { ++ if (budget > 5 * min_budget) ++ budget -= 4 * min_budget; ++ else ++ budget = min_budget; ++ } ++ break; ++ case BFQ_BFQQ_BUDGET_TIMEOUT: ++ /* ++ * We double the budget here because: 1) it ++ * gives the chance to boost the throughput if ++ * this is not a seeky process (which may have ++ * bumped into this timeout because of, e.g., ++ * ZBR), 2) together with charge_full_budget ++ * it helps give seeky processes higher ++ * timestamps, and hence be served less ++ * frequently. ++ */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_BUDGET_EXHAUSTED: ++ /* ++ * The process still has backlog, and did not ++ * let either the budget timeout or the disk ++ * idling timeout expire. Hence it is not ++ * seeky, has a short thinktime and may be ++ * happy with a higher budget too. So ++ * definitely increase the budget of this good ++ * candidate to boost the disk throughput. ++ */ ++ budget = min(budget * 4, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_NO_MORE_REQUESTS: ++ /* ++ * Leave the budget unchanged. ++ */ ++ default: ++ return; ++ } ++ } else /* async queue */ ++ /* async queues get always the maximum possible budget ++ * (their ability to dispatch is limited by ++ * @bfqd->bfq_max_budget_async_rq). ++ */ ++ budget = bfqd->bfq_max_budget; ++ ++ bfqq->max_budget = budget; ++ ++ if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 && ++ bfqq->max_budget > bfqd->bfq_max_budget) ++ bfqq->max_budget = bfqd->bfq_max_budget; ++ ++ /* ++ * Make sure that we have enough budget for the next request. ++ * Since the finish time of the bfqq must be kept in sync with ++ * the budget, be sure to call __bfq_bfqq_expire() after the ++ * update. ++ */ ++ next_rq = bfqq->next_rq; ++ if (next_rq != NULL) ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ else ++ bfqq->entity.budget = bfqq->max_budget; ++ ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu", ++ next_rq != NULL ? blk_rq_sectors(next_rq) : 0, ++ bfqq->entity.budget); ++} ++ ++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) ++{ ++ unsigned long max_budget; ++ ++ /* ++ * The max_budget calculated when autotuning is equal to the ++ * amount of sectors transfered in timeout_sync at the ++ * estimated peak rate. ++ */ ++ max_budget = (unsigned long)(peak_rate * 1000 * ++ timeout >> BFQ_RATE_SHIFT); ++ ++ return max_budget; ++} ++ ++/* ++ * In addition to updating the peak rate, checks whether the process ++ * is "slow", and returns 1 if so. This slow flag is used, in addition ++ * to the budget timeout, to reduce the amount of service provided to ++ * seeky processes, and hence reduce their chances to lower the ++ * throughput. See the code for more details. ++ */ ++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ int compensate, enum bfqq_expiration reason) ++{ ++ u64 bw, usecs, expected, timeout; ++ ktime_t delta; ++ int update = 0; ++ ++ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) ++ return 0; ++ ++ if (compensate) ++ delta = bfqd->last_idling_start; ++ else ++ delta = ktime_get(); ++ delta = ktime_sub(delta, bfqd->last_budget_start); ++ usecs = ktime_to_us(delta); ++ ++ /* Don't trust short/unrealistic values. */ ++ if (usecs < 100 || usecs >= LONG_MAX) ++ return 0; ++ ++ /* ++ * Calculate the bandwidth for the last slice. We use a 64 bit ++ * value to store the peak rate, in sectors per usec in fixed ++ * point math. We do so to have enough precision in the estimate ++ * and to avoid overflows. ++ */ ++ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; ++ do_div(bw, (unsigned long)usecs); ++ ++ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); ++ ++ /* ++ * Use only long (> 20ms) intervals to filter out spikes for ++ * the peak rate estimation. ++ */ ++ if (usecs > 20000) { ++ if (bw > bfqd->peak_rate || ++ (!BFQQ_SEEKY(bfqq) && ++ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { ++ bfq_log(bfqd, "measured bw =%llu", bw); ++ /* ++ * To smooth oscillations use a low-pass filter with ++ * alpha=7/8, i.e., ++ * new_rate = (7/8) * old_rate + (1/8) * bw ++ */ ++ do_div(bw, 8); ++ if (bw == 0) ++ return 0; ++ bfqd->peak_rate *= 7; ++ do_div(bfqd->peak_rate, 8); ++ bfqd->peak_rate += bw; ++ update = 1; ++ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); ++ } ++ ++ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; ++ ++ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) ++ bfqd->peak_rate_samples++; ++ ++ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && ++ update && bfqd->bfq_user_max_budget == 0) { ++ bfqd->bfq_max_budget = ++ bfq_calc_max_budget(bfqd->peak_rate, timeout); ++ bfq_log(bfqd, "new max_budget=%lu", ++ bfqd->bfq_max_budget); ++ } ++ } ++ ++ /* ++ * If the process has been served for a too short time ++ * interval to let its possible sequential accesses prevail on ++ * the initial seek time needed to move the disk head on the ++ * first sector it requested, then give the process a chance ++ * and for the moment return false. ++ */ ++ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) ++ return 0; ++ ++ /* ++ * A process is considered ``slow'' (i.e., seeky, so that we ++ * cannot treat it fairly in the service domain, as it would ++ * slow down too much the other processes) if, when a slice ++ * ends for whatever reason, it has received service at a ++ * rate that would not be high enough to complete the budget ++ * before the budget timeout expiration. ++ */ ++ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; ++ ++ /* ++ * Caveat: processes doing IO in the slower disk zones will ++ * tend to be slow(er) even if not seeky. And the estimated ++ * peak rate will actually be an average over the disk ++ * surface. Hence, to not be too harsh with unlucky processes, ++ * we keep a budget/3 margin of safety before declaring a ++ * process slow. ++ */ ++ return expected > (4 * bfqq->entity.budget) / 3; ++} ++ ++/** ++ * bfq_bfqq_expire - expire a queue. ++ * @bfqd: device owning the queue. ++ * @bfqq: the queue to expire. ++ * @compensate: if true, compensate for the time spent idling. ++ * @reason: the reason causing the expiration. ++ * ++ * ++ * If the process associated to the queue is slow (i.e., seeky), or in ++ * case of budget timeout, or, finally, if it is async, we ++ * artificially charge it an entire budget (independently of the ++ * actual service it received). As a consequence, the queue will get ++ * higher timestamps than the correct ones upon reactivation, and ++ * hence it will be rescheduled as if it had received more service ++ * than what it actually received. In the end, this class of processes ++ * will receive less service in proportion to how slowly they consume ++ * their budgets (and hence how seriously they tend to lower the ++ * throughput). ++ * ++ * In contrast, when a queue expires because it has been idling for ++ * too much or because it exhausted its budget, we do not touch the ++ * amount of service it has received. Hence when the queue will be ++ * reactivated and its timestamps updated, the latter will be in sync ++ * with the actual service received by the queue until expiration. ++ * ++ * Charging a full budget to the first type of queues and the exact ++ * service to the others has the effect of using the WF2Q+ policy to ++ * schedule the former on a timeslice basis, without violating the ++ * service domain guarantees of the latter. ++ */ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ int compensate, ++ enum bfqq_expiration reason) ++{ ++ int slow; ++ BUG_ON(bfqq != bfqd->active_queue); ++ ++ /* Update disk peak rate for autotuning and check whether the ++ * process is slow (see bfq_update_peak_rate). ++ */ ++ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); ++ ++ /* ++ * As above explained, 'punish' slow (i.e., seeky), timed-out ++ * and async queues, to favor sequential sync workloads. ++ * ++ * Processes doing IO in the slower disk zones will tend to be ++ * slow(er) even if not seeky. Hence, since the estimated peak ++ * rate is actually an average over the disk surface, these ++ * processes may timeout just for bad luck. To avoid punishing ++ * them we do not charge a full budget to a process that ++ * succeeded in consuming at least 2/3 of its budget. ++ */ ++ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) ++ bfq_bfqq_charge_full_budget(bfqq); ++ ++ if (bfqd->low_latency && bfqq->raising_coeff == 1) ++ bfqq->last_rais_start_finish = jiffies; ++ ++ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) { ++ if(reason != BFQ_BFQQ_BUDGET_TIMEOUT) ++ bfqq->soft_rt_next_start = ++ jiffies + ++ HZ * bfqq->entity.service / ++ bfqd->bfq_raising_max_softrt_rate; ++ else ++ bfqq->soft_rt_next_start = -1; /* infinity */ ++ } ++ bfq_log_bfqq(bfqd, bfqq, ++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow, ++ bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); ++ ++ /* Increase, decrease or leave budget unchanged according to reason */ ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); ++ __bfq_bfqq_expire(bfqd, bfqq); ++} ++ ++/* ++ * Budget timeout is not implemented through a dedicated timer, but ++ * just checked on request arrivals and completions, as well as on ++ * idle timer expirations. ++ */ ++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_budget_new(bfqq)) ++ return 0; ++ ++ if (time_before(jiffies, bfqq->budget_timeout)) ++ return 0; ++ ++ return 1; ++} ++ ++/* ++ * If we expire a queue that is waiting for the arrival of a new ++ * request, we may prevent the fictitious timestamp backshifting that ++ * allows the guarantees of the queue to be preserved (see [1] for ++ * this tricky aspect). Hence we return true only if this condition ++ * does not hold, or if the queue is slow enough to deserve only to be ++ * kicked off for preserving a high throughput. ++*/ ++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "may_budget_timeout: wr %d left %d timeout %d", ++ bfq_bfqq_wait_request(bfqq), ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, ++ bfq_bfqq_budget_timeout(bfqq)); ++ ++ return (!bfq_bfqq_wait_request(bfqq) || ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) ++ && ++ bfq_bfqq_budget_timeout(bfqq); ++} ++ ++/* ++ * Select a queue for service. If we have a current active queue, ++ * check whether to continue servicing it, or retrieve and set a new one. ++ */ ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ struct request *next_rq; ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ ++ bfqq = bfqd->active_queue; ++ if (bfqq == NULL) ++ goto new_queue; ++ ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue"); ++ ++ if (bfq_may_expire_for_budg_timeout(bfqq)) ++ goto expire; ++ ++ next_rq = bfqq->next_rq; ++ /* ++ * If bfqq has requests queued and it has enough budget left to ++ * serve them, keep the queue, otherwise expire it. ++ */ ++ if (next_rq != NULL) { ++ if (bfq_serv_to_charge(next_rq, bfqq) > ++ bfq_bfqq_budget_left(bfqq)) { ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; ++ goto expire; ++ } else { ++ /* ++ * The idle timer may be pending because we may not ++ * disable disk idling even when a new request arrives ++ */ ++ if (timer_pending(&bfqd->idle_slice_timer)) { ++ /* ++ * If we get here: 1) at least a new request ++ * has arrived but we have not disabled the ++ * timer because the request was too small, ++ * 2) then the block layer has unplugged the ++ * device, causing the dispatch to be invoked. ++ * ++ * Since the device is unplugged, now the ++ * requests are probably large enough to ++ * provide a reasonable throughput. ++ * So we disable idling. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ del_timer(&bfqd->idle_slice_timer); ++ } ++ goto keep_queue; ++ } ++ } ++ ++ /* ++ * No requests pending. If there is no cooperator, and the active ++ * queue still has requests in flight or is idling for a new request, ++ * then keep it. ++ */ ++ if (timer_pending(&bfqd->idle_slice_timer) || ++ (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq) && ++ !bfq_queue_nonrot_noidle(bfqd, bfqq))) { ++ bfqq = NULL; ++ goto keep_queue; ++ } ++ ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS; ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, 0, reason); ++new_queue: ++ bfqq = bfq_set_active_queue(bfqd); ++ bfq_log(bfqd, "select_queue: new queue %d returned", ++ bfqq != NULL ? bfqq->pid : 0); ++keep_queue: ++ return bfqq; ++} ++ ++static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ if (bfqq->raising_coeff > 1) { /* queue is being boosted */ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, " ++ "old raising coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - ++ bfqq->last_rais_start_finish), ++ jiffies_to_msecs(bfqq->raising_cur_max_time), ++ bfqq->raising_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ BUG_ON(bfqq != bfqd->active_queue && entity->weight != ++ entity->orig_weight * bfqq->raising_coeff); ++ if(entity->ioprio_changed) ++ bfq_log_bfqq(bfqd, bfqq, ++ "WARN: pending prio change"); ++ /* ++ * If too much time has elapsed from the beginning ++ * of this weight-raising period and process is not soft ++ * real-time, stop it ++ */ ++ if (jiffies - bfqq->last_rais_start_finish > ++ bfqq->raising_cur_max_time) { ++ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && ++ bfqq->soft_rt_next_start < jiffies; ++ ++ bfqq->last_rais_start_finish = jiffies; ++ if (soft_rt) ++ bfqq->raising_cur_max_time = ++ bfqd->bfq_raising_rt_max_time; ++ else { ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %llu msec," ++ "rais_max_time %u", ++ bfqq->last_rais_start_finish, ++ jiffies_to_msecs(bfqq-> ++ raising_cur_max_time)); ++ bfqq->raising_coeff = 1; ++ entity->ioprio_changed = 1; ++ } ++ } ++ } ++ /* Update weight both if it must be raised and if it must be lowered */ ++ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1)) ++ __bfq_entity_update_weight_prio( ++ bfq_entity_service_tree(entity), ++ entity); ++} ++ ++ ++/* ++ * Dispatch one request from bfqq, moving it to the request queue ++ * dispatch list. ++ */ ++static int bfq_dispatch_request(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ struct request *rq; ++ unsigned long service_to_charge; ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ /* Follow expired path, else get first next available. */ ++ rq = bfq_check_fifo(bfqq); ++ if (rq == NULL) ++ rq = bfqq->next_rq; ++ service_to_charge = bfq_serv_to_charge(rq, bfqq); ++ ++ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { ++ /* ++ * This may happen if the next rq is chosen ++ * in fifo order instead of sector order. ++ * The budget is properly dimensioned ++ * to be always sufficient to serve the next request ++ * only if it is chosen in sector order. The reason is ++ * that it would be quite inefficient and little useful ++ * to always make sure that the budget is large enough ++ * to serve even the possible next rq in fifo order. ++ * In fact, requests are seldom served in fifo order. ++ * ++ * Expire the queue for budget exhaustion, and ++ * make sure that the next act_budget is enough ++ * to serve the next request, even if it comes ++ * from the fifo expired path. ++ */ ++ bfqq->next_rq = rq; ++ /* ++ * Since this dispatch is failed, make sure that ++ * a new one will be performed ++ */ ++ if (!bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++ goto expire; ++ } ++ ++ /* Finally, insert request into driver dispatch list. */ ++ bfq_bfqq_served(bfqq, service_to_charge); ++ bfq_dispatch_insert(bfqd->queue, rq); ++ ++ update_raising_data(bfqd, bfqq); ++ ++ bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), " ++ "budg left %lu", ++ blk_rq_sectors(rq), ++ (long long unsigned)blk_rq_pos(rq), ++ bfq_bfqq_budget_left(bfqq)); ++ ++ dispatched++; ++ ++ if (bfqd->active_bic == NULL) { ++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); ++ bfqd->active_bic = RQ_BIC(rq); ++ } ++ ++ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && ++ dispatched >= bfqd->bfq_max_budget_async_rq) || ++ bfq_class_idle(bfqq))) ++ goto expire; ++ ++ return dispatched; ++ ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED); ++ return dispatched; ++} ++ ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ ++ while (bfqq->next_rq != NULL) { ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); ++ dispatched++; ++ } ++ ++ BUG_ON(!list_empty(&bfqq->fifo)); ++ return dispatched; ++} ++ ++/* ++ * Drain our current requests. Used for barriers and when switching ++ * io schedulers on-the-fly. ++ */ ++static int bfq_forced_dispatch(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq, *n; ++ struct bfq_service_tree *st; ++ int dispatched = 0; ++ ++ bfqq = bfqd->active_queue; ++ if (bfqq != NULL) ++ __bfq_bfqq_expire(bfqd, bfqq); ++ ++ /* ++ * Loop through classes, and be careful to leave the scheduler ++ * in a consistent state, as feedback mechanisms and vtime ++ * updates cannot be disabled during the process. ++ */ ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { ++ st = bfq_entity_service_tree(&bfqq->entity); ++ ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq); ++ bfqq->max_budget = bfq_max_budget(bfqd); ++ ++ bfq_forget_idle(st); ++ } ++ ++ BUG_ON(bfqd->busy_queues != 0); ++ ++ return dispatched; ++} ++ ++static int bfq_dispatch_requests(struct request_queue *q, int force) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq; ++ int max_dispatch; ++ ++ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); ++ if (bfqd->busy_queues == 0) ++ return 0; ++ ++ if (unlikely(force)) ++ return bfq_forced_dispatch(bfqd); ++ ++ if((bfqq = bfq_select_queue(bfqd)) == NULL) ++ return 0; ++ ++ max_dispatch = bfqd->bfq_quantum; ++ if (bfq_class_idle(bfqq)) ++ max_dispatch = 1; ++ ++ if (!bfq_bfqq_sync(bfqq)) ++ max_dispatch = bfqd->bfq_max_budget_async_rq; ++ ++ if (bfqq->dispatched >= max_dispatch) { ++ if (bfqd->busy_queues > 1) ++ return 0; ++ if (bfqq->dispatched >= 4 * max_dispatch) ++ return 0; ++ } ++ ++ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) ++ return 0; ++ ++ bfq_clear_bfqq_wait_request(bfqq); ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); ++ ++ if (! bfq_dispatch_request(bfqd, bfqq)) ++ return 0; ++ ++ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d" ++ "(max_disp %d)", bfqq->pid, max_dispatch); ++ ++ return 1; ++} ++ ++/* ++ * Task holds one reference to the queue, dropped when task exits. Each rq ++ * in-flight on this queue also holds a reference, dropped when rq is freed. ++ * ++ * Queue lock must be held here. ++ */ ++static void bfq_put_queue(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ ++ BUG_ON(atomic_read(&bfqq->ref) <= 0); ++ ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ if (!atomic_dec_and_test(&bfqq->ref)) ++ return; ++ ++ BUG_ON(rb_first(&bfqq->sort_list) != NULL); ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); ++ BUG_ON(bfqq->entity.tree != NULL); ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ BUG_ON(bfqd->active_queue == bfqq); ++ ++ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); ++ ++ kmem_cache_free(bfq_pool, bfqq); ++} ++ ++static void bfq_put_cooperator(struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *__bfqq, *next; ++ ++ /* ++ * If this queue was scheduled to merge with another queue, be ++ * sure to drop the reference taken on that queue (and others in ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. ++ */ ++ __bfqq = bfqq->new_bfqq; ++ while (__bfqq) { ++ if (__bfqq == bfqq) { ++ WARN(1, "bfqq->new_bfqq loop detected.\n"); ++ break; ++ } ++ next = __bfqq->new_bfqq; ++ bfq_put_queue(__bfqq); ++ __bfqq = next; ++ } ++} ++ ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ if (bfqq == bfqd->active_queue) { ++ __bfq_bfqq_expire(bfqd, bfqq); ++ bfq_schedule_dispatch(bfqd); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); ++} ++ ++static void bfq_init_icq(struct io_cq *icq) ++{ ++ struct bfq_io_cq *bic = icq_to_bic(icq); ++ ++ bic->ttime.last_end_request = jiffies; ++ bic->raising_time_left = 0; ++} ++ ++static void bfq_exit_icq(struct io_cq *icq) ++{ ++ struct bfq_io_cq *bic = icq_to_bic(icq); ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ ++ if (bic->bfqq[BLK_RW_ASYNC]) { ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); ++ bic->bfqq[BLK_RW_ASYNC] = NULL; ++ } ++ ++ if (bic->bfqq[BLK_RW_SYNC]) { ++ /* ++ * If the bic is using a shared queue, put the reference ++ * taken on the io_context when the bic started using a ++ * shared bfq_queue. ++ */ ++ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) ++ put_io_context(icq->ioc); ++ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); ++ bic->bfqq[BLK_RW_SYNC] = NULL; ++ } ++} ++ ++/* ++ * Update the entity prio values; note that the new values will not ++ * be used until the next (re)activation. ++ */ ++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) ++{ ++ struct task_struct *tsk = current; ++ int ioprio_class; ++ ++ if (!bfq_bfqq_prio_changed(bfqq)) ++ return; ++ ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ switch (ioprio_class) { ++ default: ++ printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class); ++ case IOPRIO_CLASS_NONE: ++ /* ++ * No prio set, inherit CPU scheduling settings. ++ */ ++ bfqq->entity.new_ioprio = task_nice_ioprio(tsk); ++ bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk); ++ break; ++ case IOPRIO_CLASS_RT: ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT; ++ break; ++ case IOPRIO_CLASS_BE: ++ bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; ++ break; ++ case IOPRIO_CLASS_IDLE: ++ bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE; ++ bfqq->entity.new_ioprio = 7; ++ bfq_clear_bfqq_idle_window(bfqq); ++ break; ++ } ++ ++ bfqq->entity.ioprio_changed = 1; ++ ++ /* ++ * Keep track of original prio settings in case we have to temporarily ++ * elevate the priority of this queue. ++ */ ++ bfqq->org_ioprio = bfqq->entity.new_ioprio; ++ bfq_clear_bfqq_prio_changed(bfqq); ++} ++ ++static void bfq_changed_ioprio(struct bfq_io_cq *bic) ++{ ++ struct bfq_data *bfqd; ++ struct bfq_queue *bfqq, *new_bfqq; ++ struct bfq_group *bfqg; ++ unsigned long uninitialized_var(flags); ++ int ioprio = bic->icq.ioc->ioprio; ++ ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags); ++ /* ++ * This condition may trigger on a newly created bic, be sure to drop the ++ * lock before returning. ++ */ ++ if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio)) ++ goto out; ++ ++ bfqq = bic->bfqq[BLK_RW_ASYNC]; ++ if (bfqq != NULL) { ++ bfqg = container_of(bfqq->entity.sched_data, struct bfq_group, ++ sched_data); ++ new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic, ++ GFP_ATOMIC); ++ if (new_bfqq != NULL) { ++ bic->bfqq[BLK_RW_ASYNC] = new_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, ++ "changed_ioprio: bfqq %p %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ } ++ } ++ ++ bfqq = bic->bfqq[BLK_RW_SYNC]; ++ if (bfqq != NULL) ++ bfq_mark_bfqq_prio_changed(bfqq); ++ ++ bic->ioprio = ioprio; ++ ++out: ++ bfq_put_bfqd_unlock(bfqd, &flags); ++} ++ ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ pid_t pid, int is_sync) ++{ ++ RB_CLEAR_NODE(&bfqq->entity.rb_node); ++ INIT_LIST_HEAD(&bfqq->fifo); ++ ++ atomic_set(&bfqq->ref, 0); ++ bfqq->bfqd = bfqd; ++ ++ bfq_mark_bfqq_prio_changed(bfqq); ++ ++ if (is_sync) { ++ if (!bfq_class_idle(bfqq)) ++ bfq_mark_bfqq_idle_window(bfqq); ++ bfq_mark_bfqq_sync(bfqq); ++ } ++ ++ /* Tentative initial value to trade off between thr and lat */ ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; ++ bfqq->pid = pid; ++ ++ bfqq->raising_coeff = 1; ++ bfqq->last_rais_start_finish = 0; ++ bfqq->soft_rt_next_start = -1; ++} ++ ++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ int is_sync, ++ struct bfq_io_cq *bic, ++ gfp_t gfp_mask) ++{ ++ struct bfq_queue *bfqq, *new_bfqq = NULL; ++ ++retry: ++ /* bic always exists here */ ++ bfqq = bic_to_bfqq(bic, is_sync); ++ ++ /* ++ * Always try a new alloc if we fall back to the OOM bfqq ++ * originally, since it should just be a temporary situation. ++ */ ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { ++ bfqq = NULL; ++ if (new_bfqq != NULL) { ++ bfqq = new_bfqq; ++ new_bfqq = NULL; ++ } else if (gfp_mask & __GFP_WAIT) { ++ spin_unlock_irq(bfqd->queue->queue_lock); ++ new_bfqq = kmem_cache_alloc_node(bfq_pool, ++ gfp_mask | __GFP_ZERO, ++ bfqd->queue->node); ++ spin_lock_irq(bfqd->queue->queue_lock); ++ if (new_bfqq != NULL) ++ goto retry; ++ } else { ++ bfqq = kmem_cache_alloc_node(bfq_pool, ++ gfp_mask | __GFP_ZERO, ++ bfqd->queue->node); ++ } ++ ++ if (bfqq != NULL) { ++ bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync); ++ bfq_log_bfqq(bfqd, bfqq, "allocated"); ++ } else { ++ bfqq = &bfqd->oom_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); ++ } ++ ++ bfq_init_prio_data(bfqq, bic); ++ bfq_init_entity(&bfqq->entity, bfqg); ++ } ++ ++ if (new_bfqq != NULL) ++ kmem_cache_free(bfq_pool, new_bfqq); ++ ++ return bfqq; ++} ++ ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ int ioprio_class, int ioprio) ++{ ++ switch (ioprio_class) { ++ case IOPRIO_CLASS_RT: ++ return &bfqg->async_bfqq[0][ioprio]; ++ case IOPRIO_CLASS_NONE: ++ ioprio = IOPRIO_NORM; ++ /* fall through */ ++ case IOPRIO_CLASS_BE: ++ return &bfqg->async_bfqq[1][ioprio]; ++ case IOPRIO_CLASS_IDLE: ++ return &bfqg->async_idle_bfqq; ++ default: ++ BUG(); ++ } ++} ++ ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, int is_sync, ++ struct bfq_io_cq *bic, gfp_t gfp_mask) ++{ ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ struct bfq_queue **async_bfqq = NULL; ++ struct bfq_queue *bfqq = NULL; ++ ++ if (!is_sync) { ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ++ ioprio); ++ bfqq = *async_bfqq; ++ } ++ ++ if (bfqq == NULL) ++ bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask); ++ ++ /* ++ * Pin the queue now that it's allocated, scheduler exit will prune it. ++ */ ++ if (!is_sync && *async_bfqq == NULL) { ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ *async_bfqq = bfqq; ++ } ++ ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ return bfqq; ++} ++ ++static void bfq_update_io_thinktime(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic) ++{ ++ unsigned long elapsed = jiffies - bic->ttime.last_end_request; ++ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); ++ ++ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; ++ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; ++ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / bic->ttime.ttime_samples; ++} ++ ++static void bfq_update_io_seektime(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ sector_t sdist; ++ u64 total; ++ ++ if (bfqq->last_request_pos < blk_rq_pos(rq)) ++ sdist = blk_rq_pos(rq) - bfqq->last_request_pos; ++ else ++ sdist = bfqq->last_request_pos - blk_rq_pos(rq); ++ ++ /* ++ * Don't allow the seek distance to get too large from the ++ * odd fragment, pagein, etc. ++ */ ++ if (bfqq->seek_samples == 0) /* first request, not really a seek */ ++ sdist = 0; ++ else if (bfqq->seek_samples <= 60) /* second & third seek */ ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); ++ else ++ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); ++ ++ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; ++ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; ++ total = bfqq->seek_total + (bfqq->seek_samples/2); ++ do_div(total, bfqq->seek_samples); ++ if (bfq_bfqq_coop(bfqq)) { ++ /* ++ * If the mean seektime increases for a (non-seeky) shared ++ * queue, some cooperator is likely to be idling too much. ++ * On the contrary, if it decreases, some cooperator has ++ * probably waked up. ++ * ++ */ ++ if ((sector_t)total < bfqq->seek_mean) ++ bfq_mark_bfqq_some_coop_idle(bfqq) ; ++ else if ((sector_t)total > bfqq->seek_mean) ++ bfq_clear_bfqq_some_coop_idle(bfqq) ; ++ } ++ bfqq->seek_mean = (sector_t)total; ++ ++ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, ++ (u64)bfqq->seek_mean); ++} ++ ++/* ++ * Disable idle window if the process thinks too long or seeks so much that ++ * it doesn't matter. ++ */ ++static void bfq_update_idle_window(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ int enable_idle; ++ ++ /* Don't idle for async or idle io prio class. */ ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) ++ return; ++ ++ /* Idle window just restored, statistics are meaningless. */ ++ if (bfq_bfqq_just_split(bfqq)) ++ return; ++ ++ enable_idle = bfq_bfqq_idle_window(bfqq); ++ ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || ++ bfqd->bfq_slice_idle == 0 || ++ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && ++ bfqq->raising_coeff == 1)) ++ enable_idle = 0; ++ else if (bfq_sample_valid(bic->ttime.ttime_samples)) { ++ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && ++ bfqq->raising_coeff == 1) ++ enable_idle = 0; ++ else ++ enable_idle = 1; ++ } ++ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", ++ enable_idle); ++ ++ if (enable_idle) ++ bfq_mark_bfqq_idle_window(bfqq); ++ else ++ bfq_clear_bfqq_idle_window(bfqq); ++} ++ ++/* ++ * Called when a new fs request (rq) is added to bfqq. Check if there's ++ * something we should do about it. ++ */ ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ struct bfq_io_cq *bic = RQ_BIC(rq); ++ ++ if (rq->cmd_flags & REQ_META) ++ bfqq->meta_pending++; ++ ++ bfq_update_io_thinktime(bfqd, bic); ++ bfq_update_io_seektime(bfqd, bfqq, rq); ++ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || ++ !BFQQ_SEEKY(bfqq)) ++ bfq_update_idle_window(bfqd, bfqq, bic); ++ bfq_clear_bfqq_just_split(bfqq); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", ++ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), ++ (long long unsigned)bfqq->seek_mean); ++ ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ ++ if (bfqq == bfqd->active_queue) { ++ /* ++ * If there is just this request queued and the request ++ * is small, just exit. ++ * In this way, if the disk is being idled to wait for a new ++ * request from the active queue, we avoid unplugging the ++ * device now. ++ * ++ * By doing so, we spare the disk to be committed ++ * to serve just a small request. On the contrary, we wait for ++ * the block layer to decide when to unplug the device: ++ * hopefully, new requests will be merged to this ++ * one quickly, then the device will be unplugged ++ * and larger requests will be dispatched. ++ */ ++ if (bfqq->queued[rq_is_sync(rq)] == 1 && ++ blk_rq_sectors(rq) < 32) { ++ return; ++ } ++ if (bfq_bfqq_wait_request(bfqq)) { ++ /* ++ * If we are waiting for a request for this queue, let ++ * it rip immediately and flag that we must not expire ++ * this queue just now. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ del_timer(&bfqd->idle_slice_timer); ++ /* ++ * Here we can safely expire the queue, in ++ * case of budget timeout, without wasting ++ * guarantees ++ */ ++ if (bfq_bfqq_budget_timeout(bfqq)) ++ bfq_bfqq_expire(bfqd, bfqq, 0, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ __blk_run_queue(bfqd->queue); ++ } ++ } ++} ++ ++static void bfq_insert_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ ++ /* ++ * An unplug may trigger a requeue of a request from the device ++ * driver: make sure we are in process context while trying to ++ * merge two bfq_queues. ++ */ ++ if (!in_interrupt() && ++ (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true))) { ++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) ++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); ++ /* ++ * Release the request's reference to the old bfqq ++ * and make sure one is taken to the shared queue. ++ */ ++ new_bfqq->allocated[rq_data_dir(rq)]++; ++ bfqq->allocated[rq_data_dir(rq)]--; ++ atomic_inc(&new_bfqq->ref); ++ bfq_put_queue(bfqq); ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); ++ rq->elv.priv[1] = new_bfqq; ++ bfqq = new_bfqq; ++ } ++ ++ bfq_init_prio_data(bfqq, RQ_BIC(rq)); ++ ++ bfq_add_rq_rb(rq); ++ ++ rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); ++ list_add_tail(&rq->queuelist, &bfqq->fifo); ++ ++ bfq_rq_enqueued(bfqd, bfqq, rq); ++} ++ ++static void bfq_update_hw_tag(struct bfq_data *bfqd) ++{ ++ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, ++ bfqd->rq_in_driver); ++ ++ if (bfqd->hw_tag == 1) ++ return; ++ ++ /* ++ * This sample is valid if the number of outstanding requests ++ * is large enough to allow a queueing behavior. Note that the ++ * sum is not exact, as it's not taking into account deactivated ++ * requests. ++ */ ++ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) ++ return; ++ ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) ++ return; ++ ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; ++ bfqd->max_rq_in_driver = 0; ++ bfqd->hw_tag_samples = 0; ++} ++ ++static void bfq_completed_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ const int sync = rq_is_sync(rq); ++ ++ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)", ++ blk_rq_sectors(rq), sync); ++ ++ bfq_update_hw_tag(bfqd); ++ ++ WARN_ON(!bfqd->rq_in_driver); ++ WARN_ON(!bfqq->dispatched); ++ bfqd->rq_in_driver--; ++ bfqq->dispatched--; ++ ++ if (bfq_bfqq_sync(bfqq)) ++ bfqd->sync_flight--; ++ ++ if (sync) ++ RQ_BIC(rq)->ttime.last_end_request = jiffies; ++ ++ /* ++ * If this is the active queue, check if it needs to be expired, ++ * or if we want to idle in case it has no pending requests. ++ */ ++ if (bfqd->active_queue == bfqq) { ++ if (bfq_bfqq_budget_new(bfqq)) ++ bfq_set_budget_timeout(bfqd); ++ ++ /* Idling is disabled also for cooperation issues: ++ * 1) there is a close cooperator for the queue, or ++ * 2) the queue is shared and some cooperator is likely ++ * to be idle (in this case, by not arming the idle timer, ++ * we try to slow down the queue, to prevent the zones ++ * of the disk accessed by the active cooperators to become ++ * too distant from the zone that will be accessed by the ++ * currently idle cooperators) ++ */ ++ if (bfq_may_expire_for_budg_timeout(bfqq)) ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); ++ else if (sync && ++ (bfqd->rq_in_driver == 0 || ++ bfqq->raising_coeff > 1) ++ && RB_EMPTY_ROOT(&bfqq->sort_list) ++ && !bfq_close_cooperator(bfqd, bfqq, bfqd->last_position) ++ && (!bfq_bfqq_coop(bfqq) || ++ !bfq_bfqq_some_coop_idle(bfqq))) ++ bfq_arm_slice_timer(bfqd); ++ } ++ ++ if (!bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++} ++ ++static inline int __bfq_may_queue(struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { ++ bfq_clear_bfqq_must_alloc(bfqq); ++ return ELV_MQUEUE_MUST; ++ } ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++static int bfq_may_queue(struct request_queue *q, int rw) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct task_struct *tsk = current; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ /* ++ * Don't force setup of a queue from here, as a call to may_queue ++ * does not necessarily imply that a request actually will be queued. ++ * So just lookup a possibly existing queue, or return 'may queue' ++ * if that fails. ++ */ ++ bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ if (bic == NULL) ++ return ELV_MQUEUE_MAY; ++ ++ bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); ++ if (bfqq != NULL) { ++ bfq_init_prio_data(bfqq, bic); ++ ++ return __bfq_may_queue(bfqq); ++ } ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++/* ++ * Queue lock held here. ++ */ ++static void bfq_put_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ if (bfqq != NULL) { ++ const int rw = rq_data_dir(rq); ++ ++ BUG_ON(!bfqq->allocated[rw]); ++ bfqq->allocated[rw]--; ++ ++ rq->elv.priv[0] = NULL; ++ rq->elv.priv[1] = NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++/* ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this ++ * was the last process referring to said bfqq. ++ */ ++static struct bfq_queue * ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); ++ ++ put_io_context(bic->icq.ioc); ++ ++ if (bfqq_process_refs(bfqq) == 1) { ++ bfqq->pid = current->pid; ++ bfq_clear_bfqq_some_coop_idle(bfqq); ++ bfq_clear_bfqq_coop(bfqq); ++ bfq_clear_bfqq_split_coop(bfqq); ++ return bfqq; ++ } ++ ++ bic_set_bfqq(bic, NULL, 1); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); ++ return NULL; ++} ++ ++/* ++ * Allocate bfq data structures associated with this request. ++ */ ++static int bfq_set_request(struct request_queue *q, struct request *rq, ++ struct bio *bio, gfp_t gfp_mask) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); ++ const int rw = rq_data_dir(rq); ++ const int is_sync = rq_is_sync(rq); ++ struct bfq_queue *bfqq; ++ struct bfq_group *bfqg; ++ unsigned long flags; ++ bool split = false; ++ ++ might_sleep_if(gfp_mask & __GFP_WAIT); ++ ++ bfq_changed_ioprio(bic); ++ ++ spin_lock_irqsave(q->queue_lock, flags); ++ ++ if (bic == NULL) ++ goto queue_fail; ++ ++ bfqg = bfq_bic_update_cgroup(bic); ++ ++new_queue: ++ bfqq = bic_to_bfqq(bic, is_sync); ++ if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { ++ bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask); ++ bic_set_bfqq(bic, bfqq, is_sync); ++ } else { ++ /* If the queue was seeky for too long, break it apart. */ ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); ++ bfqq = bfq_split_bfqq(bic, bfqq); ++ split = true; ++ if (!bfqq) ++ goto new_queue; ++ } ++ } ++ ++ bfqq->allocated[rw]++; ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, ++ atomic_read(&bfqq->ref)); ++ ++ rq->elv.priv[0] = bic; ++ rq->elv.priv[1] = bfqq; ++ ++ /* ++ * If a bfq_queue has only one process reference, it is owned ++ * by only one bfq_io_cq: we can set the bic field of the ++ * bfq_queue to the address of that structure. Also, if the ++ * queue has just been split, mark a flag so that the ++ * information is available to the other scheduler hooks. ++ */ ++ if (bfqq_process_refs(bfqq) == 1) { ++ bfqq->bic = bic; ++ if (split) { ++ bfq_mark_bfqq_just_split(bfqq); ++ /* ++ * If the queue has just been split from a shared queue, ++ * restore the idle window and the possible weight ++ * raising period. ++ */ ++ bfq_bfqq_resume_state(bfqq, bic); ++ } ++ } ++ ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 0; ++ ++queue_fail: ++ bfq_schedule_dispatch(bfqd); ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 1; ++} ++ ++static void bfq_kick_queue(struct work_struct *work) ++{ ++ struct bfq_data *bfqd = ++ container_of(work, struct bfq_data, unplug_work); ++ struct request_queue *q = bfqd->queue; ++ ++ spin_lock_irq(q->queue_lock); ++ __blk_run_queue(q); ++ spin_unlock_irq(q->queue_lock); ++} ++ ++/* ++ * Handler of the expiration of the timer running if the active_queue ++ * is idling inside its time slice. ++ */ ++static void bfq_idle_slice_timer(unsigned long data) ++{ ++ struct bfq_data *bfqd = (struct bfq_data *)data; ++ struct bfq_queue *bfqq; ++ unsigned long flags; ++ enum bfqq_expiration reason; ++ ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags); ++ ++ bfqq = bfqd->active_queue; ++ /* ++ * Theoretical race here: active_queue can be NULL or different ++ * from the queue that was idling if the timer handler spins on ++ * the queue_lock and a new request arrives for the current ++ * queue and there is a full dispatch cycle that changes the ++ * active_queue. This can hardly happen, but in the worst case ++ * we just expire a queue too early. ++ */ ++ if (bfqq != NULL) { ++ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); ++ if (bfq_bfqq_budget_timeout(bfqq)) ++ /* ++ * Also here the queue can be safely expired ++ * for budget timeout without wasting ++ * guarantees ++ */ ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) ++ /* ++ * The queue may not be empty upon timer expiration, ++ * because we may not disable the timer when the first ++ * request of the active queue arrives during ++ * disk idling ++ */ ++ reason = BFQ_BFQQ_TOO_IDLE; ++ else ++ goto schedule_dispatch; ++ ++ bfq_bfqq_expire(bfqd, bfqq, 1, reason); ++ } ++ ++schedule_dispatch: ++ bfq_schedule_dispatch(bfqd); ++ ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); ++} ++ ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) ++{ ++ del_timer_sync(&bfqd->idle_slice_timer); ++ cancel_work_sync(&bfqd->unplug_work); ++} ++ ++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, ++ struct bfq_queue **bfqq_ptr) ++{ ++ struct bfq_group *root_group = bfqd->root_group; ++ struct bfq_queue *bfqq = *bfqq_ptr; ++ ++ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); ++ if (bfqq != NULL) { ++ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); ++ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ *bfqq_ptr = NULL; ++ } ++} ++ ++/* ++ * Release all the bfqg references to its async queues. If we are ++ * deallocating the group these queues may still contain requests, so ++ * we reparent them to the root cgroup (i.e., the only one that will ++ * exist for sure untill all the requests on a device are gone). ++ */ ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); ++ ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); ++} ++ ++static void bfq_exit_queue(struct elevator_queue *e) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ struct request_queue *q = bfqd->queue; ++ struct bfq_queue *bfqq, *n; ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ spin_lock_irq(q->queue_lock); ++ ++ BUG_ON(bfqd->active_queue != NULL); ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) ++ bfq_deactivate_bfqq(bfqd, bfqq, 0); ++ ++ bfq_disconnect_groups(bfqd); ++ spin_unlock_irq(q->queue_lock); ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ synchronize_rcu(); ++ ++ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); ++ ++ bfq_free_root_group(bfqd); ++ kfree(bfqd); ++} ++ ++static int bfq_init_queue(struct request_queue *q) ++{ ++ struct bfq_group *bfqg; ++ struct bfq_data *bfqd; ++ ++ bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node); ++ if (bfqd == NULL) ++ return -ENOMEM; ++ ++ /* ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. ++ * Grab a permanent reference to it, so that the normal code flow ++ * will not attempt to free it. ++ */ ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0); ++ atomic_inc(&bfqd->oom_bfqq.ref); ++ ++ bfqd->queue = q; ++ q->elevator->elevator_data = bfqd; ++ ++ bfqg = bfq_alloc_root_group(bfqd, q->node); ++ if (bfqg == NULL) { ++ kfree(bfqd); ++ return -ENOMEM; ++ } ++ ++ bfqd->root_group = bfqg; ++ ++ init_timer(&bfqd->idle_slice_timer); ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; ++ bfqd->idle_slice_timer.data = (unsigned long)bfqd; ++ ++ bfqd->rq_pos_tree = RB_ROOT; ++ ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); ++ ++ INIT_LIST_HEAD(&bfqd->active_list); ++ INIT_LIST_HEAD(&bfqd->idle_list); ++ ++ bfqd->hw_tag = -1; ++ ++ bfqd->bfq_max_budget = bfq_default_max_budget; ++ ++ bfqd->bfq_quantum = bfq_quantum; ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; ++ bfqd->bfq_back_max = bfq_back_max; ++ bfqd->bfq_back_penalty = bfq_back_penalty; ++ bfqd->bfq_slice_idle = bfq_slice_idle; ++ bfqd->bfq_class_idle_last_service = 0; ++ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; ++ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; ++ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; ++ ++ bfqd->low_latency = true; ++ ++ bfqd->bfq_raising_coeff = 20; ++ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300); ++ bfqd->bfq_raising_max_time = 0; ++ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000); ++ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500); ++ bfqd->bfq_raising_max_softrt_rate = 7000; ++ ++ /* Initially estimate the device's peak rate as the reference rate */ ++ if (blk_queue_nonrot(bfqd->queue)) { ++ bfqd->RT_prod = R_nonrot * T_nonrot; ++ bfqd->peak_rate = R_nonrot; ++ } else { ++ bfqd->RT_prod = R_rot * T_rot; ++ bfqd->peak_rate = R_rot; ++ } ++ ++ return 0; ++} ++ ++static void bfq_slab_kill(void) ++{ ++ if (bfq_pool != NULL) ++ kmem_cache_destroy(bfq_pool); ++} ++ ++static int __init bfq_slab_setup(void) ++{ ++ bfq_pool = KMEM_CACHE(bfq_queue, 0); ++ if (bfq_pool == NULL) ++ return -ENOMEM; ++ return 0; ++} ++ ++static ssize_t bfq_var_show(unsigned int var, char *page) ++{ ++ return sprintf(page, "%d\n", var); ++} ++ ++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count) ++{ ++ unsigned long new_val; ++ int ret = strict_strtoul(page, 10, &new_val); ++ ++ if (ret == 0) ++ *var = new_val; ++ ++ return count; ++} ++ ++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ? ++ bfqd->bfq_raising_max_time : ++ bfq_wrais_duration(bfqd)); ++} ++ ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_queue *bfqq; ++ struct bfq_data *bfqd = e->elevator_data; ++ ssize_t num_char = 0; ++ ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", ++ bfqd->queued); ++ ++ spin_lock_irq(bfqd->queue->queue_lock); ++ ++ num_char += sprintf(page + num_char, "Active:\n"); ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, nr_queued %d %d," ++ " dur %d/%u\n", ++ bfqq->pid, ++ bfqq->entity.weight, ++ bfqq->queued[0], ++ bfqq->queued[1], ++ jiffies_to_msecs(jiffies - ++ bfqq->last_rais_start_finish), ++ jiffies_to_msecs(bfqq->raising_cur_max_time)); ++ } ++ ++ num_char += sprintf(page + num_char, "Idle:\n"); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, dur %d/%u\n", ++ bfqq->pid, ++ bfqq->entity.weight, ++ jiffies_to_msecs(jiffies - ++ bfqq->last_rais_start_finish), ++ jiffies_to_msecs(bfqq->raising_cur_max_time)); ++ } ++ ++ spin_unlock_irq(bfqd->queue->queue_lock); ++ ++ return num_char; ++} ++ ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned int __data = __VAR; \ ++ if (__CONV) \ ++ __data = jiffies_to_msecs(__data); \ ++ return bfq_var_show(__data, (page)); \ ++} ++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0); ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); ++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0); ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); ++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); ++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0); ++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1); ++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time, ++ 1); ++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show, ++ bfqd->bfq_raising_min_inter_arr_async, ++ 1); ++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show, ++ bfqd->bfq_raising_max_softrt_rate, 0); ++#undef SHOW_FUNCTION ++ ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ ++static ssize_t \ ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long __data; \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ if (__CONV) \ ++ *(__PTR) = msecs_to_jiffies(__data); \ ++ else \ ++ *(__PTR) = __data; \ ++ return ret; \ ++} ++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, ++ INT_MAX, 0); ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, ++ 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1, ++ INT_MAX, 0); ++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_min_idle_time_store, ++ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store, ++ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_raising_max_softrt_rate_store, ++ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0); ++#undef STORE_FUNCTION ++ ++/* do nothing for the moment */ ++static ssize_t bfq_weights_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ return count; ++} ++ ++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) ++{ ++ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); ++ ++ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) ++ return bfq_calc_max_budget(bfqd->peak_rate, timeout); ++ else ++ return bfq_default_max_budget; ++} ++ ++static ssize_t bfq_max_budget_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long __data; ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data == 0) ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); ++ else { ++ if (__data > INT_MAX) ++ __data = INT_MAX; ++ bfqd->bfq_max_budget = __data; ++ } ++ ++ bfqd->bfq_user_max_budget = __data; ++ ++ return ret; ++} ++ ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long __data; ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data < 1) ++ __data = 1; ++ else if (__data > INT_MAX) ++ __data = INT_MAX; ++ ++ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); ++ if (bfqd->bfq_user_max_budget == 0) ++ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); ++ ++ return ret; ++} ++ ++static ssize_t bfq_low_latency_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long __data; ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ bfqd->low_latency = __data; ++ ++ return ret; ++} ++ ++#define BFQ_ATTR(name) \ ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) ++ ++static struct elv_fs_entry bfq_attrs[] = { ++ BFQ_ATTR(quantum), ++ BFQ_ATTR(fifo_expire_sync), ++ BFQ_ATTR(fifo_expire_async), ++ BFQ_ATTR(back_seek_max), ++ BFQ_ATTR(back_seek_penalty), ++ BFQ_ATTR(slice_idle), ++ BFQ_ATTR(max_budget), ++ BFQ_ATTR(max_budget_async_rq), ++ BFQ_ATTR(timeout_sync), ++ BFQ_ATTR(timeout_async), ++ BFQ_ATTR(low_latency), ++ BFQ_ATTR(raising_coeff), ++ BFQ_ATTR(raising_max_time), ++ BFQ_ATTR(raising_rt_max_time), ++ BFQ_ATTR(raising_min_idle_time), ++ BFQ_ATTR(raising_min_inter_arr_async), ++ BFQ_ATTR(raising_max_softrt_rate), ++ BFQ_ATTR(weights), ++ __ATTR_NULL ++}; ++ ++static struct elevator_type iosched_bfq = { ++ .ops = { ++ .elevator_merge_fn = bfq_merge, ++ .elevator_merged_fn = bfq_merged_request, ++ .elevator_merge_req_fn = bfq_merged_requests, ++ .elevator_allow_merge_fn = bfq_allow_merge, ++ .elevator_dispatch_fn = bfq_dispatch_requests, ++ .elevator_add_req_fn = bfq_insert_request, ++ .elevator_activate_req_fn = bfq_activate_request, ++ .elevator_deactivate_req_fn = bfq_deactivate_request, ++ .elevator_completed_req_fn = bfq_completed_request, ++ .elevator_former_req_fn = elv_rb_former_request, ++ .elevator_latter_req_fn = elv_rb_latter_request, ++ .elevator_init_icq_fn = bfq_init_icq, ++ .elevator_exit_icq_fn = bfq_exit_icq, ++ .elevator_set_req_fn = bfq_set_request, ++ .elevator_put_req_fn = bfq_put_request, ++ .elevator_may_queue_fn = bfq_may_queue, ++ .elevator_init_fn = bfq_init_queue, ++ .elevator_exit_fn = bfq_exit_queue, ++ }, ++ .icq_size = sizeof(struct bfq_io_cq), ++ .icq_align = __alignof__(struct bfq_io_cq), ++ .elevator_attrs = bfq_attrs, ++ .elevator_name = "bfq", ++ .elevator_owner = THIS_MODULE, ++}; ++ ++static int __init bfq_init(void) ++{ ++ /* ++ * Can be 0 on HZ < 1000 setups. ++ */ ++ if (bfq_slice_idle == 0) ++ bfq_slice_idle = 1; ++ ++ if (bfq_timeout_async == 0) ++ bfq_timeout_async = 1; ++ ++ if (bfq_slab_setup()) ++ return -ENOMEM; ++ ++ elv_register(&iosched_bfq); ++ ++ return 0; ++} ++ ++static void __exit bfq_exit(void) ++{ ++ elv_unregister(&iosched_bfq); ++ bfq_slab_kill(); ++} ++ ++module_init(bfq_init); ++module_exit(bfq_exit); ++ ++MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler"); +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +new file mode 100644 +index 0000000..a0edaa2 +--- /dev/null ++++ b/block/bfq-sched.c +@@ -0,0 +1,1044 @@ ++/* ++ * BFQ: Hierarchical B-WF2Q+ scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2010 Paolo Valente ++ */ ++ ++#ifdef CONFIG_CGROUP_BFQIO ++#define for_each_entity(entity) \ ++ for (; entity != NULL; entity = entity->parent) ++ ++#define for_each_entity_safe(entity, parent) \ ++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) ++ ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, ++ int extract, ++ struct bfq_data *bfqd); ++ ++static inline void bfq_update_budget(struct bfq_entity *next_active) ++{ ++ struct bfq_entity *bfqg_entity; ++ struct bfq_group *bfqg; ++ struct bfq_sched_data *group_sd; ++ ++ BUG_ON(next_active == NULL); ++ ++ group_sd = next_active->sched_data; ++ ++ bfqg = container_of(group_sd, struct bfq_group, sched_data); ++ /* ++ * bfq_group's my_entity field is not NULL only if the group ++ * is not the root group. We must not touch the root entity ++ * as it must never become an active entity. ++ */ ++ bfqg_entity = bfqg->my_entity; ++ if (bfqg_entity != NULL) ++ bfqg_entity->budget = next_active->budget; ++} ++ ++static int bfq_update_next_active(struct bfq_sched_data *sd) ++{ ++ struct bfq_entity *next_active; ++ ++ if (sd->active_entity != NULL) ++ /* will update/requeue at the end of service */ ++ return 0; ++ ++ /* ++ * NOTE: this can be improved in many ways, such as returning ++ * 1 (and thus propagating upwards the update) only when the ++ * budget changes, or caching the bfqq that will be scheduled ++ * next from this subtree. By now we worry more about ++ * correctness than about performance... ++ */ ++ next_active = bfq_lookup_next_entity(sd, 0, NULL); ++ sd->next_active = next_active; ++ ++ if (next_active != NULL) ++ bfq_update_budget(next_active); ++ ++ return 1; ++} ++ ++static inline void bfq_check_next_active(struct bfq_sched_data *sd, ++ struct bfq_entity *entity) ++{ ++ BUG_ON(sd->next_active != entity); ++} ++#else ++#define for_each_entity(entity) \ ++ for (; entity != NULL; entity = NULL) ++ ++#define for_each_entity_safe(entity, parent) \ ++ for (parent = NULL; entity != NULL; entity = parent) ++ ++static inline int bfq_update_next_active(struct bfq_sched_data *sd) ++{ ++ return 0; ++} ++ ++static inline void bfq_check_next_active(struct bfq_sched_data *sd, ++ struct bfq_entity *entity) ++{ ++} ++ ++static inline void bfq_update_budget(struct bfq_entity *next_active) ++{ ++} ++#endif ++ ++/* ++ * Shift for timestamp calculations. This actually limits the maximum ++ * service allowed in one timestamp delta (small shift values increase it), ++ * the maximum total weight that can be used for the queues in the system ++ * (big shift values increase it), and the period of virtual time wraparounds. ++ */ ++#define WFQ_SERVICE_SHIFT 22 ++ ++/** ++ * bfq_gt - compare two timestamps. ++ * @a: first ts. ++ * @b: second ts. ++ * ++ * Return @a > @b, dealing with wrapping correctly. ++ */ ++static inline int bfq_gt(u64 a, u64 b) ++{ ++ return (s64)(a - b) > 0; ++} ++ ++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = NULL; ++ ++ BUG_ON(entity == NULL); ++ ++ if (entity->my_sched_data == NULL) ++ bfqq = container_of(entity, struct bfq_queue, entity); ++ ++ return bfqq; ++} ++ ++ ++/** ++ * bfq_delta - map service into the virtual time domain. ++ * @service: amount of service. ++ * @weight: scale factor (weight of an entity or weight sum). ++ */ ++static inline u64 bfq_delta(unsigned long service, ++ unsigned long weight) ++{ ++ u64 d = (u64)service << WFQ_SERVICE_SHIFT; ++ ++ do_div(d, weight); ++ return d; ++} ++ ++/** ++ * bfq_calc_finish - assign the finish time to an entity. ++ * @entity: the entity to act upon. ++ * @service: the service to be charged to the entity. ++ */ ++static inline void bfq_calc_finish(struct bfq_entity *entity, ++ unsigned long service) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(entity->weight == 0); ++ ++ entity->finish = entity->start + ++ bfq_delta(service, entity->weight); ++ ++ if (bfqq != NULL) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "calc_finish: serv %lu, w %d", ++ service, entity->weight); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "calc_finish: start %llu, finish %llu, delta %llu", ++ entity->start, entity->finish, ++ bfq_delta(service, entity->weight)); ++ } ++} ++ ++/** ++ * bfq_entity_of - get an entity from a node. ++ * @node: the node field of the entity. ++ * ++ * Convert a node pointer to the relative entity. This is used only ++ * to simplify the logic of some functions and not as the generic ++ * conversion mechanism because, e.g., in the tree walking functions, ++ * the check for a %NULL value would be redundant. ++ */ ++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) ++{ ++ struct bfq_entity *entity = NULL; ++ ++ if (node != NULL) ++ entity = rb_entry(node, struct bfq_entity, rb_node); ++ ++ return entity; ++} ++ ++/** ++ * bfq_extract - remove an entity from a tree. ++ * @root: the tree root. ++ * @entity: the entity to remove. ++ */ ++static inline void bfq_extract(struct rb_root *root, ++ struct bfq_entity *entity) ++{ ++ BUG_ON(entity->tree != root); ++ ++ entity->tree = NULL; ++ rb_erase(&entity->rb_node, root); ++} ++ ++/** ++ * bfq_idle_extract - extract an entity from the idle tree. ++ * @st: the service tree of the owning @entity. ++ * @entity: the entity being removed. ++ */ ++static void bfq_idle_extract(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *next; ++ ++ BUG_ON(entity->tree != &st->idle); ++ ++ if (entity == st->first_idle) { ++ next = rb_next(&entity->rb_node); ++ st->first_idle = bfq_entity_of(next); ++ } ++ ++ if (entity == st->last_idle) { ++ next = rb_prev(&entity->rb_node); ++ st->last_idle = bfq_entity_of(next); ++ } ++ ++ bfq_extract(&st->idle, entity); ++ ++ if (bfqq != NULL) ++ list_del(&bfqq->bfqq_list); ++} ++ ++/** ++ * bfq_insert - generic tree insertion. ++ * @root: tree root. ++ * @entity: entity to insert. ++ * ++ * This is used for the idle and the active tree, since they are both ++ * ordered by finish time. ++ */ ++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) ++{ ++ struct bfq_entity *entry; ++ struct rb_node **node = &root->rb_node; ++ struct rb_node *parent = NULL; ++ ++ BUG_ON(entity->tree != NULL); ++ ++ while (*node != NULL) { ++ parent = *node; ++ entry = rb_entry(parent, struct bfq_entity, rb_node); ++ ++ if (bfq_gt(entry->finish, entity->finish)) ++ node = &parent->rb_left; ++ else ++ node = &parent->rb_right; ++ } ++ ++ rb_link_node(&entity->rb_node, parent, node); ++ rb_insert_color(&entity->rb_node, root); ++ ++ entity->tree = root; ++} ++ ++/** ++ * bfq_update_min - update the min_start field of a entity. ++ * @entity: the entity to update. ++ * @node: one of its children. ++ * ++ * This function is called when @entity may store an invalid value for ++ * min_start due to updates to the active tree. The function assumes ++ * that the subtree rooted at @node (which may be its left or its right ++ * child) has a valid min_start value. ++ */ ++static inline void bfq_update_min(struct bfq_entity *entity, ++ struct rb_node *node) ++{ ++ struct bfq_entity *child; ++ ++ if (node != NULL) { ++ child = rb_entry(node, struct bfq_entity, rb_node); ++ if (bfq_gt(entity->min_start, child->min_start)) ++ entity->min_start = child->min_start; ++ } ++} ++ ++/** ++ * bfq_update_active_node - recalculate min_start. ++ * @node: the node to update. ++ * ++ * @node may have changed position or one of its children may have moved, ++ * this function updates its min_start value. The left and right subtrees ++ * are assumed to hold a correct min_start value. ++ */ ++static inline void bfq_update_active_node(struct rb_node *node) ++{ ++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); ++ ++ entity->min_start = entity->start; ++ bfq_update_min(entity, node->rb_right); ++ bfq_update_min(entity, node->rb_left); ++} ++ ++/** ++ * bfq_update_active_tree - update min_start for the whole active tree. ++ * @node: the starting node. ++ * ++ * @node must be the deepest modified node after an update. This function ++ * updates its min_start using the values held by its children, assuming ++ * that they did not change, and then updates all the nodes that may have ++ * changed in the path to the root. The only nodes that may have changed ++ * are the ones in the path or their siblings. ++ */ ++static void bfq_update_active_tree(struct rb_node *node) ++{ ++ struct rb_node *parent; ++ ++up: ++ bfq_update_active_node(node); ++ ++ parent = rb_parent(node); ++ if (parent == NULL) ++ return; ++ ++ if (node == parent->rb_left && parent->rb_right != NULL) ++ bfq_update_active_node(parent->rb_right); ++ else if (parent->rb_left != NULL) ++ bfq_update_active_node(parent->rb_left); ++ ++ node = parent; ++ goto up; ++} ++ ++/** ++ * bfq_active_insert - insert an entity in the active tree of its group/device. ++ * @st: the service tree of the entity. ++ * @entity: the entity being inserted. ++ * ++ * The active tree is ordered by finish time, but an extra key is kept ++ * per each node, containing the minimum value for the start times of ++ * its children (and the node itself), so it's possible to search for ++ * the eligible node with the lowest finish time in logarithmic time. ++ */ ++static void bfq_active_insert(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *node = &entity->rb_node; ++ ++ bfq_insert(&st->active, entity); ++ ++ if (node->rb_left != NULL) ++ node = node->rb_left; ++ else if (node->rb_right != NULL) ++ node = node->rb_right; ++ ++ bfq_update_active_tree(node); ++ ++ if (bfqq != NULL) ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); ++} ++ ++/** ++ * bfq_ioprio_to_weight - calc a weight from an ioprio. ++ * @ioprio: the ioprio value to convert. ++ */ ++static unsigned short bfq_ioprio_to_weight(int ioprio) ++{ ++ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); ++ return IOPRIO_BE_NR - ioprio; ++} ++ ++/** ++ * bfq_weight_to_ioprio - calc an ioprio from a weight. ++ * @weight: the weight value to convert. ++ * ++ * To preserve as mush as possible the old only-ioprio user interface, ++ * 0 is used as an escape ioprio value for weights (numerically) equal or ++ * larger than IOPRIO_BE_NR ++ */ ++static unsigned short bfq_weight_to_ioprio(int weight) ++{ ++ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); ++ return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; ++} ++ ++static inline void bfq_get_entity(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_sched_data *sd; ++ ++ if (bfqq != NULL) { ++ sd = entity->sched_data; ++ atomic_inc(&bfqq->ref); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ } ++} ++ ++/** ++ * bfq_find_deepest - find the deepest node that an extraction can modify. ++ * @node: the node being removed. ++ * ++ * Do the first step of an extraction in an rb tree, looking for the ++ * node that will replace @node, and returning the deepest node that ++ * the following modifications to the tree can touch. If @node is the ++ * last node in the tree return %NULL. ++ */ ++static struct rb_node *bfq_find_deepest(struct rb_node *node) ++{ ++ struct rb_node *deepest; ++ ++ if (node->rb_right == NULL && node->rb_left == NULL) ++ deepest = rb_parent(node); ++ else if (node->rb_right == NULL) ++ deepest = node->rb_left; ++ else if (node->rb_left == NULL) ++ deepest = node->rb_right; ++ else { ++ deepest = rb_next(node); ++ if (deepest->rb_right != NULL) ++ deepest = deepest->rb_right; ++ else if (rb_parent(deepest) != node) ++ deepest = rb_parent(deepest); ++ } ++ ++ return deepest; ++} ++ ++/** ++ * bfq_active_extract - remove an entity from the active tree. ++ * @st: the service_tree containing the tree. ++ * @entity: the entity being removed. ++ */ ++static void bfq_active_extract(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *node; ++ ++ node = bfq_find_deepest(&entity->rb_node); ++ bfq_extract(&st->active, entity); ++ ++ if (node != NULL) ++ bfq_update_active_tree(node); ++ ++ if (bfqq != NULL) ++ list_del(&bfqq->bfqq_list); ++} ++ ++/** ++ * bfq_idle_insert - insert an entity into the idle tree. ++ * @st: the service tree containing the tree. ++ * @entity: the entity to insert. ++ */ ++static void bfq_idle_insert(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_entity *first_idle = st->first_idle; ++ struct bfq_entity *last_idle = st->last_idle; ++ ++ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) ++ st->first_idle = entity; ++ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) ++ st->last_idle = entity; ++ ++ bfq_insert(&st->idle, entity); ++ ++ if (bfqq != NULL) ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); ++} ++ ++/** ++ * bfq_forget_entity - remove an entity from the wfq trees. ++ * @st: the service tree. ++ * @entity: the entity being removed. ++ * ++ * Update the device status and forget everything about @entity, putting ++ * the device reference to it, if it is a queue. Entities belonging to ++ * groups are not refcounted. ++ */ ++static void bfq_forget_entity(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_sched_data *sd; ++ ++ BUG_ON(!entity->on_st); ++ ++ entity->on_st = 0; ++ st->wsum -= entity->weight; ++ if (bfqq != NULL) { ++ sd = entity->sched_data; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", ++ bfqq, atomic_read(&bfqq->ref)); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++/** ++ * bfq_put_idle_entity - release the idle tree ref of an entity. ++ * @st: service tree for the entity. ++ * @entity: the entity being released. ++ */ ++static void bfq_put_idle_entity(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ bfq_idle_extract(st, entity); ++ bfq_forget_entity(st, entity); ++} ++ ++/** ++ * bfq_forget_idle - update the idle tree if necessary. ++ * @st: the service tree to act upon. ++ * ++ * To preserve the global O(log N) complexity we only remove one entry here; ++ * as the idle tree will not grow indefinitely this can be done safely. ++ */ ++static void bfq_forget_idle(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *first_idle = st->first_idle; ++ struct bfq_entity *last_idle = st->last_idle; ++ ++ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && ++ !bfq_gt(last_idle->finish, st->vtime)) { ++ /* ++ * Forget the whole idle tree, increasing the vtime past ++ * the last finish time of idle entities. ++ */ ++ st->vtime = last_idle->finish; ++ } ++ ++ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) ++ bfq_put_idle_entity(st, first_idle); ++} ++ ++static struct bfq_service_tree * ++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_service_tree *new_st = old_st; ++ ++ if (entity->ioprio_changed) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(old_st->wsum < entity->weight); ++ old_st->wsum -= entity->weight; ++ ++ if (entity->new_weight != entity->orig_weight) { ++ entity->orig_weight = entity->new_weight; ++ entity->ioprio = ++ bfq_weight_to_ioprio(entity->orig_weight); ++ } else if (entity->new_ioprio != entity->ioprio) { ++ entity->ioprio = entity->new_ioprio; ++ entity->orig_weight = ++ bfq_ioprio_to_weight(entity->ioprio); ++ } else ++ entity->new_weight = entity->orig_weight = ++ bfq_ioprio_to_weight(entity->ioprio); ++ ++ entity->ioprio_class = entity->new_ioprio_class; ++ entity->ioprio_changed = 0; ++ ++ /* ++ * NOTE: here we may be changing the weight too early, ++ * this will cause unfairness. The correct approach ++ * would have required additional complexity to defer ++ * weight changes to the proper time instants (i.e., ++ * when entity->finish <= old_st->vtime). ++ */ ++ new_st = bfq_entity_service_tree(entity); ++ entity->weight = entity->orig_weight * ++ (bfqq != NULL ? bfqq->raising_coeff : 1); ++ new_st->wsum += entity->weight; ++ ++ if (new_st != old_st) ++ entity->start = new_st->vtime; ++ } ++ ++ return new_st; ++} ++ ++/** ++ * bfq_bfqq_served - update the scheduler status after selection for service. ++ * @bfqq: the queue being served. ++ * @served: bytes to transfer. ++ * ++ * NOTE: this can be optimized, as the timestamps of upper level entities ++ * are synchronized every time a new bfqq is selected for service. By now, ++ * we keep it to better check consistency. ++ */ ++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st; ++ ++ for_each_entity(entity) { ++ st = bfq_entity_service_tree(entity); ++ ++ entity->service += served; ++ BUG_ON(entity->service > entity->budget); ++ BUG_ON(st->wsum == 0); ++ ++ st->vtime += bfq_delta(served, st->wsum); ++ bfq_forget_idle(st); ++ } ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served); ++} ++ ++/** ++ * bfq_bfqq_charge_full_budget - set the service to the entity budget. ++ * @bfqq: the queue that needs a service update. ++ * ++ * When it's not possible to be fair in the service domain, because ++ * a queue is not consuming its budget fast enough (the meaning of ++ * fast depends on the timeout parameter), we charge it a full ++ * budget. In this way we should obtain a sort of time-domain ++ * fairness among all the seeky/slow queues. ++ */ ++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); ++ ++ bfq_bfqq_served(bfqq, entity->budget - entity->service); ++} ++ ++/** ++ * __bfq_activate_entity - activate an entity. ++ * @entity: the entity being activated. ++ * ++ * Called whenever an entity is activated, i.e., it is not active and one ++ * of its children receives a new request, or has to be reactivated due to ++ * budget exhaustion. It uses the current budget of the entity (and the ++ * service received if @entity is active) of the queue to calculate its ++ * timestamps. ++ */ ++static void __bfq_activate_entity(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ ++ if (entity == sd->active_entity) { ++ BUG_ON(entity->tree != NULL); ++ /* ++ * If we are requeueing the current entity we have ++ * to take care of not charging to it service it has ++ * not received. ++ */ ++ bfq_calc_finish(entity, entity->service); ++ entity->start = entity->finish; ++ sd->active_entity = NULL; ++ } else if (entity->tree == &st->active) { ++ /* ++ * Requeueing an entity due to a change of some ++ * next_active entity below it. We reuse the old ++ * start time. ++ */ ++ bfq_active_extract(st, entity); ++ } else if (entity->tree == &st->idle) { ++ /* ++ * Must be on the idle tree, bfq_idle_extract() will ++ * check for that. ++ */ ++ bfq_idle_extract(st, entity); ++ entity->start = bfq_gt(st->vtime, entity->finish) ? ++ st->vtime : entity->finish; ++ } else { ++ /* ++ * The finish time of the entity may be invalid, and ++ * it is in the past for sure, otherwise the queue ++ * would have been on the idle tree. ++ */ ++ entity->start = st->vtime; ++ st->wsum += entity->weight; ++ bfq_get_entity(entity); ++ ++ BUG_ON(entity->on_st); ++ entity->on_st = 1; ++ } ++ ++ st = __bfq_entity_update_weight_prio(st, entity); ++ bfq_calc_finish(entity, entity->budget); ++ bfq_active_insert(st, entity); ++} ++ ++/** ++ * bfq_activate_entity - activate an entity and its ancestors if necessary. ++ * @entity: the entity to activate. ++ * ++ * Activate @entity and all the entities on the path from it to the root. ++ */ ++static void bfq_activate_entity(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sd; ++ ++ for_each_entity(entity) { ++ __bfq_activate_entity(entity); ++ ++ sd = entity->sched_data; ++ if (!bfq_update_next_active(sd)) ++ /* ++ * No need to propagate the activation to the ++ * upper entities, as they will be updated when ++ * the active entity is rescheduled. ++ */ ++ break; ++ } ++} ++ ++/** ++ * __bfq_deactivate_entity - deactivate an entity from its service tree. ++ * @entity: the entity to deactivate. ++ * @requeue: if false, the entity will not be put into the idle tree. ++ * ++ * Deactivate an entity, independently from its previous state. If the ++ * entity was not on a service tree just return, otherwise if it is on ++ * any scheduler tree, extract it from that tree, and if necessary ++ * and if the caller did not specify @requeue, put it on the idle tree. ++ * ++ * Return %1 if the caller should update the entity hierarchy, i.e., ++ * if the entity was under service or if it was the next_active for ++ * its sched_data; return %0 otherwise. ++ */ ++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ int was_active = entity == sd->active_entity; ++ int ret = 0; ++ ++ if (!entity->on_st) ++ return 0; ++ ++ BUG_ON(was_active && entity->tree != NULL); ++ ++ if (was_active) { ++ bfq_calc_finish(entity, entity->service); ++ sd->active_entity = NULL; ++ } else if (entity->tree == &st->active) ++ bfq_active_extract(st, entity); ++ else if (entity->tree == &st->idle) ++ bfq_idle_extract(st, entity); ++ else if (entity->tree != NULL) ++ BUG(); ++ ++ if (was_active || sd->next_active == entity) ++ ret = bfq_update_next_active(sd); ++ ++ if (!requeue || !bfq_gt(entity->finish, st->vtime)) ++ bfq_forget_entity(st, entity); ++ else ++ bfq_idle_insert(st, entity); ++ ++ BUG_ON(sd->active_entity == entity); ++ BUG_ON(sd->next_active == entity); ++ ++ return ret; ++} ++ ++/** ++ * bfq_deactivate_entity - deactivate an entity. ++ * @entity: the entity to deactivate. ++ * @requeue: true if the entity can be put on the idle tree ++ */ ++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) ++{ ++ struct bfq_sched_data *sd; ++ struct bfq_entity *parent; ++ ++ for_each_entity_safe(entity, parent) { ++ sd = entity->sched_data; ++ ++ if (!__bfq_deactivate_entity(entity, requeue)) ++ /* ++ * The parent entity is still backlogged, and ++ * we don't need to update it as it is still ++ * under service. ++ */ ++ break; ++ ++ if (sd->next_active != NULL) ++ /* ++ * The parent entity is still backlogged and ++ * the budgets on the path towards the root ++ * need to be updated. ++ */ ++ goto update; ++ ++ /* ++ * If we reach there the parent is no more backlogged and ++ * we want to propagate the dequeue upwards. ++ */ ++ requeue = 1; ++ } ++ ++ return; ++ ++update: ++ entity = parent; ++ for_each_entity(entity) { ++ __bfq_activate_entity(entity); ++ ++ sd = entity->sched_data; ++ if (!bfq_update_next_active(sd)) ++ break; ++ } ++} ++ ++/** ++ * bfq_update_vtime - update vtime if necessary. ++ * @st: the service tree to act upon. ++ * ++ * If necessary update the service tree vtime to have at least one ++ * eligible entity, skipping to its start time. Assumes that the ++ * active tree of the device is not empty. ++ * ++ * NOTE: this hierarchical implementation updates vtimes quite often, ++ * we may end up with reactivated tasks getting timestamps after a ++ * vtime skip done because we needed a ->first_active entity on some ++ * intermediate node. ++ */ ++static void bfq_update_vtime(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entry; ++ struct rb_node *node = st->active.rb_node; ++ ++ entry = rb_entry(node, struct bfq_entity, rb_node); ++ if (bfq_gt(entry->min_start, st->vtime)) { ++ st->vtime = entry->min_start; ++ bfq_forget_idle(st); ++ } ++} ++ ++/** ++ * bfq_first_active - find the eligible entity with the smallest finish time ++ * @st: the service tree to select from. ++ * ++ * This function searches the first schedulable entity, starting from the ++ * root of the tree and going on the left every time on this side there is ++ * a subtree with at least one eligible (start >= vtime) entity. The path ++ * on the right is followed only if a) the left subtree contains no eligible ++ * entities and b) no eligible entity has been found yet. ++ */ ++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entry, *first = NULL; ++ struct rb_node *node = st->active.rb_node; ++ ++ while (node != NULL) { ++ entry = rb_entry(node, struct bfq_entity, rb_node); ++left: ++ if (!bfq_gt(entry->start, st->vtime)) ++ first = entry; ++ ++ BUG_ON(bfq_gt(entry->min_start, st->vtime)); ++ ++ if (node->rb_left != NULL) { ++ entry = rb_entry(node->rb_left, ++ struct bfq_entity, rb_node); ++ if (!bfq_gt(entry->min_start, st->vtime)) { ++ node = node->rb_left; ++ goto left; ++ } ++ } ++ if (first != NULL) ++ break; ++ node = node->rb_right; ++ } ++ ++ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); ++ return first; ++} ++ ++/** ++ * __bfq_lookup_next_entity - return the first eligible entity in @st. ++ * @st: the service tree. ++ * ++ * Update the virtual time in @st and return the first eligible entity ++ * it contains. ++ */ ++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, ++ bool force) ++{ ++ struct bfq_entity *entity, *new_next_active = NULL; ++ ++ if (RB_EMPTY_ROOT(&st->active)) ++ return NULL; ++ ++ bfq_update_vtime(st); ++ entity = bfq_first_active_entity(st); ++ BUG_ON(bfq_gt(entity->start, st->vtime)); ++ ++ /* ++ * If the chosen entity does not match with the sched_data's ++ * next_active and we are forcedly serving the IDLE priority ++ * class tree, bubble up budget update. ++ */ ++ if (unlikely(force && entity != entity->sched_data->next_active)) { ++ new_next_active = entity; ++ for_each_entity(new_next_active) ++ bfq_update_budget(new_next_active); ++ } ++ ++ return entity; ++} ++ ++/** ++ * bfq_lookup_next_entity - return the first eligible entity in @sd. ++ * @sd: the sched_data. ++ * @extract: if true the returned entity will be also extracted from @sd. ++ * ++ * NOTE: since we cache the next_active entity at each level of the ++ * hierarchy, the complexity of the lookup can be decreased with ++ * absolutely no effort just returning the cached next_active value; ++ * we prefer to do full lookups to test the consistency of * the data ++ * structures. ++ */ ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, ++ int extract, ++ struct bfq_data *bfqd) ++{ ++ struct bfq_service_tree *st = sd->service_tree; ++ struct bfq_entity *entity; ++ int i=0; ++ ++ BUG_ON(sd->active_entity != NULL); ++ ++ if (bfqd != NULL && ++ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { ++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true); ++ if (entity != NULL) { ++ i = BFQ_IOPRIO_CLASSES - 1; ++ bfqd->bfq_class_idle_last_service = jiffies; ++ sd->next_active = entity; ++ } ++ } ++ for (; i < BFQ_IOPRIO_CLASSES; i++) { ++ entity = __bfq_lookup_next_entity(st + i, false); ++ if (entity != NULL) { ++ if (extract) { ++ bfq_check_next_active(sd, entity); ++ bfq_active_extract(st + i, entity); ++ sd->active_entity = entity; ++ sd->next_active = NULL; ++ } ++ break; ++ } ++ } ++ ++ return entity; ++} ++ ++/* ++ * Get next queue for service. ++ */ ++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_entity *entity = NULL; ++ struct bfq_sched_data *sd; ++ struct bfq_queue *bfqq; ++ ++ BUG_ON(bfqd->active_queue != NULL); ++ ++ if (bfqd->busy_queues == 0) ++ return NULL; ++ ++ sd = &bfqd->root_group->sched_data; ++ for (; sd != NULL; sd = entity->my_sched_data) { ++ entity = bfq_lookup_next_entity(sd, 1, bfqd); ++ BUG_ON(entity == NULL); ++ entity->service = 0; ++ } ++ ++ bfqq = bfq_entity_to_bfqq(entity); ++ BUG_ON(bfqq == NULL); ++ ++ return bfqq; ++} ++ ++static void __bfq_bfqd_reset_active(struct bfq_data *bfqd) ++{ ++ if (bfqd->active_bic != NULL) { ++ put_io_context(bfqd->active_bic->icq.ioc); ++ bfqd->active_bic = NULL; ++ } ++ ++ bfqd->active_queue = NULL; ++ del_timer(&bfqd->idle_slice_timer); ++} ++ ++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ int requeue) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (bfqq == bfqd->active_queue) ++ __bfq_bfqd_reset_active(bfqd); ++ ++ bfq_deactivate_entity(entity, requeue); ++} ++ ++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_activate_entity(entity); ++} ++ ++/* ++ * Called when the bfqq no longer has requests pending, remove it from ++ * the service tree. ++ */ ++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ int requeue) ++{ ++ BUG_ON(!bfq_bfqq_busy(bfqq)); ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ bfq_log_bfqq(bfqd, bfqq, "del from busy"); ++ ++ bfq_clear_bfqq_busy(bfqq); ++ ++ BUG_ON(bfqd->busy_queues == 0); ++ bfqd->busy_queues--; ++ ++ bfq_deactivate_bfqq(bfqd, bfqq, requeue); ++} ++ ++/* ++ * Called when an inactive queue receives a new request. ++ */ ++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ BUG_ON(bfqq == bfqd->active_queue); ++ ++ bfq_log_bfqq(bfqd, bfqq, "add to busy"); ++ ++ bfq_activate_bfqq(bfqd, bfqq); ++ ++ bfq_mark_bfqq_busy(bfqq); ++ bfqd->busy_queues++; ++} +diff --git a/block/bfq.h b/block/bfq.h +new file mode 100644 +index 0000000..ce3c474 +--- /dev/null ++++ b/block/bfq.h +@@ -0,0 +1,617 @@ ++/* ++ * BFQ-v5r1 for 3.7.0: data structures and common functions prototypes. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe ++ * ++ * Copyright (C) 2008 Fabio Checconi ++ * Paolo Valente ++ * ++ * Copyright (C) 2010 Paolo Valente ++ */ ++ ++#ifndef _BFQ_H ++#define _BFQ_H ++ ++#include ++#include ++#include ++#include ++ ++#define BFQ_IOPRIO_CLASSES 3 ++#define BFQ_CL_IDLE_TIMEOUT HZ/5 ++ ++#define BFQ_MIN_WEIGHT 1 ++#define BFQ_MAX_WEIGHT 1000 ++ ++#define BFQ_DEFAULT_GRP_WEIGHT 10 ++#define BFQ_DEFAULT_GRP_IOPRIO 0 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE ++ ++struct bfq_entity; ++ ++/** ++ * struct bfq_service_tree - per ioprio_class service tree. ++ * @active: tree for active entities (i.e., those backlogged). ++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). ++ * @first_idle: idle entity with minimum F_i. ++ * @last_idle: idle entity with maximum F_i. ++ * @vtime: scheduler virtual time. ++ * @wsum: scheduler weight sum; active and idle entities contribute to it. ++ * ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each ++ * ioprio_class has its own independent scheduler, and so its own ++ * bfq_service_tree. All the fields are protected by the queue lock ++ * of the containing bfqd. ++ */ ++struct bfq_service_tree { ++ struct rb_root active; ++ struct rb_root idle; ++ ++ struct bfq_entity *first_idle; ++ struct bfq_entity *last_idle; ++ ++ u64 vtime; ++ unsigned long wsum; ++}; ++ ++/** ++ * struct bfq_sched_data - multi-class scheduler. ++ * @active_entity: entity under service. ++ * @next_active: head-of-the-line entity in the scheduler. ++ * @service_tree: array of service trees, one per ioprio_class. ++ * ++ * bfq_sched_data is the basic scheduler queue. It supports three ++ * ioprio_classes, and can be used either as a toplevel queue or as ++ * an intermediate queue on a hierarchical setup. ++ * @next_active points to the active entity of the sched_data service ++ * trees that will be scheduled next. ++ * ++ * The supported ioprio_classes are the same as in CFQ, in descending ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. ++ * Requests from higher priority queues are served before all the ++ * requests from lower priority queues; among requests of the same ++ * queue requests are served according to B-WF2Q+. ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_sched_data { ++ struct bfq_entity *active_entity; ++ struct bfq_entity *next_active; ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; ++}; ++ ++/** ++ * struct bfq_entity - schedulable entity. ++ * @rb_node: service_tree member. ++ * @on_st: flag, true if the entity is on a tree (either the active or ++ * the idle one of its service_tree). ++ * @finish: B-WF2Q+ finish timestamp (aka F_i). ++ * @start: B-WF2Q+ start timestamp (aka S_i). ++ * @tree: tree the entity is enqueued into; %NULL if not on a tree. ++ * @min_start: minimum start time of the (active) subtree rooted at ++ * this entity; used for O(log N) lookups into active trees. ++ * @service: service received during the last round of service. ++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. ++ * @weight: weight of the queue ++ * @parent: parent entity, for hierarchical scheduling. ++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the ++ * associated scheduler queue, %NULL on leaf nodes. ++ * @sched_data: the scheduler queue this entity belongs to. ++ * @ioprio: the ioprio in use. ++ * @new_weight: when a weight change is requested, the new weight value. ++ * @orig_weight: original weight, used to implement weight boosting ++ * @new_ioprio: when an ioprio change is requested, the new ioprio value. ++ * @ioprio_class: the ioprio_class in use. ++ * @new_ioprio_class: when an ioprio_class change is requested, the new ++ * ioprio_class value. ++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or ++ * ioprio_class change. ++ * ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each ++ * entity belongs to the sched_data of the parent group in the cgroup ++ * hierarchy. Non-leaf entities have also their own sched_data, stored ++ * in @my_sched_data. ++ * ++ * Each entity stores independently its priority values; this would ++ * allow different weights on different devices, but this ++ * functionality is not exported to userspace by now. Priorities and ++ * weights are updated lazily, first storing the new values into the ++ * new_* fields, then setting the @ioprio_changed flag. As soon as ++ * there is a transition in the entity state that allows the priority ++ * update to take place the effective and the requested priority ++ * values are synchronized. ++ * ++ * Unless cgroups are used, the weight value is calculated from the ++ * ioprio to export the same interface as CFQ. When dealing with ++ * ``well-behaved'' queues (i.e., queues that do not spend too much ++ * time to consume their budget and have true sequential behavior, and ++ * when there are no external factors breaking anticipation) the ++ * relative weights at each level of the cgroups hierarchy should be ++ * guaranteed. All the fields are protected by the queue lock of the ++ * containing bfqd. ++ */ ++struct bfq_entity { ++ struct rb_node rb_node; ++ ++ int on_st; ++ ++ u64 finish; ++ u64 start; ++ ++ struct rb_root *tree; ++ ++ u64 min_start; ++ ++ unsigned long service, budget; ++ unsigned short weight, new_weight; ++ unsigned short orig_weight; ++ ++ struct bfq_entity *parent; ++ ++ struct bfq_sched_data *my_sched_data; ++ struct bfq_sched_data *sched_data; ++ ++ unsigned short ioprio, new_ioprio; ++ unsigned short ioprio_class, new_ioprio_class; ++ ++ int ioprio_changed; ++}; ++ ++struct bfq_group; ++ ++/** ++ * struct bfq_queue - leaf schedulable entity. ++ * @ref: reference counter. ++ * @bfqd: parent bfq_data. ++ * @new_bfqq: shared bfq_queue if queue is cooperating with ++ * one or more other queues. ++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree). ++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree). ++ * @sort_list: sorted list of pending requests. ++ * @next_rq: if fifo isn't expired, next request to serve. ++ * @queued: nr of requests queued in @sort_list. ++ * @allocated: currently allocated requests. ++ * @meta_pending: pending metadata requests. ++ * @fifo: fifo list of requests in sort_list. ++ * @entity: entity representing this queue in the scheduler. ++ * @max_budget: maximum budget allowed from the feedback mechanism. ++ * @budget_timeout: budget expiration (in jiffies). ++ * @dispatched: number of requests on the dispatch list or inside driver. ++ * @org_ioprio: saved ioprio during boosted periods. ++ * @flags: status flags. ++ * @bfqq_list: node for active/idle bfqq list inside our bfqd. ++ * @seek_samples: number of seeks sampled ++ * @seek_total: sum of the distances of the seeks sampled ++ * @seek_mean: mean seek distance ++ * @last_request_pos: position of the last request enqueued ++ * @pid: pid of the process owning the queue, used for logging purposes. ++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt ++ * @raising_cur_max_time: current max raising time for this queue ++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the ++ * queue is shared ++ * ++ * A bfq_queue is a leaf request queue; it can be associated to an io_context ++ * or more (if it is an async one). @cgroup holds a reference to the ++ * cgroup, to be sure that it does not disappear while a bfqq still ++ * references it (mostly to avoid races between request issuing and task ++ * migration followed by cgroup distruction). ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_queue { ++ atomic_t ref; ++ struct bfq_data *bfqd; ++ ++ /* fields for cooperating queues handling */ ++ struct bfq_queue *new_bfqq; ++ struct rb_node pos_node; ++ struct rb_root *pos_root; ++ ++ struct rb_root sort_list; ++ struct request *next_rq; ++ int queued[2]; ++ int allocated[2]; ++ int meta_pending; ++ struct list_head fifo; ++ ++ struct bfq_entity entity; ++ ++ unsigned long max_budget; ++ unsigned long budget_timeout; ++ ++ int dispatched; ++ ++ unsigned short org_ioprio; ++ ++ unsigned int flags; ++ ++ struct list_head bfqq_list; ++ ++ unsigned int seek_samples; ++ u64 seek_total; ++ sector_t seek_mean; ++ sector_t last_request_pos; ++ ++ pid_t pid; ++ struct bfq_io_cq *bic; ++ ++ /* weight-raising fields */ ++ unsigned int raising_cur_max_time; ++ u64 last_rais_start_finish, soft_rt_next_start; ++ unsigned int raising_coeff; ++}; ++ ++/** ++ * struct bfq_ttime - per process thinktime stats. ++ * @ttime_total: total process thinktime ++ * @ttime_samples: number of thinktime samples ++ * @ttime_mean: average process thinktime ++ */ ++struct bfq_ttime { ++ unsigned long last_end_request; ++ ++ unsigned long ttime_total; ++ unsigned long ttime_samples; ++ unsigned long ttime_mean; ++}; ++ ++/** ++ * struct bfq_io_cq - per (request_queue, io_context) structure. ++ * @icq: associated io_cq structure ++ * @bfqq: array of two process queues, the sync and the async ++ * @ttime: associated @bfq_ttime struct ++ * @raising_time_left: snapshot of the time left before weight raising ends ++ * for the sync queue associated to this process; this ++ * snapshot is taken to remember this value while the weight ++ * raising is suspended because the queue is merged with a ++ * shared queue, and is used to set @raising_cur_max_time ++ * when the queue is split from the shared queue and its ++ * weight is raised again ++ * @saved_idle_window: same purpose as the previous field for the idle window ++ */ ++struct bfq_io_cq { ++ struct io_cq icq; /* must be the first member */ ++ struct bfq_queue *bfqq[2]; ++ struct bfq_ttime ttime; ++ int ioprio; ++ ++ unsigned int raising_time_left; ++ unsigned int saved_idle_window; ++}; ++ ++/** ++ * struct bfq_data - per device data structure. ++ * @queue: request queue for the managed device. ++ * @root_group: root bfq_group for the device. ++ * @rq_pos_tree: rbtree sorted by next_request position, ++ * used when determining if two or more queues ++ * have interleaving requests (see bfq_close_cooperator). ++ * @busy_queues: number of bfq_queues containing requests (including the ++ * queue under service, even if it is idling). ++ * @queued: number of queued requests. ++ * @rq_in_driver: number of requests dispatched and waiting for completion. ++ * @sync_flight: number of sync requests in the driver. ++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples ++ * completed requests . ++ * @hw_tag_samples: nr of samples used to calculate hw_tag. ++ * @hw_tag: flag set to one if the driver is showing a queueing behavior. ++ * @budgets_assigned: number of budgets assigned. ++ * @idle_slice_timer: timer set when idling for the next sequential request ++ * from the queue under service. ++ * @unplug_work: delayed work to restart dispatching on the request queue. ++ * @active_queue: bfq_queue under service. ++ * @active_bic: bfq_io_cq (bic) associated with the @active_queue. ++ * @last_position: on-disk position of the last served request. ++ * @last_budget_start: beginning of the last budget. ++ * @last_idling_start: beginning of the last idle slice. ++ * @peak_rate: peak transfer rate observed for a budget. ++ * @peak_rate_samples: number of samples used to calculate @peak_rate. ++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling. ++ * @group_list: list of all the bfq_groups active on the device. ++ * @active_list: list of all the bfq_queues active on the device. ++ * @idle_list: list of all the bfq_queues idle on the device. ++ * @bfq_quantum: max number of requests dispatched per dispatch round. ++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires ++ * requests are served in fifo order. ++ * @bfq_back_penalty: weight of backward seeks wrt forward ones. ++ * @bfq_back_max: maximum allowed backward seek. ++ * @bfq_slice_idle: maximum idling time. ++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning). ++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to ++ * async queues. ++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to ++ * to prevent seeky queues to impose long latencies to well ++ * behaved ones (this also implies that seeky queues cannot ++ * receive guarantees in the service domain; after a timeout ++ * they are charged for the whole allocated budget, to try ++ * to preserve a behavior reasonably fair among them, but ++ * without service-domain guarantees). ++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted ++ * queue is multiplied ++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies) ++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes ++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising ++ * may be reactivated for a queue (in jiffies) ++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals ++ * after which weight-raising may be ++ * reactivated for an already busy queue ++ * (in jiffies) ++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue, ++ * sectors per seconds ++ * @RT_prod: cached value of the product R*T used for computing the maximum ++ * duration of the weight raising automatically ++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions ++ * ++ * All the fields are protected by the @queue lock. ++ */ ++struct bfq_data { ++ struct request_queue *queue; ++ ++ struct bfq_group *root_group; ++ ++ struct rb_root rq_pos_tree; ++ ++ int busy_queues; ++ int queued; ++ int rq_in_driver; ++ int sync_flight; ++ ++ int max_rq_in_driver; ++ int hw_tag_samples; ++ int hw_tag; ++ ++ int budgets_assigned; ++ ++ struct timer_list idle_slice_timer; ++ struct work_struct unplug_work; ++ ++ struct bfq_queue *active_queue; ++ struct bfq_io_cq *active_bic; ++ ++ sector_t last_position; ++ ++ ktime_t last_budget_start; ++ ktime_t last_idling_start; ++ int peak_rate_samples; ++ u64 peak_rate; ++ unsigned long bfq_max_budget; ++ ++ struct hlist_head group_list; ++ struct list_head active_list; ++ struct list_head idle_list; ++ ++ unsigned int bfq_quantum; ++ unsigned int bfq_fifo_expire[2]; ++ unsigned int bfq_back_penalty; ++ unsigned int bfq_back_max; ++ unsigned int bfq_slice_idle; ++ u64 bfq_class_idle_last_service; ++ ++ unsigned int bfq_user_max_budget; ++ unsigned int bfq_max_budget_async_rq; ++ unsigned int bfq_timeout[2]; ++ ++ bool low_latency; ++ ++ /* parameters of the low_latency heuristics */ ++ unsigned int bfq_raising_coeff; ++ unsigned int bfq_raising_max_time; ++ unsigned int bfq_raising_rt_max_time; ++ unsigned int bfq_raising_min_idle_time; ++ unsigned int bfq_raising_min_inter_arr_async; ++ unsigned int bfq_raising_max_softrt_rate; ++ u64 RT_prod; ++ ++ struct bfq_queue oom_bfqq; ++}; ++ ++enum bfqq_state_flags { ++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */ ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ ++ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ ++ BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */ ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ ++ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ ++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ ++ BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */ ++ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ ++}; ++ ++#define BFQ_BFQQ_FNS(name) \ ++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ ++{ \ ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ ++} ++ ++BFQ_BFQQ_FNS(busy); ++BFQ_BFQQ_FNS(wait_request); ++BFQ_BFQQ_FNS(must_alloc); ++BFQ_BFQQ_FNS(fifo_expire); ++BFQ_BFQQ_FNS(idle_window); ++BFQ_BFQQ_FNS(prio_changed); ++BFQ_BFQQ_FNS(sync); ++BFQ_BFQQ_FNS(budget_new); ++BFQ_BFQQ_FNS(coop); ++BFQ_BFQQ_FNS(split_coop); ++BFQ_BFQQ_FNS(some_coop_idle); ++BFQ_BFQQ_FNS(just_split); ++#undef BFQ_BFQQ_FNS ++ ++/* Logging facilities. */ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) ++ ++/* Expiration reasons. */ ++enum bfqq_expiration { ++ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */ ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ ++}; ++ ++#ifdef CONFIG_CGROUP_BFQIO ++/** ++ * struct bfq_group - per (device, cgroup) data structure. ++ * @entity: schedulable entity to insert into the parent group sched_data. ++ * @sched_data: own sched_data, to contain child entities (they may be ++ * both bfq_queues and bfq_groups). ++ * @group_node: node to be inserted into the bfqio_cgroup->group_data ++ * list of the containing cgroup's bfqio_cgroup. ++ * @bfqd_node: node to be inserted into the @bfqd->group_list list ++ * of the groups active on the same device; used for cleanup. ++ * @bfqd: the bfq_data for the device this group acts upon. ++ * @async_bfqq: array of async queues for all the tasks belonging to ++ * the group, one queue per ioprio value per ioprio_class, ++ * except for the idle class that has only one queue. ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used ++ * to avoid too many special cases during group creation/migration. ++ * ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup ++ * there is a set of bfq_groups, each one collecting the lower-level ++ * entities belonging to the group that are acting on the same device. ++ * ++ * Locking works as follows: ++ * o @group_node is protected by the bfqio_cgroup lock, and is accessed ++ * via RCU from its readers. ++ * o @bfqd is protected by the queue lock, RCU is used to access it ++ * from the readers. ++ * o All the other fields are protected by the @bfqd queue lock. ++ */ ++struct bfq_group { ++ struct bfq_entity entity; ++ struct bfq_sched_data sched_data; ++ ++ struct hlist_node group_node; ++ struct hlist_node bfqd_node; ++ ++ void *bfqd; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct bfq_entity *my_entity; ++}; ++ ++/** ++ * struct bfqio_cgroup - bfq cgroup data structure. ++ * @css: subsystem state for bfq in the containing cgroup. ++ * @weight: cgroup weight. ++ * @ioprio: cgroup ioprio. ++ * @ioprio_class: cgroup ioprio_class. ++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data. ++ * @group_data: list containing the bfq_group belonging to this cgroup. ++ * ++ * @group_data is accessed using RCU, with @lock protecting the updates, ++ * @ioprio and @ioprio_class are protected by @lock. ++ */ ++struct bfqio_cgroup { ++ struct cgroup_subsys_state css; ++ ++ unsigned short weight, ioprio, ioprio_class; ++ ++ spinlock_t lock; ++ struct hlist_head group_data; ++}; ++#else ++struct bfq_group { ++ struct bfq_sched_data sched_data; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++}; ++#endif ++ ++static inline struct bfq_service_tree * ++bfq_entity_service_tree(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sched_data = entity->sched_data; ++ unsigned int idx = entity->ioprio_class - 1; ++ ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); ++ BUG_ON(sched_data == NULL); ++ ++ return sched_data->service_tree + idx; ++} ++ ++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, ++ int is_sync) ++{ ++ return bic->bfqq[!!is_sync]; ++} ++ ++static inline void bic_set_bfqq(struct bfq_io_cq *bic, ++ struct bfq_queue *bfqq, int is_sync) ++{ ++ bic->bfqq[!!is_sync] = bfqq; ++} ++ ++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) ++{ ++ return bic->icq.q->elevator->elevator_data; ++} ++ ++/** ++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. ++ * @ptr: a pointer to a bfqd. ++ * @flags: storage for the flags to be saved. ++ * ++ * This function allows bfqg->bfqd to be protected by the ++ * queue lock of the bfqd they reference; the pointer is dereferenced ++ * under RCU, so the storage for bfqd is assured to be safe as long ++ * as the RCU read side critical section does not end. After the ++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be ++ * sure that no other writer accessed it. If we raced with a writer, ++ * the function returns NULL, with the queue unlocked, otherwise it ++ * returns the dereferenced pointer, with the queue locked. ++ */ ++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, ++ unsigned long *flags) ++{ ++ struct bfq_data *bfqd; ++ ++ rcu_read_lock(); ++ bfqd = rcu_dereference(*(struct bfq_data **)ptr); ++ ++ if (bfqd != NULL) { ++ spin_lock_irqsave(bfqd->queue->queue_lock, *flags); ++ if (*ptr == bfqd) ++ goto out; ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); ++ } ++ ++ bfqd = NULL; ++out: ++ rcu_read_unlock(); ++ return bfqd; ++} ++ ++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd, ++ unsigned long *flags) ++{ ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); ++} ++ ++static void bfq_changed_ioprio(struct bfq_io_cq *bic); ++static void bfq_put_queue(struct bfq_queue *bfqq); ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, int is_sync, ++ struct bfq_io_cq *bic, gfp_t gfp_mask); ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); ++#endif +-- +1.8.1.2 + diff --git a/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0004-tuxonice-for-linux.patch b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0004-tuxonice-for-linux.patch new file mode 100644 index 000000000..27848ccc2 --- /dev/null +++ b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0004-tuxonice-for-linux.patch @@ -0,0 +1,22319 @@ +# Calculate format=diff os_linux_system==desktop +diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt +index 9aa8ff3..2ca1256 100644 +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -3073,6 +3073,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. + HIGHMEM regardless of setting + of CONFIG_HIGHPTE. + ++ uuid_debug= (Boolean) whether to enable debugging of TuxOnIce's ++ uuid support. ++ + vdso= [X86,SH] + vdso=2: enable compat VDSO (default with COMPAT_VDSO) + vdso=1: enable VDSO (default) +diff --git a/Documentation/power/tuxonice-internals.txt b/Documentation/power/tuxonice-internals.txt +new file mode 100644 +index 0000000..7a96186 +--- /dev/null ++++ b/Documentation/power/tuxonice-internals.txt +@@ -0,0 +1,477 @@ ++ TuxOnIce 3.0 Internal Documentation. ++ Updated to 26 March 2009 ++ ++1. Introduction. ++ ++ TuxOnIce 3.0 is an addition to the Linux Kernel, designed to ++ allow the user to quickly shutdown and quickly boot a computer, without ++ needing to close documents or programs. It is equivalent to the ++ hibernate facility in some laptops. This implementation, however, ++ requires no special BIOS or hardware support. ++ ++ The code in these files is based upon the original implementation ++ prepared by Gabor Kuti and additional work by Pavel Machek and a ++ host of others. This code has been substantially reworked by Nigel ++ Cunningham, again with the help and testing of many others, not the ++ least of whom is Michael Frank. At its heart, however, the operation is ++ essentially the same as Gabor's version. ++ ++2. Overview of operation. ++ ++ The basic sequence of operations is as follows: ++ ++ a. Quiesce all other activity. ++ b. Ensure enough memory and storage space are available, and attempt ++ to free memory/storage if necessary. ++ c. Allocate the required memory and storage space. ++ d. Write the image. ++ e. Power down. ++ ++ There are a number of complicating factors which mean that things are ++ not as simple as the above would imply, however... ++ ++ o The activity of each process must be stopped at a point where it will ++ not be holding locks necessary for saving the image, or unexpectedly ++ restart operations due to something like a timeout and thereby make ++ our image inconsistent. ++ ++ o It is desirous that we sync outstanding I/O to disk before calculating ++ image statistics. This reduces corruption if one should suspend but ++ then not resume, and also makes later parts of the operation safer (see ++ below). ++ ++ o We need to get as close as we can to an atomic copy of the data. ++ Inconsistencies in the image will result in inconsistent memory contents at ++ resume time, and thus in instability of the system and/or file system ++ corruption. This would appear to imply a maximum image size of one half of ++ the amount of RAM, but we have a solution... (again, below). ++ ++ o In 2.6, we choose to play nicely with the other suspend-to-disk ++ implementations. ++ ++3. Detailed description of internals. ++ ++ a. Quiescing activity. ++ ++ Safely quiescing the system is achieved using three separate but related ++ aspects. ++ ++ First, we note that the vast majority of processes don't need to run during ++ suspend. They can be 'frozen'. We therefore implement a refrigerator ++ routine, which processes enter and in which they remain until the cycle is ++ complete. Processes enter the refrigerator via try_to_freeze() invocations ++ at appropriate places. A process cannot be frozen in any old place. It ++ must not be holding locks that will be needed for writing the image or ++ freezing other processes. For this reason, userspace processes generally ++ enter the refrigerator via the signal handling code, and kernel threads at ++ the place in their event loops where they drop locks and yield to other ++ processes or sleep. ++ ++ The task of freezing processes is complicated by the fact that there can be ++ interdependencies between processes. Freezing process A before process B may ++ mean that process B cannot be frozen, because it stops at waiting for ++ process A rather than in the refrigerator. This issue is seen where ++ userspace waits on freezeable kernel threads or fuse filesystem threads. To ++ address this issue, we implement the following algorithm for quiescing ++ activity: ++ ++ - Freeze filesystems (including fuse - userspace programs starting ++ new requests are immediately frozen; programs already running ++ requests complete their work before being frozen in the next ++ step) ++ - Freeze userspace ++ - Thaw filesystems (this is safe now that userspace is frozen and no ++ fuse requests are outstanding). ++ - Invoke sys_sync (noop on fuse). ++ - Freeze filesystems ++ - Freeze kernel threads ++ ++ If we need to free memory, we thaw kernel threads and filesystems, but not ++ userspace. We can then free caches without worrying about deadlocks due to ++ swap files being on frozen filesystems or such like. ++ ++ b. Ensure enough memory & storage are available. ++ ++ We have a number of constraints to meet in order to be able to successfully ++ suspend and resume. ++ ++ First, the image will be written in two parts, described below. One of these ++ parts needs to have an atomic copy made, which of course implies a maximum ++ size of one half of the amount of system memory. The other part ('pageset') ++ is not atomically copied, and can therefore be as large or small as desired. ++ ++ Second, we have constraints on the amount of storage available. In these ++ calculations, we may also consider any compression that will be done. The ++ cryptoapi module allows the user to configure an expected compression ratio. ++ ++ Third, the user can specify an arbitrary limit on the image size, in ++ megabytes. This limit is treated as a soft limit, so that we don't fail the ++ attempt to suspend if we cannot meet this constraint. ++ ++ c. Allocate the required memory and storage space. ++ ++ Having done the initial freeze, we determine whether the above constraints ++ are met, and seek to allocate the metadata for the image. If the constraints ++ are not met, or we fail to allocate the required space for the metadata, we ++ seek to free the amount of memory that we calculate is needed and try again. ++ We allow up to four iterations of this loop before aborting the cycle. If we ++ do fail, it should only be because of a bug in TuxOnIce's calculations. ++ ++ These steps are merged together in the prepare_image function, found in ++ prepare_image.c. The functions are merged because of the cyclical nature ++ of the problem of calculating how much memory and storage is needed. Since ++ the data structures containing the information about the image must ++ themselves take memory and use storage, the amount of memory and storage ++ required changes as we prepare the image. Since the changes are not large, ++ only one or two iterations will be required to achieve a solution. ++ ++ The recursive nature of the algorithm is miminised by keeping user space ++ frozen while preparing the image, and by the fact that our records of which ++ pages are to be saved and which pageset they are saved in use bitmaps (so ++ that changes in number or fragmentation of the pages to be saved don't ++ feedback via changes in the amount of memory needed for metadata). The ++ recursiveness is thus limited to any extra slab pages allocated to store the ++ extents that record storage used, and the effects of seeking to free memory. ++ ++ d. Write the image. ++ ++ We previously mentioned the need to create an atomic copy of the data, and ++ the half-of-memory limitation that is implied in this. This limitation is ++ circumvented by dividing the memory to be saved into two parts, called ++ pagesets. ++ ++ Pageset2 contains most of the page cache - the pages on the active and ++ inactive LRU lists that aren't needed or modified while TuxOnIce is ++ running, so they can be safely written without an atomic copy. They are ++ therefore saved first and reloaded last. While saving these pages, ++ TuxOnIce carefully ensures that the work of writing the pages doesn't make ++ the image inconsistent. With the support for Kernel (Video) Mode Setting ++ going into the kernel at the time of writing, we need to check for pages ++ on the LRU that are used by KMS, and exclude them from pageset2. They are ++ atomically copied as part of pageset 1. ++ ++ Once pageset2 has been saved, we prepare to do the atomic copy of remaining ++ memory. As part of the preparation, we power down drivers, thereby providing ++ them with the opportunity to have their state recorded in the image. The ++ amount of memory allocated by drivers for this is usually negligible, but if ++ DRI is in use, video drivers may require significants amounts. Ideally we ++ would be able to query drivers while preparing the image as to the amount of ++ memory they will need. Unfortunately no such mechanism exists at the time of ++ writing. For this reason, TuxOnIce allows the user to set an ++ 'extra_pages_allowance', which is used to seek to ensure sufficient memory ++ is available for drivers at this point. TuxOnIce also lets the user set this ++ value to 0. In this case, a test driver suspend is done while preparing the ++ image, and the difference (plus a margin) used instead. TuxOnIce will also ++ automatically restart the hibernation process (twice at most) if it finds ++ that the extra pages allowance is not sufficient. It will then use what was ++ actually needed (plus a margin, again). Failure to hibernate should thus ++ be an extremely rare occurence. ++ ++ Having suspended the drivers, we save the CPU context before making an ++ atomic copy of pageset1, resuming the drivers and saving the atomic copy. ++ After saving the two pagesets, we just need to save our metadata before ++ powering down. ++ ++ As we mentioned earlier, the contents of pageset2 pages aren't needed once ++ they've been saved. We therefore use them as the destination of our atomic ++ copy. In the unlikely event that pageset1 is larger, extra pages are ++ allocated while the image is being prepared. This is normally only a real ++ possibility when the system has just been booted and the page cache is ++ small. ++ ++ This is where we need to be careful about syncing, however. Pageset2 will ++ probably contain filesystem meta data. If this is overwritten with pageset1 ++ and then a sync occurs, the filesystem will be corrupted - at least until ++ resume time and another sync of the restored data. Since there is a ++ possibility that the user might not resume or (may it never be!) that ++ TuxOnIce might oops, we do our utmost to avoid syncing filesystems after ++ copying pageset1. ++ ++ e. Power down. ++ ++ Powering down uses standard kernel routines. TuxOnIce supports powering down ++ using the ACPI S3, S4 and S5 methods or the kernel's non-ACPI power-off. ++ Supporting suspend to ram (S3) as a power off option might sound strange, ++ but it allows the user to quickly get their system up and running again if ++ the battery doesn't run out (we just need to re-read the overwritten pages) ++ and if the battery does run out (or the user removes power), they can still ++ resume. ++ ++4. Data Structures. ++ ++ TuxOnIce uses three main structures to store its metadata and configuration ++ information: ++ ++ a) Pageflags bitmaps. ++ ++ TuxOnIce records which pages will be in pageset1, pageset2, the destination ++ of the atomic copy and the source of the atomically restored image using ++ bitmaps. The code used is that written for swsusp, with small improvements ++ to match TuxOnIce's requirements. ++ ++ The pageset1 bitmap is thus easily stored in the image header for use at ++ resume time. ++ ++ As mentioned above, using bitmaps also means that the amount of memory and ++ storage required for recording the above information is constant. This ++ greatly simplifies the work of preparing the image. In earlier versions of ++ TuxOnIce, extents were used to record which pages would be stored. In that ++ case, however, eating memory could result in greater fragmentation of the ++ lists of pages, which in turn required more memory to store the extents and ++ more storage in the image header. These could in turn require further ++ freeing of memory, and another iteration. All of this complexity is removed ++ by having bitmaps. ++ ++ Bitmaps also make a lot of sense because TuxOnIce only ever iterates ++ through the lists. There is therefore no cost to not being able to find the ++ nth page in order 0 time. We only need to worry about the cost of finding ++ the n+1th page, given the location of the nth page. Bitwise optimisations ++ help here. ++ ++ b) Extents for block data. ++ ++ TuxOnIce supports writing the image to multiple block devices. In the case ++ of swap, multiple partitions and/or files may be in use, and we happily use ++ them all (with the exception of compcache pages, which we allocate but do ++ not use). This use of multiple block devices is accomplished as follows: ++ ++ Whatever the actual source of the allocated storage, the destination of the ++ image can be viewed in terms of one or more block devices, and on each ++ device, a list of sectors. To simplify matters, we only use contiguous, ++ PAGE_SIZE aligned sectors, like the swap code does. ++ ++ Since sector numbers on each bdev may well not start at 0, it makes much ++ more sense to use extents here. Contiguous ranges of pages can thus be ++ represented in the extents by contiguous values. ++ ++ Variations in block size are taken account of in transforming this data ++ into the parameters for bio submission. ++ ++ We can thus implement a layer of abstraction wherein the core of TuxOnIce ++ doesn't have to worry about which device we're currently writing to or ++ where in the device we are. It simply requests that the next page in the ++ pageset or header be written, leaving the details to this lower layer. ++ The lower layer remembers where in the sequence of devices and blocks each ++ pageset starts. The header always starts at the beginning of the allocated ++ storage. ++ ++ So extents are: ++ ++ struct extent { ++ unsigned long minimum, maximum; ++ struct extent *next; ++ } ++ ++ These are combined into chains of extents for a device: ++ ++ struct extent_chain { ++ int size; /* size of the extent ie sum (max-min+1) */ ++ int allocs, frees; ++ char *name; ++ struct extent *first, *last_touched; ++ }; ++ ++ For each bdev, we need to store a little more info: ++ ++ struct suspend_bdev_info { ++ struct block_device *bdev; ++ dev_t dev_t; ++ int bmap_shift; ++ int blocks_per_page; ++ }; ++ ++ The dev_t is used to identify the device in the stored image. As a result, ++ we expect devices at resume time to have the same major and minor numbers ++ as they had while suspending. This is primarily a concern where the user ++ utilises LVM for storage, as they will need to dmsetup their partitions in ++ such a way as to maintain this consistency at resume time. ++ ++ bmap_shift and blocks_per_page apply the effects of variations in blocks ++ per page settings for the filesystem and underlying bdev. For most ++ filesystems, these are the same, but for xfs, they can have independant ++ values. ++ ++ Combining these two structures together, we have everything we need to ++ record what devices and what blocks on each device are being used to ++ store the image, and to submit i/o using bio_submit. ++ ++ The last elements in the picture are a means of recording how the storage ++ is being used. ++ ++ We do this first and foremost by implementing a layer of abstraction on ++ top of the devices and extent chains which allows us to view however many ++ devices there might be as one long storage tape, with a single 'head' that ++ tracks a 'current position' on the tape: ++ ++ struct extent_iterate_state { ++ struct extent_chain *chains; ++ int num_chains; ++ int current_chain; ++ struct extent *current_extent; ++ unsigned long current_offset; ++ }; ++ ++ That is, *chains points to an array of size num_chains of extent chains. ++ For the filewriter, this is always a single chain. For the swapwriter, the ++ array is of size MAX_SWAPFILES. ++ ++ current_chain, current_extent and current_offset thus point to the current ++ index in the chains array (and into a matching array of struct ++ suspend_bdev_info), the current extent in that chain (to optimise access), ++ and the current value in the offset. ++ ++ The image is divided into three parts: ++ - The header ++ - Pageset 1 ++ - Pageset 2 ++ ++ The header always starts at the first device and first block. We know its ++ size before we begin to save the image because we carefully account for ++ everything that will be stored in it. ++ ++ The second pageset (LRU) is stored first. It begins on the next page after ++ the end of the header. ++ ++ The first pageset is stored second. It's start location is only known once ++ pageset2 has been saved, since pageset2 may be compressed as it is written. ++ This location is thus recorded at the end of saving pageset2. It is page ++ aligned also. ++ ++ Since this information is needed at resume time, and the location of extents ++ in memory will differ at resume time, this needs to be stored in a portable ++ way: ++ ++ struct extent_iterate_saved_state { ++ int chain_num; ++ int extent_num; ++ unsigned long offset; ++ }; ++ ++ We can thus implement a layer of abstraction wherein the core of TuxOnIce ++ doesn't have to worry about which device we're currently writing to or ++ where in the device we are. It simply requests that the next page in the ++ pageset or header be written, leaving the details to this layer, and ++ invokes the routines to remember and restore the position, without having ++ to worry about the details of how the data is arranged on disk or such like. ++ ++ c) Modules ++ ++ One aim in designing TuxOnIce was to make it flexible. We wanted to allow ++ for the implementation of different methods of transforming a page to be ++ written to disk and different methods of getting the pages stored. ++ ++ In early versions (the betas and perhaps Suspend1), compression support was ++ inlined in the image writing code, and the data structures and code for ++ managing swap were intertwined with the rest of the code. A number of people ++ had expressed interest in implementing image encryption, and alternative ++ methods of storing the image. ++ ++ In order to achieve this, TuxOnIce was given a modular design. ++ ++ A module is a single file which encapsulates the functionality needed ++ to transform a pageset of data (encryption or compression, for example), ++ or to write the pageset to a device. The former type of module is called ++ a 'page-transformer', the later a 'writer'. ++ ++ Modules are linked together in pipeline fashion. There may be zero or more ++ page transformers in a pipeline, and there is always exactly one writer. ++ The pipeline follows this pattern: ++ ++ --------------------------------- ++ | TuxOnIce Core | ++ --------------------------------- ++ | ++ | ++ --------------------------------- ++ | Page transformer 1 | ++ --------------------------------- ++ | ++ | ++ --------------------------------- ++ | Page transformer 2 | ++ --------------------------------- ++ | ++ | ++ --------------------------------- ++ | Writer | ++ --------------------------------- ++ ++ During the writing of an image, the core code feeds pages one at a time ++ to the first module. This module performs whatever transformations it ++ implements on the incoming data, completely consuming the incoming data and ++ feeding output in a similar manner to the next module. ++ ++ All routines are SMP safe, and the final result of the transformations is ++ written with an index (provided by the core) and size of the output by the ++ writer. As a result, we can have multithreaded I/O without needing to ++ worry about the sequence in which pages are written (or read). ++ ++ During reading, the pipeline works in the reverse direction. The core code ++ calls the first module with the address of a buffer which should be filled. ++ (Note that the buffer size is always PAGE_SIZE at this time). This module ++ will in turn request data from the next module and so on down until the ++ writer is made to read from the stored image. ++ ++ Part of definition of the structure of a module thus looks like this: ++ ++ int (*rw_init) (int rw, int stream_number); ++ int (*rw_cleanup) (int rw); ++ int (*write_chunk) (struct page *buffer_page); ++ int (*read_chunk) (struct page *buffer_page, int sync); ++ ++ It should be noted that the _cleanup routine may be called before the ++ full stream of data has been read or written. While writing the image, ++ the user may (depending upon settings) choose to abort suspending, and ++ if we are in the midst of writing the last portion of the image, a portion ++ of the second pageset may be reread. This may also happen if an error ++ occurs and we seek to abort the process of writing the image. ++ ++ The modular design is also useful in a number of other ways. It provides ++ a means where by we can add support for: ++ ++ - providing overall initialisation and cleanup routines; ++ - serialising configuration information in the image header; ++ - providing debugging information to the user; ++ - determining memory and image storage requirements; ++ - dis/enabling components at run-time; ++ - configuring the module (see below); ++ ++ ...and routines for writers specific to their work: ++ - Parsing a resume= location; ++ - Determining whether an image exists; ++ - Marking a resume as having been attempted; ++ - Invalidating an image; ++ ++ Since some parts of the core - the user interface and storage manager ++ support - have use for some of these functions, they are registered as ++ 'miscellaneous' modules as well. ++ ++ d) Sysfs data structures. ++ ++ This brings us naturally to support for configuring TuxOnIce. We desired to ++ provide a way to make TuxOnIce as flexible and configurable as possible. ++ The user shouldn't have to reboot just because they want to now hibernate to ++ a file instead of a partition, for example. ++ ++ To accomplish this, TuxOnIce implements a very generic means whereby the ++ core and modules can register new sysfs entries. All TuxOnIce entries use ++ a single _store and _show routine, both of which are found in ++ tuxonice_sysfs.c in the kernel/power directory. These routines handle the ++ most common operations - getting and setting the values of bits, integers, ++ longs, unsigned longs and strings in one place, and allow overrides for ++ customised get and set options as well as side-effect routines for all ++ reads and writes. ++ ++ When combined with some simple macros, a new sysfs entry can then be defined ++ in just a couple of lines: ++ ++ SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1, ++ 2048, 0, NULL), ++ ++ This defines a sysfs entry named "progress_granularity" which is rw and ++ allows the user to access an integer stored at &progress_granularity, giving ++ it a value between 1 and 2048 inclusive. ++ ++ Sysfs entries are registered under /sys/power/tuxonice, and entries for ++ modules are located in a subdirectory named after the module. ++ +diff --git a/Documentation/power/tuxonice.txt b/Documentation/power/tuxonice.txt +new file mode 100644 +index 0000000..3bf0575 +--- /dev/null ++++ b/Documentation/power/tuxonice.txt +@@ -0,0 +1,948 @@ ++ --- TuxOnIce, version 3.0 --- ++ ++1. What is it? ++2. Why would you want it? ++3. What do you need to use it? ++4. Why not just use the version already in the kernel? ++5. How do you use it? ++6. What do all those entries in /sys/power/tuxonice do? ++7. How do you get support? ++8. I think I've found a bug. What should I do? ++9. When will XXX be supported? ++10 How does it work? ++11. Who wrote TuxOnIce? ++ ++1. What is it? ++ ++ Imagine you're sitting at your computer, working away. For some reason, you ++ need to turn off your computer for a while - perhaps it's time to go home ++ for the day. When you come back to your computer next, you're going to want ++ to carry on where you left off. Now imagine that you could push a button and ++ have your computer store the contents of its memory to disk and power down. ++ Then, when you next start up your computer, it loads that image back into ++ memory and you can carry on from where you were, just as if you'd never ++ turned the computer off. You have far less time to start up, no reopening of ++ applications or finding what directory you put that file in yesterday. ++ That's what TuxOnIce does. ++ ++ TuxOnIce has a long heritage. It began life as work by Gabor Kuti, who, ++ with some help from Pavel Machek, got an early version going in 1999. The ++ project was then taken over by Florent Chabaud while still in alpha version ++ numbers. Nigel Cunningham came on the scene when Florent was unable to ++ continue, moving the project into betas, then 1.0, 2.0 and so on up to ++ the present series. During the 2.0 series, the name was contracted to ++ Suspend2 and the website suspend2.net created. Beginning around July 2007, ++ a transition to calling the software TuxOnIce was made, to seek to help ++ make it clear that TuxOnIce is more concerned with hibernation than suspend ++ to ram. ++ ++ Pavel Machek's swsusp code, which was merged around 2.5.17 retains the ++ original name, and was essentially a fork of the beta code until Rafael ++ Wysocki came on the scene in 2005 and began to improve it further. ++ ++2. Why would you want it? ++ ++ Why wouldn't you want it? ++ ++ Being able to save the state of your system and quickly restore it improves ++ your productivity - you get a useful system in far less time than through ++ the normal boot process. You also get to be completely 'green', using zero ++ power, or as close to that as possible (the computer may still provide ++ minimal power to some devices, so they can initiate a power on, but that ++ will be the same amount of power as would be used if you told the computer ++ to shutdown. ++ ++3. What do you need to use it? ++ ++ a. Kernel Support. ++ ++ i) The TuxOnIce patch. ++ ++ TuxOnIce is part of the Linux Kernel. This version is not part of Linus's ++ 2.6 tree at the moment, so you will need to download the kernel source and ++ apply the latest patch. Having done that, enable the appropriate options in ++ make [menu|x]config (under Power Management Options - look for "Enhanced ++ Hibernation"), compile and install your kernel. TuxOnIce works with SMP, ++ Highmem, preemption, fuse filesystems, x86-32, PPC and x86_64. ++ ++ TuxOnIce patches are available from http://tuxonice.net. ++ ++ ii) Compression support. ++ ++ Compression support is implemented via the cryptoapi. You will therefore want ++ to select any Cryptoapi transforms that you want to use on your image from ++ the Cryptoapi menu while configuring your kernel. We recommend the use of the ++ LZO compression method - it is very fast and still achieves good compression. ++ ++ You can also tell TuxOnIce to write its image to an encrypted and/or ++ compressed filesystem/swap partition. In that case, you don't need to do ++ anything special for TuxOnIce when it comes to kernel configuration. ++ ++ iii) Configuring other options. ++ ++ While you're configuring your kernel, try to configure as much as possible ++ to build as modules. We recommend this because there are a number of drivers ++ that are still in the process of implementing proper power management ++ support. In those cases, the best way to work around their current lack is ++ to build them as modules and remove the modules while hibernating. You might ++ also bug the driver authors to get their support up to speed, or even help! ++ ++ b. Storage. ++ ++ i) Swap. ++ ++ TuxOnIce can store the hibernation image in your swap partition, a swap file or ++ a combination thereof. Whichever combination you choose, you will probably ++ want to create enough swap space to store the largest image you could have, ++ plus the space you'd normally use for swap. A good rule of thumb would be ++ to calculate the amount of swap you'd want without using TuxOnIce, and then ++ add the amount of memory you have. This swapspace can be arranged in any way ++ you'd like. It can be in one partition or file, or spread over a number. The ++ only requirement is that they be active when you start a hibernation cycle. ++ ++ There is one exception to this requirement. TuxOnIce has the ability to turn ++ on one swap file or partition at the start of hibernating and turn it back off ++ at the end. If you want to ensure you have enough memory to store a image ++ when your memory is fully used, you might want to make one swap partition or ++ file for 'normal' use, and another for TuxOnIce to activate & deactivate ++ automatically. (Further details below). ++ ++ ii) Normal files. ++ ++ TuxOnIce includes a 'file allocator'. The file allocator can store your ++ image in a simple file. Since Linux has the concept of everything being a ++ file, this is more powerful than it initially sounds. If, for example, you ++ were to set up a network block device file, you could hibernate to a network ++ server. This has been tested and works to a point, but nbd itself isn't ++ stateless enough for our purposes. ++ ++ Take extra care when setting up the file allocator. If you just type ++ commands without thinking and then try to hibernate, you could cause ++ irreversible corruption on your filesystems! Make sure you have backups. ++ ++ Most people will only want to hibernate to a local file. To achieve that, do ++ something along the lines of: ++ ++ echo "TuxOnIce" > /hibernation-file ++ dd if=/dev/zero bs=1M count=512 >> /hibernation-file ++ ++ This will create a 512MB file called /hibernation-file. To get TuxOnIce to use ++ it: ++ ++ echo /hibernation-file > /sys/power/tuxonice/file/target ++ ++ Then ++ ++ cat /sys/power/tuxonice/resume ++ ++ Put the results of this into your bootloader's configuration (see also step ++ C, below): ++ ++ ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE--- ++ # cat /sys/power/tuxonice/resume ++ file:/dev/hda2:0x1e001 ++ ++ In this example, we would edit the append= line of our lilo.conf|menu.lst ++ so that it included: ++ ++ resume=file:/dev/hda2:0x1e001 ++ ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE--- ++ ++ For those who are thinking 'Could I make the file sparse?', the answer is ++ 'No!'. At the moment, there is no way for TuxOnIce to fill in the holes in ++ a sparse file while hibernating. In the longer term (post merge!), I'd like ++ to change things so that the file could be dynamically resized and have ++ holes filled as needed. Right now, however, that's not possible and not a ++ priority. ++ ++ c. Bootloader configuration. ++ ++ Using TuxOnIce also requires that you add an extra parameter to ++ your lilo.conf or equivalent. Here's an example for a swap partition: ++ ++ append="resume=swap:/dev/hda1" ++ ++ This would tell TuxOnIce that /dev/hda1 is a swap partition you ++ have. TuxOnIce will use the swap signature of this partition as a ++ pointer to your data when you hibernate. This means that (in this example) ++ /dev/hda1 doesn't need to be _the_ swap partition where all of your data ++ is actually stored. It just needs to be a swap partition that has a ++ valid signature. ++ ++ You don't need to have a swap partition for this purpose. TuxOnIce ++ can also use a swap file, but usage is a little more complex. Having made ++ your swap file, turn it on and do ++ ++ cat /sys/power/tuxonice/swap/headerlocations ++ ++ (this assumes you've already compiled your kernel with TuxOnIce ++ support and booted it). The results of the cat command will tell you ++ what you need to put in lilo.conf: ++ ++ For swap partitions like /dev/hda1, simply use resume=/dev/hda1. ++ For swapfile `swapfile`, use resume=swap:/dev/hda2:0x242d. ++ ++ If the swapfile changes for any reason (it is moved to a different ++ location, it is deleted and recreated, or the filesystem is ++ defragmented) then you will have to check ++ /sys/power/tuxonice/swap/headerlocations for a new resume_block value. ++ ++ Once you've compiled and installed the kernel and adjusted your bootloader ++ configuration, you should only need to reboot for the most basic part ++ of TuxOnIce to be ready. ++ ++ If you only compile in the swap allocator, or only compile in the file ++ allocator, you don't need to add the "swap:" part of the resume= ++ parameters above. resume=/dev/hda2:0x242d will work just as well. If you ++ have compiled both and your storage is on swap, you can also use this ++ format (the swap allocator is the default allocator). ++ ++ When compiling your kernel, one of the options in the 'Power Management ++ Support' menu, just above the 'Enhanced Hibernation (TuxOnIce)' entry is ++ called 'Default resume partition'. This can be used to set a default value ++ for the resume= parameter. ++ ++ d. The hibernate script. ++ ++ Since the driver model in 2.6 kernels is still being developed, you may need ++ to do more than just configure TuxOnIce. Users of TuxOnIce usually start the ++ process via a script which prepares for the hibernation cycle, tells the ++ kernel to do its stuff and then restore things afterwards. This script might ++ involve: ++ ++ - Switching to a text console and back if X doesn't like the video card ++ status on resume. ++ - Un/reloading drivers that don't play well with hibernation. ++ ++ Note that you might not be able to unload some drivers if there are ++ processes using them. You might have to kill off processes that hold ++ devices open. Hint: if your X server accesses an USB mouse, doing a ++ 'chvt' to a text console releases the device and you can unload the ++ module. ++ ++ Check out the latest script (available on tuxonice.net). ++ ++ e. The userspace user interface. ++ ++ TuxOnIce has very limited support for displaying status if you only apply ++ the kernel patch - it can printk messages, but that is all. In addition, ++ some of the functions mentioned in this document (such as cancelling a cycle ++ or performing interactive debugging) are unavailable. To utilise these ++ functions, or simply get a nice display, you need the 'userui' component. ++ Userui comes in three flavours, usplash, fbsplash and text. Text should ++ work on any console. Usplash and fbsplash require the appropriate ++ (distro specific?) support. ++ ++ To utilise a userui, TuxOnIce just needs to be told where to find the ++ userspace binary: ++ ++ echo "/usr/local/sbin/tuxoniceui_fbsplash" > /sys/power/tuxonice/user_interface/program ++ ++ The hibernate script can do this for you, and a default value for this ++ setting can be configured when compiling the kernel. This path is also ++ stored in the image header, so if you have an initrd or initramfs, you can ++ use the userui during the first part of resuming (prior to the atomic ++ restore) by putting the binary in the same path in your initrd/ramfs. ++ Alternatively, you can put it in a different location and do an echo ++ similar to the above prior to the echo > do_resume. The value saved in the ++ image header will then be ignored. ++ ++4. Why not just use the version already in the kernel? ++ ++ The version in the vanilla kernel has a number of drawbacks. The most ++ serious of these are: ++ - it has a maximum image size of 1/2 total memory; ++ - it doesn't allocate storage until after it has snapshotted memory. ++ This means that you can't be sure hibernating will work until you ++ see it start to write the image; ++ - it does not allow you to press escape to cancel a cycle; ++ - it does not allow you to press escape to cancel resuming; ++ - it does not allow you to automatically swapon a file when ++ starting a cycle; ++ - it does not allow you to use multiple swap partitions or files; ++ - it does not allow you to use ordinary files; ++ - it just invalidates an image and continues to boot if you ++ accidentally boot the wrong kernel after hibernating; ++ - it doesn't support any sort of nice display while hibernating; ++ - it is moving toward requiring that you have an initrd/initramfs ++ to ever have a hope of resuming (uswsusp). While uswsusp will ++ address some of the concerns above, it won't address all of them, ++ and will be more complicated to get set up; ++ - it doesn't have support for suspend-to-both (write a hibernation ++ image, then suspend to ram; I think this is known as ReadySafe ++ under M$). ++ ++5. How do you use it? ++ ++ A hibernation cycle can be started directly by doing: ++ ++ echo > /sys/power/tuxonice/do_hibernate ++ ++ In practice, though, you'll probably want to use the hibernate script ++ to unload modules, configure the kernel the way you like it and so on. ++ In that case, you'd do (as root): ++ ++ hibernate ++ ++ See the hibernate script's man page for more details on the options it ++ takes. ++ ++ If you're using the text or splash user interface modules, one feature of ++ TuxOnIce that you might find useful is that you can press Escape at any time ++ during hibernating, and the process will be aborted. ++ ++ Due to the way hibernation works, this means you'll have your system back and ++ perfectly usable almost instantly. The only exception is when it's at the ++ very end of writing the image. Then it will need to reload a small (usually ++ 4-50MBs, depending upon the image characteristics) portion first. ++ ++ Likewise, when resuming, you can press escape and resuming will be aborted. ++ The computer will then powerdown again according to settings at that time for ++ the powerdown method or rebooting. ++ ++ You can change the settings for powering down while the image is being ++ written by pressing 'R' to toggle rebooting and 'O' to toggle between ++ suspending to ram and powering down completely). ++ ++ If you run into problems with resuming, adding the "noresume" option to ++ the kernel command line will let you skip the resume step and recover your ++ system. This option shouldn't normally be needed, because TuxOnIce modifies ++ the image header prior to the atomic restore, and will thus prompt you ++ if it detects that you've tried to resume an image before (this flag is ++ removed if you press Escape to cancel a resume, so you won't be prompted ++ then). ++ ++ Recent kernels (2.6.24 onwards) add support for resuming from a different ++ kernel to the one that was hibernated (thanks to Rafael for his work on ++ this - I've just embraced and enhanced the support for TuxOnIce). This ++ should further reduce the need for you to use the noresume option. ++ ++6. What do all those entries in /sys/power/tuxonice do? ++ ++ /sys/power/tuxonice is the directory which contains files you can use to ++ tune and configure TuxOnIce to your liking. The exact contents of ++ the directory will depend upon the version of TuxOnIce you're ++ running and the options you selected at compile time. In the following ++ descriptions, names in brackets refer to compile time options. ++ (Note that they're all dependant upon you having selected CONFIG_TUXONICE ++ in the first place!). ++ ++ Since the values of these settings can open potential security risks, the ++ writeable ones are accessible only to the root user. You may want to ++ configure sudo to allow you to invoke your hibernate script as an ordinary ++ user. ++ ++ - alloc/failure_test ++ ++ This debugging option provides a way of testing TuxOnIce's handling of ++ memory allocation failures. Each allocation type that TuxOnIce makes has ++ been given a unique number (see the source code). Echo the appropriate ++ number into this entry, and when TuxOnIce attempts to do that allocation, ++ it will pretend there was a failure and act accordingly. ++ ++ - alloc/find_max_mem_allocated ++ ++ This debugging option will cause TuxOnIce to find the maximum amount of ++ memory it used during a cycle, and report that information in debugging ++ information at the end of the cycle. ++ ++ - alt_resume_param ++ ++ Instead of powering down after writing a hibernation image, TuxOnIce ++ supports resuming from a different image. This entry lets you set the ++ location of the signature for that image (the resume= value you'd use ++ for it). Using an alternate image and keep_image mode, you can do things ++ like using an alternate image to power down an uninterruptible power ++ supply. ++ ++ - block_io/target_outstanding_io ++ ++ This value controls the amount of memory that the block I/O code says it ++ needs when the core code is calculating how much memory is needed for ++ hibernating and for resuming. It doesn't directly control the amount of ++ I/O that is submitted at any one time - that depends on the amount of ++ available memory (we may have more available than we asked for), the ++ throughput that is being achieved and the ability of the CPU to keep up ++ with disk throughput (particularly where we're compressing pages). ++ ++ - checksum/enabled ++ ++ Use cryptoapi hashing routines to verify that Pageset2 pages don't change ++ while we're saving the first part of the image, and to get any pages that ++ do change resaved in the atomic copy. This should normally not be needed, ++ but if you're seeing issues, please enable this. If your issues stop you ++ being able to resume, enable this option, hibernate and cancel the cycle ++ after the atomic copy is done. If the debugging info shows a non-zero ++ number of pages resaved, please report this to Nigel. ++ ++ - compression/algorithm ++ ++ Set the cryptoapi algorithm used for compressing the image. ++ ++ - compression/expected_compression ++ ++ These values allow you to set an expected compression ratio, which TuxOnice ++ will use in calculating whether it meets constraints on the image size. If ++ this expected compression ratio is not attained, the hibernation cycle will ++ abort, so it is wise to allow some spare. You can see what compression ++ ratio is achieved in the logs after hibernating. ++ ++ - debug_info: ++ ++ This file returns information about your configuration that may be helpful ++ in diagnosing problems with hibernating. ++ ++ - did_suspend_to_both: ++ ++ This file can be used when you hibernate with powerdown method 3 (ie suspend ++ to ram after writing the image). There can be two outcomes in this case. We ++ can resume from the suspend-to-ram before the battery runs out, or we can run ++ out of juice and and up resuming like normal. This entry lets you find out, ++ post resume, which way we went. If the value is 1, we resumed from suspend ++ to ram. This can be useful when actions need to be run post suspend-to-ram ++ that don't need to be run if we did the normal resume from power off. ++ ++ - do_hibernate: ++ ++ When anything is written to this file, the kernel side of TuxOnIce will ++ begin to attempt to write an image to disk and power down. You'll normally ++ want to run the hibernate script instead, to get modules unloaded first. ++ ++ - do_resume: ++ ++ When anything is written to this file TuxOnIce will attempt to read and ++ restore an image. If there is no image, it will return almost immediately. ++ If an image exists, the echo > will never return. Instead, the original ++ kernel context will be restored and the original echo > do_hibernate will ++ return. ++ ++ - */enabled ++ ++ These option can be used to temporarily disable various parts of TuxOnIce. ++ ++ - extra_pages_allowance ++ ++ When TuxOnIce does its atomic copy, it calls the driver model suspend ++ and resume methods. If you have DRI enabled with a driver such as fglrx, ++ this can result in the driver allocating a substantial amount of memory ++ for storing its state. Extra_pages_allowance tells TuxOnIce how much ++ extra memory it should ensure is available for those allocations. If ++ your attempts at hibernating end with a message in dmesg indicating that ++ insufficient extra pages were allowed, you need to increase this value. ++ ++ - file/target: ++ ++ Read this value to get the current setting. Write to it to point TuxOnice ++ at a new storage location for the file allocator. See section 3.b.ii above ++ for details of how to set up the file allocator. ++ ++ - freezer_test ++ ++ This entry can be used to get TuxOnIce to just test the freezer and prepare ++ an image without actually doing a hibernation cycle. It is useful for ++ diagnosing freezing and image preparation issues. ++ ++ - full_pageset2 ++ ++ TuxOnIce divides the pages that are stored in an image into two sets. The ++ difference between the two sets is that pages in pageset 1 are atomically ++ copied, and pages in pageset 2 are written to disk without being copied ++ first. A page CAN be written to disk without being copied first if and only ++ if its contents will not be modified or used at any time after userspace ++ processes are frozen. A page MUST be in pageset 1 if its contents are ++ modified or used at any time after userspace processes have been frozen. ++ ++ Normally (ie if this option is enabled), TuxOnIce will put all pages on the ++ per-zone LRUs in pageset2, then remove those pages used by any userspace ++ user interface helper and TuxOnIce storage manager that are running, ++ together with pages used by the GEM memory manager introduced around 2.6.28 ++ kernels. ++ ++ If this option is disabled, a much more conservative approach will be taken. ++ The only pages in pageset2 will be those belonging to userspace processes, ++ with the exclusion of those belonging to the TuxOnIce userspace helpers ++ mentioned above. This will result in a much smaller pageset2, and will ++ therefore result in smaller images than are possible with this option ++ enabled. ++ ++ - ignore_rootfs ++ ++ TuxOnIce records which device is mounted as the root filesystem when ++ writing the hibernation image. It will normally check at resume time that ++ this device isn't already mounted - that would be a cause of filesystem ++ corruption. In some particular cases (RAM based root filesystems), you ++ might want to disable this check. This option allows you to do that. ++ ++ - image_exists: ++ ++ Can be used in a script to determine whether a valid image exists at the ++ location currently pointed to by resume=. Returns up to three lines. ++ The first is whether an image exists (-1 for unsure, otherwise 0 or 1). ++ If an image eixsts, additional lines will return the machine and version. ++ Echoing anything to this entry removes any current image. ++ ++ - image_size_limit: ++ ++ The maximum size of hibernation image written to disk, measured in megabytes ++ (1024*1024). ++ ++ - last_result: ++ ++ The result of the last hibernation cycle, as defined in ++ include/linux/suspend-debug.h with the values SUSPEND_ABORTED to ++ SUSPEND_KEPT_IMAGE. This is a bitmask. ++ ++ - late_cpu_hotplug: ++ ++ This sysfs entry controls whether cpu hotplugging is done - as normal - just ++ before (unplug) and after (replug) the atomic copy/restore (so that all ++ CPUs/cores are available for multithreaded I/O). The alternative is to ++ unplug all secondary CPUs/cores at the start of hibernating/resuming, and ++ replug them at the end of resuming. No multithreaded I/O will be possible in ++ this configuration, but the odd machine has been reported to require it. ++ ++ - lid_file: ++ ++ This determines which ACPI button file we look in to determine whether the ++ lid is open or closed after resuming from suspend to disk or power off. ++ If the entry is set to "lid/LID", we'll open /proc/acpi/button/lid/LID/state ++ and check its contents at the appropriate moment. See post_wake_state below ++ for more details on how this entry is used. ++ ++ - log_everything (CONFIG_PM_DEBUG): ++ ++ Setting this option results in all messages printed being logged. Normally, ++ only a subset are logged, so as to not slow the process and not clutter the ++ logs. Useful for debugging. It can be toggled during a cycle by pressing ++ 'L'. ++ ++ - no_load_direct: ++ ++ This is a debugging option. If, when loading the atomically copied pages of ++ an image, TuxOnIce finds that the destination address for a page is free, ++ it will normally allocate the image, load the data directly into that ++ address and skip it in the atomic restore. If this option is disabled, the ++ page will be loaded somewhere else and atomically restored like other pages. ++ ++ - no_flusher_thread: ++ ++ When doing multithreaded I/O (see below), the first online CPU can be used ++ to _just_ submit compressed pages when writing the image, rather than ++ compressing and submitting data. This option is normally disabled, but has ++ been included because Nigel would like to see whether it will be more useful ++ as the number of cores/cpus in computers increases. ++ ++ - no_multithreaded_io: ++ ++ TuxOnIce will normally create one thread per cpu/core on your computer, ++ each of which will then perform I/O. This will generally result in ++ throughput that's the maximum the storage medium can handle. There ++ shouldn't be any reason to disable multithreaded I/O now, but this option ++ has been retained for debugging purposes. ++ ++ - no_pageset2 ++ ++ See the entry for full_pageset2 above for an explanation of pagesets. ++ Enabling this option causes TuxOnIce to do an atomic copy of all pages, ++ thereby limiting the maximum image size to 1/2 of memory, as swsusp does. ++ ++ - no_pageset2_if_unneeded ++ ++ See the entry for full_pageset2 above for an explanation of pagesets. ++ Enabling this option causes TuxOnIce to act like no_pageset2 was enabled ++ if and only it isn't needed anyway. This option may still make TuxOnIce ++ less reliable because pageset2 pages are normally used to store the ++ atomic copy - drivers that want to do allocations of larger amounts of ++ memory in one shot will be more likely to find that those amounts aren't ++ available if this option is enabled. ++ ++ - pause_between_steps (CONFIG_PM_DEBUG): ++ ++ This option is used during debugging, to make TuxOnIce pause between ++ each step of the process. It is ignored when the nice display is on. ++ ++ - post_wake_state: ++ ++ TuxOnIce provides support for automatically waking after a user-selected ++ delay, and using a different powerdown method if the lid is still closed. ++ (Yes, we're assuming a laptop). This entry lets you choose what state ++ should be entered next. The values are those described under ++ powerdown_method, below. It can be used to suspend to RAM after hibernating, ++ then powerdown properly (say) 20 minutes. It can also be used to power down ++ properly, then wake at (say) 6.30am and suspend to RAM until you're ready ++ to use the machine. ++ ++ - powerdown_method: ++ ++ Used to select a method by which TuxOnIce should powerdown after writing the ++ image. Currently: ++ ++ 0: Don't use ACPI to power off. ++ 3: Attempt to enter Suspend-to-ram. ++ 4: Attempt to enter ACPI S4 mode. ++ 5: Attempt to power down via ACPI S5 mode. ++ ++ Note that these options are highly dependant upon your hardware & software: ++ ++ 3: When succesful, your machine suspends to ram instead of powering off. ++ The advantage of using this mode is that it doesn't matter whether your ++ battery has enough charge to make it through to your next resume. If it ++ lasts, you will simply resume from suspend to ram (and the image on disk ++ will be discarded). If the battery runs out, you will resume from disk ++ instead. The disadvantage is that it takes longer than a normal ++ suspend-to-ram to enter the state, since the suspend-to-disk image needs ++ to be written first. ++ 4/5: When successful, your machine will be off and comsume (almost) no power. ++ But it might still react to some external events like opening the lid or ++ trafic on a network or usb device. For the bios, resume is then the same ++ as warm boot, similar to a situation where you used the command `reboot' ++ to reboot your machine. If your machine has problems on warm boot or if ++ you want to protect your machine with the bios password, this is probably ++ not the right choice. Mode 4 may be necessary on some machines where ACPI ++ wake up methods need to be run to properly reinitialise hardware after a ++ hibernation cycle. ++ 0: Switch the machine completely off. The only possible wakeup is the power ++ button. For the bios, resume is then the same as a cold boot, in ++ particular you would have to provide your bios boot password if your ++ machine uses that feature for booting. ++ ++ - progressbar_granularity_limit: ++ ++ This option can be used to limit the granularity of the progress bar ++ displayed with a bootsplash screen. The value is the maximum number of ++ steps. That is, 10 will make the progress bar jump in 10% increments. ++ ++ - reboot: ++ ++ This option causes TuxOnIce to reboot rather than powering down ++ at the end of saving an image. It can be toggled during a cycle by pressing ++ 'R'. ++ ++ - resume: ++ ++ This sysfs entry can be used to read and set the location in which TuxOnIce ++ will look for the signature of an image - the value set using resume= at ++ boot time or CONFIG_PM_STD_PARTITION ("Default resume partition"). By ++ writing to this file as well as modifying your bootloader's configuration ++ file (eg menu.lst), you can set or reset the location of your image or the ++ method of storing the image without rebooting. ++ ++ - replace_swsusp (CONFIG_TOI_REPLACE_SWSUSP): ++ ++ This option makes ++ ++ echo disk > /sys/power/state ++ ++ activate TuxOnIce instead of swsusp. Regardless of whether this option is ++ enabled, any invocation of swsusp's resume time trigger will cause TuxOnIce ++ to check for an image too. This is due to the fact that at resume time, we ++ can't know whether this option was enabled until we see if an image is there ++ for us to resume from. (And when an image exists, we don't care whether we ++ did replace swsusp anyway - we just want to resume). ++ ++ - resume_commandline: ++ ++ This entry can be read after resuming to see the commandline that was used ++ when resuming began. You might use this to set up two bootloader entries ++ that are the same apart from the fact that one includes a extra append= ++ argument "at_work=1". You could then grep resume_commandline in your ++ post-resume scripts and configure networking (for example) differently ++ depending upon whether you're at home or work. resume_commandline can be ++ set to arbitrary text if you wish to remove sensitive contents. ++ ++ - swap/swapfilename: ++ ++ This entry is used to specify the swapfile or partition that ++ TuxOnIce will attempt to swapon/swapoff automatically. Thus, if ++ I normally use /dev/hda1 for swap, and want to use /dev/hda2 for specifically ++ for my hibernation image, I would ++ ++ echo /dev/hda2 > /sys/power/tuxonice/swap/swapfile ++ ++ /dev/hda2 would then be automatically swapon'd and swapoff'd. Note that the ++ swapon and swapoff occur while other processes are frozen (including kswapd) ++ so this swap file will not be used up when attempting to free memory. The ++ parition/file is also given the highest priority, so other swapfiles/partitions ++ will only be used to save the image when this one is filled. ++ ++ The value of this file is used by headerlocations along with any currently ++ activated swapfiles/partitions. ++ ++ - swap/headerlocations: ++ ++ This option tells you the resume= options to use for swap devices you ++ currently have activated. It is particularly useful when you only want to ++ use a swap file to store your image. See above for further details. ++ ++ - test_bio ++ ++ This is a debugging option. When enabled, TuxOnIce will not hibernate. ++ Instead, when asked to write an image, it will skip the atomic copy, ++ just doing the writing of the image and then returning control to the ++ user at the point where it would have powered off. This is useful for ++ testing throughput in different configurations. ++ ++ - test_filter_speed ++ ++ This is a debugging option. When enabled, TuxOnIce will not hibernate. ++ Instead, when asked to write an image, it will not write anything or do ++ an atomic copy, but will only run any enabled compression algorithm on the ++ data that would have been written (the source pages of the atomic copy in ++ the case of pageset 1). This is useful for comparing the performance of ++ compression algorithms and for determining the extent to which an upgrade ++ to your storage method would improve hibernation speed. ++ ++ - user_interface/debug_sections (CONFIG_PM_DEBUG): ++ ++ This value, together with the console log level, controls what debugging ++ information is displayed. The console log level determines the level of ++ detail, and this value determines what detail is displayed. This value is ++ a bit vector, and the meaning of the bits can be found in the kernel tree ++ in include/linux/tuxonice.h. It can be overridden using the kernel's ++ command line option suspend_dbg. ++ ++ - user_interface/default_console_level (CONFIG_PM_DEBUG): ++ ++ This determines the value of the console log level at the start of a ++ hibernation cycle. If debugging is compiled in, the console log level can be ++ changed during a cycle by pressing the digit keys. Meanings are: ++ ++ 0: Nice display. ++ 1: Nice display plus numerical progress. ++ 2: Errors only. ++ 3: Low level debugging info. ++ 4: Medium level debugging info. ++ 5: High level debugging info. ++ 6: Verbose debugging info. ++ ++ - user_interface/enable_escape: ++ ++ Setting this to "1" will enable you abort a hibernation cycle or resuming by ++ pressing escape, "0" (default) disables this feature. Note that enabling ++ this option means that you cannot initiate a hibernation cycle and then walk ++ away from your computer, expecting it to be secure. With feature disabled, ++ you can validly have this expectation once TuxOnice begins to write the ++ image to disk. (Prior to this point, it is possible that TuxOnice might ++ about because of failure to freeze all processes or because constraints ++ on its ability to save the image are not met). ++ ++ - user_interface/program ++ ++ This entry is used to tell TuxOnice what userspace program to use for ++ providing a user interface while hibernating. The program uses a netlink ++ socket to pass messages back and forward to the kernel, allowing all of the ++ functions formerly implemented in the kernel user interface components. ++ ++ - version: ++ ++ The version of TuxOnIce you have compiled into the currently running kernel. ++ ++ - wake_alarm_dir: ++ ++ As mentioned above (post_wake_state), TuxOnIce supports automatically waking ++ after some delay. This entry allows you to select which wake alarm to use. ++ It should contain the value "rtc0" if you're wanting to use ++ /sys/class/rtc/rtc0. ++ ++ - wake_delay: ++ ++ This value determines the delay from the end of writing the image until the ++ wake alarm is triggered. You can set an absolute time by writing the desired ++ time into /sys/class/rtc//wakealarm and leaving these values ++ empty. ++ ++ Note that for the wakeup to actually occur, you may need to modify entries ++ in /proc/acpi/wakeup. This is done by echoing the name of the button in the ++ first column (eg PBTN) into the file. ++ ++7. How do you get support? ++ ++ Glad you asked. TuxOnIce is being actively maintained and supported ++ by Nigel (the guy doing most of the kernel coding at the moment), Bernard ++ (who maintains the hibernate script and userspace user interface components) ++ and its users. ++ ++ Resources availble include HowTos, FAQs and a Wiki, all available via ++ tuxonice.net. You can find the mailing lists there. ++ ++8. I think I've found a bug. What should I do? ++ ++ By far and a way, the most common problems people have with TuxOnIce ++ related to drivers not having adequate power management support. In this ++ case, it is not a bug with TuxOnIce, but we can still help you. As we ++ mentioned above, such issues can usually be worked around by building the ++ functionality as modules and unloading them while hibernating. Please visit ++ the Wiki for up-to-date lists of known issues and work arounds. ++ ++ If this information doesn't help, try running: ++ ++ hibernate --bug-report ++ ++ ..and sending the output to the users mailing list. ++ ++ Good information on how to provide us with useful information from an ++ oops is found in the file REPORTING-BUGS, in the top level directory ++ of the kernel tree. If you get an oops, please especially note the ++ information about running what is printed on the screen through ksymoops. ++ The raw information is useless. ++ ++9. When will XXX be supported? ++ ++ If there's a feature missing from TuxOnIce that you'd like, feel free to ++ ask. We try to be obliging, within reason. ++ ++ Patches are welcome. Please send to the list. ++ ++10. How does it work? ++ ++ TuxOnIce does its work in a number of steps. ++ ++ a. Freezing system activity. ++ ++ The first main stage in hibernating is to stop all other activity. This is ++ achieved in stages. Processes are considered in fours groups, which we will ++ describe in reverse order for clarity's sake: Threads with the PF_NOFREEZE ++ flag, kernel threads without this flag, userspace processes with the ++ PF_SYNCTHREAD flag and all other processes. The first set (PF_NOFREEZE) are ++ untouched by the refrigerator code. They are allowed to run during hibernating ++ and resuming, and are used to support user interaction, storage access or the ++ like. Other kernel threads (those unneeded while hibernating) are frozen last. ++ This leaves us with userspace processes that need to be frozen. When a ++ process enters one of the *_sync system calls, we set a PF_SYNCTHREAD flag on ++ that process for the duration of that call. Processes that have this flag are ++ frozen after processes without it, so that we can seek to ensure that dirty ++ data is synced to disk as quickly as possible in a situation where other ++ processes may be submitting writes at the same time. Freezing the processes ++ that are submitting data stops new I/O from being submitted. Syncthreads can ++ then cleanly finish their work. So the order is: ++ ++ - Userspace processes without PF_SYNCTHREAD or PF_NOFREEZE; ++ - Userspace processes with PF_SYNCTHREAD (they won't have NOFREEZE); ++ - Kernel processes without PF_NOFREEZE. ++ ++ b. Eating memory. ++ ++ For a successful hibernation cycle, you need to have enough disk space to store the ++ image and enough memory for the various limitations of TuxOnIce's ++ algorithm. You can also specify a maximum image size. In order to attain ++ to those constraints, TuxOnIce may 'eat' memory. If, after freezing ++ processes, the constraints aren't met, TuxOnIce will thaw all the ++ other processes and begin to eat memory until its calculations indicate ++ the constraints are met. It will then freeze processes again and recheck ++ its calculations. ++ ++ c. Allocation of storage. ++ ++ Next, TuxOnIce allocates the storage that will be used to save ++ the image. ++ ++ The core of TuxOnIce knows nothing about how or where pages are stored. We ++ therefore request the active allocator (remember you might have compiled in ++ more than one!) to allocate enough storage for our expect image size. If ++ this request cannot be fulfilled, we eat more memory and try again. If it ++ is fulfiled, we seek to allocate additional storage, just in case our ++ expected compression ratio (if any) isn't achieved. This time, however, we ++ just continue if we can't allocate enough storage. ++ ++ If these calls to our allocator change the characteristics of the image ++ such that we haven't allocated enough memory, we also loop. (The allocator ++ may well need to allocate space for its storage information). ++ ++ d. Write the first part of the image. ++ ++ TuxOnIce stores the image in two sets of pages called 'pagesets'. ++ Pageset 2 contains pages on the active and inactive lists; essentially ++ the page cache. Pageset 1 contains all other pages, including the kernel. ++ We use two pagesets for one important reason: We need to make an atomic copy ++ of the kernel to ensure consistency of the image. Without a second pageset, ++ that would limit us to an image that was at most half the amount of memory ++ available. Using two pagesets allows us to store a full image. Since pageset ++ 2 pages won't be needed in saving pageset 1, we first save pageset 2 pages. ++ We can then make our atomic copy of the remaining pages using both pageset 2 ++ pages and any other pages that are free. While saving both pagesets, we are ++ careful not to corrupt the image. Among other things, we use lowlevel block ++ I/O routines that don't change the pagecache contents. ++ ++ The next step, then, is writing pageset 2. ++ ++ e. Suspending drivers and storing processor context. ++ ++ Having written pageset2, TuxOnIce calls the power management functions to ++ notify drivers of the hibernation, and saves the processor state in preparation ++ for the atomic copy of memory we are about to make. ++ ++ f. Atomic copy. ++ ++ At this stage, everything else but the TuxOnIce code is halted. Processes ++ are frozen or idling, drivers are quiesced and have stored (ideally and where ++ necessary) their configuration in memory we are about to atomically copy. ++ In our lowlevel architecture specific code, we have saved the CPU state. ++ We can therefore now do our atomic copy before resuming drivers etc. ++ ++ g. Save the atomic copy (pageset 1). ++ ++ TuxOnice can then write the atomic copy of the remaining pages. Since we ++ have copied the pages into other locations, we can continue to use the ++ normal block I/O routines without fear of corruption our image. ++ ++ f. Save the image header. ++ ++ Nearly there! We save our settings and other parameters needed for ++ reloading pageset 1 in an 'image header'. We also tell our allocator to ++ serialise its data at this stage, so that it can reread the image at resume ++ time. ++ ++ g. Set the image header. ++ ++ Finally, we edit the header at our resume= location. The signature is ++ changed by the allocator to reflect the fact that an image exists, and to ++ point to the start of that data if necessary (swap allocator). ++ ++ h. Power down. ++ ++ Or reboot if we're debugging and the appropriate option is selected. ++ ++ Whew! ++ ++ Reloading the image. ++ -------------------- ++ ++ Reloading the image is essentially the reverse of all the above. We load ++ our copy of pageset 1, being careful to choose locations that aren't going ++ to be overwritten as we copy it back (We start very early in the boot ++ process, so there are no other processes to quiesce here). We then copy ++ pageset 1 back to its original location in memory and restore the process ++ context. We are now running with the original kernel. Next, we reload the ++ pageset 2 pages, free the memory and swap used by TuxOnIce, restore ++ the pageset header and restart processes. Sounds easy in comparison to ++ hibernating, doesn't it! ++ ++ There is of course more to TuxOnIce than this, but this explanation ++ should be a good start. If there's interest, I'll write further ++ documentation on range pages and the low level I/O. ++ ++11. Who wrote TuxOnIce? ++ ++ (Answer based on the writings of Florent Chabaud, credits in files and ++ Nigel's limited knowledge; apologies to anyone missed out!) ++ ++ The main developers of TuxOnIce have been... ++ ++ Gabor Kuti ++ Pavel Machek ++ Florent Chabaud ++ Bernard Blackham ++ Nigel Cunningham ++ ++ Significant portions of swsusp, the code in the vanilla kernel which ++ TuxOnIce enhances, have been worked on by Rafael Wysocki. Thanks should ++ also be expressed to him. ++ ++ The above mentioned developers have been aided in their efforts by a host ++ of hundreds, if not thousands of testers and people who have submitted bug ++ fixes & suggestions. Of special note are the efforts of Michael Frank, who ++ had his computers repetitively hibernate and resume for literally tens of ++ thousands of cycles and developed scripts to stress the system and test ++ TuxOnIce far beyond the point most of us (Nigel included!) would consider ++ testing. His efforts have contributed as much to TuxOnIce as any of the ++ names above. +diff --git a/MAINTAINERS b/MAINTAINERS +index a92f485..4b47f3a 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -7831,6 +7831,13 @@ S: Maintained + F: drivers/tc/ + F: include/linux/tc.h + ++TUXONICE (ENHANCED HIBERNATION) ++P: Nigel Cunningham ++M: nigel@tuxonice.net ++L: tuxonice-devel@tuxonice.net ++W: http://tuxonice.net ++S: Maintained ++ + U14-34F SCSI DRIVER + M: Dario Ballabio + L: linux-scsi@vger.kernel.org +diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c +index 6c856fb..749ee94 100644 +--- a/arch/powerpc/mm/pgtable_32.c ++++ b/arch/powerpc/mm/pgtable_32.c +@@ -433,6 +433,7 @@ void kernel_map_pages(struct page *page, int numpages, int enable) + + change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); + } ++EXPORT_SYMBOL_GPL(kernel_map_pages); + #endif /* CONFIG_DEBUG_PAGEALLOC */ + + static int fixmaps; +diff --git a/arch/powerpc/platforms/83xx/suspend.c b/arch/powerpc/platforms/83xx/suspend.c +index 1d769a2..e2877cf 100644 +--- a/arch/powerpc/platforms/83xx/suspend.c ++++ b/arch/powerpc/platforms/83xx/suspend.c +@@ -263,6 +263,8 @@ static int mpc83xx_suspend_begin(suspend_state_t state) + + static int agent_thread_fn(void *data) + { ++ set_freezable(); ++ + while (1) { + wait_event_interruptible(agent_wq, pci_pm_state >= 2); + try_to_freeze(); +diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c +index 3f175e8..b5d59c6 100644 +--- a/arch/powerpc/platforms/ps3/device-init.c ++++ b/arch/powerpc/platforms/ps3/device-init.c +@@ -841,6 +841,8 @@ static int ps3_probe_thread(void *data) + if (res) + goto fail_free_irq; + ++ set_freezable(); ++ + /* Loop here processing the requested notification events. */ + do { + try_to_freeze(); +diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c +index a1b1c88..41a5d8b 100644 +--- a/arch/x86/mm/pageattr.c ++++ b/arch/x86/mm/pageattr.c +@@ -1368,6 +1368,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) + __flush_tlb_all(); + } + ++EXPORT_SYMBOL_GPL(kernel_map_pages); ++ + #ifdef CONFIG_HIBERNATION + + bool kernel_page_present(struct page *page) +@@ -1381,7 +1383,7 @@ bool kernel_page_present(struct page *page) + pte = lookup_address((unsigned long)page_address(page), &level); + return (pte_val(*pte) & _PAGE_PRESENT); + } +- ++EXPORT_SYMBOL_GPL(kernel_page_present); + #endif /* CONFIG_HIBERNATION */ + + #endif /* CONFIG_DEBUG_PAGEALLOC */ +diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c +index 120cee1..53b9691 100644 +--- a/arch/x86/power/cpu.c ++++ b/arch/x86/power/cpu.c +@@ -118,9 +118,7 @@ void save_processor_state(void) + __save_processor_state(&saved_context); + x86_platform.save_sched_clock_state(); + } +-#ifdef CONFIG_X86_32 + EXPORT_SYMBOL(save_processor_state); +-#endif + + static void do_fpu_end(void) + { +diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c +index 7d28c88..4f1dd95 100644 +--- a/arch/x86/power/hibernate_32.c ++++ b/arch/x86/power/hibernate_32.c +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -161,6 +162,7 @@ int swsusp_arch_resume(void) + restore_image(); + return 0; + } ++EXPORT_SYMBOL_GPL(swsusp_arch_resume); + + /* + * pfn_is_nosave - check if given pfn is in the 'nosave' section +diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c +index a0fde91..9e8ce13 100644 +--- a/arch/x86/power/hibernate_64.c ++++ b/arch/x86/power/hibernate_64.c +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -97,6 +98,7 @@ int swsusp_arch_resume(void) + restore_image(); + return 0; + } ++EXPORT_SYMBOL_GPL(swsusp_arch_resume); + + /* + * pfn_is_nosave - check if given pfn is in the 'nosave' section +@@ -147,3 +149,4 @@ int arch_hibernation_header_restore(void *addr) + restore_cr3 = rdr->cr3; + return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; + } ++EXPORT_SYMBOL_GPL(arch_hibernation_header_restore); +diff --git a/block/Makefile b/block/Makefile +index 39b76ba..0a0125a 100644 +--- a/block/Makefile ++++ b/block/Makefile +@@ -6,7 +6,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ + blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ + blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ + blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \ +- partition-generic.o partitions/ ++ uuid.o partition-generic.o partitions/ + + obj-$(CONFIG_BLK_DEV_BSG) += bsg.o + obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o +diff --git a/block/blk-core.c b/block/blk-core.c +index 277134c..c420a5c 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -44,6 +44,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); + + DEFINE_IDA(blk_queue_ida); + ++int trap_non_toi_io; ++EXPORT_SYMBOL_GPL(trap_non_toi_io); ++ + /* + * For the allocated request tables + */ +@@ -1854,6 +1857,9 @@ void submit_bio(int rw, struct bio *bio) + { + bio->bi_rw |= rw; + ++ if (unlikely(trap_non_toi_io)) ++ BUG_ON(!(bio->bi_rw & REQ_TOI)); ++ + /* + * If it's a regular read/write or a barrier with data attached, + * go through the normal accounting stuff before submission. +diff --git a/block/genhd.c b/block/genhd.c +index 3993ebf..6eba3d2 100644 +--- a/block/genhd.c ++++ b/block/genhd.c +@@ -17,6 +17,8 @@ + #include + #include + #include ++#include ++#include + #include + + #include "blk.h" +@@ -1373,6 +1375,87 @@ int invalidate_partition(struct gendisk *disk, int partno) + + EXPORT_SYMBOL(invalidate_partition); + ++dev_t blk_lookup_fs_info(struct fs_info *seek) ++{ ++ dev_t devt = MKDEV(0, 0); ++ struct class_dev_iter iter; ++ struct device *dev; ++ int best_score = 0; ++ ++ class_dev_iter_init(&iter, &block_class, NULL, &disk_type); ++ while (best_score < 3 && (dev = class_dev_iter_next(&iter))) { ++ struct gendisk *disk = dev_to_disk(dev); ++ struct disk_part_iter piter; ++ struct hd_struct *part; ++ ++ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); ++ ++ while (best_score < 3 && (part = disk_part_iter_next(&piter))) { ++ int score = part_matches_fs_info(part, seek); ++ if (score > best_score) { ++ devt = part_devt(part); ++ best_score = score; ++ } ++ } ++ disk_part_iter_exit(&piter); ++ } ++ class_dev_iter_exit(&iter); ++ return devt; ++} ++EXPORT_SYMBOL_GPL(blk_lookup_fs_info); ++ ++/* Caller uses NULL, key to start. For each match found, we return a bdev on ++ * which we have done blkdev_get, and we do the blkdev_put on block devices ++ * that are passed to us. When no more matches are found, we return NULL. ++ */ ++struct block_device *next_bdev_of_type(struct block_device *last, ++ const char *key) ++{ ++ dev_t devt = MKDEV(0, 0); ++ struct class_dev_iter iter; ++ struct device *dev; ++ struct block_device *next = NULL, *bdev; ++ int got_last = 0; ++ ++ if (!key) ++ goto out; ++ ++ class_dev_iter_init(&iter, &block_class, NULL, &disk_type); ++ while (!devt && (dev = class_dev_iter_next(&iter))) { ++ struct gendisk *disk = dev_to_disk(dev); ++ struct disk_part_iter piter; ++ struct hd_struct *part; ++ ++ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); ++ ++ while ((part = disk_part_iter_next(&piter))) { ++ bdev = bdget(part_devt(part)); ++ if (last && !got_last) { ++ if (last == bdev) ++ got_last = 1; ++ continue; ++ } ++ ++ if (blkdev_get(bdev, FMODE_READ, 0)) ++ continue; ++ ++ if (bdev_matches_key(bdev, key)) { ++ next = bdev; ++ break; ++ } ++ ++ blkdev_put(bdev, FMODE_READ); ++ } ++ disk_part_iter_exit(&piter); ++ } ++ class_dev_iter_exit(&iter); ++out: ++ if (last) ++ blkdev_put(last, FMODE_READ); ++ return next; ++} ++EXPORT_SYMBOL_GPL(next_bdev_of_type); ++ + /* + * Disk events - monitor disk events like media change and eject request. + */ +diff --git a/block/uuid.c b/block/uuid.c +new file mode 100644 +index 0000000..7ae50d3 +--- /dev/null ++++ b/block/uuid.c +@@ -0,0 +1,510 @@ ++#include ++#include ++#include ++#include ++#include ++ ++static int debug_enabled; ++ ++#define PRINTK(fmt, args...) do { \ ++ if (debug_enabled) \ ++ printk(KERN_DEBUG fmt, ## args); \ ++ } while(0) ++ ++#define PRINT_HEX_DUMP(v1, v2, v3, v4, v5, v6, v7, v8) \ ++ do { \ ++ if (debug_enabled) \ ++ print_hex_dump(v1, v2, v3, v4, v5, v6, v7, v8); \ ++ } while(0) ++ ++/* ++ * Simple UUID translation ++ */ ++ ++struct uuid_info { ++ const char *key; ++ const char *name; ++ long bkoff; ++ unsigned sboff; ++ unsigned sig_len; ++ const char *magic; ++ int uuid_offset; ++ int last_mount_offset; ++ int last_mount_size; ++}; ++ ++/* ++ * Based on libuuid's blkid_magic array. Note that I don't ++ * have uuid offsets for all of these yet - mssing ones are 0x0. ++ * Further information welcome. ++ * ++ * Rearranged by page of fs signature for optimisation. ++ */ ++static struct uuid_info uuid_list[] = { ++ { NULL, "oracleasm", 0, 32, 8, "ORCLDISK", 0x0, 0, 0 }, ++ { "ntfs", "ntfs", 0, 3, 8, "NTFS ", 0x0, 0, 0 }, ++ { "vfat", "vfat", 0, 0x52, 5, "MSWIN", 0x0, 0, 0 }, ++ { "vfat", "vfat", 0, 0x52, 8, "FAT32 ", 0x0, 0, 0 }, ++ { "vfat", "vfat", 0, 0x36, 5, "MSDOS", 0x0, 0, 0 }, ++ { "vfat", "vfat", 0, 0x36, 8, "FAT16 ", 0x0, 0, 0 }, ++ { "vfat", "vfat", 0, 0x36, 8, "FAT12 ", 0x0, 0, 0 }, ++ { "vfat", "vfat", 0, 0, 1, "\353", 0x0, 0, 0 }, ++ { "vfat", "vfat", 0, 0, 1, "\351", 0x0, 0, 0 }, ++ { "vfat", "vfat", 0, 0x1fe, 2, "\125\252", 0x0, 0, 0 }, ++ { "xfs", "xfs", 0, 0, 4, "XFSB", 0x20, 0, 0 }, ++ { "romfs", "romfs", 0, 0, 8, "-rom1fs-", 0x0, 0, 0 }, ++ { "bfs", "bfs", 0, 0, 4, "\316\372\173\033", 0, 0, 0 }, ++ { "cramfs", "cramfs", 0, 0, 4, "E=\315\050", 0x0, 0, 0 }, ++ { "qnx4", "qnx4", 0, 4, 6, "QNX4FS", 0, 0, 0 }, ++ { NULL, "crypt_LUKS", 0, 0, 6, "LUKS\xba\xbe", 0x0, 0, 0 }, ++ { "squashfs", "squashfs", 0, 0, 4, "sqsh", 0, 0, 0 }, ++ { "squashfs", "squashfs", 0, 0, 4, "hsqs", 0, 0, 0 }, ++ { "ocfs", "ocfs", 0, 8, 9, "OracleCFS", 0x0, 0, 0 }, ++ { "lvm2pv", "lvm2pv", 0, 0x018, 8, "LVM2 001", 0x0, 0, 0 }, ++ { "sysv", "sysv", 0, 0x3f8, 4, "\020~\030\375", 0, 0, 0 }, ++ { "ext", "ext", 1, 0x38, 2, "\123\357", 0x468, 0x42c, 4 }, ++ { "minix", "minix", 1, 0x10, 2, "\177\023", 0, 0, 0 }, ++ { "minix", "minix", 1, 0x10, 2, "\217\023", 0, 0, 0 }, ++ { "minix", "minix", 1, 0x10, 2, "\150\044", 0, 0, 0 }, ++ { "minix", "minix", 1, 0x10, 2, "\170\044", 0, 0, 0 }, ++ { "lvm2pv", "lvm2pv", 1, 0x018, 8, "LVM2 001", 0x0, 0, 0 }, ++ { "vxfs", "vxfs", 1, 0, 4, "\365\374\001\245", 0, 0, 0 }, ++ { "hfsplus", "hfsplus", 1, 0, 2, "BD", 0x0, 0, 0 }, ++ { "hfsplus", "hfsplus", 1, 0, 2, "H+", 0x0, 0, 0 }, ++ { "hfsplus", "hfsplus", 1, 0, 2, "HX", 0x0, 0, 0 }, ++ { "hfs", "hfs", 1, 0, 2, "BD", 0x0, 0, 0 }, ++ { "ocfs2", "ocfs2", 1, 0, 6, "OCFSV2", 0x0, 0, 0 }, ++ { "lvm2pv", "lvm2pv", 0, 0x218, 8, "LVM2 001", 0x0, 0, 0 }, ++ { "lvm2pv", "lvm2pv", 1, 0x218, 8, "LVM2 001", 0x0, 0, 0 }, ++ { "ocfs2", "ocfs2", 2, 0, 6, "OCFSV2", 0x0, 0, 0 }, ++ { "swap", "swap", 0, 0xff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, ++ { "swap", "swap", 0, 0xff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0xff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0xff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0xff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, ++ { "ocfs2", "ocfs2", 4, 0, 6, "OCFSV2", 0x0, 0, 0 }, ++ { "ocfs2", "ocfs2", 8, 0, 6, "OCFSV2", 0x0, 0, 0 }, ++ { "hpfs", "hpfs", 8, 0, 4, "I\350\225\371", 0, 0, 0 }, ++ { "reiserfs", "reiserfs", 8, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 }, ++ { "reiserfs", "reiserfs", 8, 20, 8, "ReIsErFs", 0x10054, 0, 0 }, ++ { "zfs", "zfs", 8, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 }, ++ { "zfs", "zfs", 8, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 }, ++ { "ufs", "ufs", 8, 0x55c, 4, "T\031\001\000", 0, 0, 0 }, ++ { "swap", "swap", 0, 0x1ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, ++ { "swap", "swap", 0, 0x1ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0x1ff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0x1ff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0x1ff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, ++ { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr2Fs", 0x10054, 0, 0 }, ++ { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr3Fs", 0x10054, 0, 0 }, ++ { "reiserfs", "reiserfs", 64, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 }, ++ { "reiser4", "reiser4", 64, 0, 7, "ReIsEr4", 0x100544, 0, 0 }, ++ { "gfs2", "gfs2", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 }, ++ { "gfs", "gfs", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 }, ++ { "btrfs", "btrfs", 64, 0x40, 8, "_BHRfS_M", 0x0, 0, 0 }, ++ { "swap", "swap", 0, 0x3ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, ++ { "swap", "swap", 0, 0x3ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0x3ff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0x3ff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0x3ff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, ++ { "udf", "udf", 32, 1, 5, "BEA01", 0x0, 0, 0 }, ++ { "udf", "udf", 32, 1, 5, "BOOT2", 0x0, 0, 0 }, ++ { "udf", "udf", 32, 1, 5, "CD001", 0x0, 0, 0 }, ++ { "udf", "udf", 32, 1, 5, "CDW02", 0x0, 0, 0 }, ++ { "udf", "udf", 32, 1, 5, "NSR02", 0x0, 0, 0 }, ++ { "udf", "udf", 32, 1, 5, "NSR03", 0x0, 0, 0 }, ++ { "udf", "udf", 32, 1, 5, "TEA01", 0x0, 0, 0 }, ++ { "iso9660", "iso9660", 32, 1, 5, "CD001", 0x0, 0, 0 }, ++ { "iso9660", "iso9660", 32, 9, 5, "CDROM", 0x0, 0, 0 }, ++ { "jfs", "jfs", 32, 0, 4, "JFS1", 0x88, 0, 0 }, ++ { "swap", "swap", 0, 0x7ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, ++ { "swap", "swap", 0, 0x7ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0x7ff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0x7ff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0x7ff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, ++ { "swap", "swap", 0, 0xfff6, 10, "SWAP-SPACE", 0x40c, 0, 0 }, ++ { "swap", "swap", 0, 0xfff6, 10, "SWAPSPACE2", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0xfff6, 9, "S1SUSPEND", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0xfff6, 9, "S2SUSPEND", 0x40c, 0, 0 }, ++ { "swap", "swsuspend", 0, 0xfff6, 9, "ULSUSPEND", 0x40c, 0, 0 }, ++ { "zfs", "zfs", 264, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 }, ++ { "zfs", "zfs", 264, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 }, ++ { NULL, NULL, 0, 0, 0, NULL, 0x0, 0, 0 } ++}; ++ ++static int null_uuid(const char *uuid) ++{ ++ int i; ++ ++ for (i = 0; i < 16 && !uuid[i]; i++); ++ ++ return (i == 16); ++} ++ ++ ++static void uuid_end_bio(struct bio *bio, int err) ++{ ++ struct page *page = bio->bi_io_vec[0].bv_page; ++ ++ if(!test_bit(BIO_UPTODATE, &bio->bi_flags)) ++ SetPageError(page); ++ ++ unlock_page(page); ++ bio_put(bio); ++} ++ ++ ++/** ++ * submit - submit BIO request ++ * @dev: The block device we're using. ++ * @page_num: The page we're reading. ++ * ++ * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the ++ * textbook - allocate and initialize the bio. If we're writing, make sure ++ * the page is marked as dirty. Then submit it and carry on." ++ **/ ++static struct page *read_bdev_page(struct block_device *dev, int page_num) ++{ ++ struct bio *bio = NULL; ++ struct page *page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); ++ ++ if (!page) { ++ printk(KERN_ERR "Failed to allocate a page for reading data " ++ "in UUID checks."); ++ return NULL; ++ } ++ ++ bio = bio_alloc(GFP_NOFS, 1); ++ bio->bi_bdev = dev; ++ bio->bi_sector = page_num << 3; ++ bio->bi_end_io = uuid_end_bio; ++ ++ PRINTK("Submitting bio on device %lx, page %d using bio %p and page %p.\n", ++ (unsigned long) dev->bd_dev, page_num, bio, page); ++ ++ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { ++ printk(KERN_DEBUG "ERROR: adding page to bio at %d\n", ++ page_num); ++ bio_put(bio); ++ __free_page(page); ++ printk(KERN_DEBUG "read_bdev_page freed page %p (in error " ++ "path).\n", page); ++ return NULL; ++ } ++ ++ lock_page(page); ++ submit_bio(READ | REQ_SYNC | REQ_TOI, bio); ++ ++ wait_on_page_locked(page); ++ if (PageError(page)) { ++ __free_page(page); ++ page = NULL; ++ } ++ return page; ++} ++ ++int bdev_matches_key(struct block_device *bdev, const char *key) ++{ ++ unsigned char *data = NULL; ++ struct page *data_page = NULL; ++ ++ int dev_offset, pg_num, pg_off, i; ++ int last_pg_num = -1; ++ int result = 0; ++ char buf[50]; ++ ++ if (null_uuid(key)) { ++ PRINTK("Refusing to find a NULL key.\n"); ++ return 0; ++ } ++ ++ if (!bdev->bd_disk) { ++ bdevname(bdev, buf); ++ PRINTK("bdev %s has no bd_disk.\n", buf); ++ return 0; ++ } ++ ++ if (!bdev->bd_disk->queue) { ++ bdevname(bdev, buf); ++ PRINTK("bdev %s has no queue.\n", buf); ++ return 0; ++ } ++ ++ for (i = 0; uuid_list[i].name; i++) { ++ struct uuid_info *dat = &uuid_list[i]; ++ ++ if (!dat->key || strcmp(dat->key, key)) ++ continue; ++ ++ dev_offset = (dat->bkoff << 10) + dat->sboff; ++ pg_num = dev_offset >> 12; ++ pg_off = dev_offset & 0xfff; ++ ++ if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1) ++ continue; ++ ++ if (pg_num != last_pg_num) { ++ if (data_page) { ++ kunmap(data_page); ++ __free_page(data_page); ++ } ++ data_page = read_bdev_page(bdev, pg_num); ++ if (!data_page) ++ continue; ++ data = kmap(data_page); ++ } ++ ++ last_pg_num = pg_num; ++ ++ if (strncmp(&data[pg_off], dat->magic, dat->sig_len)) ++ continue; ++ ++ result = 1; ++ break; ++ } ++ ++ if (data_page) { ++ kunmap(data_page); ++ __free_page(data_page); ++ } ++ ++ return result; ++} ++ ++/* ++ * part_matches_fs_info - Does the given partition match the details given? ++ * ++ * Returns a score saying how good the match is. ++ * 0 = no UUID match. ++ * 1 = UUID but last mount time differs. ++ * 2 = UUID, last mount time but not dev_t ++ * 3 = perfect match ++ * ++ * This lets us cope elegantly with probing resulting in dev_ts changing ++ * from boot to boot, and with the case where a user copies a partition ++ * (UUID is non unique), and we need to check the last mount time of the ++ * correct partition. ++ */ ++int part_matches_fs_info(struct hd_struct *part, struct fs_info *seek) ++{ ++ struct block_device *bdev; ++ struct fs_info *got; ++ int result = 0; ++ char buf[50]; ++ ++ if (null_uuid((char *) &seek->uuid)) { ++ PRINTK("Refusing to find a NULL uuid.\n"); ++ return 0; ++ } ++ ++ bdev = bdget(part_devt(part)); ++ ++ PRINTK("part_matches fs info considering %x.\n", part_devt(part)); ++ ++ if (blkdev_get(bdev, FMODE_READ, 0)) { ++ PRINTK("blkdev_get failed.\n"); ++ return 0; ++ } ++ ++ if (!bdev->bd_disk) { ++ bdevname(bdev, buf); ++ PRINTK("bdev %s has no bd_disk.\n", buf); ++ goto out; ++ } ++ ++ if (!bdev->bd_disk->queue) { ++ bdevname(bdev, buf); ++ PRINTK("bdev %s has no queue.\n", buf); ++ goto out; ++ } ++ ++ got = fs_info_from_block_dev(bdev); ++ ++ if (got && !memcmp(got->uuid, seek->uuid, 16)) { ++ PRINTK(" Have matching UUID.\n"); ++ PRINTK(" Got: LMS %d, LM %p.\n", got->last_mount_size, got->last_mount); ++ PRINTK(" Seek: LMS %d, LM %p.\n", seek->last_mount_size, seek->last_mount); ++ result = 1; ++ ++ if (got->last_mount_size == seek->last_mount_size && ++ got->last_mount && seek->last_mount && ++ !memcmp(got->last_mount, seek->last_mount, ++ got->last_mount_size)) { ++ result = 2; ++ ++ PRINTK(" Matching last mount time.\n"); ++ ++ if (part_devt(part) == seek->dev_t) { ++ result = 3; ++ PRINTK(" Matching dev_t.\n"); ++ } else ++ PRINTK("Dev_ts differ (%x vs %x).\n", part_devt(part), seek->dev_t); ++ } ++ } ++ ++ PRINTK(" Score for %x is %d.\n", part_devt(part), result); ++ free_fs_info(got); ++out: ++ blkdev_put(bdev, FMODE_READ); ++ return result; ++} ++ ++void free_fs_info(struct fs_info *fs_info) ++{ ++ if (!fs_info || IS_ERR(fs_info)) ++ return; ++ ++ if (fs_info->last_mount) ++ kfree(fs_info->last_mount); ++ ++ kfree(fs_info); ++} ++EXPORT_SYMBOL_GPL(free_fs_info); ++ ++struct fs_info *fs_info_from_block_dev(struct block_device *bdev) ++{ ++ unsigned char *data = NULL; ++ struct page *data_page = NULL; ++ ++ int dev_offset, pg_num, pg_off; ++ int uuid_pg_num, uuid_pg_off, i; ++ unsigned char *uuid_data = NULL; ++ struct page *uuid_data_page = NULL; ++ ++ int last_pg_num = -1, last_uuid_pg_num = 0; ++ char buf[50]; ++ struct fs_info *fs_info = NULL; ++ ++ bdevname(bdev, buf); ++ ++ PRINTK("uuid_from_block_dev looking for partition type of %s.\n", buf); ++ ++ for (i = 0; uuid_list[i].name; i++) { ++ struct uuid_info *dat = &uuid_list[i]; ++ dev_offset = (dat->bkoff << 10) + dat->sboff; ++ pg_num = dev_offset >> 12; ++ pg_off = dev_offset & 0xfff; ++ uuid_pg_num = dat->uuid_offset >> 12; ++ uuid_pg_off = dat->uuid_offset & 0xfff; ++ ++ if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1) ++ continue; ++ ++ /* Ignore partition types with no UUID offset */ ++ if (!dat->uuid_offset) ++ continue; ++ ++ if (pg_num != last_pg_num) { ++ if (data_page) { ++ kunmap(data_page); ++ __free_page(data_page); ++ } ++ data_page = read_bdev_page(bdev, pg_num); ++ if (!data_page) ++ continue; ++ data = kmap(data_page); ++ } ++ ++ last_pg_num = pg_num; ++ ++ if (strncmp(&data[pg_off], dat->magic, dat->sig_len)) ++ continue; ++ ++ PRINTK("This partition looks like %s.\n", dat->name); ++ ++ fs_info = kzalloc(sizeof(struct fs_info), GFP_KERNEL); ++ ++ if (!fs_info) { ++ PRINTK("Failed to allocate fs_info struct."); ++ fs_info = ERR_PTR(-ENOMEM); ++ break; ++ } ++ ++ /* UUID can't be off the end of the disk */ ++ if ((uuid_pg_num > bdev->bd_part->nr_sects >> 3) || ++ !dat->uuid_offset) ++ goto no_uuid; ++ ++ if (!uuid_data || uuid_pg_num != last_uuid_pg_num) { ++ /* No need to reread the page from above */ ++ if (uuid_pg_num == pg_num && uuid_data) ++ memcpy(uuid_data, data, PAGE_SIZE); ++ else { ++ if (uuid_data_page) { ++ kunmap(uuid_data_page); ++ __free_page(uuid_data_page); ++ } ++ uuid_data_page = read_bdev_page(bdev, uuid_pg_num); ++ if (!uuid_data_page) ++ continue; ++ uuid_data = kmap(uuid_data_page); ++ } ++ } ++ ++ last_uuid_pg_num = uuid_pg_num; ++ memcpy(&fs_info->uuid, &uuid_data[uuid_pg_off], 16); ++ fs_info->dev_t = bdev->bd_dev; ++ ++no_uuid: ++ PRINT_HEX_DUMP(KERN_EMERG, "fs_info_from_block_dev " ++ "returning uuid ", DUMP_PREFIX_NONE, 16, 1, ++ fs_info->uuid, 16, 0); ++ ++ if (dat->last_mount_size) { ++ int pg = dat->last_mount_offset >> 12, sz; ++ int off = dat->last_mount_offset & 0xfff; ++ struct page *last_mount = read_bdev_page(bdev, pg); ++ unsigned char *last_mount_data; ++ char *ptr; ++ ++ if (!last_mount) { ++ fs_info = ERR_PTR(-ENOMEM); ++ break; ++ } ++ last_mount_data = kmap(last_mount); ++ sz = dat->last_mount_size; ++ ptr = kmalloc(sz, GFP_KERNEL); ++ ++ if (!ptr) { ++ printk(KERN_EMERG "fs_info_from_block_dev " ++ "failed to get memory for last mount " ++ "timestamp."); ++ free_fs_info(fs_info); ++ fs_info = ERR_PTR(-ENOMEM); ++ } else { ++ fs_info->last_mount = ptr; ++ fs_info->last_mount_size = sz; ++ memcpy(ptr, &last_mount_data[off], sz); ++ } ++ ++ kunmap(last_mount); ++ __free_page(last_mount); ++ } ++ break; ++ } ++ ++ if (data_page) { ++ kunmap(data_page); ++ __free_page(data_page); ++ } ++ ++ if (uuid_data_page) { ++ kunmap(uuid_data_page); ++ __free_page(uuid_data_page); ++ } ++ ++ return fs_info; ++} ++EXPORT_SYMBOL_GPL(fs_info_from_block_dev); ++ ++static int __init uuid_debug_setup(char *str) ++{ ++ int value; ++ ++ if (sscanf(str, "=%d", &value)) ++ debug_enabled = value; ++ ++ return 1; ++} ++ ++__setup("uuid_debug", uuid_debug_setup); +diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c +index 31de104..fbe4f9e 100644 +--- a/drivers/acpi/acpi_pad.c ++++ b/drivers/acpi/acpi_pad.c +@@ -154,6 +154,7 @@ static int power_saving_thread(void *data) + u64 last_jiffies = 0; + + sched_setscheduler(current, SCHED_RR, ¶m); ++ set_freezable(); + + while (!kthread_should_stop()) { + int cpu; +diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c +index 2b7f77d..45178ca 100644 +--- a/drivers/base/power/main.c ++++ b/drivers/base/power/main.c +@@ -716,6 +716,7 @@ void dpm_resume(pm_message_t state) + async_synchronize_full(); + dpm_show_time(starttime, state, NULL); + } ++EXPORT_SYMBOL_GPL(dpm_resume); + + /** + * device_complete - Complete a PM transition for given device. +@@ -792,6 +793,7 @@ void dpm_complete(pm_message_t state) + list_splice(&list, &dpm_list); + mutex_unlock(&dpm_list_mtx); + } ++EXPORT_SYMBOL_GPL(dpm_complete); + + /** + * dpm_resume_end - Execute "resume" callbacks and complete system transition. +@@ -1214,6 +1216,7 @@ int dpm_suspend(pm_message_t state) + dpm_show_time(starttime, state, NULL); + return error; + } ++EXPORT_SYMBOL_GPL(dpm_suspend); + + /** + * device_prepare - Prepare a device for system power transition. +@@ -1315,6 +1318,7 @@ int dpm_prepare(pm_message_t state) + mutex_unlock(&dpm_list_mtx); + return error; + } ++EXPORT_SYMBOL_GPL(dpm_prepare); + + /** + * dpm_suspend_start - Prepare devices for PM transition and suspend them. +diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c +index 79715e7..76e8bc2 100644 +--- a/drivers/base/power/wakeup.c ++++ b/drivers/base/power/wakeup.c +@@ -23,6 +23,7 @@ + * if wakeup events are registered during or immediately before the transition. + */ + bool events_check_enabled __read_mostly; ++EXPORT_SYMBOL_GPL(events_check_enabled); + + /* + * Combined counters of registered wakeup events and wakeup events in progress. +@@ -712,6 +713,7 @@ bool pm_wakeup_pending(void) + + return ret; + } ++EXPORT_SYMBOL_GPL(pm_wakeup_pending); + + /** + * pm_get_wakeup_count - Read the number of registered wakeup events. +diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c +index 5ac841f..f3cb20a 100644 +--- a/drivers/block/xen-blkback/blkback.c ++++ b/drivers/block/xen-blkback/blkback.c +@@ -397,6 +397,7 @@ int xen_blkif_schedule(void *arg) + struct xen_vbd *vbd = &blkif->vbd; + + xen_blkif_get(blkif); ++ set_freezable(); + + while (!kthread_should_stop()) { + if (try_to_freeze()) +diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c +index 24efae4..8db2c38 100644 +--- a/drivers/gpu/drm/drm_gem.c ++++ b/drivers/gpu/drm/drm_gem.c +@@ -139,7 +139,8 @@ int drm_gem_object_init(struct drm_device *dev, + BUG_ON((size & (PAGE_SIZE - 1)) != 0); + + obj->dev = dev; +- obj->filp = shmem_file_setup("drm mm object", size, VM_NORESERVE); ++ obj->filp = shmem_file_setup("drm mm object", size, ++ VM_NORESERVE, 1); + if (IS_ERR(obj->filp)) + return PTR_ERR(obj->filp); + +diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c +index 7d759a4..83f8e3b 100644 +--- a/drivers/gpu/drm/ttm/ttm_tt.c ++++ b/drivers/gpu/drm/ttm/ttm_tt.c +@@ -337,7 +337,7 @@ int ttm_tt_swapout(struct ttm_tt *ttm, struct file *persistent_swap_storage) + if (!persistent_swap_storage) { + swap_storage = shmem_file_setup("ttm swap", + ttm->num_pages << PAGE_SHIFT, +- 0); ++ 0, 0); + if (unlikely(IS_ERR(swap_storage))) { + pr_err("Failed allocating swap storage\n"); + return PTR_ERR(swap_storage); +diff --git a/drivers/md/md.c b/drivers/md/md.c +index 3db3d1b..a09c18d 100644 +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -33,6 +33,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -7331,10 +7332,14 @@ void md_do_sync(struct md_thread *thread) + * + */ + ++ set_freezable(); ++ + do { + mddev->curr_resync = 2; + + try_again: ++ try_to_freeze(); ++ + if (kthread_should_stop()) + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + +@@ -7357,6 +7362,9 @@ void md_do_sync(struct md_thread *thread) + * time 'round when curr_resync == 2 + */ + continue; ++ ++ try_to_freeze(); ++ + /* We need to wait 'interruptible' so as not to + * contribute to the load average, and not to + * be caught by 'softlockup' +@@ -7369,6 +7377,7 @@ void md_do_sync(struct md_thread *thread) + " share one or more physical units)\n", + desc, mdname(mddev), mdname(mddev2)); + mddev_put(mddev2); ++ try_to_freeze(); + if (signal_pending(current)) + flush_signals(current); + schedule(); +@@ -7486,6 +7495,8 @@ void md_do_sync(struct md_thread *thread) + || kthread_should_stop()); + } + ++ try_to_freeze(); ++ + if (kthread_should_stop()) + goto interrupted; + +@@ -7530,6 +7541,7 @@ void md_do_sync(struct md_thread *thread) + last_mark = next; + } + ++ try_to_freeze(); + + if (kthread_should_stop()) + goto interrupted; +@@ -7743,8 +7755,10 @@ static void reap_sync_thread(struct mddev *mddev) + */ + void md_check_recovery(struct mddev *mddev) + { +- if (mddev->suspended) ++#ifdef CONFIG_FREEZER ++ if (mddev->suspended || unlikely(atomic_read(&system_freezing_cnt))) + return; ++#endif + + if (mddev->bitmap) + bitmap_daemon_work(mddev); +diff --git a/drivers/net/irda/stir4200.c b/drivers/net/irda/stir4200.c +index 876e709..b0653a2 100644 +--- a/drivers/net/irda/stir4200.c ++++ b/drivers/net/irda/stir4200.c +@@ -739,6 +739,8 @@ static int stir_transmit_thread(void *arg) + struct net_device *dev = stir->netdev; + struct sk_buff *skb; + ++ set_freezable(); ++ + while (!kthread_should_stop()) { + #ifdef CONFIG_PM + /* if suspending, then power off and wait */ +diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c +index 1a27280..39a2c61 100644 +--- a/drivers/tty/vt/vt.c ++++ b/drivers/tty/vt/vt.c +@@ -2422,6 +2422,7 @@ int vt_kmsg_redirect(int new) + else + return kmsg_con; + } ++EXPORT_SYMBOL_GPL(vt_kmsg_redirect); + + /* + * Console on virtual terminal +diff --git a/drivers/uwb/uwbd.c b/drivers/uwb/uwbd.c +index bdcb13c..ce8fc9c 100644 +--- a/drivers/uwb/uwbd.c ++++ b/drivers/uwb/uwbd.c +@@ -271,6 +271,7 @@ static int uwbd(void *param) + struct uwb_event *evt; + int should_stop = 0; + ++ set_freezable(); + while (1) { + wait_event_interruptible_timeout( + rc->uwbd.wq, +diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c +index 7994d933..ced5cba 100644 +--- a/drivers/w1/w1.c ++++ b/drivers/w1/w1.c +@@ -974,6 +974,7 @@ int w1_process(void *data) + * time can be calculated in jiffies once. + */ + const unsigned long jtime = msecs_to_jiffies(w1_timeout * 1000); ++ set_freezable(); + + while (!kthread_should_stop()) { + if (dev->search_count) { +diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c +index 58b7d14..87080a8 100644 +--- a/fs/btrfs/async-thread.c ++++ b/fs/btrfs/async-thread.c +@@ -307,6 +307,8 @@ static int worker_loop(void *arg) + INIT_LIST_HEAD(&head); + INIT_LIST_HEAD(&prio_head); + ++ set_freezable(); ++ + do { + again: + while (1) { +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index a8f652d..82588e5 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -1636,6 +1636,8 @@ static int cleaner_kthread(void *arg) + { + struct btrfs_root *root = arg; + ++ set_freezable(); ++ + do { + if (!(root->fs_info->sb->s_flags & MS_RDONLY) && + mutex_trylock(&root->fs_info->cleaner_mutex)) { +@@ -1665,6 +1667,8 @@ static int transaction_kthread(void *arg) + unsigned long delay; + bool cannot_commit; + ++ set_freezable(); ++ + do { + cannot_commit = false; + delay = HZ * 30; +diff --git a/fs/drop_caches.c b/fs/drop_caches.c +index c00e055..d023de0 100644 +--- a/fs/drop_caches.c ++++ b/fs/drop_caches.c +@@ -8,6 +8,7 @@ + #include + #include + #include ++#include + #include "internal.h" + + /* A global variable is a bit ugly, but it keeps the code simple */ +@@ -49,6 +50,13 @@ static void drop_slab(void) + } while (nr_objects > 10); + } + ++/* For TuxOnIce */ ++void drop_pagecache(void) ++{ ++ iterate_supers(drop_pagecache_sb, NULL); ++} ++EXPORT_SYMBOL_GPL(drop_pagecache); ++ + int drop_caches_sysctl_handler(ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) + { +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 3d4fb81..4161100 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -2847,6 +2847,7 @@ static int ext4_lazyinit_thread(void *arg) + unsigned long next_wakeup, cur; + + BUG_ON(NULL == eli); ++ set_freezable(); + + cont_thread: + while (true) { +diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c +index 9a2ca8b..88c7502 100644 +--- a/fs/gfs2/log.c ++++ b/fs/gfs2/log.c +@@ -792,6 +792,8 @@ int gfs2_logd(void *data) + unsigned long t = 1; + DEFINE_WAIT(wait); + ++ set_freezable(); ++ + while (!kthread_should_stop()) { + + if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { +diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c +index 06122d0..decc509 100644 +--- a/fs/gfs2/quota.c ++++ b/fs/gfs2/quota.c +@@ -1410,6 +1410,8 @@ int gfs2_quotad(void *data) + DEFINE_WAIT(wait); + int empty; + ++ set_freezable(); ++ + while (!kthread_should_stop()) { + + /* Update the master statfs file */ +diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c +index 2eb952c..ce8687e 100644 +--- a/fs/jfs/jfs_logmgr.c ++++ b/fs/jfs/jfs_logmgr.c +@@ -2337,6 +2337,8 @@ int jfsIOWait(void *arg) + { + struct lbuf *bp; + ++ set_freezable(); ++ + do { + spin_lock_irq(&log_redrive_lock); + while ((bp = log_redrive_list)) { +diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c +index 5fcc02e..e0bc87b 100644 +--- a/fs/jfs/jfs_txnmgr.c ++++ b/fs/jfs/jfs_txnmgr.c +@@ -2752,6 +2752,8 @@ int jfs_lazycommit(void *arg) + unsigned long flags; + struct jfs_sb_info *sbi; + ++ set_freezable(); ++ + do { + LAZY_LOCK(flags); + jfs_commit_thread_waking = 0; /* OK to wake another thread */ +@@ -2936,6 +2938,8 @@ int jfs_sync(void *arg) + struct jfs_inode_info *jfs_ip; + tid_t tid; + ++ set_freezable(); ++ + do { + /* + * write each inode on the anonymous inode list +diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c +index a5752a58..3ae43e5 100644 +--- a/fs/nilfs2/segment.c ++++ b/fs/nilfs2/segment.c +@@ -2440,6 +2440,8 @@ static int nilfs_segctor_thread(void *arg) + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + int timeout = 0; + ++ set_freezable(); ++ + sci->sc_timer.data = (unsigned long)current; + sci->sc_timer.function = nilfs_construction_timeout; + +diff --git a/fs/super.c b/fs/super.c +index 12f1237..411cb28 100644 +--- a/fs/super.c ++++ b/fs/super.c +@@ -38,6 +38,8 @@ + + + LIST_HEAD(super_blocks); ++EXPORT_SYMBOL_GPL(super_blocks); ++ + DEFINE_SPINLOCK(sb_lock); + + static char *sb_writers_name[SB_FREEZE_LEVELS] = { +diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c +index 0eda725..55de808 100644 +--- a/fs/xfs/xfs_trans_ail.c ++++ b/fs/xfs/xfs_trans_ail.c +@@ -511,6 +511,7 @@ xfsaild( + struct xfs_ail *ailp = data; + long tout = 0; /* milliseconds */ + ++ set_freezable(); + current->flags |= PF_MEMALLOC; + + while (!kthread_should_stop()) { +diff --git a/include/linux/bio.h b/include/linux/bio.h +index 820e7aa..b7d41d5 100644 +--- a/include/linux/bio.h ++++ b/include/linux/bio.h +@@ -32,6 +32,8 @@ + /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ + #include + ++extern int trap_non_toi_io; ++ + #define BIO_DEBUG + + #ifdef BIO_DEBUG +diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h +index cdf1119..fc4c817 100644 +--- a/include/linux/blk_types.h ++++ b/include/linux/blk_types.h +@@ -175,6 +175,7 @@ enum rq_flag_bits { + __REQ_IO_STAT, /* account I/O stat */ + __REQ_MIXED_MERGE, /* merge of different types, fail separately */ + __REQ_KERNEL, /* direct IO to kernel pages */ ++ __REQ_TOI, /* TuxOnIce I/O */ + __REQ_NR_BITS, /* stops here */ + }; + +@@ -222,6 +223,7 @@ enum rq_flag_bits { + #define REQ_IO_STAT (1 << __REQ_IO_STAT) + #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) + #define REQ_SECURE (1 << __REQ_SECURE) ++#define REQ_TOI (1 << __REQ_TOI) + #define REQ_KERNEL (1 << __REQ_KERNEL) + + #endif /* __LINUX_BLK_TYPES_H */ +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 7d2e893..6b3856c 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -1632,6 +1632,8 @@ struct super_operations { + #define S_IMA 1024 /* Inode has an associated IMA struct */ + #define S_AUTOMOUNT 2048 /* Automount/referral quasi-directory */ + #define S_NOSEC 4096 /* no suid or xattr security attributes */ ++#define S_ATOMIC_COPY 8192 /* Pages mapped with this inode need to be ++ atomically copied (gem) */ + + /* + * Note that nosuid etc flags are inode-specific: setting some file-system +@@ -2055,6 +2057,13 @@ extern struct super_block *freeze_bdev(struct block_device *); + extern void emergency_thaw_all(void); + extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); + extern int fsync_bdev(struct block_device *); ++extern int fsync_super(struct super_block *); ++extern int fsync_no_super(struct block_device *); ++#define FS_FREEZER_FUSE 1 ++#define FS_FREEZER_NORMAL 2 ++#define FS_FREEZER_ALL (FS_FREEZER_FUSE | FS_FREEZER_NORMAL) ++void freeze_filesystems(int which); ++void thaw_filesystems(int which); + #else + static inline void bd_forget(struct inode *inode) {} + static inline int sync_blockdev(struct block_device *bdev) { return 0; } +diff --git a/include/linux/fs_uuid.h b/include/linux/fs_uuid.h +new file mode 100644 +index 0000000..3234135 +--- /dev/null ++++ b/include/linux/fs_uuid.h +@@ -0,0 +1,19 @@ ++#include ++ ++struct hd_struct; ++struct block_device; ++ ++struct fs_info { ++ char uuid[16]; ++ dev_t dev_t; ++ char *last_mount; ++ int last_mount_size; ++}; ++ ++int part_matches_fs_info(struct hd_struct *part, struct fs_info *seek); ++dev_t blk_lookup_fs_info(struct fs_info *seek); ++struct fs_info *fs_info_from_block_dev(struct block_device *bdev); ++void free_fs_info(struct fs_info *fs_info); ++int bdev_matches_key(struct block_device *bdev, const char *key); ++struct block_device *next_bdev_of_type(struct block_device *last, ++ const char *key); +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 9d9dcc3..8e3282e 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1679,6 +1679,7 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, + unsigned long shrink_slab(struct shrink_control *shrink, + unsigned long nr_pages_scanned, + unsigned long lru_pages); ++void drop_pagecache(void); + + #ifndef CONFIG_MMU + #define randomize_va_space 0 +diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h +index 30aa0dc..b7ea3d4 100644 +--- a/include/linux/shmem_fs.h ++++ b/include/linux/shmem_fs.h +@@ -46,7 +46,8 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) + extern int shmem_init(void); + extern int shmem_fill_super(struct super_block *sb, void *data, int silent); + extern struct file *shmem_file_setup(const char *name, +- loff_t size, unsigned long flags); ++ loff_t size, unsigned long flags, ++ int atomic_copy); + extern int shmem_zero_setup(struct vm_area_struct *); + extern int shmem_lock(struct file *file, int lock, struct user_struct *user); + extern void shmem_unlock_mapping(struct address_space *mapping); +diff --git a/include/linux/suspend.h b/include/linux/suspend.h +index d4e3f16..3f143b0 100644 +--- a/include/linux/suspend.h ++++ b/include/linux/suspend.h +@@ -418,6 +418,73 @@ extern bool pm_print_times_enabled; + #define pm_print_times_enabled (false) + #endif + ++enum { ++ TOI_CAN_HIBERNATE, ++ TOI_CAN_RESUME, ++ TOI_RESUME_DEVICE_OK, ++ TOI_NORESUME_SPECIFIED, ++ TOI_SANITY_CHECK_PROMPT, ++ TOI_CONTINUE_REQ, ++ TOI_RESUMED_BEFORE, ++ TOI_BOOT_TIME, ++ TOI_NOW_RESUMING, ++ TOI_IGNORE_LOGLEVEL, ++ TOI_TRYING_TO_RESUME, ++ TOI_LOADING_ALT_IMAGE, ++ TOI_STOP_RESUME, ++ TOI_IO_STOPPED, ++ TOI_NOTIFIERS_PREPARE, ++ TOI_CLUSTER_MODE, ++ TOI_BOOT_KERNEL, ++}; ++ ++#ifdef CONFIG_TOI ++ ++/* Used in init dir files */ ++extern unsigned long toi_state; ++#define set_toi_state(bit) (set_bit(bit, &toi_state)) ++#define clear_toi_state(bit) (clear_bit(bit, &toi_state)) ++#define test_toi_state(bit) (test_bit(bit, &toi_state)) ++extern int toi_running; ++ ++#define test_action_state(bit) (test_bit(bit, &toi_bkd.toi_action)) ++extern int try_tuxonice_hibernate(void); ++ ++#else /* !CONFIG_TOI */ ++ ++#define toi_state (0) ++#define set_toi_state(bit) do { } while (0) ++#define clear_toi_state(bit) do { } while (0) ++#define test_toi_state(bit) (0) ++#define toi_running (0) ++ ++static inline int try_tuxonice_hibernate(void) { return 0; } ++#define test_action_state(bit) (0) ++ ++#endif /* CONFIG_TOI */ ++ ++#ifdef CONFIG_HIBERNATION ++#ifdef CONFIG_TOI ++extern void try_tuxonice_resume(void); ++#else ++#define try_tuxonice_resume() do { } while (0) ++#endif ++ ++extern int resume_attempted; ++extern int software_resume(void); ++ ++static inline void check_resume_attempted(void) ++{ ++ if (resume_attempted) ++ return; ++ ++ software_resume(); ++} ++#else ++#define check_resume_attempted() do { } while (0) ++#define resume_attempted (0) ++#endif ++ + #ifdef CONFIG_PM_AUTOSLEEP + + /* kernel/power/autosleep.c */ +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 68df9c1..ceabd9d 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -217,6 +217,7 @@ extern unsigned long totalram_pages; + extern unsigned long totalreserve_pages; + extern unsigned long dirty_balance_reserve; + extern unsigned int nr_free_buffer_pages(void); ++extern unsigned int nr_unallocated_buffer_pages(void); + extern unsigned int nr_free_pagecache_pages(void); + + /* Definition of global_page_state not available yet */ +@@ -264,6 +265,8 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, + struct zone *zone, + unsigned long *nr_scanned); + extern unsigned long shrink_all_memory(unsigned long nr_pages); ++extern unsigned long shrink_memory_mask(unsigned long nr_to_reclaim, ++ gfp_t mask); + extern int vm_swappiness; + extern int remove_mapping(struct address_space *mapping, struct page *page); + extern long vm_total_pages; +@@ -360,13 +363,17 @@ extern void swapcache_free(swp_entry_t, struct page *page); + extern int free_swap_and_cache(swp_entry_t); + extern int swap_type_of(dev_t, sector_t, struct block_device **); + extern unsigned int count_swap_pages(int, int); ++extern sector_t map_swap_entry(swp_entry_t entry, struct block_device **); + extern sector_t map_swap_page(struct page *, struct block_device **); + extern sector_t swapdev_block(int, pgoff_t); ++extern struct swap_info_struct *get_swap_info_struct(unsigned); + extern int page_swapcount(struct page *); + extern struct swap_info_struct *page_swap_info(struct page *); + extern int reuse_swap_page(struct page *); + extern int try_to_free_swap(struct page *); + struct backing_dev_info; ++extern void get_swap_range_of_type(int type, swp_entry_t *start, ++ swp_entry_t *end, unsigned int limit); + + #ifdef CONFIG_MEMCG + extern void +diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h +index 78d5b8a..864847f 100644 +--- a/include/uapi/linux/netlink.h ++++ b/include/uapi/linux/netlink.h +@@ -26,6 +26,8 @@ + #define NETLINK_ECRYPTFS 19 + #define NETLINK_RDMA 20 + #define NETLINK_CRYPTO 21 /* Crypto layer */ ++#define NETLINK_TOI_USERUI 22 /* TuxOnIce's userui */ ++#define NETLINK_TOI_USM 23 /* Userspace storage manager */ + + #define NETLINK_INET_DIAG NETLINK_SOCK_DIAG + +diff --git a/init/do_mounts.c b/init/do_mounts.c +index a2b49f2..58be071 100644 +--- a/init/do_mounts.c ++++ b/init/do_mounts.c +@@ -281,6 +281,7 @@ fail: + done: + return res; + } ++EXPORT_SYMBOL_GPL(name_to_dev_t); + + static int __init root_dev_setup(char *line) + { +@@ -582,6 +583,8 @@ void __init prepare_namespace(void) + if (is_floppy && rd_doload && rd_load_disk(0)) + ROOT_DEV = Root_RAM0; + ++ check_resume_attempted(); ++ + mount_root(); + out: + devtmpfs_mount("dev"); +diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c +index a32ec1c..74460d0 100644 +--- a/init/do_mounts_initrd.c ++++ b/init/do_mounts_initrd.c +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -75,6 +76,11 @@ static void __init handle_initrd(void) + + current->flags &= ~PF_FREEZER_SKIP; + ++ if (!resume_attempted) ++ printk(KERN_ERR "TuxOnIce: No attempt was made to resume from " ++ "any image that might exist.\n"); ++ clear_toi_state(TOI_BOOT_TIME); ++ + /* move initrd to rootfs' /old */ + sys_mount("..", ".", NULL, MS_MOVE, NULL); + /* switch root and cwd back to / of rootfs */ +diff --git a/init/main.c b/init/main.c +index 63534a1..010d242 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -125,6 +125,7 @@ extern void softirq_init(void); + char __initdata boot_command_line[COMMAND_LINE_SIZE]; + /* Untouched saved command line (eg. for /proc) */ + char *saved_command_line; ++EXPORT_SYMBOL_GPL(saved_command_line); + /* Command line for parameter parsing */ + static char *static_command_line; + +diff --git a/ipc/shm.c b/ipc/shm.c +index 4fa6d8f..a9f4ae9 100644 +--- a/ipc/shm.c ++++ b/ipc/shm.c +@@ -505,7 +505,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) + if ((shmflg & SHM_NORESERVE) && + sysctl_overcommit_memory != OVERCOMMIT_NEVER) + acctflag = VM_NORESERVE; +- file = shmem_file_setup(name, size, acctflag); ++ file = shmem_file_setup(name, size, acctflag, 0); + } + error = PTR_ERR(file); + if (IS_ERR(file)) +diff --git a/kernel/cpu.c b/kernel/cpu.c +index b5e4ab2..f9c94c3 100644 +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -493,6 +493,7 @@ int disable_nonboot_cpus(void) + cpu_maps_update_done(); + return error; + } ++EXPORT_SYMBOL_GPL(disable_nonboot_cpus); + + void __weak arch_enable_nonboot_cpus_begin(void) + { +@@ -531,6 +532,7 @@ void __ref enable_nonboot_cpus(void) + out: + cpu_maps_update_done(); + } ++EXPORT_SYMBOL_GPL(enable_nonboot_cpus); + + static int __init alloc_frozen_cpus(void) + { +diff --git a/kernel/kmod.c b/kernel/kmod.c +index 56dd349..5d3c529 100644 +--- a/kernel/kmod.c ++++ b/kernel/kmod.c +@@ -450,6 +450,7 @@ void __usermodehelper_set_disable_depth(enum umh_disable_depth depth) + wake_up(&usermodehelper_disabled_waitq); + up_write(&umhelper_sem); + } ++EXPORT_SYMBOL_GPL(__usermodehelper_set_disable_depth); + + /** + * __usermodehelper_disable - Prevent new helpers from being started. +@@ -483,6 +484,7 @@ int __usermodehelper_disable(enum umh_disable_depth depth) + __usermodehelper_set_disable_depth(UMH_ENABLED); + return -EAGAIN; + } ++EXPORT_SYMBOL_GPL(__usermodehelper_disable); + + static void helper_lock(void) + { +diff --git a/kernel/kthread.c b/kernel/kthread.c +index 691dc2e..c037774 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -490,6 +490,8 @@ int kthread_worker_fn(void *worker_ptr) + + WARN_ON(worker->task); + worker->task = current; ++ set_freezable(); ++ + repeat: + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ + +diff --git a/kernel/pid.c b/kernel/pid.c +index f2c6a68..b46f32a 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -446,6 +446,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) + " protection"); + return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); + } ++EXPORT_SYMBOL_GPL(find_task_by_pid_ns); + + struct task_struct *find_task_by_vpid(pid_t vnr) + { +diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig +index 5dfdc9e..788f046 100644 +--- a/kernel/power/Kconfig ++++ b/kernel/power/Kconfig +@@ -91,6 +91,286 @@ config PM_STD_PARTITION + suspended image to. It will simply pick the first available swap + device. + ++menuconfig TOI_CORE ++ tristate "Enhanced Hibernation (TuxOnIce)" ++ depends on HIBERNATION ++ default y ++ ---help--- ++ TuxOnIce is the 'new and improved' suspend support. ++ ++ See the TuxOnIce home page (tuxonice.net) ++ for FAQs, HOWTOs and other documentation. ++ ++ comment "Image Storage (you need at least one allocator)" ++ depends on TOI_CORE ++ ++ config TOI_FILE ++ tristate "File Allocator" ++ depends on TOI_CORE ++ default y ++ ---help--- ++ This option enables support for storing an image in a ++ simple file. You might want this if your swap is ++ sometimes full enough that you don't have enough spare ++ space to store an image. ++ ++ config TOI_SWAP ++ tristate "Swap Allocator" ++ depends on TOI_CORE && SWAP ++ default y ++ ---help--- ++ This option enables support for storing an image in your ++ swap space. ++ ++ comment "General Options" ++ depends on TOI_CORE ++ ++ config TOI_INCREMENTAL ++ tristate "Incremental Image Support" ++ depends on TOI_CORE && CRYPTO && BROKEN ++ select CRYPTO_SHA1 ++ default y ++ ---help--- ++ This option adds initial support for using hashing algorithms ++ (a quick, internal implementation of Fletcher16 and SHA1 via ++ cryptoapi) to discover the number of pages which are ++ unchanged since the image was last written. It is hoped that ++ this will be an initial step toward implementing storing just ++ the differences between consecutive images, which will ++ increase the amount of storage needed for the image, but also ++ increase the speed at which writing an image occurs and ++ reduce the wear and tear on drives. ++ ++ comment "No increemntal image support available without Cryptoapi support." ++ depends on TOI_CORE && !CRYPTO ++ ++ config TOI_PRUNE ++ tristate "Image pruning support" ++ depends on TOI_CORE && CRYPTO && BROKEN ++ default y ++ ---help--- ++ This option adds support for using cryptoapi hashing ++ algorithms to identify pages with the same content. We ++ then write a much smaller pointer to the first copy of ++ the data instead of a complete (perhaps compressed) ++ additional copy. ++ ++ You probably want this, so say Y here. ++ ++ comment "No image pruning support available without Cryptoapi support." ++ depends on TOI_CORE && !CRYPTO ++ ++ config TOI_CRYPTO ++ tristate "Compression support" ++ depends on TOI_CORE && CRYPTO ++ default y ++ ---help--- ++ This option adds support for using cryptoapi compression ++ algorithms. Compression is particularly useful as it can ++ more than double your suspend and resume speed (depending ++ upon how well your image compresses). ++ ++ You probably want this, so say Y here. ++ ++ comment "No compression support available without Cryptoapi support." ++ depends on TOI_CORE && !CRYPTO ++ ++ config TOI_USERUI ++ tristate "Userspace User Interface support" ++ depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE) ++ default y ++ ---help--- ++ This option enabled support for a userspace based user interface ++ to TuxOnIce, which allows you to have a nice display while suspending ++ and resuming, and also enables features such as pressing escape to ++ cancel a cycle or interactive debugging. ++ ++ config TOI_USERUI_DEFAULT_PATH ++ string "Default userui program location" ++ default "/usr/local/sbin/tuxoniceui_text" ++ depends on TOI_USERUI ++ ---help--- ++ This entry allows you to specify a default path to the userui binary. ++ ++ config TOI_DEFAULT_IMAGE_SIZE_LIMIT ++ int "Default image size limit" ++ range -2 65536 ++ default "-2" ++ depends on TOI_CORE ++ ---help--- ++ This entry allows you to specify a default image size limit. It can ++ be overridden at run-time using /sys/power/tuxonice/image_size_limit. ++ ++ config TOI_KEEP_IMAGE ++ bool "Allow Keep Image Mode" ++ depends on TOI_CORE ++ ---help--- ++ This option allows you to keep and image and reuse it. It is intended ++ __ONLY__ for use with systems where all filesystems are mounted read- ++ only (kiosks, for example). To use it, compile this option in and boot ++ normally. Set the KEEP_IMAGE flag in /sys/power/tuxonice and suspend. ++ When you resume, the image will not be removed. You will be unable to turn ++ off swap partitions (assuming you are using the swap allocator), but future ++ suspends simply do a power-down. The image can be updated using the ++ kernel command line parameter suspend_act= to turn off the keep image ++ bit. Keep image mode is a little less user friendly on purpose - it ++ should not be used without thought! ++ ++ config TOI_REPLACE_SWSUSP ++ bool "Replace swsusp by default" ++ default y ++ depends on TOI_CORE ++ ---help--- ++ TuxOnIce can replace swsusp. This option makes that the default state, ++ requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want ++ to use the vanilla kernel functionality. Note that your initrd/ramfs will ++ need to do this before trying to resume, too. ++ With overriding swsusp enabled, echoing disk to /sys/power/state will ++ start a TuxOnIce cycle. If resume= doesn't specify an allocator and both ++ the swap and file allocators are compiled in, the swap allocator will be ++ used by default. ++ ++ config TOI_IGNORE_LATE_INITCALL ++ bool "Wait for initrd/ramfs to run, by default" ++ default n ++ depends on TOI_CORE ++ ---help--- ++ When booting, TuxOnIce can check for an image and start to resume prior ++ to any initrd/ramfs running (via a late initcall). ++ ++ If you don't have an initrd/ramfs, this is what you want to happen - ++ otherwise you won't be able to safely resume. You should set this option ++ to 'No'. ++ ++ If, however, you want your initrd/ramfs to run anyway before resuming, ++ you need to tell TuxOnIce to ignore that earlier opportunity to resume. ++ This can be done either by using this compile time option, or by ++ overriding this option with the boot-time parameter toi_initramfs_resume_only=1. ++ ++ Note that if TuxOnIce can't resume at the earlier opportunity, the ++ value of this option won't matter - the initramfs/initrd (if any) will ++ run anyway. ++ ++ menuconfig TOI_CLUSTER ++ tristate "Cluster support" ++ default n ++ depends on TOI_CORE && NET && BROKEN ++ ---help--- ++ Support for linking multiple machines in a cluster so that they suspend ++ and resume together. ++ ++ config TOI_DEFAULT_CLUSTER_INTERFACE ++ string "Default cluster interface" ++ depends on TOI_CLUSTER ++ ---help--- ++ The default interface on which to communicate with other nodes in ++ the cluster. ++ ++ If no value is set here, cluster support will be disabled by default. ++ ++ config TOI_DEFAULT_CLUSTER_KEY ++ string "Default cluster key" ++ default "Default" ++ depends on TOI_CLUSTER ++ ---help--- ++ The default key used by this node. All nodes in the same cluster ++ have the same key. Multiple clusters may coexist on the same lan ++ by using different values for this key. ++ ++ config TOI_CLUSTER_IMAGE_TIMEOUT ++ int "Timeout when checking for image" ++ default 15 ++ depends on TOI_CLUSTER ++ ---help--- ++ Timeout (seconds) before continuing to boot when waiting to see ++ whether other nodes might have an image. Set to -1 to wait ++ indefinitely. In WAIT_UNTIL_NODES is non zero, we might continue ++ booting sooner than this timeout. ++ ++ config TOI_CLUSTER_WAIT_UNTIL_NODES ++ int "Nodes without image before continuing" ++ default 0 ++ depends on TOI_CLUSTER ++ ---help--- ++ When booting and no image is found, we wait to see if other nodes ++ have an image before continuing to boot. This value lets us ++ continue after seeing a certain number of nodes without an image, ++ instead of continuing to wait for the timeout. Set to 0 to only ++ use the timeout. ++ ++ config TOI_DEFAULT_CLUSTER_PRE_HIBERNATE ++ string "Default pre-hibernate script" ++ depends on TOI_CLUSTER ++ ---help--- ++ The default script to be called when starting to hibernate. ++ ++ config TOI_DEFAULT_CLUSTER_POST_HIBERNATE ++ string "Default post-hibernate script" ++ depends on TOI_CLUSTER ++ ---help--- ++ The default script to be called after resuming from hibernation. ++ ++ config TOI_DEFAULT_WAIT ++ int "Default waiting time for emergency boot messages" ++ default "25" ++ range -1 32768 ++ depends on TOI_CORE ++ help ++ TuxOnIce can display warnings very early in the process of resuming, ++ if (for example) it appears that you have booted a kernel that doesn't ++ match an image on disk. It can then give you the opportunity to either ++ continue booting that kernel, or reboot the machine. This option can be ++ used to control how long to wait in such circumstances. -1 means wait ++ forever. 0 means don't wait at all (do the default action, which will ++ generally be to continue booting and remove the image). Values of 1 or ++ more indicate a number of seconds (up to 255) to wait before doing the ++ default. ++ ++ config TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE ++ int "Default extra pages allowance" ++ default "2000" ++ range 500 32768 ++ depends on TOI_CORE ++ help ++ This value controls the default for the allowance TuxOnIce makes for ++ drivers to allocate extra memory during the atomic copy. The default ++ value of 2000 will be okay in most cases. If you are using ++ DRI, the easiest way to find what value to use is to try to hibernate ++ and look at how many pages were actually needed in the sysfs entry ++ /sys/power/tuxonice/debug_info (first number on the last line), adding ++ a little extra because the value is not always the same. ++ ++ config TOI_CHECKSUM ++ bool "Checksum pageset2" ++ default n ++ depends on TOI_CORE ++ select CRYPTO ++ select CRYPTO_ALGAPI ++ select CRYPTO_MD4 ++ ---help--- ++ Adds support for checksumming pageset2 pages, to ensure you really get an ++ atomic copy. Since some filesystems (XFS especially) change metadata even ++ when there's no other activity, we need this to check for pages that have ++ been changed while we were saving the page cache. If your debugging output ++ always says no pages were resaved, you may be able to safely disable this ++ option. ++ ++config TOI ++ bool ++ depends on TOI_CORE!=n ++ default y ++ ++config TOI_EXPORTS ++ bool ++ depends on TOI_SWAP=m || TOI_FILE=m || \ ++ TOI_CRYPTO=m || TOI_CLUSTER=m || \ ++ TOI_USERUI=m || TOI_CORE=m ++ default y ++ ++config TOI_ZRAM_SUPPORT ++ def_bool y ++ depends on TOI && ZRAM!=n ++ + config PM_SLEEP + def_bool y + depends on SUSPEND || HIBERNATE_CALLBACKS +diff --git a/kernel/power/Makefile b/kernel/power/Makefile +index 29472bf..dd5d4f2 100644 +--- a/kernel/power/Makefile ++++ b/kernel/power/Makefile +@@ -1,6 +1,37 @@ + + ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG + ++tuxonice_core-y := tuxonice_modules.o ++ ++obj-$(CONFIG_TOI) += tuxonice_builtin.o ++ ++tuxonice_core-$(CONFIG_PM_DEBUG) += tuxonice_alloc.o ++ ++# Compile these in after allocation debugging, if used. ++ ++tuxonice_core-y += tuxonice_sysfs.o tuxonice_highlevel.o \ ++ tuxonice_io.o tuxonice_pagedir.o tuxonice_prepare_image.o \ ++ tuxonice_extent.o tuxonice_pageflags.o tuxonice_ui.o \ ++ tuxonice_power_off.o tuxonice_atomic_copy.o ++ ++tuxonice_core-$(CONFIG_TOI_CHECKSUM) += tuxonice_checksum.o ++ ++tuxonice_core-$(CONFIG_NET) += tuxonice_storage.o tuxonice_netlink.o ++ ++obj-$(CONFIG_TOI_CORE) += tuxonice_core.o ++obj-$(CONFIG_TOI_PRUNE) += tuxonice_prune.o ++obj-$(CONFIG_TOI_INCREMENTAL) += tuxonice_incremental.o ++obj-$(CONFIG_TOI_CRYPTO) += tuxonice_compress.o ++ ++tuxonice_bio-y := tuxonice_bio_core.o tuxonice_bio_chains.o \ ++ tuxonice_bio_signature.o ++ ++obj-$(CONFIG_TOI_SWAP) += tuxonice_bio.o tuxonice_swap.o ++obj-$(CONFIG_TOI_FILE) += tuxonice_bio.o tuxonice_file.o ++obj-$(CONFIG_TOI_CLUSTER) += tuxonice_cluster.o ++ ++obj-$(CONFIG_TOI_USERUI) += tuxonice_userui.o ++ + obj-y += qos.o + obj-$(CONFIG_PM) += main.o + obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o +diff --git a/kernel/power/console.c b/kernel/power/console.c +index b1dc456..bbf19a5 100644 +--- a/kernel/power/console.c ++++ b/kernel/power/console.c +@@ -23,6 +23,7 @@ int pm_prepare_console(void) + orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); + return 0; + } ++EXPORT_SYMBOL_GPL(pm_prepare_console); + + void pm_restore_console(void) + { +@@ -31,3 +32,4 @@ void pm_restore_console(void) + vt_kmsg_redirect(orig_kmsg); + } + } ++EXPORT_SYMBOL_GPL(pm_restore_console); +diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c +index b26f5f1..9c04c3e 100644 +--- a/kernel/power/hibernate.c ++++ b/kernel/power/hibernate.c +@@ -29,14 +29,15 @@ + #include + #include + +-#include "power.h" ++#include "tuxonice.h" + + + static int nocompress; + static int noresume; + static int resume_wait; + static int resume_delay; +-static char resume_file[256] = CONFIG_PM_STD_PARTITION; ++char resume_file[256] = CONFIG_PM_STD_PARTITION; ++EXPORT_SYMBOL_GPL(resume_file); + dev_t swsusp_resume_device; + sector_t swsusp_resume_block; + int in_suspend __nosavedata; +@@ -114,21 +115,23 @@ static int hibernation_test(int level) { return 0; } + * platform_begin - Call platform to start hibernation. + * @platform_mode: Whether or not to use the platform driver. + */ +-static int platform_begin(int platform_mode) ++int platform_begin(int platform_mode) + { + return (platform_mode && hibernation_ops) ? + hibernation_ops->begin() : 0; + } ++EXPORT_SYMBOL_GPL(platform_begin); + + /** + * platform_end - Call platform to finish transition to the working state. + * @platform_mode: Whether or not to use the platform driver. + */ +-static void platform_end(int platform_mode) ++void platform_end(int platform_mode) + { + if (platform_mode && hibernation_ops) + hibernation_ops->end(); + } ++EXPORT_SYMBOL_GPL(platform_end); + + /** + * platform_pre_snapshot - Call platform to prepare the machine for hibernation. +@@ -138,11 +141,12 @@ static void platform_end(int platform_mode) + * if so configured, and return an error code if that fails. + */ + +-static int platform_pre_snapshot(int platform_mode) ++int platform_pre_snapshot(int platform_mode) + { + return (platform_mode && hibernation_ops) ? + hibernation_ops->pre_snapshot() : 0; + } ++EXPORT_SYMBOL_GPL(platform_pre_snapshot); + + /** + * platform_leave - Call platform to prepare a transition to the working state. +@@ -153,11 +157,12 @@ static int platform_pre_snapshot(int platform_mode) + * + * This routine is called on one CPU with interrupts disabled. + */ +-static void platform_leave(int platform_mode) ++void platform_leave(int platform_mode) + { + if (platform_mode && hibernation_ops) + hibernation_ops->leave(); + } ++EXPORT_SYMBOL_GPL(platform_leave); + + /** + * platform_finish - Call platform to switch the system to the working state. +@@ -168,11 +173,12 @@ static void platform_leave(int platform_mode) + * + * This routine must be called after platform_prepare(). + */ +-static void platform_finish(int platform_mode) ++void platform_finish(int platform_mode) + { + if (platform_mode && hibernation_ops) + hibernation_ops->finish(); + } ++EXPORT_SYMBOL_GPL(platform_finish); + + /** + * platform_pre_restore - Prepare for hibernate image restoration. +@@ -184,11 +190,12 @@ static void platform_finish(int platform_mode) + * If the restore fails after this function has been called, + * platform_restore_cleanup() must be called. + */ +-static int platform_pre_restore(int platform_mode) ++int platform_pre_restore(int platform_mode) + { + return (platform_mode && hibernation_ops) ? + hibernation_ops->pre_restore() : 0; + } ++EXPORT_SYMBOL_GPL(platform_pre_restore); + + /** + * platform_restore_cleanup - Switch to the working state after failing restore. +@@ -201,21 +208,23 @@ static int platform_pre_restore(int platform_mode) + * function must be called too, regardless of the result of + * platform_pre_restore(). + */ +-static void platform_restore_cleanup(int platform_mode) ++void platform_restore_cleanup(int platform_mode) + { + if (platform_mode && hibernation_ops) + hibernation_ops->restore_cleanup(); + } ++EXPORT_SYMBOL_GPL(platform_restore_cleanup); + + /** + * platform_recover - Recover from a failure to suspend devices. + * @platform_mode: Whether or not to use the platform driver. + */ +-static void platform_recover(int platform_mode) ++void platform_recover(int platform_mode) + { + if (platform_mode && hibernation_ops && hibernation_ops->recover) + hibernation_ops->recover(); + } ++EXPORT_SYMBOL_GPL(platform_recover); + + /** + * swsusp_show_speed - Print time elapsed between two events during hibernation. +@@ -573,6 +582,7 @@ int hibernation_platform_enter(void) + + return error; + } ++EXPORT_SYMBOL_GPL(hibernation_platform_enter); + + /** + * power_down - Shut the machine down for hibernation. +@@ -632,6 +642,9 @@ int hibernate(void) + { + int error; + ++ if (test_action_state(TOI_REPLACE_SWSUSP)) ++ return try_tuxonice_hibernate(); ++ + lock_system_sleep(); + /* The snapshot device should not be opened while we're running */ + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { +@@ -715,11 +728,19 @@ int hibernate(void) + * attempts to recover gracefully and make the kernel return to the normal mode + * of operation. + */ +-static int software_resume(void) ++int software_resume(void) + { + int error; + unsigned int flags; + ++ resume_attempted = 1; ++ ++ /* ++ * We can't know (until an image header - if any - is loaded), whether ++ * we did override swsusp. We therefore ensure that both are tried. ++ */ ++ try_tuxonice_resume(); ++ + /* + * If the user said "noresume".. bail out early. + */ +@@ -1094,6 +1115,7 @@ static int __init hibernate_setup(char *str) + static int __init noresume_setup(char *str) + { + noresume = 1; ++ set_toi_state(TOI_NORESUME_SPECIFIED); + return 1; + } + +diff --git a/kernel/power/main.c b/kernel/power/main.c +index d77663b..bbd0c86 100644 +--- a/kernel/power/main.c ++++ b/kernel/power/main.c +@@ -19,12 +19,14 @@ + #include "power.h" + + DEFINE_MUTEX(pm_mutex); ++EXPORT_SYMBOL_GPL(pm_mutex); + + #ifdef CONFIG_PM_SLEEP + + /* Routines for PM-transition notifications */ + +-static BLOCKING_NOTIFIER_HEAD(pm_chain_head); ++BLOCKING_NOTIFIER_HEAD(pm_chain_head); ++EXPORT_SYMBOL_GPL(pm_chain_head); + + int register_pm_notifier(struct notifier_block *nb) + { +@@ -44,6 +46,7 @@ int pm_notifier_call_chain(unsigned long val) + + return notifier_to_errno(ret); + } ++EXPORT_SYMBOL_GPL(pm_notifier_call_chain); + + /* If set, devices may be suspended and resumed asynchronously. */ + int pm_async_enabled = 1; +@@ -277,6 +280,7 @@ static inline void pm_print_times_init(void) {} + #endif /* CONFIG_PM_SLEEP_DEBUG */ + + struct kobject *power_kobj; ++EXPORT_SYMBOL_GPL(power_kobj); + + /** + * state - control system power state. +diff --git a/kernel/power/power.h b/kernel/power/power.h +index 7d4b7ff..98b9660 100644 +--- a/kernel/power/power.h ++++ b/kernel/power/power.h +@@ -35,8 +35,12 @@ static inline char *check_image_kernel(struct swsusp_info *info) + return arch_hibernation_header_restore(info) ? + "architecture specific data" : NULL; + } ++#else ++extern char *check_image_kernel(struct swsusp_info *info); + #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ ++extern int init_header(struct swsusp_info *info); + ++extern char resume_file[256]; + /* + * Keep some memory free so that I/O operations can succeed without paging + * [Might this be more than 4 MB?] +@@ -55,6 +59,7 @@ extern bool freezer_test_done; + extern int hibernation_snapshot(int platform_mode); + extern int hibernation_restore(int platform_mode); + extern int hibernation_platform_enter(void); ++extern void platform_recover(int platform_mode); + + #else /* !CONFIG_HIBERNATION */ + +@@ -74,6 +79,8 @@ static struct kobj_attribute _name##_attr = { \ + .store = _name##_store, \ + } + ++extern struct pbe *restore_pblist; ++ + /* Preferred image size in bytes (default 500 MB) */ + extern unsigned long image_size; + /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ +@@ -268,6 +275,90 @@ static inline void suspend_thaw_processes(void) + } + #endif + ++extern struct page *saveable_page(struct zone *z, unsigned long p); ++#ifdef CONFIG_HIGHMEM ++extern struct page *saveable_highmem_page(struct zone *z, unsigned long p); ++#else ++static ++inline struct page *saveable_highmem_page(struct zone *z, unsigned long p) ++{ ++ return NULL; ++} ++#endif ++ ++#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe)) ++extern struct list_head nosave_regions; ++ ++/** ++ * This structure represents a range of page frames the contents of which ++ * should not be saved during the suspend. ++ */ ++ ++struct nosave_region { ++ struct list_head list; ++ unsigned long start_pfn; ++ unsigned long end_pfn; ++}; ++ ++#define BM_END_OF_MAP (~0UL) ++ ++#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) ++ ++struct bm_block { ++ struct list_head hook; /* hook into a list of bitmap blocks */ ++ unsigned long start_pfn; /* pfn represented by the first bit */ ++ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ ++ unsigned long *data; /* bitmap representing pages */ ++}; ++ ++/* struct bm_position is used for browsing memory bitmaps */ ++ ++struct bm_position { ++ struct bm_block *block; ++ int bit; ++}; ++ ++struct memory_bitmap { ++ struct list_head blocks; /* list of bitmap blocks */ ++ struct linked_page *p_list; /* list of pages used to store zone ++ * bitmap objects and bitmap block ++ * objects ++ */ ++ struct bm_position *states; /* most recently used bit position */ ++ int num_states; /* when iterating over a bitmap and ++ * number of states we support. ++ */ ++}; ++ ++extern int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, ++ int safe_needed); ++extern int memory_bm_create_index(struct memory_bitmap *bm, gfp_t gfp_mask, ++ int safe_needed, int index); ++extern void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); ++extern void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn); ++extern void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn); ++extern void memory_bm_clear_bit_index(struct memory_bitmap *bm, unsigned long pfn, int index); ++extern int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn); ++extern int memory_bm_test_bit_index(struct memory_bitmap *bm, unsigned long pfn, int index); ++extern unsigned long memory_bm_next_pfn(struct memory_bitmap *bm); ++extern unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, ++ int index); ++extern void memory_bm_position_reset(struct memory_bitmap *bm); ++extern void memory_bm_clear(struct memory_bitmap *bm); ++extern void memory_bm_copy(struct memory_bitmap *source, ++ struct memory_bitmap *dest); ++extern void memory_bm_dup(struct memory_bitmap *source, ++ struct memory_bitmap *dest); ++extern int memory_bm_set_iterators(struct memory_bitmap *bm, int number); ++ ++#ifdef CONFIG_TOI ++struct toi_module_ops; ++extern int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk) ++ (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)); ++extern int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk) ++ (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)); ++#endif ++ + #ifdef CONFIG_PM_AUTOSLEEP + + /* kernel/power/autosleep.c */ +diff --git a/kernel/power/process.c b/kernel/power/process.c +index 98088e0..b340c98 100644 +--- a/kernel/power/process.c ++++ b/kernel/power/process.c +@@ -134,6 +134,7 @@ int freeze_processes(void) + thaw_processes(); + return error; + } ++EXPORT_SYMBOL_GPL(freeze_processes); + + /** + * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. +@@ -160,6 +161,7 @@ int freeze_kernel_threads(void) + thaw_kernel_threads(); + return error; + } ++EXPORT_SYMBOL_GPL(freeze_kernel_threads); + + void thaw_processes(void) + { +@@ -187,6 +189,7 @@ void thaw_processes(void) + schedule(); + printk("done.\n"); + } ++EXPORT_SYMBOL_GPL(thaw_processes); + + void thaw_kernel_threads(void) + { +@@ -207,3 +210,4 @@ void thaw_kernel_threads(void) + schedule(); + printk("done.\n"); + } ++EXPORT_SYMBOL_GPL(thaw_kernel_threads); +diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c +index 0de2857..c47a1a8 100644 +--- a/kernel/power/snapshot.c ++++ b/kernel/power/snapshot.c +@@ -35,6 +35,8 @@ + #include + + #include "power.h" ++#include "tuxonice_builtin.h" ++#include "tuxonice_pagedir.h" + + static int swsusp_page_is_free(struct page *); + static void swsusp_set_page_forbidden(struct page *); +@@ -71,6 +73,10 @@ void __init hibernate_image_size_init(void) + * directly to their "original" page frames. + */ + struct pbe *restore_pblist; ++EXPORT_SYMBOL_GPL(restore_pblist); ++ ++int resume_attempted; ++EXPORT_SYMBOL_GPL(resume_attempted); + + /* Pointer to an auxiliary buffer (1 page) */ + static void *buffer; +@@ -113,6 +119,9 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed) + + unsigned long get_safe_page(gfp_t gfp_mask) + { ++ if (toi_running) ++ return toi_get_nonconflicting_page(); ++ + return (unsigned long)get_image_page(gfp_mask, PG_SAFE); + } + +@@ -249,47 +258,53 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) + * the represented memory area. + */ + +-#define BM_END_OF_MAP (~0UL) +- +-#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) +- +-struct bm_block { +- struct list_head hook; /* hook into a list of bitmap blocks */ +- unsigned long start_pfn; /* pfn represented by the first bit */ +- unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ +- unsigned long *data; /* bitmap representing pages */ +-}; +- + static inline unsigned long bm_block_bits(struct bm_block *bb) + { + return bb->end_pfn - bb->start_pfn; + } + +-/* strcut bm_position is used for browsing memory bitmaps */ ++/* Functions that operate on memory bitmaps */ + +-struct bm_position { +- struct bm_block *block; +- int bit; +-}; ++void memory_bm_position_reset_index(struct memory_bitmap *bm, int index) ++{ ++ bm->states[index].block = list_entry(bm->blocks.next, ++ struct bm_block, hook); ++ bm->states[index].bit = 0; ++} ++EXPORT_SYMBOL_GPL(memory_bm_position_reset_index); + +-struct memory_bitmap { +- struct list_head blocks; /* list of bitmap blocks */ +- struct linked_page *p_list; /* list of pages used to store zone +- * bitmap objects and bitmap block +- * objects +- */ +- struct bm_position cur; /* most recently used bit position */ +-}; ++void memory_bm_position_reset(struct memory_bitmap *bm) ++{ ++ int i; + +-/* Functions that operate on memory bitmaps */ ++ for (i = 0; i < bm->num_states; i++) { ++ bm->states[i].block = list_entry(bm->blocks.next, ++ struct bm_block, hook); ++ bm->states[i].bit = 0; ++ } ++} ++EXPORT_SYMBOL_GPL(memory_bm_position_reset); + +-static void memory_bm_position_reset(struct memory_bitmap *bm) ++int memory_bm_set_iterators(struct memory_bitmap *bm, int number) + { +- bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); +- bm->cur.bit = 0; +-} ++ int bytes = number * sizeof(struct bm_position); ++ struct bm_position *new_states; ++ ++ if (number < bm->num_states) ++ return 0; + +-static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); ++ new_states = kmalloc(bytes, GFP_KERNEL); ++ if (!new_states) ++ return -ENOMEM; ++ ++ if (bm->states) ++ kfree(bm->states); ++ ++ bm->states = new_states; ++ bm->num_states = number; ++ return 0; ++} ++EXPORT_SYMBOL_GPL(memory_bm_set_iterators); + + /** + * create_bm_block_list - create a list of block bitmap objects +@@ -397,8 +412,8 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) + /** + * memory_bm_create - allocate memory for a memory bitmap + */ +-static int +-memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) ++int memory_bm_create_index(struct memory_bitmap *bm, gfp_t gfp_mask, ++ int safe_needed, int states) + { + struct chain_allocator ca; + struct list_head mem_extents; +@@ -442,6 +457,9 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) + } + } + ++ if (!error) ++ error = memory_bm_set_iterators(bm, states); ++ + bm->p_list = ca.chain; + memory_bm_position_reset(bm); + Exit: +@@ -453,11 +471,18 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) + memory_bm_free(bm, PG_UNSAFE_CLEAR); + goto Exit; + } ++EXPORT_SYMBOL_GPL(memory_bm_create_index); ++ ++int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) ++{ ++ return memory_bm_create_index(bm, gfp_mask, safe_needed, 1); ++} ++EXPORT_SYMBOL_GPL(memory_bm_create); + + /** + * memory_bm_free - free memory occupied by the memory bitmap @bm + */ +-static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) ++void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) + { + struct bm_block *bb; + +@@ -468,15 +493,22 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) + free_list_of_pages(bm->p_list, clear_nosave_free); + + INIT_LIST_HEAD(&bm->blocks); ++ ++ if (bm->states) { ++ kfree(bm->states); ++ bm->states = NULL; ++ bm->num_states = 0; ++ } + } ++EXPORT_SYMBOL_GPL(memory_bm_free); + + /** + * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds + * to given pfn. The cur_zone_bm member of @bm and the cur_block member +- * of @bm->cur_zone_bm are updated. ++ * of @bm->states[i]_zone_bm are updated. + */ +-static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, +- void **addr, unsigned int *bit_nr) ++static int memory_bm_find_bit_index(struct memory_bitmap *bm, unsigned long pfn, ++ void **addr, unsigned int *bit_nr, int state) + { + struct bm_block *bb; + +@@ -484,7 +516,7 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, + * Check if the pfn corresponds to the current bitmap block and find + * the block where it fits if this is not the case. + */ +- bb = bm->cur.block; ++ bb = bm->states[state].block; + if (pfn < bb->start_pfn) + list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn) +@@ -499,15 +531,21 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, + return -EFAULT; + + /* The block has been found */ +- bm->cur.block = bb; ++ bm->states[state].block = bb; + pfn -= bb->start_pfn; +- bm->cur.bit = pfn + 1; ++ bm->states[state].bit = pfn + 1; + *bit_nr = pfn; + *addr = bb->data; + return 0; + } + +-static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) ++static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, ++ void **addr, unsigned int *bit_nr) ++{ ++ return memory_bm_find_bit_index(bm, pfn, addr, bit_nr, 0); ++} ++ ++void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) + { + void *addr; + unsigned int bit; +@@ -517,6 +555,7 @@ static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) + BUG_ON(error); + set_bit(bit, addr); + } ++EXPORT_SYMBOL_GPL(memory_bm_set_bit); + + static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) + { +@@ -530,27 +569,43 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) + return error; + } + +-static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) ++void memory_bm_clear_bit_index(struct memory_bitmap *bm, unsigned long pfn, ++ int index) + { + void *addr; + unsigned int bit; + int error; + +- error = memory_bm_find_bit(bm, pfn, &addr, &bit); ++ error = memory_bm_find_bit_index(bm, pfn, &addr, &bit, index); + BUG_ON(error); + clear_bit(bit, addr); + } ++EXPORT_SYMBOL_GPL(memory_bm_clear_bit_index); ++ ++void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) ++{ ++ memory_bm_clear_bit_index(bm, pfn, 0); ++} ++EXPORT_SYMBOL_GPL(memory_bm_clear_bit); + +-static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) ++int memory_bm_test_bit_index(struct memory_bitmap *bm, unsigned long pfn, ++ int index) + { + void *addr; + unsigned int bit; + int error; + +- error = memory_bm_find_bit(bm, pfn, &addr, &bit); ++ error = memory_bm_find_bit_index(bm, pfn, &addr, &bit, index); + BUG_ON(error); + return test_bit(bit, addr); + } ++EXPORT_SYMBOL_GPL(memory_bm_test_bit_index); ++ ++int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) ++{ ++ return memory_bm_test_bit_index(bm, pfn, 0); ++} ++EXPORT_SYMBOL_GPL(memory_bm_test_bit); + + static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) + { +@@ -569,43 +624,184 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) + * this function. + */ + +-static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) ++unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, int index) + { + struct bm_block *bb; + int bit; + +- bb = bm->cur.block; ++ bb = bm->states[index].block; + do { +- bit = bm->cur.bit; ++ bit = bm->states[index].bit; + bit = find_next_bit(bb->data, bm_block_bits(bb), bit); + if (bit < bm_block_bits(bb)) + goto Return_pfn; + + bb = list_entry(bb->hook.next, struct bm_block, hook); +- bm->cur.block = bb; +- bm->cur.bit = 0; ++ bm->states[index].block = bb; ++ bm->states[index].bit = 0; + } while (&bb->hook != &bm->blocks); + +- memory_bm_position_reset(bm); ++ memory_bm_position_reset_index(bm, index); + return BM_END_OF_MAP; + + Return_pfn: +- bm->cur.bit = bit + 1; ++ bm->states[index].bit = bit + 1; + return bb->start_pfn + bit; + } ++EXPORT_SYMBOL_GPL(memory_bm_next_pfn_index); + +-/** +- * This structure represents a range of page frames the contents of which +- * should not be saved during the suspend. +- */ ++unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) ++{ ++ return memory_bm_next_pfn_index(bm, 0); ++} ++EXPORT_SYMBOL_GPL(memory_bm_next_pfn); + +-struct nosave_region { +- struct list_head list; +- unsigned long start_pfn; +- unsigned long end_pfn; +-}; ++void memory_bm_clear(struct memory_bitmap *bm) ++{ ++ unsigned long pfn; + +-static LIST_HEAD(nosave_regions); ++ memory_bm_position_reset(bm); ++ pfn = memory_bm_next_pfn(bm); ++ while (pfn != BM_END_OF_MAP) { ++ memory_bm_clear_bit(bm, pfn); ++ pfn = memory_bm_next_pfn(bm); ++ } ++} ++EXPORT_SYMBOL_GPL(memory_bm_clear); ++ ++void memory_bm_copy(struct memory_bitmap *source, struct memory_bitmap *dest) ++{ ++ unsigned long pfn; ++ ++ memory_bm_position_reset(source); ++ pfn = memory_bm_next_pfn(source); ++ while (pfn != BM_END_OF_MAP) { ++ memory_bm_set_bit(dest, pfn); ++ pfn = memory_bm_next_pfn(source); ++ } ++} ++EXPORT_SYMBOL_GPL(memory_bm_copy); ++ ++void memory_bm_dup(struct memory_bitmap *source, struct memory_bitmap *dest) ++{ ++ memory_bm_clear(dest); ++ memory_bm_copy(source, dest); ++} ++EXPORT_SYMBOL_GPL(memory_bm_dup); ++ ++#ifdef CONFIG_TOI ++#define DEFINE_MEMORY_BITMAP(name) \ ++struct memory_bitmap *name; \ ++EXPORT_SYMBOL_GPL(name) ++ ++DEFINE_MEMORY_BITMAP(pageset1_map); ++DEFINE_MEMORY_BITMAP(pageset1_copy_map); ++DEFINE_MEMORY_BITMAP(pageset2_map); ++DEFINE_MEMORY_BITMAP(page_resave_map); ++DEFINE_MEMORY_BITMAP(io_map); ++DEFINE_MEMORY_BITMAP(nosave_map); ++DEFINE_MEMORY_BITMAP(free_map); ++ ++int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk) ++ (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)) ++{ ++ int result = 0; ++ unsigned int nr = 0; ++ struct bm_block *bb; ++ ++ if (!bm) ++ return result; ++ ++ list_for_each_entry(bb, &bm->blocks, hook) ++ nr++; ++ ++ result = (*rw_chunk)(WRITE, NULL, (char *) &nr, sizeof(unsigned int)); ++ if (result) ++ return result; ++ ++ list_for_each_entry(bb, &bm->blocks, hook) { ++ result = (*rw_chunk)(WRITE, NULL, (char *) &bb->start_pfn, ++ 2 * sizeof(unsigned long)); ++ if (result) ++ return result; ++ ++ result = (*rw_chunk)(WRITE, NULL, (char *) bb->data, PAGE_SIZE); ++ if (result) ++ return result; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(memory_bm_write); ++ ++int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk) ++ (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size)) ++{ ++ int result = 0; ++ unsigned int nr, i; ++ struct bm_block *bb; ++ ++ if (!bm) ++ return result; ++ ++ result = memory_bm_create(bm, GFP_KERNEL, 0); ++ ++ if (result) ++ return result; ++ ++ result = (*rw_chunk)(READ, NULL, (char *) &nr, sizeof(unsigned int)); ++ if (result) ++ goto Free; ++ ++ for (i = 0; i < nr; i++) { ++ unsigned long pfn; ++ ++ result = (*rw_chunk)(READ, NULL, (char *) &pfn, ++ sizeof(unsigned long)); ++ if (result) ++ goto Free; ++ ++ list_for_each_entry(bb, &bm->blocks, hook) ++ if (bb->start_pfn == pfn) ++ break; ++ ++ if (&bb->hook == &bm->blocks) { ++ printk(KERN_ERR ++ "TuxOnIce: Failed to load memory bitmap.\n"); ++ result = -EINVAL; ++ goto Free; ++ } ++ ++ result = (*rw_chunk)(READ, NULL, (char *) &pfn, ++ sizeof(unsigned long)); ++ if (result) ++ goto Free; ++ ++ if (pfn != bb->end_pfn) { ++ printk(KERN_ERR ++ "TuxOnIce: Failed to load memory bitmap. " ++ "End PFN doesn't match what was saved.\n"); ++ result = -EINVAL; ++ goto Free; ++ } ++ ++ result = (*rw_chunk)(READ, NULL, (char *) bb->data, PAGE_SIZE); ++ ++ if (result) ++ goto Free; ++ } ++ ++ return 0; ++ ++Free: ++ memory_bm_free(bm, PG_ANY); ++ return result; ++} ++EXPORT_SYMBOL_GPL(memory_bm_read); ++#endif ++ ++LIST_HEAD(nosave_regions); ++EXPORT_SYMBOL_GPL(nosave_regions); + + /** + * register_nosave_region - register a range of page frames the contents +@@ -843,7 +1039,7 @@ static unsigned int count_free_highmem_pages(void) + * We should save the page if it isn't Nosave or NosaveFree, or Reserved, + * and it isn't a part of a free chunk of pages. + */ +-static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) ++struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) + { + struct page *page; + +@@ -865,6 +1061,7 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) + + return page; + } ++EXPORT_SYMBOL_GPL(saveable_highmem_page); + + /** + * count_highmem_pages - compute the total number of saveable highmem +@@ -890,11 +1087,6 @@ static unsigned int count_highmem_pages(void) + } + return n; + } +-#else +-static inline void *saveable_highmem_page(struct zone *z, unsigned long p) +-{ +- return NULL; +-} + #endif /* CONFIG_HIGHMEM */ + + /** +@@ -905,7 +1097,7 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p) + * of pages statically defined as 'unsaveable', and it isn't a part of + * a free chunk of pages. + */ +-static struct page *saveable_page(struct zone *zone, unsigned long pfn) ++struct page *saveable_page(struct zone *zone, unsigned long pfn) + { + struct page *page; + +@@ -930,6 +1122,7 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) + + return page; + } ++EXPORT_SYMBOL_GPL(saveable_page); + + /** + * count_data_pages - compute the total number of saveable non-highmem +@@ -1580,6 +1773,9 @@ asmlinkage int swsusp_save(void) + { + unsigned int nr_pages, nr_highmem; + ++ if (toi_running) ++ return toi_post_context_save(); ++ + printk(KERN_INFO "PM: Creating hibernation image:\n"); + + drain_local_pages(NULL); +@@ -1620,14 +1816,14 @@ asmlinkage int swsusp_save(void) + } + + #ifndef CONFIG_ARCH_HIBERNATION_HEADER +-static int init_header_complete(struct swsusp_info *info) ++int init_header_complete(struct swsusp_info *info) + { + memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname)); + info->version_code = LINUX_VERSION_CODE; + return 0; + } + +-static char *check_image_kernel(struct swsusp_info *info) ++char *check_image_kernel(struct swsusp_info *info) + { + if (info->version_code != LINUX_VERSION_CODE) + return "kernel version"; +@@ -1641,6 +1837,7 @@ static char *check_image_kernel(struct swsusp_info *info) + return "machine"; + return NULL; + } ++EXPORT_SYMBOL_GPL(check_image_kernel); + #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ + + unsigned long snapshot_get_image_size(void) +@@ -1648,7 +1845,7 @@ unsigned long snapshot_get_image_size(void) + return nr_copy_pages + nr_meta_pages + 1; + } + +-static int init_header(struct swsusp_info *info) ++int init_header(struct swsusp_info *info) + { + memset(info, 0, sizeof(struct swsusp_info)); + info->num_physpages = num_physpages; +@@ -1658,6 +1855,7 @@ static int init_header(struct swsusp_info *info) + info->size <<= PAGE_SHIFT; + return init_header_complete(info); + } ++EXPORT_SYMBOL_GPL(init_header); + + /** + * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm +diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c +index d4feda0..e1337b6 100644 +--- a/kernel/power/suspend.c ++++ b/kernel/power/suspend.c +@@ -286,6 +286,7 @@ int suspend_devices_and_enter(suspend_state_t state) + suspend_ops->recover(); + goto Resume_devices; + } ++EXPORT_SYMBOL_GPL(suspend_devices_and_enter); + + /** + * suspend_finish - Clean up before finishing the suspend sequence. +diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h +new file mode 100644 +index 0000000..6f8d127 +--- /dev/null ++++ b/kernel/power/tuxonice.h +@@ -0,0 +1,227 @@ ++/* ++ * kernel/power/tuxonice.h ++ * ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * It contains declarations used throughout swsusp. ++ * ++ */ ++ ++#ifndef KERNEL_POWER_TOI_H ++#define KERNEL_POWER_TOI_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "tuxonice_pageflags.h" ++#include "power.h" ++ ++#define TOI_CORE_VERSION "3.3" ++#define TOI_HEADER_VERSION 3 ++#define MY_BOOT_KERNEL_DATA_VERSION 4 ++ ++struct toi_boot_kernel_data { ++ int version; ++ int size; ++ unsigned long toi_action; ++ unsigned long toi_debug_state; ++ u32 toi_default_console_level; ++ int toi_io_time[2][2]; ++ char toi_nosave_commandline[COMMAND_LINE_SIZE]; ++ unsigned long pages_used[33]; ++ unsigned long incremental_bytes_in; ++ unsigned long incremental_bytes_out; ++ unsigned long compress_bytes_in; ++ unsigned long compress_bytes_out; ++ unsigned long pruned_pages; ++}; ++ ++extern struct toi_boot_kernel_data toi_bkd; ++ ++/* Location of book kernel data struct in kernel being resumed */ ++extern unsigned long boot_kernel_data_buffer; ++ ++/* == Action states == */ ++ ++enum { ++ TOI_REBOOT, ++ TOI_PAUSE, ++ TOI_LOGALL, ++ TOI_CAN_CANCEL, ++ TOI_KEEP_IMAGE, ++ TOI_FREEZER_TEST, ++ TOI_SINGLESTEP, ++ TOI_PAUSE_NEAR_PAGESET_END, ++ TOI_TEST_FILTER_SPEED, ++ TOI_TEST_BIO, ++ TOI_NO_PAGESET2, ++ TOI_IGNORE_ROOTFS, ++ TOI_REPLACE_SWSUSP, ++ TOI_PAGESET2_FULL, ++ TOI_ABORT_ON_RESAVE_NEEDED, ++ TOI_NO_MULTITHREADED_IO, ++ TOI_NO_DIRECT_LOAD, /* Obsolete */ ++ TOI_LATE_CPU_HOTPLUG, ++ TOI_GET_MAX_MEM_ALLOCD, ++ TOI_NO_FLUSHER_THREAD, ++ TOI_NO_PS2_IF_UNNEEDED, ++ TOI_POST_RESUME_BREAKPOINT, ++ TOI_NO_READAHEAD, ++}; ++ ++extern unsigned long toi_bootflags_mask; ++ ++#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action)) ++ ++/* == Result states == */ ++ ++enum { ++ TOI_ABORTED, ++ TOI_ABORT_REQUESTED, ++ TOI_NOSTORAGE_AVAILABLE, ++ TOI_INSUFFICIENT_STORAGE, ++ TOI_FREEZING_FAILED, ++ TOI_KEPT_IMAGE, ++ TOI_WOULD_EAT_MEMORY, ++ TOI_UNABLE_TO_FREE_ENOUGH_MEMORY, ++ TOI_PM_SEM, ++ TOI_DEVICE_REFUSED, ++ TOI_SYSDEV_REFUSED, ++ TOI_EXTRA_PAGES_ALLOW_TOO_SMALL, ++ TOI_UNABLE_TO_PREPARE_IMAGE, ++ TOI_FAILED_MODULE_INIT, ++ TOI_FAILED_MODULE_CLEANUP, ++ TOI_FAILED_IO, ++ TOI_OUT_OF_MEMORY, ++ TOI_IMAGE_ERROR, ++ TOI_PLATFORM_PREP_FAILED, ++ TOI_CPU_HOTPLUG_FAILED, ++ TOI_ARCH_PREPARE_FAILED, /* Removed Linux-3.0 */ ++ TOI_RESAVE_NEEDED, ++ TOI_CANT_SUSPEND, ++ TOI_NOTIFIERS_PREPARE_FAILED, ++ TOI_PRE_SNAPSHOT_FAILED, ++ TOI_PRE_RESTORE_FAILED, ++ TOI_USERMODE_HELPERS_ERR, ++ TOI_CANT_USE_ALT_RESUME, ++ TOI_HEADER_TOO_BIG, ++ TOI_WAKEUP_EVENT, ++ TOI_SYSCORE_REFUSED, ++ TOI_DPM_PREPARE_FAILED, ++ TOI_DPM_SUSPEND_FAILED, ++ TOI_NUM_RESULT_STATES /* Used in printing debug info only */ ++}; ++ ++extern unsigned long toi_result; ++ ++#define set_result_state(bit) (test_and_set_bit(bit, &toi_result)) ++#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \ ++ test_and_set_bit(bit, &toi_result)) ++#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result)) ++#define test_result_state(bit) (test_bit(bit, &toi_result)) ++ ++/* == Debug sections and levels == */ ++ ++/* debugging levels. */ ++enum { ++ TOI_STATUS = 0, ++ TOI_ERROR = 2, ++ TOI_LOW, ++ TOI_MEDIUM, ++ TOI_HIGH, ++ TOI_VERBOSE, ++}; ++ ++enum { ++ TOI_ANY_SECTION, ++ TOI_EAT_MEMORY, ++ TOI_IO, ++ TOI_HEADER, ++ TOI_WRITER, ++ TOI_MEMORY, ++ TOI_PAGEDIR, ++ TOI_COMPRESS, ++ TOI_BIO, ++}; ++ ++#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state)) ++#define clear_debug_state(bit) \ ++ (test_and_clear_bit(bit, &toi_bkd.toi_debug_state)) ++#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state)) ++ ++/* == Steps in hibernating == */ ++ ++enum { ++ STEP_HIBERNATE_PREPARE_IMAGE, ++ STEP_HIBERNATE_SAVE_IMAGE, ++ STEP_HIBERNATE_POWERDOWN, ++ STEP_RESUME_CAN_RESUME, ++ STEP_RESUME_LOAD_PS1, ++ STEP_RESUME_DO_RESTORE, ++ STEP_RESUME_READ_PS2, ++ STEP_RESUME_GO, ++ STEP_RESUME_ALT_IMAGE, ++ STEP_CLEANUP, ++ STEP_QUIET_CLEANUP ++}; ++ ++/* == TuxOnIce states == ++ (see also include/linux/suspend.h) */ ++ ++#define get_toi_state() (toi_state) ++#define restore_toi_state(saved_state) \ ++ do { toi_state = saved_state; } while (0) ++ ++/* == Module support == */ ++ ++struct toi_core_fns { ++ int (*post_context_save)(void); ++ unsigned long (*get_nonconflicting_page)(void); ++ int (*try_hibernate)(void); ++ void (*try_resume)(void); ++}; ++ ++extern struct toi_core_fns *toi_core_fns; ++ ++/* == All else == */ ++#define KB(x) ((x) << (PAGE_SHIFT - 10)) ++#define MB(x) ((x) >> (20 - PAGE_SHIFT)) ++ ++extern int toi_start_anything(int toi_or_resume); ++extern void toi_finish_anything(int toi_or_resume); ++ ++extern int save_image_part1(void); ++extern int toi_atomic_restore(void); ++ ++extern int toi_try_hibernate(void); ++extern void toi_try_resume(void); ++ ++extern int __toi_post_context_save(void); ++ ++extern unsigned int nr_hibernates; ++extern char alt_resume_param[256]; ++ ++extern void copyback_post(void); ++extern int toi_hibernate(void); ++extern unsigned long extra_pd1_pages_used; ++ ++#define SECTOR_SIZE 512 ++ ++extern void toi_early_boot_message(int can_erase_image, int default_answer, ++ char *warning_reason, ...); ++ ++extern int do_check_can_resume(void); ++extern int do_toi_step(int step); ++extern int toi_launch_userspace_program(char *command, int channel_no, ++ int wait, int debug); ++ ++extern char tuxonice_signature[9]; ++ ++extern int toi_start_other_threads(void); ++extern void toi_stop_other_threads(void); ++#endif +diff --git a/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c +new file mode 100644 +index 0000000..675f2b5 +--- /dev/null ++++ b/kernel/power/tuxonice_alloc.c +@@ -0,0 +1,314 @@ ++/* ++ * kernel/power/tuxonice_alloc.c ++ * ++ * Copyright (C) 2008-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ */ ++ ++#ifdef CONFIG_PM_DEBUG ++#include ++#include ++#include "tuxonice_modules.h" ++#include "tuxonice_alloc.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice.h" ++ ++#define TOI_ALLOC_PATHS 40 ++ ++static DEFINE_MUTEX(toi_alloc_mutex); ++ ++static struct toi_module_ops toi_alloc_ops; ++ ++static int toi_fail_num; ++ ++static atomic_t toi_alloc_count[TOI_ALLOC_PATHS], ++ toi_free_count[TOI_ALLOC_PATHS], ++ toi_test_count[TOI_ALLOC_PATHS], ++ toi_fail_count[TOI_ALLOC_PATHS]; ++static int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS]; ++static int cur_allocd, max_allocd; ++ ++static char *toi_alloc_desc[TOI_ALLOC_PATHS] = { ++ "", /* 0 */ ++ "get_io_info_struct", ++ "extent", ++ "extent (loading chain)", ++ "userui channel", ++ "userui arg", /* 5 */ ++ "attention list metadata", ++ "extra pagedir memory metadata", ++ "bdev metadata", ++ "extra pagedir memory", ++ "header_locations_read", /* 10 */ ++ "bio queue", ++ "prepare_readahead", ++ "i/o buffer", ++ "writer buffer in bio_init", ++ "checksum buffer", /* 15 */ ++ "compression buffer", ++ "filewriter signature op", ++ "set resume param alloc1", ++ "set resume param alloc2", ++ "debugging info buffer", /* 20 */ ++ "check can resume buffer", ++ "write module config buffer", ++ "read module config buffer", ++ "write image header buffer", ++ "read pageset1 buffer", /* 25 */ ++ "get_have_image_data buffer", ++ "checksum page", ++ "worker rw loop", ++ "get nonconflicting page", ++ "ps1 load addresses", /* 30 */ ++ "remove swap image", ++ "swap image exists", ++ "swap parse sig location", ++ "sysfs kobj", ++ "swap mark resume attempted buffer", /* 35 */ ++ "cluster member", ++ "boot kernel data buffer", ++ "setting swap signature", ++ "block i/o bdev struct" ++}; ++ ++#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \ ++ do { \ ++ BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \ ++ \ ++ if (FAIL_NUM == toi_fail_num) { \ ++ atomic_inc(&toi_test_count[FAIL_NUM]); \ ++ toi_fail_num = 0; \ ++ return FAIL_VAL; \ ++ } \ ++ } while (0) ++ ++static void alloc_update_stats(int fail_num, void *result, int size) ++{ ++ if (!result) { ++ atomic_inc(&toi_fail_count[fail_num]); ++ return; ++ } ++ ++ atomic_inc(&toi_alloc_count[fail_num]); ++ if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) { ++ mutex_lock(&toi_alloc_mutex); ++ toi_cur_allocd[fail_num]++; ++ cur_allocd += size; ++ if (unlikely(cur_allocd > max_allocd)) { ++ int i; ++ ++ for (i = 0; i < TOI_ALLOC_PATHS; i++) ++ toi_max_allocd[i] = toi_cur_allocd[i]; ++ max_allocd = cur_allocd; ++ } ++ mutex_unlock(&toi_alloc_mutex); ++ } ++} ++ ++static void free_update_stats(int fail_num, int size) ++{ ++ BUG_ON(fail_num >= TOI_ALLOC_PATHS); ++ atomic_inc(&toi_free_count[fail_num]); ++ if (unlikely(atomic_read(&toi_free_count[fail_num]) > ++ atomic_read(&toi_alloc_count[fail_num]))) ++ dump_stack(); ++ if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) { ++ mutex_lock(&toi_alloc_mutex); ++ cur_allocd -= size; ++ toi_cur_allocd[fail_num]--; ++ mutex_unlock(&toi_alloc_mutex); ++ } ++} ++ ++void *toi_kzalloc(int fail_num, size_t size, gfp_t flags) ++{ ++ void *result; ++ ++ if (toi_alloc_ops.enabled) ++ MIGHT_FAIL(fail_num, NULL); ++ result = kzalloc(size, flags); ++ if (toi_alloc_ops.enabled) ++ alloc_update_stats(fail_num, result, size); ++ if (fail_num == toi_trace_allocs) ++ dump_stack(); ++ return result; ++} ++EXPORT_SYMBOL_GPL(toi_kzalloc); ++ ++unsigned long toi_get_free_pages(int fail_num, gfp_t mask, ++ unsigned int order) ++{ ++ unsigned long result; ++ ++ if (toi_alloc_ops.enabled) ++ MIGHT_FAIL(fail_num, 0); ++ result = __get_free_pages(mask, order); ++ if (toi_alloc_ops.enabled) ++ alloc_update_stats(fail_num, (void *) result, ++ PAGE_SIZE << order); ++ if (fail_num == toi_trace_allocs) ++ dump_stack(); ++ return result; ++} ++EXPORT_SYMBOL_GPL(toi_get_free_pages); ++ ++struct page *toi_alloc_page(int fail_num, gfp_t mask) ++{ ++ struct page *result; ++ ++ if (toi_alloc_ops.enabled) ++ MIGHT_FAIL(fail_num, NULL); ++ result = alloc_page(mask); ++ if (toi_alloc_ops.enabled) ++ alloc_update_stats(fail_num, (void *) result, PAGE_SIZE); ++ if (fail_num == toi_trace_allocs) ++ dump_stack(); ++ return result; ++} ++EXPORT_SYMBOL_GPL(toi_alloc_page); ++ ++unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask) ++{ ++ unsigned long result; ++ ++ if (toi_alloc_ops.enabled) ++ MIGHT_FAIL(fail_num, 0); ++ result = get_zeroed_page(mask); ++ if (toi_alloc_ops.enabled) ++ alloc_update_stats(fail_num, (void *) result, PAGE_SIZE); ++ if (fail_num == toi_trace_allocs) ++ dump_stack(); ++ return result; ++} ++EXPORT_SYMBOL_GPL(toi_get_zeroed_page); ++ ++void toi_kfree(int fail_num, const void *arg, int size) ++{ ++ if (arg && toi_alloc_ops.enabled) ++ free_update_stats(fail_num, size); ++ ++ if (fail_num == toi_trace_allocs) ++ dump_stack(); ++ kfree(arg); ++} ++EXPORT_SYMBOL_GPL(toi_kfree); ++ ++void toi_free_page(int fail_num, unsigned long virt) ++{ ++ if (virt && toi_alloc_ops.enabled) ++ free_update_stats(fail_num, PAGE_SIZE); ++ ++ if (fail_num == toi_trace_allocs) ++ dump_stack(); ++ free_page(virt); ++} ++EXPORT_SYMBOL_GPL(toi_free_page); ++ ++void toi__free_page(int fail_num, struct page *page) ++{ ++ if (page && toi_alloc_ops.enabled) ++ free_update_stats(fail_num, PAGE_SIZE); ++ ++ if (fail_num == toi_trace_allocs) ++ dump_stack(); ++ __free_page(page); ++} ++EXPORT_SYMBOL_GPL(toi__free_page); ++ ++void toi_free_pages(int fail_num, struct page *page, int order) ++{ ++ if (page && toi_alloc_ops.enabled) ++ free_update_stats(fail_num, PAGE_SIZE << order); ++ ++ if (fail_num == toi_trace_allocs) ++ dump_stack(); ++ __free_pages(page, order); ++} ++ ++void toi_alloc_print_debug_stats(void) ++{ ++ int i, header_done = 0; ++ ++ if (!toi_alloc_ops.enabled) ++ return; ++ ++ for (i = 0; i < TOI_ALLOC_PATHS; i++) ++ if (atomic_read(&toi_alloc_count[i]) != ++ atomic_read(&toi_free_count[i])) { ++ if (!header_done) { ++ printk(KERN_INFO "Idx Allocs Frees Tests " ++ " Fails Max Description\n"); ++ header_done = 1; ++ } ++ ++ printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i, ++ atomic_read(&toi_alloc_count[i]), ++ atomic_read(&toi_free_count[i]), ++ atomic_read(&toi_test_count[i]), ++ atomic_read(&toi_fail_count[i]), ++ toi_max_allocd[i], ++ toi_alloc_desc[i]); ++ } ++} ++EXPORT_SYMBOL_GPL(toi_alloc_print_debug_stats); ++ ++static int toi_alloc_initialise(int starting_cycle) ++{ ++ int i; ++ ++ if (!starting_cycle) ++ return 0; ++ ++ if (toi_trace_allocs) ++ dump_stack(); ++ ++ for (i = 0; i < TOI_ALLOC_PATHS; i++) { ++ atomic_set(&toi_alloc_count[i], 0); ++ atomic_set(&toi_free_count[i], 0); ++ atomic_set(&toi_test_count[i], 0); ++ atomic_set(&toi_fail_count[i], 0); ++ toi_cur_allocd[i] = 0; ++ toi_max_allocd[i] = 0; ++ }; ++ ++ max_allocd = 0; ++ cur_allocd = 0; ++ return 0; ++} ++ ++static struct toi_sysfs_data sysfs_params[] = { ++ SYSFS_INT("failure_test", SYSFS_RW, &toi_fail_num, 0, 99, 0, NULL), ++ SYSFS_INT("trace", SYSFS_RW, &toi_trace_allocs, 0, TOI_ALLOC_PATHS, 0, ++ NULL), ++ SYSFS_BIT("find_max_mem_allocated", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_GET_MAX_MEM_ALLOCD, 0), ++ SYSFS_INT("enabled", SYSFS_RW, &toi_alloc_ops.enabled, 0, 1, 0, ++ NULL) ++}; ++ ++static struct toi_module_ops toi_alloc_ops = { ++ .type = MISC_HIDDEN_MODULE, ++ .name = "allocation debugging", ++ .directory = "alloc", ++ .module = THIS_MODULE, ++ .early = 1, ++ .initialise = toi_alloc_initialise, ++ ++ .sysfs_data = sysfs_params, ++ .num_sysfs_entries = sizeof(sysfs_params) / ++ sizeof(struct toi_sysfs_data), ++}; ++ ++int toi_alloc_init(void) ++{ ++ int result = toi_register_module(&toi_alloc_ops); ++ return result; ++} ++ ++void toi_alloc_exit(void) ++{ ++ toi_unregister_module(&toi_alloc_ops); ++} ++#endif +diff --git a/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h +new file mode 100644 +index 0000000..099ee51 +--- /dev/null ++++ b/kernel/power/tuxonice_alloc.h +@@ -0,0 +1,54 @@ ++/* ++ * kernel/power/tuxonice_alloc.h ++ * ++ * Copyright (C) 2008-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ */ ++ ++#include ++#define TOI_WAIT_GFP (GFP_NOFS | __GFP_NOWARN) ++#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN) ++ ++#ifdef CONFIG_PM_DEBUG ++extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags); ++extern void toi_kfree(int fail_num, const void *arg, int size); ++ ++extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask, ++ unsigned int order); ++#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0) ++extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask); ++extern void toi_free_page(int fail_num, unsigned long buf); ++extern void toi__free_page(int fail_num, struct page *page); ++extern void toi_free_pages(int fail_num, struct page *page, int order); ++extern struct page *toi_alloc_page(int fail_num, gfp_t mask); ++extern int toi_alloc_init(void); ++extern void toi_alloc_exit(void); ++ ++extern void toi_alloc_print_debug_stats(void); ++ ++#else /* CONFIG_PM_DEBUG */ ++ ++#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS)) ++#define toi_kfree(FAIL, ALLOCN, SIZE) (kfree(ALLOCN)) ++ ++#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER) ++#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS) ++#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS) ++#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0) ++#define toi__free_page(FAIL, PAGE) __free_page(PAGE) ++#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER) ++#define toi_alloc_page(FAIL, MASK) alloc_page(MASK) ++static inline int toi_alloc_init(void) ++{ ++ return 0; ++} ++ ++static inline void toi_alloc_exit(void) { } ++ ++static inline void toi_alloc_print_debug_stats(void) { } ++ ++#endif ++ ++extern int toi_trace_allocs; +diff --git a/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c +new file mode 100644 +index 0000000..c524acb +--- /dev/null ++++ b/kernel/power/tuxonice_atomic_copy.c +@@ -0,0 +1,473 @@ ++/* ++ * kernel/power/tuxonice_atomic_copy.c ++ * ++ * Copyright 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * Distributed under GPLv2. ++ * ++ * Routines for doing the atomic save/restore. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "tuxonice.h" ++#include "tuxonice_storage.h" ++#include "tuxonice_power_off.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_io.h" ++#include "tuxonice_prepare_image.h" ++#include "tuxonice_pageflags.h" ++#include "tuxonice_checksum.h" ++#include "tuxonice_builtin.h" ++#include "tuxonice_atomic_copy.h" ++#include "tuxonice_alloc.h" ++#include "tuxonice_modules.h" ++ ++unsigned long extra_pd1_pages_used; ++ ++/** ++ * free_pbe_list - free page backup entries used by the atomic copy code. ++ * @list: List to free. ++ * @highmem: Whether the list is in highmem. ++ * ++ * Normally, this function isn't used. If, however, we need to abort before ++ * doing the atomic copy, we use this to free the pbes previously allocated. ++ **/ ++static void free_pbe_list(struct pbe **list, int highmem) ++{ ++ while (*list) { ++ int i; ++ struct pbe *free_pbe, *next_page = NULL; ++ struct page *page; ++ ++ if (highmem) { ++ page = (struct page *) *list; ++ free_pbe = (struct pbe *) kmap(page); ++ } else { ++ page = virt_to_page(*list); ++ free_pbe = *list; ++ } ++ ++ for (i = 0; i < PBES_PER_PAGE; i++) { ++ if (!free_pbe) ++ break; ++ if (highmem) ++ toi__free_page(29, free_pbe->address); ++ else ++ toi_free_page(29, ++ (unsigned long) free_pbe->address); ++ free_pbe = free_pbe->next; ++ } ++ ++ if (highmem) { ++ if (free_pbe) ++ next_page = free_pbe; ++ kunmap(page); ++ } else { ++ if (free_pbe) ++ next_page = free_pbe; ++ } ++ ++ toi__free_page(29, page); ++ *list = (struct pbe *) next_page; ++ }; ++} ++ ++/** ++ * copyback_post - post atomic-restore actions ++ * ++ * After doing the atomic restore, we have a few more things to do: ++ * 1) We want to retain some values across the restore, so we now copy ++ * these from the nosave variables to the normal ones. ++ * 2) Set the status flags. ++ * 3) Resume devices. ++ * 4) Tell userui so it can redraw & restore settings. ++ * 5) Reread the page cache. ++ **/ ++void copyback_post(void) ++{ ++ struct toi_boot_kernel_data *bkd = ++ (struct toi_boot_kernel_data *) boot_kernel_data_buffer; ++ ++ if (toi_activate_storage(1)) ++ panic("Failed to reactivate our storage."); ++ ++ toi_post_atomic_restore_modules(bkd); ++ ++ toi_cond_pause(1, "About to reload secondary pagedir."); ++ ++ if (read_pageset2(0)) ++ panic("Unable to successfully reread the page cache."); ++ ++ /* ++ * If the user wants to sleep again after resuming from full-off, ++ * it's most likely to be in order to suspend to ram, so we'll ++ * do this check after loading pageset2, to give them the fastest ++ * wakeup when they are ready to use the computer again. ++ */ ++ toi_check_resleep(); ++} ++ ++/** ++ * toi_copy_pageset1 - do the atomic copy of pageset1 ++ * ++ * Make the atomic copy of pageset1. We can't use copy_page (as we once did) ++ * because we can't be sure what side effects it has. On my old Duron, with ++ * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt ++ * count at resume time 4 instead of 3. ++ * ++ * We don't want to call kmap_atomic unconditionally because it has the side ++ * effect of incrementing the preempt count, which will leave it one too high ++ * post resume (the page containing the preempt count will be copied after ++ * its incremented. This is essentially the same problem. ++ **/ ++void toi_copy_pageset1(void) ++{ ++ int i; ++ unsigned long source_index, dest_index; ++ ++ memory_bm_position_reset(pageset1_map); ++ memory_bm_position_reset(pageset1_copy_map); ++ ++ source_index = memory_bm_next_pfn(pageset1_map); ++ dest_index = memory_bm_next_pfn(pageset1_copy_map); ++ ++ for (i = 0; i < pagedir1.size; i++) { ++ unsigned long *origvirt, *copyvirt; ++ struct page *origpage, *copypage; ++ int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1, ++ was_present1, was_present2; ++ ++ origpage = pfn_to_page(source_index); ++ copypage = pfn_to_page(dest_index); ++ ++ origvirt = PageHighMem(origpage) ? ++ kmap_atomic(origpage) : ++ page_address(origpage); ++ ++ copyvirt = PageHighMem(copypage) ? ++ kmap_atomic(copypage) : ++ page_address(copypage); ++ ++ was_present1 = kernel_page_present(origpage); ++ if (!was_present1) ++ kernel_map_pages(origpage, 1, 1); ++ ++ was_present2 = kernel_page_present(copypage); ++ if (!was_present2) ++ kernel_map_pages(copypage, 1, 1); ++ ++ while (loop >= 0) { ++ *(copyvirt + loop) = *(origvirt + loop); ++ loop--; ++ } ++ ++ if (!was_present1) ++ kernel_map_pages(origpage, 1, 0); ++ ++ if (!was_present2) ++ kernel_map_pages(copypage, 1, 0); ++ ++ if (PageHighMem(origpage)) ++ kunmap_atomic(origvirt); ++ ++ if (PageHighMem(copypage)) ++ kunmap_atomic(copyvirt); ++ ++ source_index = memory_bm_next_pfn(pageset1_map); ++ dest_index = memory_bm_next_pfn(pageset1_copy_map); ++ } ++} ++ ++/** ++ * __toi_post_context_save - steps after saving the cpu context ++ * ++ * Steps taken after saving the CPU state to make the actual ++ * atomic copy. ++ * ++ * Called from swsusp_save in snapshot.c via toi_post_context_save. ++ **/ ++int __toi_post_context_save(void) ++{ ++ unsigned long old_ps1_size = pagedir1.size; ++ ++ check_checksums(); ++ ++ free_checksum_pages(); ++ ++ toi_recalculate_image_contents(1); ++ ++ extra_pd1_pages_used = pagedir1.size > old_ps1_size ? ++ pagedir1.size - old_ps1_size : 0; ++ ++ if (extra_pd1_pages_used > extra_pd1_pages_allowance) { ++ printk(KERN_INFO "Pageset1 has grown by %lu pages. " ++ "extra_pages_allowance is currently only %lu.\n", ++ pagedir1.size - old_ps1_size, ++ extra_pd1_pages_allowance); ++ ++ /* ++ * Highlevel code will see this, clear the state and ++ * retry if we haven't already done so twice. ++ */ ++ if (any_to_free(1)) { ++ set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); ++ return 1; ++ } ++ if (try_allocate_extra_memory()) { ++ printk(KERN_INFO "Failed to allocate the extra memory" ++ " needed. Restarting the process."); ++ set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); ++ return 1; ++ } ++ printk(KERN_INFO "However it looks like there's enough" ++ " free ram and storage to handle this, so " ++ " continuing anyway."); ++ /* ++ * What if try_allocate_extra_memory above calls ++ * toi_allocate_extra_pagedir_memory and it allocs a new ++ * slab page via toi_kzalloc which should be in ps1? So... ++ */ ++ toi_recalculate_image_contents(1); ++ } ++ ++ if (!test_action_state(TOI_TEST_FILTER_SPEED) && ++ !test_action_state(TOI_TEST_BIO)) ++ toi_copy_pageset1(); ++ ++ return 0; ++} ++ ++/** ++ * toi_hibernate - high level code for doing the atomic copy ++ * ++ * High-level code which prepares to do the atomic copy. Loosely based ++ * on the swsusp version, but with the following twists: ++ * - We set toi_running so the swsusp code uses our code paths. ++ * - We give better feedback regarding what goes wrong if there is a ++ * problem. ++ * - We use an extra function to call the assembly, just in case this code ++ * is in a module (return address). ++ **/ ++int toi_hibernate(void) ++{ ++ int error; ++ ++ toi_running = 1; /* For the swsusp code we use :< */ ++ ++ error = toi_lowlevel_builtin(); ++ ++ if (!error) { ++ struct toi_boot_kernel_data *bkd = ++ (struct toi_boot_kernel_data *) boot_kernel_data_buffer; ++ ++ /* ++ * The boot kernel's data may be larger (newer version) or ++ * smaller (older version) than ours. Copy the minimum ++ * of the two sizes, so that we don't overwrite valid values ++ * from pre-atomic copy. ++ */ ++ ++ memcpy(&toi_bkd, (char *) boot_kernel_data_buffer, ++ min_t(int, sizeof(struct toi_boot_kernel_data), ++ bkd->size)); ++ } ++ ++ toi_running = 0; ++ return error; ++} ++ ++/** ++ * toi_atomic_restore - prepare to do the atomic restore ++ * ++ * Get ready to do the atomic restore. This part gets us into the same ++ * state we are in prior to do calling do_toi_lowlevel while ++ * hibernating: hot-unplugging secondary cpus and freeze processes, ++ * before starting the thread that will do the restore. ++ **/ ++int toi_atomic_restore(void) ++{ ++ int error; ++ ++ toi_running = 1; ++ ++ toi_prepare_status(DONT_CLEAR_BAR, "Atomic restore."); ++ ++ memcpy(&toi_bkd.toi_nosave_commandline, saved_command_line, ++ strlen(saved_command_line)); ++ ++ toi_pre_atomic_restore_modules(&toi_bkd); ++ ++ if (add_boot_kernel_data_pbe()) ++ goto Failed; ++ ++ toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore."); ++ ++ if (toi_go_atomic(PMSG_QUIESCE, 0)) ++ goto Failed; ++ ++ /* We'll ignore saved state, but this gets preempt count (etc) right */ ++ save_processor_state(); ++ ++ error = swsusp_arch_resume(); ++ /* ++ * Code below is only ever reached in case of failure. Otherwise ++ * execution continues at place where swsusp_arch_suspend was called. ++ * ++ * We don't know whether it's safe to continue (this shouldn't happen), ++ * so lets err on the side of caution. ++ */ ++ BUG(); ++ ++Failed: ++ free_pbe_list(&restore_pblist, 0); ++#ifdef CONFIG_HIGHMEM ++ free_pbe_list(&restore_highmem_pblist, 1); ++#endif ++ toi_running = 0; ++ return 1; ++} ++ ++/** ++ * toi_go_atomic - do the actual atomic copy/restore ++ * @state: The state to use for dpm_suspend_start & power_down calls. ++ * @suspend_time: Whether we're suspending or resuming. ++ **/ ++int toi_go_atomic(pm_message_t state, int suspend_time) ++{ ++ if (suspend_time) { ++ if (platform_begin(1)) { ++ set_abort_result(TOI_PLATFORM_PREP_FAILED); ++ toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3); ++ return 1; ++ } ++ ++ if (dpm_prepare(PMSG_FREEZE)) { ++ set_abort_result(TOI_DPM_PREPARE_FAILED); ++ dpm_complete(PMSG_RECOVER); ++ toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3); ++ return 1; ++ } ++ } ++ ++ suspend_console(); ++ ftrace_stop(); ++ pm_restrict_gfp_mask(); ++ ++ if (suspend_time) { ++ if (dpm_suspend(state)) { ++ set_abort_result(TOI_DPM_SUSPEND_FAILED); ++ toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3); ++ return 1; ++ } ++ } else { ++ if (dpm_suspend_start(state)) { ++ set_abort_result(TOI_DPM_SUSPEND_FAILED); ++ toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3); ++ return 1; ++ } ++ } ++ ++ /* At this point, dpm_suspend_start() has been called, but *not* ++ * dpm_suspend_noirq(). We *must* dpm_suspend_noirq() now. ++ * Otherwise, drivers for some devices (e.g. interrupt controllers) ++ * become desynchronized with the actual state of the hardware ++ * at resume time, and evil weirdness ensues. ++ */ ++ ++ if (dpm_suspend_end(state)) { ++ set_abort_result(TOI_DEVICE_REFUSED); ++ toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 1); ++ return 1; ++ } ++ ++ if (suspend_time) { ++ if (platform_pre_snapshot(1)) ++ set_abort_result(TOI_PRE_SNAPSHOT_FAILED); ++ } else { ++ if (platform_pre_restore(1)) ++ set_abort_result(TOI_PRE_RESTORE_FAILED); ++ } ++ ++ if (test_result_state(TOI_ABORTED)) { ++ toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 1); ++ return 1; ++ } ++ ++ if (test_action_state(TOI_LATE_CPU_HOTPLUG)) { ++ if (disable_nonboot_cpus()) { ++ set_abort_result(TOI_CPU_HOTPLUG_FAILED); ++ toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG, ++ suspend_time, 1); ++ return 1; ++ } ++ } ++ ++ local_irq_disable(); ++ ++ if (syscore_suspend()) { ++ set_abort_result(TOI_SYSCORE_REFUSED); ++ toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 1); ++ return 1; ++ } ++ ++ if (suspend_time && pm_wakeup_pending()) { ++ set_abort_result(TOI_WAKEUP_EVENT); ++ toi_end_atomic(ATOMIC_STEP_SYSCORE_RESUME, suspend_time, 1); ++ return 1; ++ } ++ return 0; ++} ++ ++/** ++ * toi_end_atomic - post atomic copy/restore routines ++ * @stage: What step to start at. ++ * @suspend_time: Whether we're suspending or resuming. ++ * @error: Whether we're recovering from an error. ++ **/ ++void toi_end_atomic(int stage, int suspend_time, int error) ++{ ++ pm_message_t msg = suspend_time ? (error ? PMSG_RECOVER : PMSG_THAW) : ++ PMSG_RESTORE; ++ ++ switch (stage) { ++ case ATOMIC_ALL_STEPS: ++ if (!suspend_time) { ++ events_check_enabled = false; ++ platform_leave(1); ++ } ++ case ATOMIC_STEP_SYSCORE_RESUME: ++ syscore_resume(); ++ case ATOMIC_STEP_IRQS: ++ local_irq_enable(); ++ case ATOMIC_STEP_CPU_HOTPLUG: ++ if (test_action_state(TOI_LATE_CPU_HOTPLUG)) ++ enable_nonboot_cpus(); ++ case ATOMIC_STEP_PLATFORM_FINISH: ++ if (!suspend_time && error & 2) ++ platform_restore_cleanup(1); ++ else ++ platform_finish(1); ++ dpm_resume_start(msg); ++ case ATOMIC_STEP_DEVICE_RESUME: ++ if (suspend_time && (error & 2)) ++ platform_recover(1); ++ dpm_resume(msg); ++ if (error || !toi_in_suspend()) ++ pm_restore_gfp_mask(); ++ ftrace_start(); ++ resume_console(); ++ case ATOMIC_STEP_DPM_COMPLETE: ++ dpm_complete(msg); ++ case ATOMIC_STEP_PLATFORM_END: ++ platform_end(1); ++ ++ toi_prepare_status(DONT_CLEAR_BAR, "Post atomic."); ++ } ++} +diff --git a/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h +new file mode 100644 +index 0000000..6a989c1 +--- /dev/null ++++ b/kernel/power/tuxonice_atomic_copy.h +@@ -0,0 +1,23 @@ ++/* ++ * kernel/power/tuxonice_atomic_copy.h ++ * ++ * Copyright 2008-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * Distributed under GPLv2. ++ * ++ * Routines for doing the atomic save/restore. ++ */ ++ ++enum { ++ ATOMIC_ALL_STEPS, ++ ATOMIC_STEP_SYSCORE_RESUME, ++ ATOMIC_STEP_IRQS, ++ ATOMIC_STEP_CPU_HOTPLUG, ++ ATOMIC_STEP_PLATFORM_FINISH, ++ ATOMIC_STEP_DEVICE_RESUME, ++ ATOMIC_STEP_DPM_COMPLETE, ++ ATOMIC_STEP_PLATFORM_END, ++}; ++ ++int toi_go_atomic(pm_message_t state, int toi_time); ++void toi_end_atomic(int stage, int toi_time, int error); +diff --git a/kernel/power/tuxonice_bio.h b/kernel/power/tuxonice_bio.h +new file mode 100644 +index 0000000..9627ccc +--- /dev/null ++++ b/kernel/power/tuxonice_bio.h +@@ -0,0 +1,77 @@ ++/* ++ * kernel/power/tuxonice_bio.h ++ * ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * Distributed under GPLv2. ++ * ++ * This file contains declarations for functions exported from ++ * tuxonice_bio.c, which contains low level io functions. ++ */ ++ ++#include ++#include "tuxonice_extent.h" ++ ++void toi_put_extent_chain(struct hibernate_extent_chain *chain); ++int toi_add_to_extent_chain(struct hibernate_extent_chain *chain, ++ unsigned long start, unsigned long end); ++ ++struct hibernate_extent_saved_state { ++ int extent_num; ++ struct hibernate_extent *extent_ptr; ++ unsigned long offset; ++}; ++ ++struct toi_bdev_info { ++ struct toi_bdev_info *next; ++ struct hibernate_extent_chain blocks; ++ struct block_device *bdev; ++ struct toi_module_ops *allocator; ++ int allocator_index; ++ struct hibernate_extent_chain allocations; ++ char name[266]; /* "swap on " or "file " + up to 256 chars */ ++ ++ /* Saved in header */ ++ char uuid[17]; ++ dev_t dev_t; ++ int prio; ++ int bmap_shift; ++ int blocks_per_page; ++ unsigned long pages_used; ++ struct hibernate_extent_saved_state saved_state[4]; ++}; ++ ++struct toi_extent_iterate_state { ++ struct toi_bdev_info *current_chain; ++ int num_chains; ++ int saved_chain_number[4]; ++ struct toi_bdev_info *saved_chain_ptr[4]; ++}; ++ ++/* ++ * Our exported interface so the swapwriter and filewriter don't ++ * need these functions duplicated. ++ */ ++struct toi_bio_ops { ++ int (*bdev_page_io) (int rw, struct block_device *bdev, long pos, ++ struct page *page); ++ int (*register_storage)(struct toi_bdev_info *new); ++ void (*free_storage)(void); ++}; ++ ++struct toi_allocator_ops { ++ unsigned long (*toi_swap_storage_available) (void); ++}; ++ ++extern struct toi_bio_ops toi_bio_ops; ++ ++extern char *toi_writer_buffer; ++extern int toi_writer_buffer_posn; ++ ++struct toi_bio_allocator_ops { ++ int (*register_storage) (void); ++ unsigned long (*storage_available)(void); ++ int (*allocate_storage) (struct toi_bdev_info *, unsigned long); ++ int (*bmap) (struct toi_bdev_info *); ++ void (*free_storage) (struct toi_bdev_info *); ++}; +diff --git a/kernel/power/tuxonice_bio_chains.c b/kernel/power/tuxonice_bio_chains.c +new file mode 100644 +index 0000000..c214d18 +--- /dev/null ++++ b/kernel/power/tuxonice_bio_chains.c +@@ -0,0 +1,1048 @@ ++/* ++ * kernel/power/tuxonice_bio_devinfo.c ++ * ++ * Copyright (C) 2009-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * Distributed under GPLv2. ++ * ++ */ ++ ++#include ++#include "tuxonice_bio.h" ++#include "tuxonice_bio_internal.h" ++#include "tuxonice_alloc.h" ++#include "tuxonice_ui.h" ++#include "tuxonice.h" ++#include "tuxonice_io.h" ++ ++static struct toi_bdev_info *prio_chain_head; ++static int num_chains; ++ ++/* Pointer to current entry being loaded/saved. */ ++struct toi_extent_iterate_state toi_writer_posn; ++ ++#define metadata_size (sizeof(struct toi_bdev_info) - \ ++ offsetof(struct toi_bdev_info, uuid)) ++ ++/* ++ * After section 0 (header) comes 2 => next_section[0] = 2 ++ */ ++static int next_section[3] = { 2, 3, 1 }; ++ ++/** ++ * dump_block_chains - print the contents of the bdev info array. ++ **/ ++void dump_block_chains(void) ++{ ++ int i = 0; ++ int j; ++ struct toi_bdev_info *cur_chain = prio_chain_head; ++ ++ while (cur_chain) { ++ struct hibernate_extent *this = cur_chain->blocks.first; ++ ++ printk(KERN_DEBUG "Chain %d (prio %d):", i, cur_chain->prio); ++ ++ while (this) { ++ printk(KERN_CONT " [%lu-%lu]%s", this->start, ++ this->end, this->next ? "," : ""); ++ this = this->next; ++ } ++ ++ printk("\n"); ++ cur_chain = cur_chain->next; ++ i++; ++ } ++ ++ printk(KERN_DEBUG "Saved states:\n"); ++ for (i = 0; i < 4; i++) { ++ printk(KERN_DEBUG "Slot %d: Chain %d.\n", ++ i, toi_writer_posn.saved_chain_number[i]); ++ ++ cur_chain = prio_chain_head; ++ j = 0; ++ while (cur_chain) { ++ printk(KERN_DEBUG " Chain %d: Extent %d. Offset %lu.\n", ++ j, cur_chain->saved_state[i].extent_num, ++ cur_chain->saved_state[i].offset); ++ cur_chain = cur_chain->next; ++ j++; ++ } ++ printk(KERN_CONT "\n"); ++ } ++} ++ ++/** ++ * ++ **/ ++static void toi_extent_chain_next(void) ++{ ++ struct toi_bdev_info *this = toi_writer_posn.current_chain; ++ ++ if (!this->blocks.current_extent) ++ return; ++ ++ if (this->blocks.current_offset == this->blocks.current_extent->end) { ++ if (this->blocks.current_extent->next) { ++ this->blocks.current_extent = ++ this->blocks.current_extent->next; ++ this->blocks.current_offset = ++ this->blocks.current_extent->start; ++ } else { ++ this->blocks.current_extent = NULL; ++ this->blocks.current_offset = 0; ++ } ++ } else ++ this->blocks.current_offset++; ++} ++ ++/** ++ * ++ */ ++ ++static struct toi_bdev_info *__find_next_chain_same_prio(void) ++{ ++ struct toi_bdev_info *start_chain = toi_writer_posn.current_chain; ++ struct toi_bdev_info *this = start_chain; ++ int orig_prio = this->prio; ++ ++ do { ++ this = this->next; ++ ++ if (!this) ++ this = prio_chain_head; ++ ++ /* Back on original chain? Use it again. */ ++ if (this == start_chain) ++ return start_chain; ++ ++ } while (!this->blocks.current_extent || this->prio != orig_prio); ++ ++ return this; ++} ++ ++static void find_next_chain(void) ++{ ++ struct toi_bdev_info *this; ++ ++ this = __find_next_chain_same_prio(); ++ ++ /* ++ * If we didn't get another chain of the same priority that we ++ * can use, look for the next priority. ++ */ ++ while (this && !this->blocks.current_extent) ++ this = this->next; ++ ++ toi_writer_posn.current_chain = this; ++} ++ ++/** ++ * toi_extent_state_next - go to the next extent ++ * @blocks: The number of values to progress. ++ * @stripe_mode: Whether to spread usage across all chains. ++ * ++ * Given a state, progress to the next valid entry. We may begin in an ++ * invalid state, as we do when invoked after extent_state_goto_start below. ++ * ++ * When using compression and expected_compression > 0, we let the image size ++ * be larger than storage, so we can validly run out of data to return. ++ **/ ++static unsigned long toi_extent_state_next(int blocks, int current_stream) ++{ ++ int i; ++ ++ if (!toi_writer_posn.current_chain) ++ return -ENOSPC; ++ ++ /* Assume chains always have lengths that are multiples of @blocks */ ++ for (i = 0; i < blocks; i++) ++ toi_extent_chain_next(); ++ ++ /* The header stream is not striped */ ++ if (current_stream || ++ !toi_writer_posn.current_chain->blocks.current_extent) ++ find_next_chain(); ++ ++ return toi_writer_posn.current_chain ? 0 : -ENOSPC; ++} ++ ++static void toi_insert_chain_in_prio_list(struct toi_bdev_info *this) ++{ ++ struct toi_bdev_info **prev_ptr; ++ struct toi_bdev_info *cur; ++ ++ /* Loop through the existing chain, finding where to insert it */ ++ prev_ptr = &prio_chain_head; ++ cur = prio_chain_head; ++ ++ while (cur && cur->prio >= this->prio) { ++ prev_ptr = &cur->next; ++ cur = cur->next; ++ } ++ ++ this->next = *prev_ptr; ++ *prev_ptr = this; ++ ++ this = prio_chain_head; ++ while (this) ++ this = this->next; ++ num_chains++; ++} ++ ++/** ++ * toi_extent_state_goto_start - reinitialize an extent chain iterator ++ * @state: Iterator to reinitialize ++ **/ ++void toi_extent_state_goto_start(void) ++{ ++ struct toi_bdev_info *this = prio_chain_head; ++ ++ while (this) { ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ "Setting current extent to %p.", this->blocks.first); ++ this->blocks.current_extent = this->blocks.first; ++ if (this->blocks.current_extent) { ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ "Setting current offset to %lu.", ++ this->blocks.current_extent->start); ++ this->blocks.current_offset = ++ this->blocks.current_extent->start; ++ } ++ ++ this = this->next; ++ } ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Setting current chain to %p.", ++ prio_chain_head); ++ toi_writer_posn.current_chain = prio_chain_head; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Leaving extent state goto start."); ++} ++ ++/** ++ * toi_extent_state_save - save state of the iterator ++ * @state: Current state of the chain ++ * @saved_state: Iterator to populate ++ * ++ * Given a state and a struct hibernate_extent_state_store, save the current ++ * position in a format that can be used with relocated chains (at ++ * resume time). ++ **/ ++void toi_extent_state_save(int slot) ++{ ++ struct toi_bdev_info *cur_chain = prio_chain_head; ++ struct hibernate_extent *extent; ++ struct hibernate_extent_saved_state *chain_state; ++ int i = 0; ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_extent_state_save, slot %d.", ++ slot); ++ ++ if (!toi_writer_posn.current_chain) { ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current chain => " ++ "chain_num = -1."); ++ toi_writer_posn.saved_chain_number[slot] = -1; ++ return; ++ } ++ ++ while (cur_chain) { ++ i++; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saving chain %d (%p) " ++ "state, slot %d.", i, cur_chain, slot); ++ ++ chain_state = &cur_chain->saved_state[slot]; ++ ++ chain_state->offset = cur_chain->blocks.current_offset; ++ ++ if (toi_writer_posn.current_chain == cur_chain) { ++ toi_writer_posn.saved_chain_number[slot] = i; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "This is the chain " ++ "we were on => chain_num is %d.", i); ++ } ++ ++ if (!cur_chain->blocks.current_extent) { ++ chain_state->extent_num = 0; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current extent " ++ "for this chain => extent_num %d is 0.", ++ i); ++ cur_chain = cur_chain->next; ++ continue; ++ } ++ ++ extent = cur_chain->blocks.first; ++ chain_state->extent_num = 1; ++ ++ while (extent != cur_chain->blocks.current_extent) { ++ chain_state->extent_num++; ++ extent = extent->next; ++ } ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "extent num %d is %d.", i, ++ chain_state->extent_num); ++ ++ cur_chain = cur_chain->next; ++ } ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ "Completed saving extent state slot %d.", slot); ++} ++ ++/** ++ * toi_extent_state_restore - restore the position saved by extent_state_save ++ * @state: State to populate ++ * @saved_state: Iterator saved to restore ++ **/ ++void toi_extent_state_restore(int slot) ++{ ++ int i = 0; ++ struct toi_bdev_info *cur_chain = prio_chain_head; ++ struct hibernate_extent_saved_state *chain_state; ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ "toi_extent_state_restore - slot %d.", slot); ++ ++ if (toi_writer_posn.saved_chain_number[slot] == -1) { ++ toi_writer_posn.current_chain = NULL; ++ return; ++ } ++ ++ while (cur_chain) { ++ int posn; ++ int j; ++ i++; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Restoring chain %d (%p) " ++ "state, slot %d.", i, cur_chain, slot); ++ ++ chain_state = &cur_chain->saved_state[slot]; ++ ++ posn = chain_state->extent_num; ++ ++ cur_chain->blocks.current_extent = cur_chain->blocks.first; ++ cur_chain->blocks.current_offset = chain_state->offset; ++ ++ if (i == toi_writer_posn.saved_chain_number[slot]) { ++ toi_writer_posn.current_chain = cur_chain; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ "Found current chain."); ++ } ++ ++ for (j = 0; j < 4; j++) ++ if (i == toi_writer_posn.saved_chain_number[j]) { ++ toi_writer_posn.saved_chain_ptr[j] = cur_chain; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ "Found saved chain ptr %d (%p) (offset" ++ " %d).", j, cur_chain, ++ cur_chain->saved_state[j].offset); ++ } ++ ++ if (posn) { ++ while (--posn) ++ cur_chain->blocks.current_extent = ++ cur_chain->blocks.current_extent->next; ++ } else ++ cur_chain->blocks.current_extent = NULL; ++ ++ cur_chain = cur_chain->next; ++ } ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done."); ++ if (test_action_state(TOI_LOGALL)) ++ dump_block_chains(); ++} ++ ++/* ++ * Storage needed ++ * ++ * Returns amount of space in the image header required ++ * for the chain data. This ignores the links between ++ * pages, which we factor in when allocating the space. ++ */ ++int toi_bio_devinfo_storage_needed(void) ++{ ++ int result = sizeof(num_chains); ++ struct toi_bdev_info *chain = prio_chain_head; ++ ++ while (chain) { ++ result += metadata_size; ++ ++ /* Chain size */ ++ result += sizeof(int); ++ ++ /* Extents */ ++ result += (2 * sizeof(unsigned long) * ++ chain->blocks.num_extents); ++ ++ chain = chain->next; ++ } ++ ++ result += 4 * sizeof(int); ++ return result; ++} ++ ++static unsigned long chain_pages_used(struct toi_bdev_info *chain) ++{ ++ struct hibernate_extent *this = chain->blocks.first; ++ struct hibernate_extent_saved_state *state = &chain->saved_state[3]; ++ unsigned long size = 0; ++ int extent_idx = 1; ++ ++ if (!state->extent_num) { ++ if (!this) ++ return 0; ++ else ++ return chain->blocks.size; ++ } ++ ++ while (extent_idx < state->extent_num) { ++ size += (this->end - this->start + 1); ++ this = this->next; ++ extent_idx++; ++ } ++ ++ /* We didn't use the one we're sitting on, so don't count it */ ++ return size + state->offset - this->start; ++} ++ ++/** ++ * toi_serialise_extent_chain - write a chain in the image ++ * @chain: Chain to write. ++ **/ ++static int toi_serialise_extent_chain(struct toi_bdev_info *chain) ++{ ++ struct hibernate_extent *this; ++ int ret; ++ int i = 1; ++ ++ chain->pages_used = chain_pages_used(chain); ++ ++ if (test_action_state(TOI_LOGALL)) ++ dump_block_chains(); ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Serialising chain (dev_t %lx).", ++ chain->dev_t); ++ /* Device info - dev_t, prio, bmap_shift, blocks per page, positions */ ++ ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops, ++ (char *) &chain->uuid, metadata_size); ++ if (ret) ++ return ret; ++ ++ /* Num extents */ ++ ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops, ++ (char *) &chain->blocks.num_extents, sizeof(int)); ++ if (ret) ++ return ret; ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.", ++ chain->blocks.num_extents); ++ ++ this = chain->blocks.first; ++ while (this) { ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i); ++ ret = toiActiveAllocator->rw_header_chunk(WRITE, ++ &toi_blockwriter_ops, ++ (char *) this, 2 * sizeof(this->start)); ++ if (ret) ++ return ret; ++ this = this->next; ++ i++; ++ } ++ ++ return ret; ++} ++ ++int toi_serialise_extent_chains(void) ++{ ++ struct toi_bdev_info *this = prio_chain_head; ++ int result; ++ ++ /* Write the number of chains */ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Write number of chains (%d)", ++ num_chains); ++ result = toiActiveAllocator->rw_header_chunk(WRITE, ++ &toi_blockwriter_ops, (char *) &num_chains, ++ sizeof(int)); ++ if (result) ++ return result; ++ ++ /* Then the chains themselves */ ++ while (this) { ++ result = toi_serialise_extent_chain(this); ++ if (result) ++ return result; ++ this = this->next; ++ } ++ ++ /* ++ * Finally, the chain we should be on at the start of each ++ * section. ++ */ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saved chain numbers."); ++ result = toiActiveAllocator->rw_header_chunk(WRITE, ++ &toi_blockwriter_ops, ++ (char *) &toi_writer_posn.saved_chain_number[0], ++ 4 * sizeof(int)); ++ ++ return result; ++} ++ ++int toi_register_storage_chain(struct toi_bdev_info *new) ++{ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Inserting chain %p into list.", ++ new); ++ toi_insert_chain_in_prio_list(new); ++ return 0; ++} ++ ++static void free_bdev_info(struct toi_bdev_info *chain) ++{ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Free chain %p.", chain); ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Block extents."); ++ toi_put_extent_chain(&chain->blocks); ++ ++ /* ++ * The allocator may need to do more than just free the chains ++ * (swap_free, for example). Don't call from boot kernel. ++ */ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Allocator extents."); ++ if (chain->allocator) ++ chain->allocator->bio_allocator_ops->free_storage(chain); ++ ++ /* ++ * Dropping out of reading atomic copy? Need to undo ++ * toi_open_by_devnum. ++ */ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Bdev."); ++ if (chain->bdev && !IS_ERR(chain->bdev) && ++ chain->bdev != resume_block_device && ++ chain->bdev != header_block_device && ++ test_toi_state(TOI_TRYING_TO_RESUME)) ++ toi_close_bdev(chain->bdev); ++ ++ /* Poison */ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Struct."); ++ toi_kfree(39, chain, sizeof(*chain)); ++ ++ if (prio_chain_head == chain) ++ prio_chain_head = NULL; ++ ++ num_chains--; ++} ++ ++void free_all_bdev_info(void) ++{ ++ struct toi_bdev_info *this = prio_chain_head; ++ ++ while (this) { ++ struct toi_bdev_info *next = this->next; ++ free_bdev_info(this); ++ this = next; ++ } ++ ++ memset((char *) &toi_writer_posn, 0, sizeof(toi_writer_posn)); ++ prio_chain_head = NULL; ++} ++ ++static void set_up_start_position(void) ++{ ++ toi_writer_posn.current_chain = prio_chain_head; ++ go_next_page(0, 0); ++} ++ ++/** ++ * toi_load_extent_chain - read back a chain saved in the image ++ * @chain: Chain to load ++ * ++ * The linked list of extents is reconstructed from the disk. chain will point ++ * to the first entry. ++ **/ ++int toi_load_extent_chain(int index, int *num_loaded) ++{ ++ struct toi_bdev_info *chain = toi_kzalloc(39, ++ sizeof(struct toi_bdev_info), GFP_ATOMIC); ++ struct hibernate_extent *this, *last = NULL; ++ int i, ret; ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Loading extent chain %d.", index); ++ /* Get dev_t, prio, bmap_shift, blocks per page, positions */ ++ ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, ++ (char *) &chain->uuid, metadata_size); ++ ++ if (ret) { ++ printk(KERN_ERR "Failed to read the size of extent chain.\n"); ++ toi_kfree(39, chain, sizeof(*chain)); ++ return 1; ++ } ++ ++ toi_bkd.pages_used[index] = chain->pages_used; ++ ++ ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, ++ (char *) &chain->blocks.num_extents, sizeof(int)); ++ if (ret) { ++ printk(KERN_ERR "Failed to read the size of extent chain.\n"); ++ toi_kfree(39, chain, sizeof(*chain)); ++ return 1; ++ } ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.", ++ chain->blocks.num_extents); ++ ++ for (i = 0; i < chain->blocks.num_extents; i++) { ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i + 1); ++ ++ this = toi_kzalloc(2, sizeof(struct hibernate_extent), ++ TOI_ATOMIC_GFP); ++ if (!this) { ++ printk(KERN_INFO "Failed to allocate a new extent.\n"); ++ free_bdev_info(chain); ++ return -ENOMEM; ++ } ++ this->next = NULL; ++ /* Get the next page */ ++ ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, ++ NULL, (char *) this, 2 * sizeof(this->start)); ++ if (ret) { ++ printk(KERN_INFO "Failed to read an extent.\n"); ++ toi_kfree(2, this, sizeof(struct hibernate_extent)); ++ free_bdev_info(chain); ++ return 1; ++ } ++ ++ if (last) ++ last->next = this; ++ else { ++ char b1[32], b2[32], b3[32]; ++ /* ++ * Open the bdev ++ */ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ "Chain dev_t is %s. Resume dev t is %s. Header" ++ " bdev_t is %s.\n", ++ format_dev_t(b1, chain->dev_t), ++ format_dev_t(b2, resume_dev_t), ++ format_dev_t(b3, toi_sig_data->header_dev_t)); ++ ++ if (chain->dev_t == resume_dev_t) ++ chain->bdev = resume_block_device; ++ else if (chain->dev_t == toi_sig_data->header_dev_t) ++ chain->bdev = header_block_device; ++ else { ++ chain->bdev = toi_open_bdev(chain->uuid, ++ chain->dev_t, 1); ++ if (IS_ERR(chain->bdev)) { ++ free_bdev_info(chain); ++ return -ENODEV; ++ } ++ } ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Chain bmap shift " ++ "is %d and blocks per page is %d.", ++ chain->bmap_shift, ++ chain->blocks_per_page); ++ ++ chain->blocks.first = this; ++ ++ /* ++ * Couldn't do this earlier, but can't do ++ * goto_start now - we may have already used blocks ++ * in the first chain. ++ */ ++ chain->blocks.current_extent = this; ++ chain->blocks.current_offset = this->start; ++ ++ /* ++ * Can't wait until we've read the whole chain ++ * before we insert it in the list. We might need ++ * this chain to read the next page in the header ++ */ ++ toi_insert_chain_in_prio_list(chain); ++ } ++ ++ /* ++ * We have to wait until 2 extents are loaded before setting up ++ * properly because if the first extent has only one page, we ++ * will need to put the position on the second extent. Sounds ++ * obvious, but it wasn't! ++ */ ++ (*num_loaded)++; ++ if ((*num_loaded) == 2) ++ set_up_start_position(); ++ last = this; ++ } ++ ++ /* ++ * Shouldn't get empty chains, but it's not impossible. Link them in so ++ * they get freed properly later. ++ */ ++ if (!chain->blocks.num_extents) ++ toi_insert_chain_in_prio_list(chain); ++ ++ if (!chain->blocks.current_extent) { ++ chain->blocks.current_extent = chain->blocks.first; ++ if (chain->blocks.current_extent) ++ chain->blocks.current_offset = ++ chain->blocks.current_extent->start; ++ } ++ return 0; ++} ++ ++int toi_load_extent_chains(void) ++{ ++ int result; ++ int to_load; ++ int i; ++ int extents_loaded = 0; ++ ++ result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL, ++ (char *) &to_load, ++ sizeof(int)); ++ if (result) ++ return result; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d chains to read.", to_load); ++ ++ for (i = 0; i < to_load; i++) { ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, " >> Loading chain %d/%d.", ++ i, to_load); ++ result = toi_load_extent_chain(i, &extents_loaded); ++ if (result) ++ return result; ++ } ++ ++ /* If we never got to a second extent, we still need to do this. */ ++ if (extents_loaded == 1) ++ set_up_start_position(); ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Save chain numbers."); ++ result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, ++ &toi_blockwriter_ops, ++ (char *) &toi_writer_posn.saved_chain_number[0], ++ 4 * sizeof(int)); ++ ++ return result; ++} ++ ++static int toi_end_of_stream(int writing, int section_barrier) ++{ ++ struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain; ++ int compare_to = next_section[current_stream]; ++ struct toi_bdev_info *compare_chain = ++ toi_writer_posn.saved_chain_ptr[compare_to]; ++ int compare_offset = compare_chain ? ++ compare_chain->saved_state[compare_to].offset : 0; ++ ++ if (!section_barrier) ++ return 0; ++ ++ if (!cur_chain) ++ return 1; ++ ++ if (cur_chain == compare_chain && ++ cur_chain->blocks.current_offset == compare_offset) { ++ if (writing) { ++ if (!current_stream) { ++ debug_broken_header(); ++ return 1; ++ } ++ } else { ++ more_readahead = 0; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ "Reached the end of stream %d " ++ "(not an error).", current_stream); ++ return 1; ++ } ++ } ++ ++ return 0; ++} ++ ++/** ++ * go_next_page - skip blocks to the start of the next page ++ * @writing: Whether we're reading or writing the image. ++ * ++ * Go forward one page. ++ **/ ++int go_next_page(int writing, int section_barrier) ++{ ++ struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain; ++ int max = cur_chain ? cur_chain->blocks_per_page : 1; ++ ++ /* Nope. Go foward a page - or maybe two. Don't stripe the header, ++ * so that bad fragmentation doesn't put the extent data containing ++ * the location of the second page out of the first header page. ++ */ ++ if (toi_extent_state_next(max, current_stream)) { ++ /* Don't complain if readahead falls off the end */ ++ if (writing && section_barrier) { ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent state eof. " ++ "Expected compression ratio too optimistic?"); ++ if (test_action_state(TOI_LOGALL)) ++ dump_block_chains(); ++ } ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Ran out of extents to " ++ "read/write. (Not necessarily a fatal error."); ++ return -ENOSPC; ++ } ++ ++ return 0; ++} ++ ++int devices_of_same_priority(struct toi_bdev_info *this) ++{ ++ struct toi_bdev_info *check = prio_chain_head; ++ int i = 0; ++ ++ while (check) { ++ if (check->prio == this->prio) ++ i++; ++ check = check->next; ++ } ++ ++ return i; ++} ++ ++/** ++ * toi_bio_rw_page - do i/o on the next disk page in the image ++ * @writing: Whether reading or writing. ++ * @page: Page to do i/o on. ++ * @is_readahead: Whether we're doing readahead ++ * @free_group: The group used in allocating the page ++ * ++ * Submit a page for reading or writing, possibly readahead. ++ * Pass the group used in allocating the page as well, as it should ++ * be freed on completion of the bio if we're writing the page. ++ **/ ++int toi_bio_rw_page(int writing, struct page *page, ++ int is_readahead, int free_group) ++{ ++ int result = toi_end_of_stream(writing, 1); ++ struct toi_bdev_info *dev_info = toi_writer_posn.current_chain; ++ ++ if (result) { ++ if (writing) ++ abort_hibernate(TOI_INSUFFICIENT_STORAGE, ++ "Insufficient storage for your image."); ++ else ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking to " ++ "read/write another page when stream has " ++ "ended."); ++ return -ENOSPC; ++ } ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ "%s %lx:%ld", ++ writing ? "Write" : "Read", ++ dev_info->dev_t, dev_info->blocks.current_offset); ++ ++ result = toi_do_io(writing, dev_info->bdev, ++ dev_info->blocks.current_offset << dev_info->bmap_shift, ++ page, is_readahead, 0, free_group); ++ ++ /* Ignore the result here - will check end of stream if come in again */ ++ go_next_page(writing, 1); ++ ++ if (result) ++ printk(KERN_ERR "toi_do_io returned %d.\n", result); ++ return result; ++} ++ ++dev_t get_header_dev_t(void) ++{ ++ return prio_chain_head->dev_t; ++} ++ ++struct block_device *get_header_bdev(void) ++{ ++ return prio_chain_head->bdev; ++} ++ ++unsigned long get_headerblock(void) ++{ ++ return prio_chain_head->blocks.first->start << ++ prio_chain_head->bmap_shift; ++} ++ ++int get_main_pool_phys_params(void) ++{ ++ struct toi_bdev_info *this = prio_chain_head; ++ int result; ++ ++ while (this) { ++ result = this->allocator->bio_allocator_ops->bmap(this); ++ if (result) ++ return result; ++ this = this->next; ++ } ++ ++ return 0; ++} ++ ++static int apply_header_reservation(void) ++{ ++ int i; ++ ++ if (!header_pages_reserved) { ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ "No header pages reserved at the moment."); ++ return 0; ++ } ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Applying header reservation."); ++ ++ /* Apply header space reservation */ ++ toi_extent_state_goto_start(); ++ ++ for (i = 0; i < header_pages_reserved; i++) ++ if (go_next_page(1, 0)) ++ return -ENOSPC; ++ ++ /* The end of header pages will be the start of pageset 2 */ ++ toi_extent_state_save(2); ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ "Finished applying header reservation."); ++ return 0; ++} ++ ++static int toi_bio_register_storage(void) ++{ ++ int result = 0; ++ struct toi_module_ops *this_module; ++ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (!this_module->enabled || ++ this_module->type != BIO_ALLOCATOR_MODULE) ++ continue; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ "Registering storage from %s.", ++ this_module->name); ++ result = this_module->bio_allocator_ops->register_storage(); ++ if (result) ++ break; ++ } ++ ++ return result; ++} ++ ++int toi_bio_allocate_storage(unsigned long request) ++{ ++ struct toi_bdev_info *chain = prio_chain_head; ++ unsigned long to_get = request; ++ unsigned long extra_pages, needed; ++ int no_free = 0; ++ ++ if (!chain) { ++ int result = toi_bio_register_storage(); ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: " ++ "Registering storage."); ++ if (result) ++ return 0; ++ chain = prio_chain_head; ++ if (!chain) { ++ printk("TuxOnIce: No storage was registered.\n"); ++ return 0; ++ } ++ } ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: " ++ "Request is %lu pages.", request); ++ extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long) ++ + sizeof(int)), PAGE_SIZE); ++ needed = request + extra_pages + header_pages_reserved; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Adding %lu extra pages and %lu " ++ "for header => %lu.", ++ extra_pages, header_pages_reserved, needed); ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Already allocated %lu pages.", ++ raw_pages_allocd); ++ ++ to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : 0; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Need to get %lu pages.", to_get); ++ ++ if (!to_get) ++ return apply_header_reservation(); ++ ++ while (to_get && chain) { ++ int num_group = devices_of_same_priority(chain); ++ int divisor = num_group - no_free; ++ int i; ++ unsigned long portion = DIV_ROUND_UP(to_get, divisor); ++ unsigned long got = 0; ++ unsigned long got_this_round = 0; ++ struct toi_bdev_info *top = chain; ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ " Start of loop. To get is %lu. Divisor is %d.", ++ to_get, divisor); ++ no_free = 0; ++ ++ /* ++ * We're aiming to spread the allocated storage as evenly ++ * as possible, but we also want to get all the storage we ++ * can off this priority. ++ */ ++ for (i = 0; i < num_group; i++) { ++ struct toi_bio_allocator_ops *ops = ++ chain->allocator->bio_allocator_ops; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ " Asking for %lu pages from chain %p.", ++ portion, chain); ++ got = ops->allocate_storage(chain, portion); ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ " Got %lu pages from allocator %p.", ++ got, chain); ++ if (!got) ++ no_free++; ++ got_this_round += got; ++ chain = chain->next; ++ } ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, " Loop finished. Got a " ++ "total of %lu pages from %d allocators.", ++ got_this_round, divisor - no_free); ++ ++ raw_pages_allocd += got_this_round; ++ to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : ++ 0; ++ ++ /* ++ * If we got anything from chains of this priority and we ++ * still have storage to allocate, go over this priority ++ * again. ++ */ ++ if (got_this_round && to_get) ++ chain = top; ++ else ++ no_free = 0; ++ } ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Finished allocating. Calling " ++ "get_main_pool_phys_params"); ++ /* Now let swap allocator bmap the pages */ ++ get_main_pool_phys_params(); ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done. Reserving header."); ++ return apply_header_reservation(); ++} ++ ++void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd) ++{ ++ int i = 0; ++ struct toi_bdev_info *cur_chain = prio_chain_head; ++ ++ while (cur_chain) { ++ cur_chain->pages_used = bkd->pages_used[i]; ++ cur_chain = cur_chain->next; ++ i++; ++ } ++} ++ ++int toi_bio_chains_debug_info(char *buffer, int size) ++{ ++ /* Show what we actually used */ ++ struct toi_bdev_info *cur_chain = prio_chain_head; ++ int len = 0; ++ ++ while (cur_chain) { ++ len += scnprintf(buffer + len, size - len, " Used %lu pages " ++ "from %s.\n", cur_chain->pages_used, ++ cur_chain->name); ++ cur_chain = cur_chain->next; ++ } ++ ++ return len; ++} +diff --git a/kernel/power/tuxonice_bio_core.c b/kernel/power/tuxonice_bio_core.c +new file mode 100644 +index 0000000..790b829 +--- /dev/null ++++ b/kernel/power/tuxonice_bio_core.c +@@ -0,0 +1,1838 @@ ++/* ++ * kernel/power/tuxonice_bio.c ++ * ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * Distributed under GPLv2. ++ * ++ * This file contains block io functions for TuxOnIce. These are ++ * used by the swapwriter and it is planned that they will also ++ * be used by the NFSwriter. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "tuxonice.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_prepare_image.h" ++#include "tuxonice_bio.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_alloc.h" ++#include "tuxonice_io.h" ++#include "tuxonice_builtin.h" ++#include "tuxonice_bio_internal.h" ++ ++#define MEMORY_ONLY 1 ++#define THROTTLE_WAIT 2 ++ ++/* #define MEASURE_MUTEX_CONTENTION */ ++#ifndef MEASURE_MUTEX_CONTENTION ++#define my_mutex_lock(index, the_lock) mutex_lock(the_lock) ++#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock) ++#else ++unsigned long mutex_times[2][2][NR_CPUS]; ++#define my_mutex_lock(index, the_lock) do { \ ++ int have_mutex; \ ++ have_mutex = mutex_trylock(the_lock); \ ++ if (!have_mutex) { \ ++ mutex_lock(the_lock); \ ++ mutex_times[index][0][smp_processor_id()]++; \ ++ } else { \ ++ mutex_times[index][1][smp_processor_id()]++; \ ++ } ++ ++#define my_mutex_unlock(index, the_lock) \ ++ mutex_unlock(the_lock); \ ++} while (0) ++#endif ++ ++static int page_idx, reset_idx; ++ ++static int target_outstanding_io = 1024; ++static int max_outstanding_writes, max_outstanding_reads; ++ ++static struct page *bio_queue_head, *bio_queue_tail; ++static atomic_t toi_bio_queue_size; ++static DEFINE_SPINLOCK(bio_queue_lock); ++ ++static int free_mem_throttle, throughput_throttle; ++int more_readahead = 1; ++static struct page *readahead_list_head, *readahead_list_tail; ++ ++static struct page *waiting_on; ++ ++static atomic_t toi_io_in_progress, toi_io_done; ++static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait); ++ ++int current_stream; ++/* Not static, so that the allocators can setup and complete ++ * writing the header */ ++char *toi_writer_buffer; ++int toi_writer_buffer_posn; ++ ++static DEFINE_MUTEX(toi_bio_mutex); ++static DEFINE_MUTEX(toi_bio_readahead_mutex); ++ ++static struct task_struct *toi_queue_flusher; ++static int toi_bio_queue_flush_pages(int dedicated_thread); ++ ++struct toi_module_ops toi_blockwriter_ops; ++ ++#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \ ++ atomic_read(&toi_bio_queue_size)) ++ ++unsigned long raw_pages_allocd, header_pages_reserved; ++ ++/** ++ * set_free_mem_throttle - set the point where we pause to avoid oom. ++ * ++ * Initially, this value is zero, but when we first fail to allocate memory, ++ * we set it (plus a buffer) and thereafter throttle i/o once that limit is ++ * reached. ++ **/ ++static void set_free_mem_throttle(void) ++{ ++ int new_throttle = nr_unallocated_buffer_pages() + 256; ++ ++ if (new_throttle > free_mem_throttle) ++ free_mem_throttle = new_throttle; ++} ++ ++#define NUM_REASONS 7 ++static atomic_t reasons[NUM_REASONS]; ++static char *reason_name[NUM_REASONS] = { ++ "readahead not ready", ++ "bio allocation", ++ "synchronous I/O", ++ "toi_bio_get_new_page", ++ "memory low", ++ "readahead buffer allocation", ++ "throughput_throttle", ++}; ++ ++/* User Specified Parameters. */ ++unsigned long resume_firstblock; ++dev_t resume_dev_t; ++struct block_device *resume_block_device; ++static atomic_t resume_bdev_open_count; ++ ++struct block_device *header_block_device; ++ ++/** ++ * toi_open_bdev: Open a bdev at resume time. ++ * ++ * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t ++ * (the user can have resume= pointing at a swap partition/file that isn't ++ * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the ++ * header. It will be from a swap partition that was enabled when we hibernated, ++ * but we don't know it's real index until we read that first page. ++ * dev_t: The device major/minor. ++ * display_errs: Whether to try to do this quietly. ++ * ++ * We stored a dev_t in the image header. Open the matching device without ++ * requiring /dev/ in most cases and record the details needed ++ * to close it later and avoid duplicating work. ++ */ ++struct block_device *toi_open_bdev(char *uuid, dev_t default_device, ++ int display_errs) ++{ ++ struct block_device *bdev; ++ dev_t device = default_device; ++ char buf[32]; ++ int retried = 0; ++ ++retry: ++ if (uuid) { ++ struct fs_info seek; ++ strncpy((char *) &seek.uuid, uuid, 16); ++ seek.dev_t = 0; ++ seek.last_mount_size = 0; ++ device = blk_lookup_fs_info(&seek); ++ if (!device) { ++ device = default_device; ++ printk(KERN_DEBUG "Unable to resolve uuid. Falling back" ++ " to dev_t.\n"); ++ } else ++ printk(KERN_DEBUG "Resolved uuid to device %s.\n", ++ format_dev_t(buf, device)); ++ } ++ ++ if (!device) { ++ printk(KERN_ERR "TuxOnIce attempting to open a " ++ "blank dev_t!\n"); ++ dump_stack(); ++ return NULL; ++ } ++ bdev = toi_open_by_devnum(device); ++ ++ if (IS_ERR(bdev) || !bdev) { ++ if (!retried) { ++ retried = 1; ++ wait_for_device_probe(); ++ goto retry; ++ } ++ if (display_errs) ++ toi_early_boot_message(1, TOI_CONTINUE_REQ, ++ "Failed to get access to block device " ++ "\"%x\" (error %d).\n Maybe you need " ++ "to run mknod and/or lvmsetup in an " ++ "initrd/ramfs?", device, bdev); ++ return ERR_PTR(-EINVAL); ++ } ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ "TuxOnIce got bdev %p for dev_t %x.", ++ bdev, device); ++ ++ return bdev; ++} ++ ++static void toi_bio_reserve_header_space(unsigned long request) ++{ ++ header_pages_reserved = request; ++} ++ ++/** ++ * do_bio_wait - wait for some TuxOnIce I/O to complete ++ * @reason: The array index of the reason we're waiting. ++ * ++ * Wait for a particular page of I/O if we're after a particular page. ++ * If we're not after a particular page, wait instead for all in flight ++ * I/O to be completed or for us to have enough free memory to be able ++ * to submit more I/O. ++ * ++ * If we wait, we also update our statistics regarding why we waited. ++ **/ ++static void do_bio_wait(int reason) ++{ ++ struct page *was_waiting_on = waiting_on; ++ ++ /* On SMP, waiting_on can be reset, so we make a copy */ ++ if (was_waiting_on) { ++ wait_on_page_locked(was_waiting_on); ++ atomic_inc(&reasons[reason]); ++ } else { ++ atomic_inc(&reasons[reason]); ++ ++ wait_event(num_in_progress_wait, ++ !atomic_read(&toi_io_in_progress) || ++ nr_unallocated_buffer_pages() > free_mem_throttle); ++ } ++} ++ ++/** ++ * throttle_if_needed - wait for I/O completion if throttle points are reached ++ * @flags: What to check and how to act. ++ * ++ * Check whether we need to wait for some I/O to complete. We always check ++ * whether we have enough memory available, but may also (depending upon ++ * @reason) check if the throughput throttle limit has been reached. ++ **/ ++static int throttle_if_needed(int flags) ++{ ++ int free_pages = nr_unallocated_buffer_pages(); ++ ++ /* Getting low on memory and I/O is in progress? */ ++ while (unlikely(free_pages < free_mem_throttle) && ++ atomic_read(&toi_io_in_progress) && ++ !test_result_state(TOI_ABORTED)) { ++ if (!(flags & THROTTLE_WAIT)) ++ return -ENOMEM; ++ do_bio_wait(4); ++ free_pages = nr_unallocated_buffer_pages(); ++ } ++ ++ while (!(flags & MEMORY_ONLY) && throughput_throttle && ++ TOTAL_OUTSTANDING_IO >= throughput_throttle && ++ !test_result_state(TOI_ABORTED)) { ++ int result = toi_bio_queue_flush_pages(0); ++ if (result) ++ return result; ++ atomic_inc(&reasons[6]); ++ wait_event(num_in_progress_wait, ++ !atomic_read(&toi_io_in_progress) || ++ TOTAL_OUTSTANDING_IO < throughput_throttle); ++ } ++ ++ return 0; ++} ++ ++/** ++ * update_throughput_throttle - update the raw throughput throttle ++ * @jif_index: The number of times this function has been called. ++ * ++ * This function is called four times per second by the core, and used to limit ++ * the amount of I/O we submit at once, spreading out our waiting through the ++ * whole job and letting userui get an opportunity to do its work. ++ * ++ * We don't start limiting I/O until 1/4s has gone so that we get a ++ * decent sample for our initial limit, and keep updating it because ++ * throughput may vary (on rotating media, eg) with our block number. ++ * ++ * We throttle to 1/10s worth of I/O. ++ **/ ++static void update_throughput_throttle(int jif_index) ++{ ++ int done = atomic_read(&toi_io_done); ++ throughput_throttle = done * 2 / 5 / jif_index; ++} ++ ++/** ++ * toi_finish_all_io - wait for all outstanding i/o to complete ++ * ++ * Flush any queued but unsubmitted I/O and wait for it all to complete. ++ **/ ++static int toi_finish_all_io(void) ++{ ++ int result = toi_bio_queue_flush_pages(0); ++ toi_bio_queue_flusher_should_finish = 1; ++ wake_up(&toi_io_queue_flusher); ++ wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO); ++ return result; ++} ++ ++/** ++ * toi_end_bio - bio completion function. ++ * @bio: bio that has completed. ++ * @err: Error value. Yes, like end_swap_bio_read, we ignore it. ++ * ++ * Function called by the block driver from interrupt context when I/O is ++ * completed. If we were writing the page, we want to free it and will have ++ * set bio->bi_private to the parameter we should use in telling the page ++ * allocation accounting code what the page was allocated for. If we're ++ * reading the page, it will be in the singly linked list made from ++ * page->private pointers. ++ **/ ++static void toi_end_bio(struct bio *bio, int err) ++{ ++ struct page *page = bio->bi_io_vec[0].bv_page; ++ ++ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); ++ ++ unlock_page(page); ++ bio_put(bio); ++ ++ if (waiting_on == page) ++ waiting_on = NULL; ++ ++ put_page(page); ++ ++ if (bio->bi_private) ++ toi__free_page((int) ((unsigned long) bio->bi_private) , page); ++ ++ bio_put(bio); ++ ++ atomic_dec(&toi_io_in_progress); ++ atomic_inc(&toi_io_done); ++ ++ wake_up(&num_in_progress_wait); ++} ++ ++/** ++ * submit - submit BIO request ++ * @writing: READ or WRITE. ++ * @dev: The block device we're using. ++ * @first_block: The first sector we're using. ++ * @page: The page being used for I/O. ++ * @free_group: If writing, the group that was used in allocating the page ++ * and which will be used in freeing the page from the completion ++ * routine. ++ * ++ * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the ++ * textbook - allocate and initialize the bio. If we're writing, make sure ++ * the page is marked as dirty. Then submit it and carry on." ++ * ++ * If we're just testing the speed of our own code, we fake having done all ++ * the hard work and all toi_end_bio immediately. ++ **/ ++static int submit(int writing, struct block_device *dev, sector_t first_block, ++ struct page *page, int free_group) ++{ ++ struct bio *bio = NULL; ++ int cur_outstanding_io, result; ++ ++ /* ++ * Shouldn't throttle if reading - can deadlock in the single ++ * threaded case as pages are only freed when we use the ++ * readahead. ++ */ ++ if (writing) { ++ result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT); ++ if (result) ++ return result; ++ } ++ ++ while (!bio) { ++ bio = bio_alloc(TOI_ATOMIC_GFP, 1); ++ if (!bio) { ++ set_free_mem_throttle(); ++ do_bio_wait(1); ++ } ++ } ++ ++ bio->bi_bdev = dev; ++ bio->bi_sector = first_block; ++ bio->bi_private = (void *) ((unsigned long) free_group); ++ bio->bi_end_io = toi_end_bio; ++ ++ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { ++ printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n", ++ (unsigned long long) first_block); ++ bio_put(bio); ++ return -EFAULT; ++ } ++ ++ bio_get(bio); ++ ++ cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress); ++ if (writing) { ++ if (cur_outstanding_io > max_outstanding_writes) ++ max_outstanding_writes = cur_outstanding_io; ++ } else { ++ if (cur_outstanding_io > max_outstanding_reads) ++ max_outstanding_reads = cur_outstanding_io; ++ } ++ ++ ++ /* Still read the header! */ ++ if (unlikely(test_action_state(TOI_TEST_BIO) && writing)) { ++ /* Fake having done the hard work */ ++ set_bit(BIO_UPTODATE, &bio->bi_flags); ++ toi_end_bio(bio, 0); ++ } else ++ submit_bio(writing | REQ_TOI | REQ_SYNC, bio); ++ ++ return 0; ++} ++ ++/** ++ * toi_do_io: Prepare to do some i/o on a page and submit or batch it. ++ * ++ * @writing: Whether reading or writing. ++ * @bdev: The block device which we're using. ++ * @block0: The first sector we're reading or writing. ++ * @page: The page on which I/O is being done. ++ * @readahead_index: If doing readahead, the index (reset this flag when done). ++ * @syncio: Whether the i/o is being done synchronously. ++ * ++ * Prepare and start a read or write operation. ++ * ++ * Note that we always work with our own page. If writing, we might be given a ++ * compression buffer that will immediately be used to start compressing the ++ * next page. For reading, we do readahead and therefore don't know the final ++ * address where the data needs to go. ++ **/ ++int toi_do_io(int writing, struct block_device *bdev, long block0, ++ struct page *page, int is_readahead, int syncio, int free_group) ++{ ++ page->private = 0; ++ ++ /* Do here so we don't race against toi_bio_get_next_page_read */ ++ lock_page(page); ++ ++ if (is_readahead) { ++ if (readahead_list_head) ++ readahead_list_tail->private = (unsigned long) page; ++ else ++ readahead_list_head = page; ++ ++ readahead_list_tail = page; ++ } ++ ++ /* Done before submitting to avoid races. */ ++ if (syncio) ++ waiting_on = page; ++ ++ /* Submit the page */ ++ get_page(page); ++ ++ if (submit(writing, bdev, block0, page, free_group)) ++ return -EFAULT; ++ ++ if (syncio) ++ do_bio_wait(2); ++ ++ return 0; ++} ++ ++/** ++ * toi_bdev_page_io - simpler interface to do directly i/o on a single page ++ * @writing: Whether reading or writing. ++ * @bdev: Block device on which we're operating. ++ * @pos: Sector at which page to read or write starts. ++ * @page: Page to be read/written. ++ * ++ * A simple interface to submit a page of I/O and wait for its completion. ++ * The caller must free the page used. ++ **/ ++static int toi_bdev_page_io(int writing, struct block_device *bdev, ++ long pos, struct page *page) ++{ ++ return toi_do_io(writing, bdev, pos, page, 0, 1, 0); ++} ++ ++/** ++ * toi_bio_memory_needed - report the amount of memory needed for block i/o ++ * ++ * We want to have at least enough memory so as to have target_outstanding_io ++ * or more transactions on the fly at once. If we can do more, fine. ++ **/ ++static int toi_bio_memory_needed(void) ++{ ++ return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) + ++ sizeof(struct bio)); ++} ++ ++/** ++ * toi_bio_print_debug_stats - put out debugging info in the buffer provided ++ * @buffer: A buffer of size @size into which text should be placed. ++ * @size: The size of @buffer. ++ * ++ * Fill a buffer with debugging info. This is used for both our debug_info sysfs ++ * entry and for recording the same info in dmesg. ++ **/ ++static int toi_bio_print_debug_stats(char *buffer, int size) ++{ ++ int len = 0; ++ ++ if (toiActiveAllocator != &toi_blockwriter_ops) { ++ len = scnprintf(buffer, size, ++ "- Block I/O inactive.\n"); ++ return len; ++ } ++ ++ len = scnprintf(buffer, size, "- Block I/O active.\n"); ++ ++ len += toi_bio_chains_debug_info(buffer + len, size - len); ++ ++ len += scnprintf(buffer + len, size - len, ++ "- Max outstanding reads %d. Max writes %d.\n", ++ max_outstanding_reads, max_outstanding_writes); ++ ++ len += scnprintf(buffer + len, size - len, ++ " Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n", ++ target_outstanding_io, ++ PAGE_SIZE, (unsigned int) sizeof(struct request), ++ (unsigned int) sizeof(struct bio), toi_bio_memory_needed()); ++ ++#ifdef MEASURE_MUTEX_CONTENTION ++ { ++ int i; ++ ++ len += scnprintf(buffer + len, size - len, ++ " Mutex contention while reading:\n Contended Free\n"); ++ ++ for_each_online_cpu(i) ++ len += scnprintf(buffer + len, size - len, ++ " %9lu %9lu\n", ++ mutex_times[0][0][i], mutex_times[0][1][i]); ++ ++ len += scnprintf(buffer + len, size - len, ++ " Mutex contention while writing:\n Contended Free\n"); ++ ++ for_each_online_cpu(i) ++ len += scnprintf(buffer + len, size - len, ++ " %9lu %9lu\n", ++ mutex_times[1][0][i], mutex_times[1][1][i]); ++ ++ } ++#endif ++ ++ return len + scnprintf(buffer + len, size - len, ++ " Free mem throttle point reached %d.\n", free_mem_throttle); ++} ++ ++static int total_header_bytes; ++static int unowned; ++ ++void debug_broken_header(void) ++{ ++ printk(KERN_DEBUG "Image header too big for size allocated!\n"); ++ print_toi_header_storage_for_modules(); ++ printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed()); ++ printk(KERN_DEBUG "toi_header : %zu.\n", sizeof(struct toi_header)); ++ printk(KERN_DEBUG "Total unowned : %d.\n", unowned); ++ printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes, ++ DIV_ROUND_UP(total_header_bytes, PAGE_SIZE)); ++ printk(KERN_DEBUG "Space needed now : %ld.\n", ++ get_header_storage_needed()); ++ dump_block_chains(); ++ abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small."); ++} ++ ++/** ++ * toi_rw_init - prepare to read or write a stream in the image ++ * @writing: Whether reading or writing. ++ * @stream number: Section of the image being processed. ++ * ++ * Prepare to read or write a section ('stream') in the image. ++ **/ ++static int toi_rw_init(int writing, int stream_number) ++{ ++ if (stream_number) ++ toi_extent_state_restore(stream_number); ++ else ++ toi_extent_state_goto_start(); ++ ++ if (writing) { ++ reset_idx = 0; ++ if (!current_stream) ++ page_idx = 0; ++ } else { ++ reset_idx = 1; ++ } ++ ++ atomic_set(&toi_io_done, 0); ++ if (!toi_writer_buffer) ++ toi_writer_buffer = (char *) toi_get_zeroed_page(11, ++ TOI_ATOMIC_GFP); ++ toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE; ++ ++ current_stream = stream_number; ++ ++ more_readahead = 1; ++ ++ return toi_writer_buffer ? 0 : -ENOMEM; ++} ++ ++/** ++ * toi_bio_queue_write - queue a page for writing ++ * @full_buffer: Pointer to a page to be queued ++ * ++ * Add a page to the queue to be submitted. If we're the queue flusher, ++ * we'll do this once we've dropped toi_bio_mutex, so other threads can ++ * continue to submit I/O while we're on the slow path doing the actual ++ * submission. ++ **/ ++static void toi_bio_queue_write(char **full_buffer) ++{ ++ struct page *page = virt_to_page(*full_buffer); ++ unsigned long flags; ++ ++ *full_buffer = NULL; ++ page->private = 0; ++ ++ spin_lock_irqsave(&bio_queue_lock, flags); ++ if (!bio_queue_head) ++ bio_queue_head = page; ++ else ++ bio_queue_tail->private = (unsigned long) page; ++ ++ bio_queue_tail = page; ++ atomic_inc(&toi_bio_queue_size); ++ ++ spin_unlock_irqrestore(&bio_queue_lock, flags); ++ wake_up(&toi_io_queue_flusher); ++} ++ ++/** ++ * toi_rw_cleanup - Cleanup after i/o. ++ * @writing: Whether we were reading or writing. ++ * ++ * Flush all I/O and clean everything up after reading or writing a ++ * section of the image. ++ **/ ++static int toi_rw_cleanup(int writing) ++{ ++ int i, result = 0; ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_rw_cleanup."); ++ if (writing) { ++ if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED)) ++ toi_bio_queue_write(&toi_writer_buffer); ++ ++ while (bio_queue_head && !result) ++ result = toi_bio_queue_flush_pages(0); ++ ++ if (result) ++ return result; ++ ++ if (current_stream == 2) ++ toi_extent_state_save(1); ++ else if (current_stream == 1) ++ toi_extent_state_save(3); ++ } ++ ++ result = toi_finish_all_io(); ++ ++ while (readahead_list_head) { ++ void *next = (void *) readahead_list_head->private; ++ toi__free_page(12, readahead_list_head); ++ readahead_list_head = next; ++ } ++ ++ readahead_list_tail = NULL; ++ ++ if (!current_stream) ++ return result; ++ ++ for (i = 0; i < NUM_REASONS; i++) { ++ if (!atomic_read(&reasons[i])) ++ continue; ++ printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n", ++ reason_name[i], atomic_read(&reasons[i])); ++ atomic_set(&reasons[i], 0); ++ } ++ ++ current_stream = 0; ++ return result; ++} ++ ++/** ++ * toi_start_one_readahead - start one page of readahead ++ * @dedicated_thread: Is this a thread dedicated to doing readahead? ++ * ++ * Start one new page of readahead. If this is being called by a thread ++ * whose only just is to submit readahead, don't quit because we failed ++ * to allocate a page. ++ **/ ++static int toi_start_one_readahead(int dedicated_thread) ++{ ++ char *buffer = NULL; ++ int oom = 0, result; ++ ++ result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0); ++ if (result) ++ return result; ++ ++ mutex_lock(&toi_bio_readahead_mutex); ++ ++ while (!buffer) { ++ buffer = (char *) toi_get_zeroed_page(12, ++ TOI_ATOMIC_GFP); ++ if (!buffer) { ++ if (oom && !dedicated_thread) { ++ mutex_unlock(&toi_bio_readahead_mutex); ++ return -ENOMEM; ++ } ++ ++ oom = 1; ++ set_free_mem_throttle(); ++ do_bio_wait(5); ++ } ++ } ++ ++ result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0); ++ if (result == -ENOSPC) ++ toi__free_page(12, virt_to_page(buffer)); ++ mutex_unlock(&toi_bio_readahead_mutex); ++ if (result) { ++ if (result == -ENOSPC) ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, ++ "Last readahead page submitted."); ++ else ++ printk(KERN_DEBUG "toi_bio_rw_page returned %d.\n", ++ result); ++ } ++ return result; ++} ++ ++/** ++ * toi_start_new_readahead - start new readahead ++ * @dedicated_thread: Are we dedicated to this task? ++ * ++ * Start readahead of image pages. ++ * ++ * We can be called as a thread dedicated to this task (may be helpful on ++ * systems with lots of CPUs), in which case we don't exit until there's no ++ * more readahead. ++ * ++ * If this is not called by a dedicated thread, we top up our queue until ++ * there's no more readahead to submit, we've submitted the number given ++ * in target_outstanding_io or the number in progress exceeds the target ++ * outstanding I/O value. ++ * ++ * No mutex needed because this is only ever called by the first cpu. ++ **/ ++static int toi_start_new_readahead(int dedicated_thread) ++{ ++ int last_result, num_submitted = 0; ++ ++ /* Start a new readahead? */ ++ if (!more_readahead) ++ return 0; ++ ++ do { ++ last_result = toi_start_one_readahead(dedicated_thread); ++ ++ if (last_result) { ++ if (last_result == -ENOMEM || last_result == -ENOSPC) ++ return 0; ++ ++ printk(KERN_DEBUG ++ "Begin read chunk returned %d.\n", ++ last_result); ++ } else ++ num_submitted++; ++ ++ } while (more_readahead && !last_result && ++ (dedicated_thread || ++ (num_submitted < target_outstanding_io && ++ atomic_read(&toi_io_in_progress) < target_outstanding_io))); ++ ++ return last_result; ++} ++ ++/** ++ * bio_io_flusher - start the dedicated I/O flushing routine ++ * @writing: Whether we're writing the image. ++ **/ ++static int bio_io_flusher(int writing) ++{ ++ ++ if (writing) ++ return toi_bio_queue_flush_pages(1); ++ else ++ return toi_start_new_readahead(1); ++} ++ ++/** ++ * toi_bio_get_next_page_read - read a disk page, perhaps with readahead ++ * @no_readahead: Whether we can use readahead ++ * ++ * Read a page from disk, submitting readahead and cleaning up finished i/o ++ * while we wait for the page we're after. ++ **/ ++static int toi_bio_get_next_page_read(int no_readahead) ++{ ++ char *virt; ++ struct page *old_readahead_list_head; ++ ++ /* ++ * When reading the second page of the header, we have to ++ * delay submitting the read until after we've gotten the ++ * extents out of the first page. ++ */ ++ if (unlikely(no_readahead && toi_start_one_readahead(0))) { ++ printk(KERN_EMERG "No readahead and toi_start_one_readahead " ++ "returned non-zero.\n"); ++ return -EIO; ++ } ++ ++ if (unlikely(!readahead_list_head)) { ++ /* ++ * If the last page finishes exactly on the page ++ * boundary, we will be called one extra time and ++ * have no data to return. In this case, we should ++ * not BUG(), like we used to! ++ */ ++ if (!more_readahead) { ++ printk(KERN_EMERG "No more readahead.\n"); ++ return -ENOSPC; ++ } ++ if (unlikely(toi_start_one_readahead(0))) { ++ printk(KERN_EMERG "No readahead and " ++ "toi_start_one_readahead returned non-zero.\n"); ++ return -EIO; ++ } ++ } ++ ++ if (PageLocked(readahead_list_head)) { ++ waiting_on = readahead_list_head; ++ do_bio_wait(0); ++ } ++ ++ virt = page_address(readahead_list_head); ++ memcpy(toi_writer_buffer, virt, PAGE_SIZE); ++ ++ mutex_lock(&toi_bio_readahead_mutex); ++ old_readahead_list_head = readahead_list_head; ++ readahead_list_head = (struct page *) readahead_list_head->private; ++ mutex_unlock(&toi_bio_readahead_mutex); ++ toi__free_page(12, old_readahead_list_head); ++ return 0; ++} ++ ++/** ++ * toi_bio_queue_flush_pages - flush the queue of pages queued for writing ++ * @dedicated_thread: Whether we're a dedicated thread ++ * ++ * Flush the queue of pages ready to be written to disk. ++ * ++ * If we're a dedicated thread, stay in here until told to leave, ++ * sleeping in wait_event. ++ * ++ * The first thread is normally the only one to come in here. Another ++ * thread can enter this routine too, though, via throttle_if_needed. ++ * Since that's the case, we must be careful to only have one thread ++ * doing this work at a time. Otherwise we have a race and could save ++ * pages out of order. ++ * ++ * If an error occurs, free all remaining pages without submitting them ++ * for I/O. ++ **/ ++ ++int toi_bio_queue_flush_pages(int dedicated_thread) ++{ ++ unsigned long flags; ++ int result = 0; ++ static DEFINE_MUTEX(busy); ++ ++ if (!mutex_trylock(&busy)) ++ return 0; ++ ++top: ++ spin_lock_irqsave(&bio_queue_lock, flags); ++ while (bio_queue_head) { ++ struct page *page = bio_queue_head; ++ bio_queue_head = (struct page *) page->private; ++ if (bio_queue_tail == page) ++ bio_queue_tail = NULL; ++ atomic_dec(&toi_bio_queue_size); ++ spin_unlock_irqrestore(&bio_queue_lock, flags); ++ ++ /* Don't generate more error messages if already had one */ ++ if (!result) ++ result = toi_bio_rw_page(WRITE, page, 0, 11); ++ /* ++ * If writing the page failed, don't drop out. ++ * Flush the rest of the queue too. ++ */ ++ if (result) ++ toi__free_page(11 , page); ++ spin_lock_irqsave(&bio_queue_lock, flags); ++ } ++ spin_unlock_irqrestore(&bio_queue_lock, flags); ++ ++ if (dedicated_thread) { ++ wait_event(toi_io_queue_flusher, bio_queue_head || ++ toi_bio_queue_flusher_should_finish); ++ if (likely(!toi_bio_queue_flusher_should_finish)) ++ goto top; ++ toi_bio_queue_flusher_should_finish = 0; ++ } ++ ++ mutex_unlock(&busy); ++ return result; ++} ++ ++/** ++ * toi_bio_get_new_page - get a new page for I/O ++ * @full_buffer: Pointer to a page to allocate. ++ **/ ++static int toi_bio_get_new_page(char **full_buffer) ++{ ++ int result = throttle_if_needed(THROTTLE_WAIT); ++ if (result) ++ return result; ++ ++ while (!*full_buffer) { ++ *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP); ++ if (!*full_buffer) { ++ set_free_mem_throttle(); ++ do_bio_wait(3); ++ } ++ } ++ ++ return 0; ++} ++ ++/** ++ * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O ++ * @writing: Bool - whether writing (or reading). ++ * @buffer: The start of the buffer to write or fill. ++ * @buffer_size: The size of the buffer to write or fill. ++ * @no_readahead: Don't try to start readhead (when getting extents). ++ **/ ++static int toi_rw_buffer(int writing, char *buffer, int buffer_size, ++ int no_readahead) ++{ ++ int bytes_left = buffer_size, result = 0; ++ ++ while (bytes_left) { ++ char *source_start = buffer + buffer_size - bytes_left; ++ char *dest_start = toi_writer_buffer + toi_writer_buffer_posn; ++ int capacity = PAGE_SIZE - toi_writer_buffer_posn; ++ char *to = writing ? dest_start : source_start; ++ char *from = writing ? source_start : dest_start; ++ ++ if (bytes_left <= capacity) { ++ memcpy(to, from, bytes_left); ++ toi_writer_buffer_posn += bytes_left; ++ return 0; ++ } ++ ++ /* Complete this page and start a new one */ ++ memcpy(to, from, capacity); ++ bytes_left -= capacity; ++ ++ if (!writing) { ++ /* ++ * Perform actual I/O: ++ * read readahead_list_head into toi_writer_buffer ++ */ ++ int result = toi_bio_get_next_page_read(no_readahead); ++ if (result) { ++ printk("toi_bio_get_next_page_read " ++ "returned %d.\n", result); ++ return result; ++ } ++ } else { ++ toi_bio_queue_write(&toi_writer_buffer); ++ result = toi_bio_get_new_page(&toi_writer_buffer); ++ if (result) { ++ printk(KERN_ERR "toi_bio_get_new_page returned " ++ "%d.\n", result); ++ return result; ++ } ++ } ++ ++ toi_writer_buffer_posn = 0; ++ toi_cond_pause(0, NULL); ++ } ++ ++ return 0; ++} ++ ++/** ++ * toi_bio_read_page - read a page of the image ++ * @pfn: The pfn where the data belongs. ++ * @buffer_page: The page containing the (possibly compressed) data. ++ * @buf_size: The number of bytes on @buffer_page used (PAGE_SIZE). ++ * ++ * Read a (possibly compressed) page from the image, into buffer_page, ++ * returning its pfn and the buffer size. ++ **/ ++static int toi_bio_read_page(unsigned long *pfn, int buf_type, ++ void *buffer_page, unsigned int *buf_size) ++{ ++ int result = 0; ++ int this_idx; ++ char *buffer_virt = TOI_MAP(buf_type, buffer_page); ++ ++ /* ++ * Only call start_new_readahead if we don't have a dedicated thread ++ * and we're the queue flusher. ++ */ ++ if (current == toi_queue_flusher && more_readahead && ++ !test_action_state(TOI_NO_READAHEAD)) { ++ int result2 = toi_start_new_readahead(0); ++ if (result2) { ++ printk(KERN_DEBUG "Queue flusher and " ++ "toi_start_one_readahead returned non-zero.\n"); ++ result = -EIO; ++ goto out; ++ } ++ } ++ ++ my_mutex_lock(0, &toi_bio_mutex); ++ ++ /* ++ * Structure in the image: ++ * [destination pfn|page size|page data] ++ * buf_size is PAGE_SIZE ++ * We can validly find there's nothing to read in a multithreaded ++ * situation. ++ */ ++ if (toi_rw_buffer(READ, (char *) &this_idx, sizeof(int), 0) || ++ toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) || ++ toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) || ++ toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) { ++ result = -ENODATA; ++ goto out_unlock; ++ } ++ ++ if (reset_idx) { ++ page_idx = this_idx; ++ reset_idx = 0; ++ } else { ++ page_idx++; ++ if (!this_idx) ++ result = -ENODATA; ++ else if (page_idx != this_idx) ++ printk(KERN_ERR "Got page index %d, expected %d.\n", ++ this_idx, page_idx); ++ } ++ ++out_unlock: ++ my_mutex_unlock(0, &toi_bio_mutex); ++out: ++ TOI_UNMAP(buf_type, buffer_page); ++ return result; ++} ++ ++/** ++ * toi_bio_write_page - write a page of the image ++ * @pfn: The pfn where the data belongs. ++ * @buffer_page: The page containing the (possibly compressed) data. ++ * @buf_size: The number of bytes on @buffer_page used. ++ * ++ * Write a (possibly compressed) page to the image from the buffer, together ++ * with it's index and buffer size. ++ **/ ++static int toi_bio_write_page(unsigned long pfn, int buf_type, ++ void *buffer_page, unsigned int buf_size) ++{ ++ char *buffer_virt; ++ int result = 0, result2 = 0; ++ ++ if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED))) ++ return 0; ++ ++ my_mutex_lock(1, &toi_bio_mutex); ++ ++ if (test_result_state(TOI_ABORTED)) { ++ my_mutex_unlock(1, &toi_bio_mutex); ++ return 0; ++ } ++ ++ buffer_virt = TOI_MAP(buf_type, buffer_page); ++ page_idx++; ++ ++ /* ++ * Structure in the image: ++ * [destination pfn|page size|page data] ++ * buf_size is PAGE_SIZE ++ */ ++ if (toi_rw_buffer(WRITE, (char *) &page_idx, sizeof(int), 0) || ++ toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) || ++ toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) || ++ toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) { ++ printk(KERN_DEBUG "toi_rw_buffer returned non-zero to " ++ "toi_bio_write_page.\n"); ++ result = -EIO; ++ } ++ ++ TOI_UNMAP(buf_type, buffer_page); ++ my_mutex_unlock(1, &toi_bio_mutex); ++ ++ if (current == toi_queue_flusher) ++ result2 = toi_bio_queue_flush_pages(0); ++ ++ return result ? result : result2; ++} ++ ++/** ++ * _toi_rw_header_chunk - read or write a portion of the image header ++ * @writing: Whether reading or writing. ++ * @owner: The module for which we're writing. ++ * Used for confirming that modules ++ * don't use more header space than they asked for. ++ * @buffer: Address of the data to write. ++ * @buffer_size: Size of the data buffer. ++ * @no_readahead: Don't try to start readhead (when getting extents). ++ * ++ * Perform PAGE_SIZE I/O. Start readahead if needed. ++ **/ ++static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner, ++ char *buffer, int buffer_size, int no_readahead) ++{ ++ int result = 0; ++ ++ if (owner) { ++ owner->header_used += buffer_size; ++ toi_message(TOI_HEADER, TOI_LOW, 1, ++ "Header: %s : %d bytes (%d/%d) from offset %d.", ++ owner->name, ++ buffer_size, owner->header_used, ++ owner->header_requested, ++ toi_writer_buffer_posn); ++ if (owner->header_used > owner->header_requested && writing) { ++ printk(KERN_EMERG "TuxOnIce module %s is using more " ++ "header space (%u) than it requested (%u).\n", ++ owner->name, ++ owner->header_used, ++ owner->header_requested); ++ return buffer_size; ++ } ++ } else { ++ unowned += buffer_size; ++ toi_message(TOI_HEADER, TOI_LOW, 1, ++ "Header: (No owner): %d bytes (%d total so far) from " ++ "offset %d.", buffer_size, unowned, ++ toi_writer_buffer_posn); ++ } ++ ++ if (!writing && !no_readahead && more_readahead) { ++ result = toi_start_new_readahead(0); ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Start new readahead " ++ "returned %d.", result); ++ } ++ ++ if (!result) { ++ result = toi_rw_buffer(writing, buffer, buffer_size, ++ no_readahead); ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "rw_buffer returned " ++ "%d.", result); ++ } ++ ++ total_header_bytes += buffer_size; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "_toi_rw_header_chunk returning " ++ "%d.", result); ++ return result; ++} ++ ++static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner, ++ char *buffer, int size) ++{ ++ return _toi_rw_header_chunk(writing, owner, buffer, size, 1); ++} ++ ++static int toi_rw_header_chunk_noreadahead(int writing, ++ struct toi_module_ops *owner, char *buffer, int size) ++{ ++ return _toi_rw_header_chunk(writing, owner, buffer, size, 1); ++} ++ ++/** ++ * toi_bio_storage_needed - get the amount of storage needed for my fns ++ **/ ++static int toi_bio_storage_needed(void) ++{ ++ return sizeof(int) + PAGE_SIZE + toi_bio_devinfo_storage_needed(); ++} ++ ++/** ++ * toi_bio_save_config_info - save block I/O config to image header ++ * @buf: PAGE_SIZE'd buffer into which data should be saved. ++ **/ ++static int toi_bio_save_config_info(char *buf) ++{ ++ int *ints = (int *) buf; ++ ints[0] = target_outstanding_io; ++ return sizeof(int); ++} ++ ++/** ++ * toi_bio_load_config_info - restore block I/O config ++ * @buf: Data to be reloaded. ++ * @size: Size of the buffer saved. ++ **/ ++static void toi_bio_load_config_info(char *buf, int size) ++{ ++ int *ints = (int *) buf; ++ target_outstanding_io = ints[0]; ++} ++ ++void close_resume_dev_t(int force) ++{ ++ if (!resume_block_device) ++ return; ++ ++ if (force) ++ atomic_set(&resume_bdev_open_count, 0); ++ else ++ atomic_dec(&resume_bdev_open_count); ++ ++ if (!atomic_read(&resume_bdev_open_count)) { ++ toi_close_bdev(resume_block_device); ++ resume_block_device = NULL; ++ } ++} ++ ++int open_resume_dev_t(int force, int quiet) ++{ ++ if (force) { ++ close_resume_dev_t(1); ++ atomic_set(&resume_bdev_open_count, 1); ++ } else ++ atomic_inc(&resume_bdev_open_count); ++ ++ if (resume_block_device) ++ return 0; ++ ++ resume_block_device = toi_open_bdev(NULL, resume_dev_t, 0); ++ if (IS_ERR(resume_block_device)) { ++ if (!quiet) ++ toi_early_boot_message(1, TOI_CONTINUE_REQ, ++ "Failed to open device %x, where" ++ " the header should be found.", ++ resume_dev_t); ++ resume_block_device = NULL; ++ atomic_set(&resume_bdev_open_count, 0); ++ return 1; ++ } ++ ++ return 0; ++} ++ ++/** ++ * toi_bio_initialise - initialise bio code at start of some action ++ * @starting_cycle: Whether starting a hibernation cycle, or just reading or ++ * writing a sysfs value. ++ **/ ++static int toi_bio_initialise(int starting_cycle) ++{ ++ int result; ++ ++ if (!starting_cycle || !resume_dev_t) ++ return 0; ++ ++ max_outstanding_writes = 0; ++ max_outstanding_reads = 0; ++ current_stream = 0; ++ toi_queue_flusher = current; ++#ifdef MEASURE_MUTEX_CONTENTION ++ { ++ int i, j, k; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < 2; j++) ++ for_each_online_cpu(k) ++ mutex_times[i][j][k] = 0; ++ } ++#endif ++ result = open_resume_dev_t(0, 1); ++ ++ if (result) ++ return result; ++ ++ return get_signature_page(); ++} ++ ++static unsigned long raw_to_real(unsigned long raw) ++{ ++ unsigned long extra; ++ ++ extra = (raw * (sizeof(unsigned long) + sizeof(int)) + ++ (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) / ++ (PAGE_SIZE + sizeof(unsigned long) + sizeof(int)); ++ ++ return raw > extra ? raw - extra : 0; ++} ++ ++static unsigned long toi_bio_storage_available(void) ++{ ++ unsigned long sum = 0; ++ struct toi_module_ops *this_module; ++ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (!this_module->enabled || ++ this_module->type != BIO_ALLOCATOR_MODULE) ++ continue; ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking storage " ++ "available from %s.", this_module->name); ++ sum += this_module->bio_allocator_ops->storage_available(); ++ } ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Total storage available is %lu " ++ "pages (%d header pages).", sum, header_pages_reserved); ++ ++ return sum > header_pages_reserved ? ++ raw_to_real(sum - header_pages_reserved) : 0; ++ ++} ++ ++static unsigned long toi_bio_storage_allocated(void) ++{ ++ return raw_pages_allocd > header_pages_reserved ? ++ raw_to_real(raw_pages_allocd - header_pages_reserved) : 0; ++} ++ ++/* ++ * If we have read part of the image, we might have filled memory with ++ * data that should be zeroed out. ++ */ ++static void toi_bio_noresume_reset(void) ++{ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_noresume_reset."); ++ toi_rw_cleanup(READ); ++ free_all_bdev_info(); ++} ++ ++/** ++ * toi_bio_cleanup - cleanup after some action ++ * @finishing_cycle: Whether completing a cycle. ++ **/ ++static void toi_bio_cleanup(int finishing_cycle) ++{ ++ if (!finishing_cycle) ++ return; ++ ++ if (toi_writer_buffer) { ++ toi_free_page(11, (unsigned long) toi_writer_buffer); ++ toi_writer_buffer = NULL; ++ } ++ ++ forget_signature_page(); ++ ++ if (header_block_device && toi_sig_data && ++ toi_sig_data->header_dev_t != resume_dev_t) ++ toi_close_bdev(header_block_device); ++ ++ header_block_device = NULL; ++ ++ close_resume_dev_t(0); ++} ++ ++static int toi_bio_write_header_init(void) ++{ ++ int result; ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_write_header_init"); ++ toi_rw_init(WRITE, 0); ++ toi_writer_buffer_posn = 0; ++ ++ /* Info needed to bootstrap goes at the start of the header. ++ * First we save the positions and devinfo, including the number ++ * of header pages. Then we save the structs containing data needed ++ * for reading the header pages back. ++ * Note that even if header pages take more than one page, when we ++ * read back the info, we will have restored the location of the ++ * next header page by the time we go to use it. ++ */ ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise extent chains."); ++ result = toi_serialise_extent_chains(); ++ ++ if (result) ++ return result; ++ ++ /* ++ * Signature page hasn't been modified at this point. Write it in ++ * the header so we can restore it later. ++ */ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise signature page."); ++ return toi_rw_header_chunk_noreadahead(WRITE, &toi_blockwriter_ops, ++ (char *) toi_cur_sig_page, ++ PAGE_SIZE); ++} ++ ++static int toi_bio_write_header_cleanup(void) ++{ ++ int result = 0; ++ ++ if (toi_writer_buffer_posn) ++ toi_bio_queue_write(&toi_writer_buffer); ++ ++ result = toi_finish_all_io(); ++ ++ unowned = 0; ++ total_header_bytes = 0; ++ ++ /* Set signature to save we have an image */ ++ if (!result) ++ result = toi_bio_mark_have_image(); ++ ++ return result; ++} ++ ++/* ++ * toi_bio_read_header_init() ++ * ++ * Description: ++ * 1. Attempt to read the device specified with resume=. ++ * 2. Check the contents of the swap header for our signature. ++ * 3. Warn, ignore, reset and/or continue as appropriate. ++ * 4. If continuing, read the toi_swap configuration section ++ * of the header and set up block device info so we can read ++ * the rest of the header & image. ++ * ++ * Returns: ++ * May not return if user choose to reboot at a warning. ++ * -EINVAL if cannot resume at this time. Booting should continue ++ * normally. ++ */ ++ ++static int toi_bio_read_header_init(void) ++{ ++ int result = 0; ++ char buf[32]; ++ ++ toi_writer_buffer_posn = 0; ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_init"); ++ ++ if (!toi_sig_data) { ++ printk(KERN_INFO "toi_bio_read_header_init called when we " ++ "haven't verified there is an image!\n"); ++ return -EINVAL; ++ } ++ ++ /* ++ * If the header is not on the resume_swap_dev_t, get the resume device ++ * first. ++ */ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "Header dev_t is %lx.", ++ toi_sig_data->header_dev_t); ++ if (toi_sig_data->have_uuid) { ++ struct fs_info seek; ++ dev_t device; ++ ++ strncpy((char *) seek.uuid, toi_sig_data->header_uuid, 16); ++ seek.dev_t = toi_sig_data->header_dev_t; ++ seek.last_mount_size = 0; ++ device = blk_lookup_fs_info(&seek); ++ if (device) { ++ printk("Using dev_t %s, returned by blk_lookup_fs_info.\n", ++ format_dev_t(buf, device)); ++ toi_sig_data->header_dev_t = device; ++ } ++ } ++ if (toi_sig_data->header_dev_t != resume_dev_t) { ++ header_block_device = toi_open_bdev(NULL, ++ toi_sig_data->header_dev_t, 1); ++ ++ if (IS_ERR(header_block_device)) ++ return PTR_ERR(header_block_device); ++ } else ++ header_block_device = resume_block_device; ++ ++ if (!toi_writer_buffer) ++ toi_writer_buffer = (char *) toi_get_zeroed_page(11, ++ TOI_ATOMIC_GFP); ++ more_readahead = 1; ++ ++ /* ++ * Read toi_swap configuration. ++ * Headerblock size taken into account already. ++ */ ++ result = toi_bio_ops.bdev_page_io(READ, header_block_device, ++ toi_sig_data->first_header_block, ++ virt_to_page((unsigned long) toi_writer_buffer)); ++ if (result) ++ return result; ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "load extent chains."); ++ result = toi_load_extent_chains(); ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "load original signature page."); ++ toi_orig_sig_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP); ++ if (!toi_orig_sig_page) { ++ printk(KERN_ERR "Failed to allocate memory for the current" ++ " image signature.\n"); ++ return -ENOMEM; ++ } ++ ++ return toi_rw_header_chunk_noreadahead(READ, &toi_blockwriter_ops, ++ (char *) toi_orig_sig_page, ++ PAGE_SIZE); ++} ++ ++static int toi_bio_read_header_cleanup(void) ++{ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_cleanup."); ++ return toi_rw_cleanup(READ); ++} ++ ++/* Works only for digits and letters, but small and fast */ ++#define TOLOWER(x) ((x) | 0x20) ++ ++/* ++ * UUID must be 32 chars long. It may have dashes, but nothing ++ * else. ++ */ ++char *uuid_from_commandline(char *commandline) ++{ ++ int low = 0; ++ char *result = NULL, *output, *ptr; ++ ++ if (strncmp(commandline, "UUID=", 5)) ++ return NULL; ++ ++ result = kzalloc(17, GFP_KERNEL); ++ if (!result) { ++ printk("Failed to kzalloc UUID text memory.\n"); ++ return NULL; ++ } ++ ++ ptr = commandline + 5; ++ output = result; ++ ++ while (*ptr && (output - result) < 16) { ++ if (isxdigit(*ptr)) { ++ int value = isdigit(*ptr) ? *ptr - '0' : ++ TOLOWER(*ptr) - 'a' + 10; ++ if (low) { ++ *output += value; ++ output++; ++ } else { ++ *output = value << 4; ++ } ++ low = !low; ++ } else if (*ptr != '-') ++ break; ++ ptr++; ++ } ++ ++ if ((output - result) < 16 || *ptr) { ++ printk(KERN_DEBUG "Found resume=UUID=, but the value looks " ++ "invalid.\n"); ++ kfree(result); ++ result = NULL; ++ } ++ ++ return result; ++} ++ ++#define retry_if_fails(command) \ ++do { \ ++ command; \ ++ if (!resume_dev_t && !waited_for_device_probe) { \ ++ wait_for_device_probe(); \ ++ command; \ ++ waited_for_device_probe = 1; \ ++ } \ ++} while(0) ++ ++/** ++ * try_to_open_resume_device: Try to parse and open resume= ++ * ++ * Any "swap:" has been stripped away and we just have the path to deal with. ++ * We attempt to do name_to_dev_t, open and stat the file. Having opened the ++ * file, get the struct block_device * to match. ++ */ ++static int try_to_open_resume_device(char *commandline, int quiet) ++{ ++ struct kstat stat; ++ int error = 0; ++ char *uuid = uuid_from_commandline(commandline); ++ int waited_for_device_probe = 0; ++ ++ resume_dev_t = MKDEV(0, 0); ++ ++ if (!strlen(commandline)) ++ retry_if_fails(toi_bio_scan_for_image(quiet)); ++ ++ if (uuid) { ++ struct fs_info seek; ++ strncpy((char *) &seek.uuid, uuid, 16); ++ seek.dev_t = resume_dev_t; ++ seek.last_mount_size = 0; ++ retry_if_fails(resume_dev_t = blk_lookup_fs_info(&seek)); ++ kfree(uuid); ++ } ++ ++ if (!resume_dev_t) ++ retry_if_fails(resume_dev_t = name_to_dev_t(commandline)); ++ ++ if (!resume_dev_t) { ++ struct file *file = filp_open(commandline, ++ O_RDONLY|O_LARGEFILE, 0); ++ ++ if (!IS_ERR(file) && file) { ++ vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat); ++ filp_close(file, NULL); ++ } else ++ error = vfs_stat(commandline, &stat); ++ if (!error) ++ resume_dev_t = stat.rdev; ++ } ++ ++ if (!resume_dev_t) { ++ if (quiet) ++ return 1; ++ ++ if (test_toi_state(TOI_TRYING_TO_RESUME)) ++ toi_early_boot_message(1, toi_translate_err_default, ++ "Failed to translate \"%s\" into a device id.\n", ++ commandline); ++ else ++ printk("TuxOnIce: Can't translate \"%s\" into a device " ++ "id yet.\n", commandline); ++ return 1; ++ } ++ ++ return open_resume_dev_t(1, quiet); ++} ++ ++/* ++ * Parse Image Location ++ * ++ * Attempt to parse a resume= parameter. ++ * Swap Writer accepts: ++ * resume=[swap:|file:]DEVNAME[:FIRSTBLOCK][@BLOCKSIZE] ++ * ++ * Where: ++ * DEVNAME is convertable to a dev_t by name_to_dev_t ++ * FIRSTBLOCK is the location of the first block in the swap file ++ * (specifying for a swap partition is nonsensical but not prohibited). ++ * Data is validated by attempting to read a swap header from the ++ * location given. Failure will result in toi_swap refusing to ++ * save an image, and a reboot with correct parameters will be ++ * necessary. ++ */ ++static int toi_bio_parse_sig_location(char *commandline, ++ int only_allocator, int quiet) ++{ ++ char *thischar, *devstart, *colon = NULL; ++ int signature_found, result = -EINVAL, temp_result = 0; ++ ++ if (strncmp(commandline, "swap:", 5) && ++ strncmp(commandline, "file:", 5)) { ++ /* ++ * Failing swap:, we'll take a simple resume=/dev/hda2, or a ++ * blank value (scan) but fall through to other allocators ++ * if /dev/ or UUID= isn't matched. ++ */ ++ if (strncmp(commandline, "/dev/", 5) && ++ strncmp(commandline, "UUID=", 5) && ++ strlen(commandline)) ++ return 1; ++ } else ++ commandline += 5; ++ ++ devstart = commandline; ++ thischar = commandline; ++ while ((*thischar != ':') && (*thischar != '@') && ++ ((thischar - commandline) < 250) && (*thischar)) ++ thischar++; ++ ++ if (*thischar == ':') { ++ colon = thischar; ++ *colon = 0; ++ thischar++; ++ } ++ ++ while ((thischar - commandline) < 250 && *thischar) ++ thischar++; ++ ++ if (colon) { ++ unsigned long block; ++ temp_result = strict_strtoul(colon + 1, 0, &block); ++ if (!temp_result) ++ resume_firstblock = (int) block; ++ } else ++ resume_firstblock = 0; ++ ++ clear_toi_state(TOI_CAN_HIBERNATE); ++ clear_toi_state(TOI_CAN_RESUME); ++ ++ if (!temp_result) ++ temp_result = try_to_open_resume_device(devstart, quiet); ++ ++ if (colon) ++ *colon = ':'; ++ ++ /* No error if we only scanned */ ++ if (temp_result) ++ return strlen(commandline) ? -EINVAL : 1; ++ ++ signature_found = toi_bio_image_exists(quiet); ++ ++ if (signature_found != -1) { ++ result = 0; ++ /* ++ * TODO: If only file storage, CAN_HIBERNATE should only be ++ * set if file allocator's target is valid. ++ */ ++ set_toi_state(TOI_CAN_HIBERNATE); ++ set_toi_state(TOI_CAN_RESUME); ++ } else ++ if (!quiet) ++ printk(KERN_ERR "TuxOnIce: Block I/O: No " ++ "signature found at %s.\n", devstart); ++ ++ return result; ++} ++ ++static void toi_bio_release_storage(void) ++{ ++ header_pages_reserved = 0; ++ raw_pages_allocd = 0; ++ ++ free_all_bdev_info(); ++} ++ ++/* toi_swap_remove_image ++ * ++ */ ++static int toi_bio_remove_image(void) ++{ ++ int result; ++ ++ toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_remove_image."); ++ ++ result = toi_bio_restore_original_signature(); ++ ++ /* ++ * We don't do a sanity check here: we want to restore the swap ++ * whatever version of kernel made the hibernate image. ++ * ++ * We need to write swap, but swap may not be enabled so ++ * we write the device directly ++ * ++ * If we don't have an current_signature_page, we didn't ++ * read an image header, so don't change anything. ++ */ ++ ++ toi_bio_release_storage(); ++ ++ return result; ++} ++ ++struct toi_bio_ops toi_bio_ops = { ++ .bdev_page_io = toi_bdev_page_io, ++ .register_storage = toi_register_storage_chain, ++ .free_storage = toi_bio_release_storage, ++}; ++EXPORT_SYMBOL_GPL(toi_bio_ops); ++ ++static struct toi_sysfs_data sysfs_params[] = { ++ SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io, ++ 0, 16384, 0, NULL), ++}; ++ ++struct toi_module_ops toi_blockwriter_ops = { ++ .type = WRITER_MODULE, ++ .name = "block i/o", ++ .directory = "block_io", ++ .module = THIS_MODULE, ++ .memory_needed = toi_bio_memory_needed, ++ .print_debug_info = toi_bio_print_debug_stats, ++ .storage_needed = toi_bio_storage_needed, ++ .save_config_info = toi_bio_save_config_info, ++ .load_config_info = toi_bio_load_config_info, ++ .initialise = toi_bio_initialise, ++ .cleanup = toi_bio_cleanup, ++ .post_atomic_restore = toi_bio_chains_post_atomic, ++ ++ .rw_init = toi_rw_init, ++ .rw_cleanup = toi_rw_cleanup, ++ .read_page = toi_bio_read_page, ++ .write_page = toi_bio_write_page, ++ .rw_header_chunk = toi_rw_header_chunk, ++ .rw_header_chunk_noreadahead = toi_rw_header_chunk_noreadahead, ++ .io_flusher = bio_io_flusher, ++ .update_throughput_throttle = update_throughput_throttle, ++ .finish_all_io = toi_finish_all_io, ++ ++ .noresume_reset = toi_bio_noresume_reset, ++ .storage_available = toi_bio_storage_available, ++ .storage_allocated = toi_bio_storage_allocated, ++ .reserve_header_space = toi_bio_reserve_header_space, ++ .allocate_storage = toi_bio_allocate_storage, ++ .image_exists = toi_bio_image_exists, ++ .mark_resume_attempted = toi_bio_mark_resume_attempted, ++ .write_header_init = toi_bio_write_header_init, ++ .write_header_cleanup = toi_bio_write_header_cleanup, ++ .read_header_init = toi_bio_read_header_init, ++ .read_header_cleanup = toi_bio_read_header_cleanup, ++ .get_header_version = toi_bio_get_header_version, ++ .remove_image = toi_bio_remove_image, ++ .parse_sig_location = toi_bio_parse_sig_location, ++ ++ .sysfs_data = sysfs_params, ++ .num_sysfs_entries = sizeof(sysfs_params) / ++ sizeof(struct toi_sysfs_data), ++}; ++ ++/** ++ * toi_block_io_load - load time routine for block I/O module ++ * ++ * Register block i/o ops and sysfs entries. ++ **/ ++static __init int toi_block_io_load(void) ++{ ++ return toi_register_module(&toi_blockwriter_ops); ++} ++ ++#ifdef MODULE ++static __exit void toi_block_io_unload(void) ++{ ++ toi_unregister_module(&toi_blockwriter_ops); ++} ++ ++module_init(toi_block_io_load); ++module_exit(toi_block_io_unload); ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Nigel Cunningham"); ++MODULE_DESCRIPTION("TuxOnIce block io functions"); ++#else ++late_initcall(toi_block_io_load); ++#endif +diff --git a/kernel/power/tuxonice_bio_internal.h b/kernel/power/tuxonice_bio_internal.h +new file mode 100644 +index 0000000..58c2481 +--- /dev/null ++++ b/kernel/power/tuxonice_bio_internal.h +@@ -0,0 +1,86 @@ ++/* ++ * kernel/power/tuxonice_bio_internal.h ++ * ++ * Copyright (C) 2009-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * Distributed under GPLv2. ++ * ++ * This file contains declarations for functions exported from ++ * tuxonice_bio.c, which contains low level io functions. ++ */ ++ ++/* Extent chains */ ++void toi_extent_state_goto_start(void); ++void toi_extent_state_save(int slot); ++int go_next_page(int writing, int section_barrier); ++void toi_extent_state_restore(int slot); ++void free_all_bdev_info(void); ++int devices_of_same_priority(struct toi_bdev_info *this); ++int toi_register_storage_chain(struct toi_bdev_info *new); ++int toi_serialise_extent_chains(void); ++int toi_load_extent_chains(void); ++int toi_bio_rw_page(int writing, struct page *page, int is_readahead, ++ int free_group); ++int toi_bio_restore_original_signature(void); ++int toi_bio_devinfo_storage_needed(void); ++unsigned long get_headerblock(void); ++dev_t get_header_dev_t(void); ++struct block_device *get_header_bdev(void); ++int toi_bio_allocate_storage(unsigned long request); ++ ++/* Signature functions */ ++#define HaveImage "HaveImage" ++#define NoImage "TuxOnIce" ++#define sig_size (sizeof(HaveImage)) ++ ++struct sig_data { ++ char sig[sig_size]; ++ int have_image; ++ int resumed_before; ++ ++ char have_uuid; ++ char header_uuid[17]; ++ dev_t header_dev_t; ++ unsigned long first_header_block; ++ ++ /* Repeat the signature to be sure we have a header version */ ++ char sig2[sig_size]; ++ int header_version; ++}; ++ ++void forget_signature_page(void); ++int toi_check_for_signature(void); ++int toi_bio_image_exists(int quiet); ++int get_signature_page(void); ++int toi_bio_mark_resume_attempted(int); ++extern char *toi_cur_sig_page; ++extern char *toi_orig_sig_page; ++int toi_bio_mark_have_image(void); ++extern struct sig_data *toi_sig_data; ++extern dev_t resume_dev_t; ++extern struct block_device *resume_block_device; ++extern struct block_device *header_block_device; ++extern unsigned long resume_firstblock; ++ ++struct block_device *open_bdev(dev_t device, int display_errs); ++extern int current_stream; ++extern int more_readahead; ++int toi_do_io(int writing, struct block_device *bdev, long block0, ++ struct page *page, int is_readahead, int syncio, int free_group); ++int get_main_pool_phys_params(void); ++ ++void toi_close_bdev(struct block_device *bdev); ++struct block_device *toi_open_bdev(char *uuid, dev_t default_device, ++ int display_errs); ++ ++extern struct toi_module_ops toi_blockwriter_ops; ++void dump_block_chains(void); ++void debug_broken_header(void); ++extern unsigned long raw_pages_allocd, header_pages_reserved; ++int toi_bio_chains_debug_info(char *buffer, int size); ++void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd); ++int toi_bio_scan_for_image(int quiet); ++int toi_bio_get_header_version(void); ++ ++void close_resume_dev_t(int force); ++int open_resume_dev_t(int force, int quiet); +diff --git a/kernel/power/tuxonice_bio_signature.c b/kernel/power/tuxonice_bio_signature.c +new file mode 100644 +index 0000000..2ebee7e +--- /dev/null ++++ b/kernel/power/tuxonice_bio_signature.c +@@ -0,0 +1,404 @@ ++/* ++ * kernel/power/tuxonice_bio_signature.c ++ * ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * Distributed under GPLv2. ++ * ++ */ ++ ++#include ++ ++#include "tuxonice.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_prepare_image.h" ++#include "tuxonice_bio.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_alloc.h" ++#include "tuxonice_io.h" ++#include "tuxonice_builtin.h" ++#include "tuxonice_bio_internal.h" ++ ++struct sig_data *toi_sig_data; ++ ++/* Struct of swap header pages */ ++ ++struct old_sig_data { ++ dev_t device; ++ unsigned long sector; ++ int resume_attempted; ++ int orig_sig_type; ++}; ++ ++union diskpage { ++ union swap_header swh; /* swh.magic is the only member used */ ++ struct sig_data sig_data; ++ struct old_sig_data old_sig_data; ++}; ++ ++union p_diskpage { ++ union diskpage *pointer; ++ char *ptr; ++ unsigned long address; ++}; ++ ++char *toi_cur_sig_page; ++char *toi_orig_sig_page; ++int have_image; ++int have_old_image; ++ ++int get_signature_page(void) ++{ ++ if (!toi_cur_sig_page) { ++ toi_message(TOI_IO, TOI_VERBOSE, 0, ++ "Allocating current signature page."); ++ toi_cur_sig_page = (char *) toi_get_zeroed_page(38, ++ TOI_ATOMIC_GFP); ++ if (!toi_cur_sig_page) { ++ printk(KERN_ERR "Failed to allocate memory for the " ++ "current image signature.\n"); ++ return -ENOMEM; ++ } ++ ++ toi_sig_data = (struct sig_data *) toi_cur_sig_page; ++ } ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Reading signature from dev %lx," ++ " sector %d.", ++ resume_block_device->bd_dev, resume_firstblock); ++ ++ return toi_bio_ops.bdev_page_io(READ, resume_block_device, ++ resume_firstblock, virt_to_page(toi_cur_sig_page)); ++} ++ ++void forget_signature_page(void) ++{ ++ if (toi_cur_sig_page) { ++ toi_sig_data = NULL; ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_cur_sig_page" ++ " (%p).", toi_cur_sig_page); ++ toi_free_page(38, (unsigned long) toi_cur_sig_page); ++ toi_cur_sig_page = NULL; ++ } ++ ++ if (toi_orig_sig_page) { ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_orig_sig_page" ++ " (%p).", toi_orig_sig_page); ++ toi_free_page(38, (unsigned long) toi_orig_sig_page); ++ toi_orig_sig_page = NULL; ++ } ++} ++ ++/* ++ * We need to ensure we use the signature page that's currently on disk, ++ * so as to not remove the image header. Post-atomic-restore, the orig sig ++ * page will be empty, so we can use that as our method of knowing that we ++ * need to load the on-disk signature and not use the non-image sig in ++ * memory. (We're going to powerdown after writing the change, so it's safe. ++ */ ++int toi_bio_mark_resume_attempted(int flag) ++{ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Make resume attempted = %d.", ++ flag); ++ if (!toi_orig_sig_page) { ++ forget_signature_page(); ++ get_signature_page(); ++ } ++ toi_sig_data->resumed_before = flag; ++ return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, ++ resume_firstblock, virt_to_page(toi_cur_sig_page)); ++} ++ ++int toi_bio_mark_have_image(void) ++{ ++ int result = 0; ++ char buf[32]; ++ struct fs_info *fs_info; ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that an image exists."); ++ memcpy(toi_sig_data->sig, tuxonice_signature, ++ sizeof(tuxonice_signature)); ++ toi_sig_data->have_image = 1; ++ toi_sig_data->resumed_before = 0; ++ toi_sig_data->header_dev_t = get_header_dev_t(); ++ toi_sig_data->have_uuid = 0; ++ ++ fs_info = fs_info_from_block_dev(get_header_bdev()); ++ if (fs_info && !IS_ERR(fs_info)) { ++ memcpy(toi_sig_data->header_uuid, &fs_info->uuid, 16); ++ free_fs_info(fs_info); ++ } else ++ result = (int) PTR_ERR(fs_info); ++ ++ if (!result) { ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Got uuid for dev_t %s.", ++ format_dev_t(buf, get_header_dev_t())); ++ toi_sig_data->have_uuid = 1; ++ } else ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Could not get uuid for " ++ "dev_t %s.", ++ format_dev_t(buf, get_header_dev_t())); ++ ++ toi_sig_data->first_header_block = get_headerblock(); ++ have_image = 1; ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is %x. First block " ++ "is %d.", toi_sig_data->header_dev_t, ++ toi_sig_data->first_header_block); ++ ++ memcpy(toi_sig_data->sig2, tuxonice_signature, ++ sizeof(tuxonice_signature)); ++ toi_sig_data->header_version = TOI_HEADER_VERSION; ++ ++ return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, ++ resume_firstblock, virt_to_page(toi_cur_sig_page)); ++} ++ ++int remove_old_signature(void) ++{ ++ union p_diskpage swap_header_page = (union p_diskpage) toi_cur_sig_page; ++ char *orig_sig, *no_image_signature_contents; ++ char *header_start = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP); ++ int result; ++ struct block_device *header_bdev; ++ struct old_sig_data *old_sig_data = ++ &swap_header_page.pointer->old_sig_data; ++ ++ header_bdev = toi_open_bdev(NULL, old_sig_data->device, 1); ++ result = toi_bio_ops.bdev_page_io(READ, header_bdev, ++ old_sig_data->sector, virt_to_page(header_start)); ++ ++ if (result) ++ goto out; ++ ++ /* ++ * TODO: Get the original contents of the first bytes of the swap ++ * header page. ++ */ ++ if (!old_sig_data->orig_sig_type) ++ orig_sig = "SWAP-SPACE"; ++ else ++ orig_sig = "SWAPSPACE2"; ++ ++ memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10); ++ memcpy(swap_header_page.ptr, header_start, ++ sizeof(no_image_signature_contents)); ++ ++ result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device, ++ resume_firstblock, virt_to_page(swap_header_page.ptr)); ++ ++out: ++ toi_close_bdev(header_bdev); ++ have_old_image = 0; ++ toi_free_page(38, (unsigned long) header_start); ++ return result; ++} ++ ++/* ++ * toi_bio_restore_original_signature - restore the original signature ++ * ++ * At boot time (aborting pre atomic-restore), toi_orig_sig_page gets used. ++ * It will have the original signature page contents, stored in the image ++ * header. Post atomic-restore, we use :toi_cur_sig_page, which will contain ++ * the contents that were loaded when we started the cycle. ++ */ ++int toi_bio_restore_original_signature(void) ++{ ++ char *use = toi_orig_sig_page ? toi_orig_sig_page : toi_cur_sig_page; ++ ++ if (have_old_image) ++ return remove_old_signature(); ++ ++ if (!use) { ++ printk("toi_bio_restore_original_signature: No signature " ++ "page loaded.\n"); ++ return 0; ++ } ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that no image exists."); ++ have_image = 0; ++ toi_sig_data->have_image = 0; ++ return toi_bio_ops.bdev_page_io(WRITE, resume_block_device, ++ resume_firstblock, virt_to_page(use)); ++} ++ ++/* ++ * check_for_signature - See whether we have an image. ++ * ++ * Returns 0 if no image, 1 if there is one, -1 if indeterminate. ++ */ ++int toi_check_for_signature(void) ++{ ++ union p_diskpage swap_header_page; ++ int type; ++ const char *normal_sigs[] = {"SWAP-SPACE", "SWAPSPACE2" }; ++ const char *swsusp_sigs[] = {"S1SUSP", "S2SUSP", "S1SUSPEND" }; ++ char *swap_header; ++ ++ if (!toi_cur_sig_page) { ++ int result = get_signature_page(); ++ ++ if (result) ++ return result; ++ } ++ ++ /* ++ * Start by looking for the binary header. ++ */ ++ if (!memcmp(tuxonice_signature, toi_cur_sig_page, ++ sizeof(tuxonice_signature))) { ++ have_image = toi_sig_data->have_image; ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Have binary signature. " ++ "Have image is %d.", have_image); ++ if (have_image) ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is " ++ "%x. First block is %d.", ++ toi_sig_data->header_dev_t, ++ toi_sig_data->first_header_block); ++ return toi_sig_data->have_image; ++ } ++ ++ /* ++ * Failing that, try old file allocator headers. ++ */ ++ ++ if (!memcmp(HaveImage, toi_cur_sig_page, strlen(HaveImage))) { ++ have_image = 1; ++ return 1; ++ } ++ ++ have_image = 0; ++ ++ if (!memcmp(NoImage, toi_cur_sig_page, strlen(NoImage))) ++ return 0; ++ ++ /* ++ * Nope? How about swap? ++ */ ++ swap_header_page = (union p_diskpage) toi_cur_sig_page; ++ swap_header = swap_header_page.pointer->swh.magic.magic; ++ ++ /* Normal swapspace? */ ++ for (type = 0; type < 2; type++) ++ if (!memcmp(normal_sigs[type], swap_header, ++ strlen(normal_sigs[type]))) ++ return 0; ++ ++ /* Swsusp or uswsusp? */ ++ for (type = 0; type < 3; type++) ++ if (!memcmp(swsusp_sigs[type], swap_header, ++ strlen(swsusp_sigs[type]))) ++ return 2; ++ ++ /* Old TuxOnIce version? */ ++ if (!memcmp(tuxonice_signature, swap_header, ++ sizeof(tuxonice_signature) - 1)) { ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Found old TuxOnIce " ++ "signature."); ++ have_old_image = 1; ++ return 3; ++ } ++ ++ return -1; ++} ++ ++/* ++ * Image_exists ++ * ++ * Returns -1 if don't know, otherwise 0 (no) or 1 (yes). ++ */ ++int toi_bio_image_exists(int quiet) ++{ ++ int result; ++ char *msg = NULL; ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_bio_image_exists."); ++ ++ if (!resume_dev_t) { ++ if (!quiet) ++ printk(KERN_INFO "Not even trying to read header " ++ "because resume_dev_t is not set.\n"); ++ return -1; ++ } ++ ++ if (open_resume_dev_t(0, quiet)) ++ return -1; ++ ++ result = toi_check_for_signature(); ++ ++ clear_toi_state(TOI_RESUMED_BEFORE); ++ if (toi_sig_data->resumed_before) ++ set_toi_state(TOI_RESUMED_BEFORE); ++ ++ if (quiet || result == -ENOMEM) ++ return result; ++ ++ if (result == -1) ++ msg = "TuxOnIce: Unable to find a signature." ++ " Could you have moved a swap file?\n"; ++ else if (!result) ++ msg = "TuxOnIce: No image found.\n"; ++ else if (result == 1) ++ msg = "TuxOnIce: Image found.\n"; ++ else if (result == 2) ++ msg = "TuxOnIce: uswsusp or swsusp image found.\n"; ++ else if (result == 3) ++ msg = "TuxOnIce: Old implementation's signature found.\n"; ++ ++ printk(KERN_INFO "%s", msg); ++ ++ return result; ++} ++ ++int toi_bio_scan_for_image(int quiet) ++{ ++ struct block_device *bdev; ++ char default_name[255] = ""; ++ ++ if (!quiet) ++ printk(KERN_DEBUG "Scanning swap devices for TuxOnIce " ++ "signature...\n"); ++ for (bdev = next_bdev_of_type(NULL, "swap"); bdev; ++ bdev = next_bdev_of_type(bdev, "swap")) { ++ int result; ++ char name[255] = ""; ++ sprintf(name, "%u:%u", MAJOR(bdev->bd_dev), ++ MINOR(bdev->bd_dev)); ++ if (!quiet) ++ printk(KERN_DEBUG "- Trying %s.\n", name); ++ resume_block_device = bdev; ++ resume_dev_t = bdev->bd_dev; ++ ++ result = toi_check_for_signature(); ++ ++ resume_block_device = NULL; ++ resume_dev_t = MKDEV(0, 0); ++ ++ if (!default_name[0]) ++ strcpy(default_name, name); ++ ++ if (result == 1) { ++ /* Got one! */ ++ strcpy(resume_file, name); ++ next_bdev_of_type(bdev, NULL); ++ if (!quiet) ++ printk(KERN_DEBUG " ==> Image found on %s.\n", ++ resume_file); ++ return 1; ++ } ++ forget_signature_page(); ++ } ++ ++ if (!quiet) ++ printk(KERN_DEBUG "TuxOnIce scan: No image found.\n"); ++ strcpy(resume_file, default_name); ++ return 0; ++} ++ ++int toi_bio_get_header_version(void) ++{ ++ return (memcmp(toi_sig_data->sig2, tuxonice_signature, ++ sizeof(tuxonice_signature))) ? ++ 0 : toi_sig_data->header_version; ++ ++} +diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c +new file mode 100644 +index 0000000..62b5d14 +--- /dev/null ++++ b/kernel/power/tuxonice_builtin.c +@@ -0,0 +1,445 @@ ++/* ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "tuxonice_io.h" ++#include "tuxonice.h" ++#include "tuxonice_extent.h" ++#include "tuxonice_netlink.h" ++#include "tuxonice_prepare_image.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_pagedir.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_builtin.h" ++#include "tuxonice_power_off.h" ++#include "tuxonice_alloc.h" ++ ++unsigned long toi_bootflags_mask; ++EXPORT_SYMBOL_GPL(toi_bootflags_mask); ++ ++/* ++ * Highmem related functions (x86 only). ++ */ ++ ++#ifdef CONFIG_HIGHMEM ++ ++/** ++ * copyback_high: Restore highmem pages. ++ * ++ * Highmem data and pbe lists are/can be stored in highmem. ++ * The format is slightly different to the lowmem pbe lists ++ * used for the assembly code: the last pbe in each page is ++ * a struct page * instead of struct pbe *, pointing to the ++ * next page where pbes are stored (or NULL if happens to be ++ * the end of the list). Since we don't want to generate ++ * unnecessary deltas against swsusp code, we use a cast ++ * instead of a union. ++ **/ ++ ++static void copyback_high(void) ++{ ++ struct page *pbe_page = (struct page *) restore_highmem_pblist; ++ struct pbe *this_pbe, *first_pbe; ++ unsigned long *origpage, *copypage; ++ int pbe_index = 1; ++ ++ if (!pbe_page) ++ return; ++ ++ this_pbe = (struct pbe *) kmap_atomic(pbe_page); ++ first_pbe = this_pbe; ++ ++ while (this_pbe) { ++ int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1; ++ ++ origpage = kmap_atomic(pfn_to_page((unsigned long) this_pbe->orig_address)); ++ copypage = kmap_atomic((struct page *) this_pbe->address); ++ ++ while (loop >= 0) { ++ *(origpage + loop) = *(copypage + loop); ++ loop--; ++ } ++ ++ kunmap_atomic(origpage); ++ kunmap_atomic(copypage); ++ ++ if (!this_pbe->next) ++ break; ++ ++ if (pbe_index < PBES_PER_PAGE) { ++ this_pbe++; ++ pbe_index++; ++ } else { ++ pbe_page = (struct page *) this_pbe->next; ++ kunmap_atomic(first_pbe); ++ if (!pbe_page) ++ return; ++ this_pbe = (struct pbe *) kmap_atomic(pbe_page); ++ first_pbe = this_pbe; ++ pbe_index = 1; ++ } ++ } ++ kunmap_atomic(first_pbe); ++} ++ ++#else /* CONFIG_HIGHMEM */ ++static void copyback_high(void) { } ++#endif ++ ++char toi_wait_for_keypress_dev_console(int timeout) ++{ ++ int fd, this_timeout = 255; ++ char key = '\0'; ++ struct termios t, t_backup; ++ ++ /* We should be guaranteed /dev/console exists after populate_rootfs() ++ * in init/main.c. ++ */ ++ fd = sys_open("/dev/console", O_RDONLY, 0); ++ if (fd < 0) { ++ printk(KERN_INFO "Couldn't open /dev/console.\n"); ++ return key; ++ } ++ ++ if (sys_ioctl(fd, TCGETS, (long)&t) < 0) ++ goto out_close; ++ ++ memcpy(&t_backup, &t, sizeof(t)); ++ ++ t.c_lflag &= ~(ISIG|ICANON|ECHO); ++ t.c_cc[VMIN] = 0; ++ ++new_timeout: ++ if (timeout > 0) { ++ this_timeout = timeout < 26 ? timeout : 25; ++ timeout -= this_timeout; ++ this_timeout *= 10; ++ } ++ ++ t.c_cc[VTIME] = this_timeout; ++ ++ if (sys_ioctl(fd, TCSETS, (long)&t) < 0) ++ goto out_restore; ++ ++ while (1) { ++ if (sys_read(fd, &key, 1) <= 0) { ++ if (timeout) ++ goto new_timeout; ++ key = '\0'; ++ break; ++ } ++ key = tolower(key); ++ if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) { ++ if (key == 'c') { ++ set_toi_state(TOI_CONTINUE_REQ); ++ break; ++ } else if (key == ' ') ++ break; ++ } else ++ break; ++ } ++ ++out_restore: ++ sys_ioctl(fd, TCSETS, (long)&t_backup); ++out_close: ++ sys_close(fd); ++ ++ return key; ++} ++EXPORT_SYMBOL_GPL(toi_wait_for_keypress_dev_console); ++ ++struct toi_boot_kernel_data toi_bkd __nosavedata ++ __attribute__((aligned(PAGE_SIZE))) = { ++ MY_BOOT_KERNEL_DATA_VERSION, ++ 0, ++#ifdef CONFIG_TOI_REPLACE_SWSUSP ++ (1 << TOI_REPLACE_SWSUSP) | ++#endif ++ (1 << TOI_NO_FLUSHER_THREAD) | ++ (1 << TOI_PAGESET2_FULL) | (1 << TOI_LATE_CPU_HOTPLUG), ++}; ++EXPORT_SYMBOL_GPL(toi_bkd); ++ ++struct block_device *toi_open_by_devnum(dev_t dev) ++{ ++ struct block_device *bdev = bdget(dev); ++ int err = -ENOMEM; ++ if (bdev) ++ err = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); ++ return err ? ERR_PTR(err) : bdev; ++} ++EXPORT_SYMBOL_GPL(toi_open_by_devnum); ++ ++/** ++ * toi_close_bdev: Close a swap bdev. ++ * ++ * int: The swap entry number to close. ++ */ ++void toi_close_bdev(struct block_device *bdev) ++{ ++ blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); ++} ++EXPORT_SYMBOL_GPL(toi_close_bdev); ++ ++int toi_wait = CONFIG_TOI_DEFAULT_WAIT; ++EXPORT_SYMBOL_GPL(toi_wait); ++ ++struct toi_core_fns *toi_core_fns; ++EXPORT_SYMBOL_GPL(toi_core_fns); ++ ++unsigned long toi_result; ++EXPORT_SYMBOL_GPL(toi_result); ++ ++struct pagedir pagedir1 = {1}; ++EXPORT_SYMBOL_GPL(pagedir1); ++ ++unsigned long toi_get_nonconflicting_page(void) ++{ ++ return toi_core_fns->get_nonconflicting_page(); ++} ++ ++int toi_post_context_save(void) ++{ ++ return toi_core_fns->post_context_save(); ++} ++ ++int try_tuxonice_hibernate(void) ++{ ++ if (!toi_core_fns) ++ return -ENODEV; ++ ++ return toi_core_fns->try_hibernate(); ++} ++ ++static int num_resume_calls; ++#ifdef CONFIG_TOI_IGNORE_LATE_INITCALL ++static int ignore_late_initcall = 1; ++#else ++static int ignore_late_initcall; ++#endif ++ ++int toi_translate_err_default = TOI_CONTINUE_REQ; ++EXPORT_SYMBOL_GPL(toi_translate_err_default); ++ ++void try_tuxonice_resume(void) ++{ ++ /* Don't let it wrap around eventually */ ++ if (num_resume_calls < 2) ++ num_resume_calls++; ++ ++ if (num_resume_calls == 1 && ignore_late_initcall) { ++ printk(KERN_INFO "TuxOnIce: Ignoring late initcall, as requested.\n"); ++ return; ++ } ++ ++ if (toi_core_fns) ++ toi_core_fns->try_resume(); ++ else ++ printk(KERN_INFO "TuxOnIce core not loaded yet.\n"); ++} ++ ++int toi_lowlevel_builtin(void) ++{ ++ int error = 0; ++ ++ save_processor_state(); ++ error = swsusp_arch_suspend(); ++ if (error) ++ printk(KERN_ERR "Error %d hibernating\n", error); ++ ++ /* Restore control flow appears here */ ++ if (!toi_in_hibernate) { ++ copyback_high(); ++ set_toi_state(TOI_NOW_RESUMING); ++ } ++ ++ restore_processor_state(); ++ return error; ++} ++EXPORT_SYMBOL_GPL(toi_lowlevel_builtin); ++ ++unsigned long toi_compress_bytes_in; ++EXPORT_SYMBOL_GPL(toi_compress_bytes_in); ++ ++unsigned long toi_compress_bytes_out; ++EXPORT_SYMBOL_GPL(toi_compress_bytes_out); ++ ++int toi_in_suspend(void) ++{ ++ return in_suspend; ++} ++EXPORT_SYMBOL_GPL(toi_in_suspend); ++ ++unsigned long toi_state = ((1 << TOI_BOOT_TIME) | ++ (1 << TOI_IGNORE_LOGLEVEL) | ++ (1 << TOI_IO_STOPPED)); ++EXPORT_SYMBOL_GPL(toi_state); ++ ++/* The number of hibernates we have started (some may have been cancelled) */ ++unsigned int nr_hibernates; ++EXPORT_SYMBOL_GPL(nr_hibernates); ++ ++int toi_running; ++EXPORT_SYMBOL_GPL(toi_running); ++ ++__nosavedata int toi_in_hibernate; ++EXPORT_SYMBOL_GPL(toi_in_hibernate); ++ ++__nosavedata struct pbe *restore_highmem_pblist; ++EXPORT_SYMBOL_GPL(restore_highmem_pblist); ++ ++int toi_trace_allocs; ++EXPORT_SYMBOL_GPL(toi_trace_allocs); ++ ++void toi_read_lock_tasklist(void) ++{ ++ read_lock(&tasklist_lock); ++} ++EXPORT_SYMBOL_GPL(toi_read_lock_tasklist); ++ ++void toi_read_unlock_tasklist(void) ++{ ++ read_unlock(&tasklist_lock); ++} ++EXPORT_SYMBOL_GPL(toi_read_unlock_tasklist); ++ ++#ifdef CONFIG_TOI_ZRAM_SUPPORT ++int (*toi_flag_zram_disks) (void); ++EXPORT_SYMBOL_GPL(toi_flag_zram_disks); ++ ++int toi_do_flag_zram_disks(void) ++{ ++ return toi_flag_zram_disks ? (*toi_flag_zram_disks)() : 0; ++} ++EXPORT_SYMBOL_GPL(toi_do_flag_zram_disks); ++#endif ++ ++static int __init toi_wait_setup(char *str) ++{ ++ int value; ++ ++ if (sscanf(str, "=%d", &value)) { ++ if (value < -1 || value > 255) ++ printk(KERN_INFO "TuxOnIce_wait outside range -1 to " ++ "255.\n"); ++ else ++ toi_wait = value; ++ } ++ ++ return 1; ++} ++ ++__setup("toi_wait", toi_wait_setup); ++ ++static int __init toi_translate_retry_setup(char *str) ++{ ++ toi_translate_err_default = 0; ++ return 1; ++} ++ ++__setup("toi_translate_retry", toi_translate_retry_setup); ++ ++static int __init toi_debug_setup(char *str) ++{ ++ toi_bkd.toi_action |= (1 << TOI_LOGALL); ++ toi_bootflags_mask |= (1 << TOI_LOGALL); ++ toi_bkd.toi_debug_state = 255; ++ toi_bkd.toi_default_console_level = 7; ++ return 1; ++} ++ ++__setup("toi_debug_setup", toi_debug_setup); ++ ++static int __init toi_pause_setup(char *str) ++{ ++ toi_bkd.toi_action |= (1 << TOI_PAUSE); ++ toi_bootflags_mask |= (1 << TOI_PAUSE); ++ return 1; ++} ++ ++__setup("toi_pause", toi_pause_setup); ++ ++#ifdef CONFIG_PM_DEBUG ++static int __init toi_trace_allocs_setup(char *str) ++{ ++ int value; ++ ++ if (sscanf(str, "=%d", &value)) ++ toi_trace_allocs = value; ++ ++ return 1; ++} ++__setup("toi_trace_allocs", toi_trace_allocs_setup); ++#endif ++ ++static int __init toi_ignore_late_initcall_setup(char *str) ++{ ++ int value; ++ ++ if (sscanf(str, "=%d", &value)) ++ ignore_late_initcall = value; ++ ++ return 1; ++} ++ ++__setup("toi_initramfs_resume_only", toi_ignore_late_initcall_setup); ++ ++static int __init toi_force_no_multithreaded_setup(char *str) ++{ ++ int value; ++ ++ toi_bkd.toi_action &= ~(1 << TOI_NO_MULTITHREADED_IO); ++ toi_bootflags_mask |= (1 << TOI_NO_MULTITHREADED_IO); ++ ++ if (sscanf(str, "=%d", &value) && value) ++ toi_bkd.toi_action |= (1 << TOI_NO_MULTITHREADED_IO); ++ ++ return 1; ++} ++ ++__setup("toi_no_multithreaded", toi_force_no_multithreaded_setup); ++ ++#ifdef CONFIG_KGDB ++static int __init toi_post_resume_breakpoint_setup(char *str) ++{ ++ int value; ++ ++ toi_bkd.toi_action &= ~(1 << TOI_POST_RESUME_BREAKPOINT); ++ toi_bootflags_mask |= (1 << TOI_POST_RESUME_BREAKPOINT); ++ if (sscanf(str, "=%d", &value) && value) ++ toi_bkd.toi_action |= (1 << TOI_POST_RESUME_BREAKPOINT); ++ ++ return 1; ++} ++ ++__setup("toi_post_resume_break", toi_post_resume_breakpoint_setup); ++#endif ++ ++static int __init toi_disable_readahead_setup(char *str) ++{ ++ int value; ++ ++ toi_bkd.toi_action &= ~(1 << TOI_NO_READAHEAD); ++ toi_bootflags_mask |= (1 << TOI_NO_READAHEAD); ++ if (sscanf(str, "=%d", &value) && value) ++ toi_bkd.toi_action |= (1 << TOI_NO_READAHEAD); ++ ++ return 1; ++} ++ ++__setup("toi_no_readahead", toi_disable_readahead_setup); +diff --git a/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h +new file mode 100644 +index 0000000..eea0155 +--- /dev/null ++++ b/kernel/power/tuxonice_builtin.h +@@ -0,0 +1,39 @@ ++/* ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ */ ++#include ++ ++extern struct toi_core_fns *toi_core_fns; ++extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out; ++extern unsigned int nr_hibernates; ++extern int toi_in_hibernate; ++ ++extern __nosavedata struct pbe *restore_highmem_pblist; ++ ++int toi_lowlevel_builtin(void); ++ ++#ifdef CONFIG_HIGHMEM ++extern __nosavedata struct zone_data *toi_nosave_zone_list; ++extern __nosavedata unsigned long toi_nosave_max_pfn; ++#endif ++ ++extern unsigned long toi_get_nonconflicting_page(void); ++extern int toi_post_context_save(void); ++ ++extern char toi_wait_for_keypress_dev_console(int timeout); ++extern struct block_device *toi_open_by_devnum(dev_t dev); ++extern void toi_close_bdev(struct block_device *bdev); ++extern int toi_wait; ++extern int toi_translate_err_default; ++extern int toi_force_no_multithreaded; ++extern void toi_read_lock_tasklist(void); ++extern void toi_read_unlock_tasklist(void); ++extern int toi_in_suspend(void); ++ ++#ifdef CONFIG_TOI_ZRAM_SUPPORT ++extern int toi_do_flag_zram_disks(void); ++#else ++#define toi_do_flag_zram_disks() (0) ++#endif +diff --git a/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c +new file mode 100644 +index 0000000..006e68b +--- /dev/null ++++ b/kernel/power/tuxonice_checksum.c +@@ -0,0 +1,384 @@ ++/* ++ * kernel/power/tuxonice_checksum.c ++ * ++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * This file contains data checksum routines for TuxOnIce, ++ * using cryptoapi. They are used to locate any modifications ++ * made to pageset 2 while we're saving it. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "tuxonice.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_io.h" ++#include "tuxonice_pageflags.h" ++#include "tuxonice_checksum.h" ++#include "tuxonice_pagedir.h" ++#include "tuxonice_alloc.h" ++#include "tuxonice_ui.h" ++ ++static struct toi_module_ops toi_checksum_ops; ++ ++/* Constant at the mo, but I might allow tuning later */ ++static char toi_checksum_name[32] = "md4"; ++/* Bytes per checksum */ ++#define CHECKSUM_SIZE (16) ++ ++#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE) ++ ++struct cpu_context { ++ struct crypto_hash *transform; ++ struct hash_desc desc; ++ struct scatterlist sg[2]; ++ char *buf; ++}; ++ ++static DEFINE_PER_CPU(struct cpu_context, contexts); ++static int pages_allocated; ++static unsigned long page_list; ++ ++static int toi_num_resaved; ++ ++static unsigned long this_checksum, next_page; ++static int checksum_index; ++ ++static inline int checksum_pages_needed(void) ++{ ++ return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE); ++} ++ ++/* ---- Local buffer management ---- */ ++ ++/* ++ * toi_checksum_cleanup ++ * ++ * Frees memory allocated for our labours. ++ */ ++static void toi_checksum_cleanup(int ending_cycle) ++{ ++ int cpu; ++ ++ if (ending_cycle) { ++ for_each_online_cpu(cpu) { ++ struct cpu_context *this = &per_cpu(contexts, cpu); ++ if (this->transform) { ++ crypto_free_hash(this->transform); ++ this->transform = NULL; ++ this->desc.tfm = NULL; ++ } ++ ++ if (this->buf) { ++ toi_free_page(27, (unsigned long) this->buf); ++ this->buf = NULL; ++ } ++ } ++ } ++} ++ ++/* ++ * toi_crypto_initialise ++ * ++ * Prepare to do some work by allocating buffers and transforms. ++ * Returns: Int: Zero. Even if we can't set up checksum, we still ++ * seek to hibernate. ++ */ ++static int toi_checksum_initialise(int starting_cycle) ++{ ++ int cpu; ++ ++ if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled) ++ return 0; ++ ++ if (!*toi_checksum_name) { ++ printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n"); ++ return 1; ++ } ++ ++ for_each_online_cpu(cpu) { ++ struct cpu_context *this = &per_cpu(contexts, cpu); ++ struct page *page; ++ ++ this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0); ++ if (IS_ERR(this->transform)) { ++ printk(KERN_INFO "TuxOnIce: Failed to initialise the " ++ "%s checksum algorithm: %ld.\n", ++ toi_checksum_name, (long) this->transform); ++ this->transform = NULL; ++ return 1; ++ } ++ ++ this->desc.tfm = this->transform; ++ this->desc.flags = 0; ++ ++ page = toi_alloc_page(27, GFP_KERNEL); ++ if (!page) ++ return 1; ++ this->buf = page_address(page); ++ sg_init_one(&this->sg[0], this->buf, PAGE_SIZE); ++ } ++ return 0; ++} ++ ++/* ++ * toi_checksum_print_debug_stats ++ * @buffer: Pointer to a buffer into which the debug info will be printed. ++ * @size: Size of the buffer. ++ * ++ * Print information to be recorded for debugging purposes into a buffer. ++ * Returns: Number of characters written to the buffer. ++ */ ++ ++static int toi_checksum_print_debug_stats(char *buffer, int size) ++{ ++ int len; ++ ++ if (!toi_checksum_ops.enabled) ++ return scnprintf(buffer, size, ++ "- Checksumming disabled.\n"); ++ ++ len = scnprintf(buffer, size, "- Checksum method is '%s'.\n", ++ toi_checksum_name); ++ len += scnprintf(buffer + len, size - len, ++ " %d pages resaved in atomic copy.\n", toi_num_resaved); ++ return len; ++} ++ ++static int toi_checksum_memory_needed(void) ++{ ++ return toi_checksum_ops.enabled ? ++ checksum_pages_needed() << PAGE_SHIFT : 0; ++} ++ ++static int toi_checksum_storage_needed(void) ++{ ++ if (toi_checksum_ops.enabled) ++ return strlen(toi_checksum_name) + sizeof(int) + 1; ++ else ++ return 0; ++} ++ ++/* ++ * toi_checksum_save_config_info ++ * @buffer: Pointer to a buffer of size PAGE_SIZE. ++ * ++ * Save informaton needed when reloading the image at resume time. ++ * Returns: Number of bytes used for saving our data. ++ */ ++static int toi_checksum_save_config_info(char *buffer) ++{ ++ int namelen = strlen(toi_checksum_name) + 1; ++ int total_len; ++ ++ *((unsigned int *) buffer) = namelen; ++ strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen); ++ total_len = sizeof(unsigned int) + namelen; ++ return total_len; ++} ++ ++/* toi_checksum_load_config_info ++ * @buffer: Pointer to the start of the data. ++ * @size: Number of bytes that were saved. ++ * ++ * Description: Reload information needed for dechecksuming the image at ++ * resume time. ++ */ ++static void toi_checksum_load_config_info(char *buffer, int size) ++{ ++ int namelen; ++ ++ namelen = *((unsigned int *) (buffer)); ++ strncpy(toi_checksum_name, buffer + sizeof(unsigned int), ++ namelen); ++ return; ++} ++ ++/* ++ * Free Checksum Memory ++ */ ++ ++void free_checksum_pages(void) ++{ ++ while (pages_allocated) { ++ unsigned long next = *((unsigned long *) page_list); ++ ClearPageNosave(virt_to_page(page_list)); ++ toi_free_page(15, (unsigned long) page_list); ++ page_list = next; ++ pages_allocated--; ++ } ++} ++ ++/* ++ * Allocate Checksum Memory ++ */ ++ ++int allocate_checksum_pages(void) ++{ ++ int pages_needed = checksum_pages_needed(); ++ ++ if (!toi_checksum_ops.enabled) ++ return 0; ++ ++ while (pages_allocated < pages_needed) { ++ unsigned long *new_page = ++ (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP); ++ if (!new_page) { ++ printk(KERN_ERR "Unable to allocate checksum pages.\n"); ++ return -ENOMEM; ++ } ++ SetPageNosave(virt_to_page(new_page)); ++ (*new_page) = page_list; ++ page_list = (unsigned long) new_page; ++ pages_allocated++; ++ } ++ ++ next_page = (unsigned long) page_list; ++ checksum_index = 0; ++ ++ return 0; ++} ++ ++char *tuxonice_get_next_checksum(void) ++{ ++ if (!toi_checksum_ops.enabled) ++ return NULL; ++ ++ if (checksum_index % CHECKSUMS_PER_PAGE) ++ this_checksum += CHECKSUM_SIZE; ++ else { ++ this_checksum = next_page + sizeof(void *); ++ next_page = *((unsigned long *) next_page); ++ } ++ ++ checksum_index++; ++ return (char *) this_checksum; ++} ++ ++int tuxonice_calc_checksum(struct page *page, char *checksum_locn) ++{ ++ char *pa; ++ int result, cpu = smp_processor_id(); ++ struct cpu_context *ctx = &per_cpu(contexts, cpu); ++ ++ if (!toi_checksum_ops.enabled) ++ return 0; ++ ++ pa = kmap(page); ++ memcpy(ctx->buf, pa, PAGE_SIZE); ++ kunmap(page); ++ result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE, ++ checksum_locn); ++ if (result) ++ printk(KERN_ERR "TuxOnIce checksumming: crypto_hash_digest " ++ "returned %d.\n", result); ++ return result; ++} ++/* ++ * Calculate checksums ++ */ ++ ++void check_checksums(void) ++{ ++ int pfn, index = 0, cpu = smp_processor_id(); ++ char current_checksum[CHECKSUM_SIZE]; ++ struct cpu_context *ctx = &per_cpu(contexts, cpu); ++ ++ if (!toi_checksum_ops.enabled) { ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksumming disabled."); ++ return; ++ } ++ ++ next_page = (unsigned long) page_list; ++ ++ toi_num_resaved = 0; ++ this_checksum = 0; ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Verifying checksums."); ++ memory_bm_position_reset(pageset2_map); ++ for (pfn = memory_bm_next_pfn(pageset2_map); pfn != BM_END_OF_MAP; ++ pfn = memory_bm_next_pfn(pageset2_map)) { ++ int ret; ++ char *pa; ++ struct page *page = pfn_to_page(pfn); ++ ++ if (index % CHECKSUMS_PER_PAGE) { ++ this_checksum += CHECKSUM_SIZE; ++ } else { ++ this_checksum = next_page + sizeof(void *); ++ next_page = *((unsigned long *) next_page); ++ } ++ ++ /* Done when IRQs disabled so must be atomic */ ++ pa = kmap_atomic(page); ++ memcpy(ctx->buf, pa, PAGE_SIZE); ++ kunmap_atomic(pa); ++ ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE, ++ current_checksum); ++ ++ if (ret) { ++ printk(KERN_INFO "Digest failed. Returned %d.\n", ret); ++ return; ++ } ++ ++ if (memcmp(current_checksum, (char *) this_checksum, ++ CHECKSUM_SIZE)) { ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Resaving %ld.", ++ pfn); ++ SetPageResave(pfn_to_page(pfn)); ++ toi_num_resaved++; ++ if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED)) ++ set_abort_result(TOI_RESAVE_NEEDED); ++ } ++ ++ index++; ++ } ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksum verification complete."); ++} ++ ++static struct toi_sysfs_data sysfs_params[] = { ++ SYSFS_INT("enabled", SYSFS_RW, &toi_checksum_ops.enabled, 0, 1, 0, ++ NULL), ++ SYSFS_BIT("abort_if_resave_needed", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_ABORT_ON_RESAVE_NEEDED, 0) ++}; ++ ++/* ++ * Ops structure. ++ */ ++static struct toi_module_ops toi_checksum_ops = { ++ .type = MISC_MODULE, ++ .name = "checksumming", ++ .directory = "checksum", ++ .module = THIS_MODULE, ++ .initialise = toi_checksum_initialise, ++ .cleanup = toi_checksum_cleanup, ++ .print_debug_info = toi_checksum_print_debug_stats, ++ .save_config_info = toi_checksum_save_config_info, ++ .load_config_info = toi_checksum_load_config_info, ++ .memory_needed = toi_checksum_memory_needed, ++ .storage_needed = toi_checksum_storage_needed, ++ ++ .sysfs_data = sysfs_params, ++ .num_sysfs_entries = sizeof(sysfs_params) / ++ sizeof(struct toi_sysfs_data), ++}; ++ ++/* ---- Registration ---- */ ++int toi_checksum_init(void) ++{ ++ int result = toi_register_module(&toi_checksum_ops); ++ return result; ++} ++ ++void toi_checksum_exit(void) ++{ ++ toi_unregister_module(&toi_checksum_ops); ++} +diff --git a/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h +new file mode 100644 +index 0000000..0f2812e +--- /dev/null ++++ b/kernel/power/tuxonice_checksum.h +@@ -0,0 +1,31 @@ ++/* ++ * kernel/power/tuxonice_checksum.h ++ * ++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * This file contains data checksum routines for TuxOnIce, ++ * using cryptoapi. They are used to locate any modifications ++ * made to pageset 2 while we're saving it. ++ */ ++ ++#if defined(CONFIG_TOI_CHECKSUM) ++extern int toi_checksum_init(void); ++extern void toi_checksum_exit(void); ++void check_checksums(void); ++int allocate_checksum_pages(void); ++void free_checksum_pages(void); ++char *tuxonice_get_next_checksum(void); ++int tuxonice_calc_checksum(struct page *page, char *checksum_locn); ++#else ++static inline int toi_checksum_init(void) { return 0; } ++static inline void toi_checksum_exit(void) { } ++static inline void check_checksums(void) { }; ++static inline int allocate_checksum_pages(void) { return 0; }; ++static inline void free_checksum_pages(void) { }; ++static inline char *tuxonice_get_next_checksum(void) { return NULL; }; ++static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn) ++ { return 0; } ++#endif ++ +diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c +new file mode 100644 +index 0000000..0e5a262 +--- /dev/null ++++ b/kernel/power/tuxonice_cluster.c +@@ -0,0 +1,1069 @@ ++/* ++ * kernel/power/tuxonice_cluster.c ++ * ++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * This file contains routines for cluster hibernation support. ++ * ++ * Based on ip autoconfiguration code in net/ipv4/ipconfig.c. ++ * ++ * How does it work? ++ * ++ * There is no 'master' node that tells everyone else what to do. All nodes ++ * send messages to the broadcast address/port, maintain a list of peers ++ * and figure out when to progress to the next step in hibernating or resuming. ++ * This makes us more fault tolerant when it comes to nodes coming and going ++ * (which may be more of an issue if we're hibernating when power supplies ++ * are being unreliable). ++ * ++ * At boot time, we start a ktuxonice thread that handles communication with ++ * other nodes. This node maintains a state machine that controls our progress ++ * through hibernating and resuming, keeping us in step with other nodes. Nodes ++ * are identified by their hw address. ++ * ++ * On startup, the node sends CLUSTER_PING on the configured interface's ++ * broadcast address, port $toi_cluster_port (see below) and begins to listen ++ * for other broadcast messages. CLUSTER_PING messages are repeated at ++ * intervals of 5 minutes, with a random offset to spread traffic out. ++ * ++ * A hibernation cycle is initiated from any node via ++ * ++ * echo > /sys/power/tuxonice/do_hibernate ++ * ++ * and (possibily) the hibernate script. At each step of the process, the node ++ * completes its work, and waits for all other nodes to signal completion of ++ * their work (or timeout) before progressing to the next step. ++ * ++ * Request/state Action before reply Possible reply Next state ++ * HIBERNATE capable, pre-script HIBERNATE|ACK NODE_PREP ++ * HIBERNATE|NACK INIT_0 ++ * ++ * PREP prepare_image PREP|ACK IMAGE_WRITE ++ * PREP|NACK INIT_0 ++ * ABORT RUNNING ++ * ++ * IO write image IO|ACK power off ++ * ABORT POST_RESUME ++ * ++ * (Boot time) check for image IMAGE|ACK RESUME_PREP ++ * (Note 1) ++ * IMAGE|NACK (Note 2) ++ * ++ * PREP prepare read image PREP|ACK IMAGE_READ ++ * PREP|NACK (As NACK_IMAGE) ++ * ++ * IO read image IO|ACK POST_RESUME ++ * ++ * POST_RESUME thaw, post-script RUNNING ++ * ++ * INIT_0 init 0 ++ * ++ * Other messages: ++ * ++ * - PING: Request for all other live nodes to send a PONG. Used at startup to ++ * announce presence, when a node is suspected dead and periodically, in case ++ * segments of the network are [un]plugged. ++ * ++ * - PONG: Response to a PING. ++ * ++ * - ABORT: Request to cancel writing an image. ++ * ++ * - BYE: Notification that this node is shutting down. ++ * ++ * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that ++ * nodes which are slower to start up can get state synchronised. If a node ++ * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send ++ * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it ++ * must invalidate its image (if any) and boot normally. ++ * ++ * Note 2: May occur when one node lost power or powered off while others ++ * hibernated. This node waits for others to complete resuming (ACK_READ) ++ * before completing its boot, so that it appears as a fail node restarting. ++ * ++ * If any node has an image, then it also has a list of nodes that hibernated ++ * in synchronisation with it. The node will wait for other nodes to appear ++ * or timeout before beginning its restoration. ++ * ++ * If a node has no image, it needs to wait, in case other nodes which do have ++ * an image are going to resume, but are taking longer to announce their ++ * presence. For this reason, the user can specify a timeout value and a number ++ * of nodes detected before we just continue. (We might want to assume in a ++ * cluster of, say, 15 nodes, if 8 others have booted without finding an image, ++ * the remaining nodes will too. This might help in situations where some nodes ++ * are much slower to boot, or more subject to hardware failures or such like). ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "tuxonice.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_alloc.h" ++#include "tuxonice_io.h" ++ ++#if 1 ++#define PRINTK(a, b...) do { printk(a, ##b); } while (0) ++#else ++#define PRINTK(a, b...) do { } while (0) ++#endif ++ ++static int loopback_mode; ++static int num_local_nodes = 1; ++#define MAX_LOCAL_NODES 8 ++#define SADDR (loopback_mode ? b->sid : h->saddr) ++ ++#define MYNAME "TuxOnIce Clustering" ++ ++enum cluster_message { ++ MSG_ACK = 1, ++ MSG_NACK = 2, ++ MSG_PING = 4, ++ MSG_ABORT = 8, ++ MSG_BYE = 16, ++ MSG_HIBERNATE = 32, ++ MSG_IMAGE = 64, ++ MSG_IO = 128, ++ MSG_RUNNING = 256 ++}; ++ ++static char *str_message(int message) ++{ ++ switch (message) { ++ case 4: ++ return "Ping"; ++ case 8: ++ return "Abort"; ++ case 9: ++ return "Abort acked"; ++ case 10: ++ return "Abort nacked"; ++ case 16: ++ return "Bye"; ++ case 17: ++ return "Bye acked"; ++ case 18: ++ return "Bye nacked"; ++ case 32: ++ return "Hibernate request"; ++ case 33: ++ return "Hibernate ack"; ++ case 34: ++ return "Hibernate nack"; ++ case 64: ++ return "Image exists?"; ++ case 65: ++ return "Image does exist"; ++ case 66: ++ return "No image here"; ++ case 128: ++ return "I/O"; ++ case 129: ++ return "I/O okay"; ++ case 130: ++ return "I/O failed"; ++ case 256: ++ return "Running"; ++ default: ++ printk(KERN_ERR "Unrecognised message %d.\n", message); ++ return "Unrecognised message (see dmesg)"; ++ } ++} ++ ++#define MSG_ACK_MASK (MSG_ACK | MSG_NACK) ++#define MSG_STATE_MASK (~MSG_ACK_MASK) ++ ++struct node_info { ++ struct list_head member_list; ++ wait_queue_head_t member_events; ++ spinlock_t member_list_lock; ++ spinlock_t receive_lock; ++ int peer_count, ignored_peer_count; ++ struct toi_sysfs_data sysfs_data; ++ enum cluster_message current_message; ++}; ++ ++struct node_info node_array[MAX_LOCAL_NODES]; ++ ++struct cluster_member { ++ __be32 addr; ++ enum cluster_message message; ++ struct list_head list; ++ int ignore; ++}; ++ ++#define toi_cluster_port_send 3501 ++#define toi_cluster_port_recv 3502 ++ ++static struct net_device *net_dev; ++static struct toi_module_ops toi_cluster_ops; ++ ++static int toi_recv(struct sk_buff *skb, struct net_device *dev, ++ struct packet_type *pt, struct net_device *orig_dev); ++ ++static struct packet_type toi_cluster_packet_type = { ++ .type = __constant_htons(ETH_P_IP), ++ .func = toi_recv, ++}; ++ ++struct toi_pkt { /* BOOTP packet format */ ++ struct iphdr iph; /* IP header */ ++ struct udphdr udph; /* UDP header */ ++ u8 htype; /* HW address type */ ++ u8 hlen; /* HW address length */ ++ __be32 xid; /* Transaction ID */ ++ __be16 secs; /* Seconds since we started */ ++ __be16 flags; /* Just what it says */ ++ u8 hw_addr[16]; /* Sender's HW address */ ++ u16 message; /* Message */ ++ unsigned long sid; /* Source ID for loopback testing */ ++}; ++ ++static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE; ++ ++static int added_pack; ++ ++static int others_have_image; ++ ++/* Key used to allow multiple clusters on the same lan */ ++static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY; ++static char pre_hibernate_script[255] = ++ CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE; ++static char post_hibernate_script[255] = ++ CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE; ++ ++/* List of cluster members */ ++static unsigned long continue_delay = 5 * HZ; ++static unsigned long cluster_message_timeout = 3 * HZ; ++ ++/* === Membership list === */ ++ ++static void print_member_info(int index) ++{ ++ struct cluster_member *this; ++ ++ printk(KERN_INFO "==> Dumping node %d.\n", index); ++ ++ list_for_each_entry(this, &node_array[index].member_list, list) ++ printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n", ++ NIPQUAD(this->addr), ++ str_message(this->message), ++ this->ignore ? "(Ignored)" : ""); ++ printk(KERN_INFO "== Done ==\n"); ++} ++ ++static struct cluster_member *__find_member(int index, __be32 addr) ++{ ++ struct cluster_member *this; ++ ++ list_for_each_entry(this, &node_array[index].member_list, list) { ++ if (this->addr != addr) ++ continue; ++ ++ return this; ++ } ++ ++ return NULL; ++} ++ ++static void set_ignore(int index, __be32 addr, struct cluster_member *this) ++{ ++ if (this->ignore) { ++ PRINTK("Node %d already ignoring %d.%d.%d.%d.\n", ++ index, NIPQUAD(addr)); ++ return; ++ } ++ ++ PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n", ++ index, NIPQUAD(addr)); ++ this->ignore = 1; ++ node_array[index].ignored_peer_count++; ++} ++ ++static int __add_update_member(int index, __be32 addr, int message) ++{ ++ struct cluster_member *this; ++ ++ this = __find_member(index, addr); ++ if (this) { ++ if (this->message != message) { ++ this->message = message; ++ if ((message & MSG_NACK) && ++ (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO))) ++ set_ignore(index, addr, this); ++ PRINTK("Node %d sees node %d.%d.%d.%d now sending " ++ "%s.\n", index, NIPQUAD(addr), ++ str_message(message)); ++ wake_up(&node_array[index].member_events); ++ } ++ return 0; ++ } ++ ++ this = (struct cluster_member *) toi_kzalloc(36, ++ sizeof(struct cluster_member), GFP_KERNEL); ++ ++ if (!this) ++ return -1; ++ ++ this->addr = addr; ++ this->message = message; ++ this->ignore = 0; ++ INIT_LIST_HEAD(&this->list); ++ ++ node_array[index].peer_count++; ++ ++ PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index, ++ NIPQUAD(addr), str_message(message)); ++ ++ if ((message & MSG_NACK) && ++ (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO))) ++ set_ignore(index, addr, this); ++ list_add_tail(&this->list, &node_array[index].member_list); ++ return 1; ++} ++ ++static int add_update_member(int index, __be32 addr, int message) ++{ ++ int result; ++ unsigned long flags; ++ spin_lock_irqsave(&node_array[index].member_list_lock, flags); ++ result = __add_update_member(index, addr, message); ++ spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); ++ ++ print_member_info(index); ++ ++ wake_up(&node_array[index].member_events); ++ ++ return result; ++} ++ ++static void del_member(int index, __be32 addr) ++{ ++ struct cluster_member *this; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&node_array[index].member_list_lock, flags); ++ this = __find_member(index, addr); ++ ++ if (this) { ++ list_del_init(&this->list); ++ toi_kfree(36, this, sizeof(*this)); ++ node_array[index].peer_count--; ++ } ++ ++ spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); ++} ++ ++/* === Message transmission === */ ++ ++static void toi_send_if(int message, unsigned long my_id); ++ ++/* ++ * Process received TOI packet. ++ */ ++static int toi_recv(struct sk_buff *skb, struct net_device *dev, ++ struct packet_type *pt, struct net_device *orig_dev) ++{ ++ struct toi_pkt *b; ++ struct iphdr *h; ++ int len, result, index; ++ unsigned long addr, message, ack; ++ ++ /* Perform verifications before taking the lock. */ ++ if (skb->pkt_type == PACKET_OTHERHOST) ++ goto drop; ++ ++ if (dev != net_dev) ++ goto drop; ++ ++ skb = skb_share_check(skb, GFP_ATOMIC); ++ if (!skb) ++ return NET_RX_DROP; ++ ++ if (!pskb_may_pull(skb, ++ sizeof(struct iphdr) + ++ sizeof(struct udphdr))) ++ goto drop; ++ ++ b = (struct toi_pkt *)skb_network_header(skb); ++ h = &b->iph; ++ ++ if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP) ++ goto drop; ++ ++ /* Fragments are not supported */ ++ if (h->frag_off & htons(IP_OFFSET | IP_MF)) { ++ if (net_ratelimit()) ++ printk(KERN_ERR "TuxOnIce: Ignoring fragmented " ++ "cluster message.\n"); ++ goto drop; ++ } ++ ++ if (skb->len < ntohs(h->tot_len)) ++ goto drop; ++ ++ if (ip_fast_csum((char *) h, h->ihl)) ++ goto drop; ++ ++ if (b->udph.source != htons(toi_cluster_port_send) || ++ b->udph.dest != htons(toi_cluster_port_recv)) ++ goto drop; ++ ++ if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr)) ++ goto drop; ++ ++ len = ntohs(b->udph.len) - sizeof(struct udphdr); ++ ++ /* Ok the front looks good, make sure we can get at the rest. */ ++ if (!pskb_may_pull(skb, skb->len)) ++ goto drop; ++ ++ b = (struct toi_pkt *)skb_network_header(skb); ++ h = &b->iph; ++ ++ addr = SADDR; ++ PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n", ++ str_message(b->message), NIPQUAD(addr)); ++ ++ message = b->message & MSG_STATE_MASK; ++ ack = b->message & MSG_ACK_MASK; ++ ++ for (index = 0; index < num_local_nodes; index++) { ++ int new_message = node_array[index].current_message, ++ old_message = new_message; ++ ++ if (index == SADDR || !old_message) { ++ PRINTK("Ignoring node %d (offline or self).\n", index); ++ continue; ++ } ++ ++ /* One message at a time, please. */ ++ spin_lock(&node_array[index].receive_lock); ++ ++ result = add_update_member(index, SADDR, b->message); ++ if (result == -1) { ++ printk(KERN_INFO "Failed to add new cluster member " ++ NIPQUAD_FMT ".\n", ++ NIPQUAD(addr)); ++ goto drop_unlock; ++ } ++ ++ switch (b->message & MSG_STATE_MASK) { ++ case MSG_PING: ++ break; ++ case MSG_ABORT: ++ break; ++ case MSG_BYE: ++ break; ++ case MSG_HIBERNATE: ++ /* Can I hibernate? */ ++ new_message = MSG_HIBERNATE | ++ ((index & 1) ? MSG_NACK : MSG_ACK); ++ break; ++ case MSG_IMAGE: ++ /* Can I resume? */ ++ new_message = MSG_IMAGE | ++ ((index & 1) ? MSG_NACK : MSG_ACK); ++ if (new_message != old_message) ++ printk(KERN_ERR "Setting whether I can resume " ++ "to %d.\n", new_message); ++ break; ++ case MSG_IO: ++ new_message = MSG_IO | MSG_ACK; ++ break; ++ case MSG_RUNNING: ++ break; ++ default: ++ if (net_ratelimit()) ++ printk(KERN_ERR "Unrecognised TuxOnIce cluster" ++ " message %d from " NIPQUAD_FMT ".\n", ++ b->message, NIPQUAD(addr)); ++ }; ++ ++ if (old_message != new_message) { ++ node_array[index].current_message = new_message; ++ printk(KERN_INFO ">>> Sending new message for node " ++ "%d.\n", index); ++ toi_send_if(new_message, index); ++ } else if (!ack) { ++ printk(KERN_INFO ">>> Resending message for node %d.\n", ++ index); ++ toi_send_if(new_message, index); ++ } ++drop_unlock: ++ spin_unlock(&node_array[index].receive_lock); ++ }; ++ ++drop: ++ /* Throw the packet out. */ ++ kfree_skb(skb); ++ ++ return 0; ++} ++ ++/* ++ * Send cluster message to single interface. ++ */ ++static void toi_send_if(int message, unsigned long my_id) ++{ ++ struct sk_buff *skb; ++ struct toi_pkt *b; ++ int hh_len = LL_RESERVED_SPACE(net_dev); ++ struct iphdr *h; ++ ++ /* Allocate packet */ ++ skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL); ++ if (!skb) ++ return; ++ skb_reserve(skb, hh_len); ++ b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt)); ++ memset(b, 0, sizeof(struct toi_pkt)); ++ ++ /* Construct IP header */ ++ skb_reset_network_header(skb); ++ h = ip_hdr(skb); ++ h->version = 4; ++ h->ihl = 5; ++ h->tot_len = htons(sizeof(struct toi_pkt)); ++ h->frag_off = htons(IP_DF); ++ h->ttl = 64; ++ h->protocol = IPPROTO_UDP; ++ h->daddr = htonl(INADDR_BROADCAST); ++ h->check = ip_fast_csum((unsigned char *) h, h->ihl); ++ ++ /* Construct UDP header */ ++ b->udph.source = htons(toi_cluster_port_send); ++ b->udph.dest = htons(toi_cluster_port_recv); ++ b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr)); ++ /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */ ++ ++ /* Construct message */ ++ b->message = message; ++ b->sid = my_id; ++ b->htype = net_dev->type; /* can cause undefined behavior */ ++ b->hlen = net_dev->addr_len; ++ memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len); ++ b->secs = htons(3); /* 3 seconds */ ++ ++ /* Chain packet down the line... */ ++ skb->dev = net_dev; ++ skb->protocol = htons(ETH_P_IP); ++ if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol), ++ net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) || ++ dev_queue_xmit(skb) < 0) ++ printk(KERN_INFO "E"); ++} ++ ++/* ========================================= */ ++ ++/* kTOICluster */ ++ ++static atomic_t num_cluster_threads; ++static DECLARE_WAIT_QUEUE_HEAD(clusterd_events); ++ ++static int kTOICluster(void *data) ++{ ++ unsigned long my_id; ++ ++ my_id = atomic_add_return(1, &num_cluster_threads) - 1; ++ node_array[my_id].current_message = (unsigned long) data; ++ ++ PRINTK("kTOICluster daemon %lu starting.\n", my_id); ++ ++ current->flags |= PF_NOFREEZE; ++ ++ while (node_array[my_id].current_message) { ++ toi_send_if(node_array[my_id].current_message, my_id); ++ sleep_on_timeout(&clusterd_events, ++ cluster_message_timeout); ++ PRINTK("Link state %lu is %d.\n", my_id, ++ node_array[my_id].current_message); ++ } ++ ++ toi_send_if(MSG_BYE, my_id); ++ atomic_dec(&num_cluster_threads); ++ wake_up(&clusterd_events); ++ ++ PRINTK("kTOICluster daemon %lu exiting.\n", my_id); ++ __set_current_state(TASK_RUNNING); ++ return 0; ++} ++ ++static void kill_clusterd(void) ++{ ++ int i; ++ ++ for (i = 0; i < num_local_nodes; i++) { ++ if (node_array[i].current_message) { ++ PRINTK("Seeking to kill clusterd %d.\n", i); ++ node_array[i].current_message = 0; ++ } ++ } ++ wait_event(clusterd_events, ++ !atomic_read(&num_cluster_threads)); ++ PRINTK("All cluster daemons have exited.\n"); ++} ++ ++static int peers_not_in_message(int index, int message, int precise) ++{ ++ struct cluster_member *this; ++ unsigned long flags; ++ int result = 0; ++ ++ spin_lock_irqsave(&node_array[index].member_list_lock, flags); ++ list_for_each_entry(this, &node_array[index].member_list, list) { ++ if (this->ignore) ++ continue; ++ ++ PRINTK("Peer %d.%d.%d.%d sending %s. " ++ "Seeking %s.\n", ++ NIPQUAD(this->addr), ++ str_message(this->message), str_message(message)); ++ if ((precise ? this->message : ++ this->message & MSG_STATE_MASK) != ++ message) ++ result++; ++ } ++ spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); ++ PRINTK("%d peers in sought message.\n", result); ++ return result; ++} ++ ++static void reset_ignored(int index) ++{ ++ struct cluster_member *this; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&node_array[index].member_list_lock, flags); ++ list_for_each_entry(this, &node_array[index].member_list, list) ++ this->ignore = 0; ++ node_array[index].ignored_peer_count = 0; ++ spin_unlock_irqrestore(&node_array[index].member_list_lock, flags); ++} ++ ++static int peers_in_message(int index, int message, int precise) ++{ ++ return node_array[index].peer_count - ++ node_array[index].ignored_peer_count - ++ peers_not_in_message(index, message, precise); ++} ++ ++static int time_to_continue(int index, unsigned long start, int message) ++{ ++ int first = peers_not_in_message(index, message, 0); ++ int second = peers_in_message(index, message, 1); ++ ++ PRINTK("First part returns %d, second returns %d.\n", first, second); ++ ++ if (!first && !second) { ++ PRINTK("All peers answered message %d.\n", ++ message); ++ return 1; ++ } ++ ++ if (time_after(jiffies, start + continue_delay)) { ++ PRINTK("Timeout reached.\n"); ++ return 1; ++ } ++ ++ PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies, ++ start + continue_delay); ++ return 0; ++} ++ ++void toi_initiate_cluster_hibernate(void) ++{ ++ int result; ++ unsigned long start; ++ ++ result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE); ++ if (result) ++ return; ++ ++ toi_send_if(MSG_HIBERNATE, 0); ++ ++ start = jiffies; ++ wait_event(node_array[0].member_events, ++ time_to_continue(0, start, MSG_HIBERNATE)); ++ ++ if (test_action_state(TOI_FREEZER_TEST)) { ++ toi_send_if(MSG_ABORT, 0); ++ ++ start = jiffies; ++ wait_event(node_array[0].member_events, ++ time_to_continue(0, start, MSG_RUNNING)); ++ ++ do_toi_step(STEP_QUIET_CLEANUP); ++ return; ++ } ++ ++ toi_send_if(MSG_IO, 0); ++ ++ result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE); ++ if (result) ++ return; ++ ++ /* This code runs at resume time too! */ ++ if (toi_in_hibernate) ++ result = do_toi_step(STEP_HIBERNATE_POWERDOWN); ++} ++EXPORT_SYMBOL_GPL(toi_initiate_cluster_hibernate); ++ ++/* toi_cluster_print_debug_stats ++ * ++ * Description: Print information to be recorded for debugging purposes into a ++ * buffer. ++ * Arguments: buffer: Pointer to a buffer into which the debug info will be ++ * printed. ++ * size: Size of the buffer. ++ * Returns: Number of characters written to the buffer. ++ */ ++static int toi_cluster_print_debug_stats(char *buffer, int size) ++{ ++ int len; ++ ++ if (strlen(toi_cluster_iface)) ++ len = scnprintf(buffer, size, ++ "- Cluster interface is '%s'.\n", ++ toi_cluster_iface); ++ else ++ len = scnprintf(buffer, size, ++ "- Cluster support is disabled.\n"); ++ return len; ++} ++ ++/* cluster_memory_needed ++ * ++ * Description: Tell the caller how much memory we need to operate during ++ * hibernate/resume. ++ * Returns: Unsigned long. Maximum number of bytes of memory required for ++ * operation. ++ */ ++static int toi_cluster_memory_needed(void) ++{ ++ return 0; ++} ++ ++static int toi_cluster_storage_needed(void) ++{ ++ return 1 + strlen(toi_cluster_iface); ++} ++ ++/* toi_cluster_save_config_info ++ * ++ * Description: Save informaton needed when reloading the image at resume time. ++ * Arguments: Buffer: Pointer to a buffer of size PAGE_SIZE. ++ * Returns: Number of bytes used for saving our data. ++ */ ++static int toi_cluster_save_config_info(char *buffer) ++{ ++ strcpy(buffer, toi_cluster_iface); ++ return strlen(toi_cluster_iface + 1); ++} ++ ++/* toi_cluster_load_config_info ++ * ++ * Description: Reload information needed for declustering the image at ++ * resume time. ++ * Arguments: Buffer: Pointer to the start of the data. ++ * Size: Number of bytes that were saved. ++ */ ++static void toi_cluster_load_config_info(char *buffer, int size) ++{ ++ strncpy(toi_cluster_iface, buffer, size); ++ return; ++} ++ ++static void cluster_startup(void) ++{ ++ int have_image = do_check_can_resume(), i; ++ unsigned long start = jiffies, initial_message; ++ struct task_struct *p; ++ ++ initial_message = MSG_IMAGE; ++ ++ have_image = 1; ++ ++ for (i = 0; i < num_local_nodes; i++) { ++ PRINTK("Starting ktoiclusterd %d.\n", i); ++ p = kthread_create(kTOICluster, (void *) initial_message, ++ "ktoiclusterd/%d", i); ++ if (IS_ERR(p)) { ++ printk(KERN_ERR "Failed to start ktoiclusterd.\n"); ++ return; ++ } ++ ++ wake_up_process(p); ++ } ++ ++ /* Wait for delay or someone else sending first message */ ++ wait_event(node_array[0].member_events, time_to_continue(0, start, ++ MSG_IMAGE)); ++ ++ others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1); ++ ++ printk(KERN_INFO "Continuing. I %shave an image. Peers with image:" ++ " %d.\n", have_image ? "" : "don't ", others_have_image); ++ ++ if (have_image) { ++ int result; ++ ++ /* Start to resume */ ++ printk(KERN_INFO " === Starting to resume === \n"); ++ node_array[0].current_message = MSG_IO; ++ toi_send_if(MSG_IO, 0); ++ ++ /* result = do_toi_step(STEP_RESUME_LOAD_PS1); */ ++ result = 0; ++ ++ if (!result) { ++ /* ++ * Atomic restore - we'll come back in the hibernation ++ * path. ++ */ ++ ++ /* result = do_toi_step(STEP_RESUME_DO_RESTORE); */ ++ result = 0; ++ ++ /* do_toi_step(STEP_QUIET_CLEANUP); */ ++ } ++ ++ node_array[0].current_message |= MSG_NACK; ++ ++ /* For debugging - disable for real life? */ ++ wait_event(node_array[0].member_events, ++ time_to_continue(0, start, MSG_IO)); ++ } ++ ++ if (others_have_image) { ++ /* Wait for them to resume */ ++ printk(KERN_INFO "Waiting for other nodes to resume.\n"); ++ start = jiffies; ++ wait_event(node_array[0].member_events, ++ time_to_continue(0, start, MSG_RUNNING)); ++ if (peers_not_in_message(0, MSG_RUNNING, 0)) ++ printk(KERN_INFO "Timed out while waiting for other " ++ "nodes to resume.\n"); ++ } ++ ++ /* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE ++ * as appropriate. ++ * ++ * If we don't have an image: ++ * - Wait until someone else says they have one, or conditions are met ++ * for continuing to boot (n machines or t seconds). ++ * - If anyone has an image, wait for them to resume before continuing ++ * to boot. ++ * ++ * If we have an image: ++ * - Wait until conditions are met before continuing to resume (n ++ * machines or t seconds). Send RESUME_PREP and freeze processes. ++ * NACK_PREP if freezing fails (shouldn't) and follow logic for ++ * us having no image above. On success, wait for [N]ACK_PREP from ++ * other machines. Read image (including atomic restore) until done. ++ * Wait for ACK_READ from others (should never fail). Thaw processes ++ * and do post-resume. (The section after the atomic restore is done ++ * via the code for hibernating). ++ */ ++ ++ node_array[0].current_message = MSG_RUNNING; ++} ++ ++/* toi_cluster_open_iface ++ * ++ * Description: Prepare to use an interface. ++ */ ++ ++static int toi_cluster_open_iface(void) ++{ ++ struct net_device *dev; ++ ++ rtnl_lock(); ++ ++ for_each_netdev(&init_net, dev) { ++ if (/* dev == &init_net.loopback_dev || */ ++ strcmp(dev->name, toi_cluster_iface)) ++ continue; ++ ++ net_dev = dev; ++ break; ++ } ++ ++ rtnl_unlock(); ++ ++ if (!net_dev) { ++ printk(KERN_ERR MYNAME ": Device %s not found.\n", ++ toi_cluster_iface); ++ return -ENODEV; ++ } ++ ++ dev_add_pack(&toi_cluster_packet_type); ++ added_pack = 1; ++ ++ loopback_mode = (net_dev == init_net.loopback_dev); ++ num_local_nodes = loopback_mode ? 8 : 1; ++ ++ PRINTK("Loopback mode is %s. Number of local nodes is %d.\n", ++ loopback_mode ? "on" : "off", num_local_nodes); ++ ++ cluster_startup(); ++ return 0; ++} ++ ++/* toi_cluster_close_iface ++ * ++ * Description: Stop using an interface. ++ */ ++ ++static int toi_cluster_close_iface(void) ++{ ++ kill_clusterd(); ++ if (added_pack) { ++ dev_remove_pack(&toi_cluster_packet_type); ++ added_pack = 0; ++ } ++ return 0; ++} ++ ++static void write_side_effect(void) ++{ ++ if (toi_cluster_ops.enabled) { ++ toi_cluster_open_iface(); ++ set_toi_state(TOI_CLUSTER_MODE); ++ } else { ++ toi_cluster_close_iface(); ++ clear_toi_state(TOI_CLUSTER_MODE); ++ } ++} ++ ++static void node_write_side_effect(void) ++{ ++} ++ ++/* ++ * data for our sysfs entries. ++ */ ++static struct toi_sysfs_data sysfs_params[] = { ++ SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0, ++ NULL), ++ SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0, ++ write_side_effect), ++ SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL), ++ SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script, ++ 256, 0, NULL), ++ SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script, ++ 256, 0, STRING), ++ SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ, ++ 0) ++}; ++ ++/* ++ * Ops structure. ++ */ ++ ++static struct toi_module_ops toi_cluster_ops = { ++ .type = FILTER_MODULE, ++ .name = "Cluster", ++ .directory = "cluster", ++ .module = THIS_MODULE, ++ .memory_needed = toi_cluster_memory_needed, ++ .print_debug_info = toi_cluster_print_debug_stats, ++ .save_config_info = toi_cluster_save_config_info, ++ .load_config_info = toi_cluster_load_config_info, ++ .storage_needed = toi_cluster_storage_needed, ++ ++ .sysfs_data = sysfs_params, ++ .num_sysfs_entries = sizeof(sysfs_params) / ++ sizeof(struct toi_sysfs_data), ++}; ++ ++/* ---- Registration ---- */ ++ ++#ifdef MODULE ++#define INIT static __init ++#define EXIT static __exit ++#else ++#define INIT ++#define EXIT ++#endif ++ ++INIT int toi_cluster_init(void) ++{ ++ int temp = toi_register_module(&toi_cluster_ops), i; ++ struct kobject *kobj = toi_cluster_ops.dir_kobj; ++ ++ for (i = 0; i < MAX_LOCAL_NODES; i++) { ++ node_array[i].current_message = 0; ++ INIT_LIST_HEAD(&node_array[i].member_list); ++ init_waitqueue_head(&node_array[i].member_events); ++ spin_lock_init(&node_array[i].member_list_lock); ++ spin_lock_init(&node_array[i].receive_lock); ++ ++ /* Set up sysfs entry */ ++ node_array[i].sysfs_data.attr.name = toi_kzalloc(8, ++ sizeof(node_array[i].sysfs_data.attr.name), ++ GFP_KERNEL); ++ sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d", ++ i); ++ node_array[i].sysfs_data.attr.mode = SYSFS_RW; ++ node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER; ++ node_array[i].sysfs_data.flags = 0; ++ node_array[i].sysfs_data.data.integer.variable = ++ (int *) &node_array[i].current_message; ++ node_array[i].sysfs_data.data.integer.minimum = 0; ++ node_array[i].sysfs_data.data.integer.maximum = INT_MAX; ++ node_array[i].sysfs_data.write_side_effect = ++ node_write_side_effect; ++ toi_register_sysfs_file(kobj, &node_array[i].sysfs_data); ++ } ++ ++ toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0); ++ ++ if (toi_cluster_ops.enabled) ++ toi_cluster_open_iface(); ++ ++ return temp; ++} ++ ++EXIT void toi_cluster_exit(void) ++{ ++ int i; ++ toi_cluster_close_iface(); ++ ++ for (i = 0; i < MAX_LOCAL_NODES; i++) ++ toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj, ++ &node_array[i].sysfs_data); ++ toi_unregister_module(&toi_cluster_ops); ++} ++ ++static int __init toi_cluster_iface_setup(char *iface) ++{ ++ toi_cluster_ops.enabled = (*iface && ++ strcmp(iface, "off")); ++ ++ if (toi_cluster_ops.enabled) ++ strncpy(toi_cluster_iface, iface, strlen(iface)); ++} ++ ++__setup("toi_cluster=", toi_cluster_iface_setup); ++ ++#ifdef MODULE ++MODULE_LICENSE("GPL"); ++module_init(toi_cluster_init); ++module_exit(toi_cluster_exit); ++MODULE_AUTHOR("Nigel Cunningham"); ++MODULE_DESCRIPTION("Cluster Support for TuxOnIce"); ++#endif +diff --git a/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h +new file mode 100644 +index 0000000..051feb3 +--- /dev/null ++++ b/kernel/power/tuxonice_cluster.h +@@ -0,0 +1,18 @@ ++/* ++ * kernel/power/tuxonice_cluster.h ++ * ++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ */ ++ ++#ifdef CONFIG_TOI_CLUSTER ++extern int toi_cluster_init(void); ++extern void toi_cluster_exit(void); ++extern void toi_initiate_cluster_hibernate(void); ++#else ++static inline int toi_cluster_init(void) { return 0; } ++static inline void toi_cluster_exit(void) { } ++static inline void toi_initiate_cluster_hibernate(void) { } ++#endif ++ +diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c +new file mode 100644 +index 0000000..2d89c4c +--- /dev/null ++++ b/kernel/power/tuxonice_compress.c +@@ -0,0 +1,465 @@ ++/* ++ * kernel/power/compression.c ++ * ++ * Copyright (C) 2003-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * This file contains data compression routines for TuxOnIce, ++ * using cryptoapi. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "tuxonice_builtin.h" ++#include "tuxonice.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_io.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_alloc.h" ++ ++static int toi_expected_compression; ++ ++static struct toi_module_ops toi_compression_ops; ++static struct toi_module_ops *next_driver; ++ ++static char toi_compressor_name[32] = "lzo"; ++ ++static DEFINE_MUTEX(stats_lock); ++ ++struct cpu_context { ++ u8 *page_buffer; ++ struct crypto_comp *transform; ++ unsigned int len; ++ u8 *buffer_start; ++ u8 *output_buffer; ++}; ++ ++#define OUT_BUF_SIZE (2 * PAGE_SIZE) ++ ++static DEFINE_PER_CPU(struct cpu_context, contexts); ++ ++/* ++ * toi_crypto_prepare ++ * ++ * Prepare to do some work by allocating buffers and transforms. ++ */ ++static int toi_compress_crypto_prepare(void) ++{ ++ int cpu; ++ ++ if (!*toi_compressor_name) { ++ printk(KERN_INFO "TuxOnIce: Compression enabled but no " ++ "compressor name set.\n"); ++ return 1; ++ } ++ ++ for_each_online_cpu(cpu) { ++ struct cpu_context *this = &per_cpu(contexts, cpu); ++ this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0); ++ if (IS_ERR(this->transform)) { ++ printk(KERN_INFO "TuxOnIce: Failed to initialise the " ++ "%s compression transform.\n", ++ toi_compressor_name); ++ this->transform = NULL; ++ return 1; ++ } ++ ++ this->page_buffer = ++ (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP); ++ ++ if (!this->page_buffer) { ++ printk(KERN_ERR ++ "Failed to allocate a page buffer for TuxOnIce " ++ "compression driver.\n"); ++ return -ENOMEM; ++ } ++ ++ this->output_buffer = ++ (char *) vmalloc_32(OUT_BUF_SIZE); ++ ++ if (!this->output_buffer) { ++ printk(KERN_ERR ++ "Failed to allocate a output buffer for TuxOnIce " ++ "compression driver.\n"); ++ return -ENOMEM; ++ } ++ } ++ ++ return 0; ++} ++ ++static int toi_compress_rw_cleanup(int writing) ++{ ++ int cpu; ++ ++ for_each_online_cpu(cpu) { ++ struct cpu_context *this = &per_cpu(contexts, cpu); ++ if (this->transform) { ++ crypto_free_comp(this->transform); ++ this->transform = NULL; ++ } ++ ++ if (this->page_buffer) ++ toi_free_page(16, (unsigned long) this->page_buffer); ++ ++ this->page_buffer = NULL; ++ ++ if (this->output_buffer) ++ vfree(this->output_buffer); ++ ++ this->output_buffer = NULL; ++ } ++ ++ return 0; ++} ++ ++/* ++ * toi_compress_init ++ */ ++ ++static int toi_compress_init(int toi_or_resume) ++{ ++ if (!toi_or_resume) ++ return 0; ++ ++ toi_compress_bytes_in = 0; ++ toi_compress_bytes_out = 0; ++ ++ next_driver = toi_get_next_filter(&toi_compression_ops); ++ ++ return next_driver ? 0 : -ECHILD; ++} ++ ++/* ++ * toi_compress_rw_init() ++ */ ++ ++static int toi_compress_rw_init(int rw, int stream_number) ++{ ++ if (toi_compress_crypto_prepare()) { ++ printk(KERN_ERR "Failed to initialise compression " ++ "algorithm.\n"); ++ if (rw == READ) { ++ printk(KERN_INFO "Unable to read the image.\n"); ++ return -ENODEV; ++ } else { ++ printk(KERN_INFO "Continuing without " ++ "compressing the image.\n"); ++ toi_compression_ops.enabled = 0; ++ } ++ } ++ ++ return 0; ++} ++ ++/* ++ * toi_compress_write_page() ++ * ++ * Compress a page of data, buffering output and passing on filled ++ * pages to the next module in the pipeline. ++ * ++ * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing ++ * data to be compressed. ++ * ++ * Returns: 0 on success. Otherwise the error is that returned by later ++ * modules, -ECHILD if we have a broken pipeline or -EIO if ++ * zlib errs. ++ */ ++static int toi_compress_write_page(unsigned long index, int buf_type, ++ void *buffer_page, unsigned int buf_size) ++{ ++ int ret = 0, cpu = smp_processor_id(); ++ struct cpu_context *ctx = &per_cpu(contexts, cpu); ++ u8* output_buffer = buffer_page; ++ int output_len = buf_size; ++ int out_buf_type = buf_type; ++ ++ if (ctx->transform) { ++ ++ ctx->buffer_start = TOI_MAP(buf_type, buffer_page); ++ ctx->len = OUT_BUF_SIZE; ++ ++ ret = crypto_comp_compress(ctx->transform, ++ ctx->buffer_start, buf_size, ++ ctx->output_buffer, &ctx->len); ++ ++ TOI_UNMAP(buf_type, buffer_page); ++ ++ toi_message(TOI_COMPRESS, TOI_VERBOSE, 0, ++ "CPU %d, index %lu: %d bytes", ++ cpu, index, ctx->len); ++ ++ if (!ret && ctx->len < buf_size) { /* some compression */ ++ output_buffer = ctx->output_buffer; ++ output_len = ctx->len; ++ out_buf_type = TOI_VIRT; ++ } ++ ++ } ++ ++ mutex_lock(&stats_lock); ++ ++ toi_compress_bytes_in += buf_size; ++ toi_compress_bytes_out += output_len; ++ ++ mutex_unlock(&stats_lock); ++ ++ if (!ret) ++ ret = next_driver->write_page(index, out_buf_type, ++ output_buffer, output_len); ++ ++ return ret; ++} ++ ++/* ++ * toi_compress_read_page() ++ * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE. ++ * ++ * Retrieve data from later modules and decompress it until the input buffer ++ * is filled. ++ * Zero if successful. Error condition from me or from downstream on failure. ++ */ ++static int toi_compress_read_page(unsigned long *index, int buf_type, ++ void *buffer_page, unsigned int *buf_size) ++{ ++ int ret, cpu = smp_processor_id(); ++ unsigned int len; ++ unsigned int outlen = PAGE_SIZE; ++ char *buffer_start; ++ struct cpu_context *ctx = &per_cpu(contexts, cpu); ++ ++ if (!ctx->transform) ++ return next_driver->read_page(index, TOI_PAGE, buffer_page, ++ buf_size); ++ ++ /* ++ * All our reads must be synchronous - we can't decompress ++ * data that hasn't been read yet. ++ */ ++ ++ ret = next_driver->read_page(index, TOI_VIRT, ctx->page_buffer, &len); ++ ++ buffer_start = kmap(buffer_page); ++ ++ /* Error or uncompressed data */ ++ if (ret || len == PAGE_SIZE) { ++ memcpy(buffer_start, ctx->page_buffer, len); ++ goto out; ++ } ++ ++ ret = crypto_comp_decompress( ++ ctx->transform, ++ ctx->page_buffer, ++ len, buffer_start, &outlen); ++ ++ toi_message(TOI_COMPRESS, TOI_VERBOSE, 0, ++ "CPU %d, index %lu: %d=>%d (%d).", ++ cpu, *index, len, outlen, ret); ++ ++ if (ret) ++ abort_hibernate(TOI_FAILED_IO, ++ "Compress_read returned %d.\n", ret); ++ else if (outlen != PAGE_SIZE) { ++ abort_hibernate(TOI_FAILED_IO, ++ "Decompression yielded %d bytes instead of %ld.\n", ++ outlen, PAGE_SIZE); ++ printk(KERN_ERR "Decompression yielded %d bytes instead of " ++ "%ld.\n", outlen, PAGE_SIZE); ++ ret = -EIO; ++ *buf_size = outlen; ++ } ++out: ++ TOI_UNMAP(buf_type, buffer_page); ++ return ret; ++} ++ ++/* ++ * toi_compress_print_debug_stats ++ * @buffer: Pointer to a buffer into which the debug info will be printed. ++ * @size: Size of the buffer. ++ * ++ * Print information to be recorded for debugging purposes into a buffer. ++ * Returns: Number of characters written to the buffer. ++ */ ++ ++static int toi_compress_print_debug_stats(char *buffer, int size) ++{ ++ unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT, ++ pages_out = toi_compress_bytes_out >> PAGE_SHIFT; ++ int len; ++ ++ /* Output the compression ratio achieved. */ ++ if (*toi_compressor_name) ++ len = scnprintf(buffer, size, "- Compressor is '%s'.\n", ++ toi_compressor_name); ++ else ++ len = scnprintf(buffer, size, "- Compressor is not set.\n"); ++ ++ if (pages_in) ++ len += scnprintf(buffer+len, size - len, " Compressed " ++ "%lu bytes into %lu (%ld percent compression).\n", ++ toi_compress_bytes_in, ++ toi_compress_bytes_out, ++ (pages_in - pages_out) * 100 / pages_in); ++ return len; ++} ++ ++/* ++ * toi_compress_compression_memory_needed ++ * ++ * Tell the caller how much memory we need to operate during hibernate/resume. ++ * Returns: Unsigned long. Maximum number of bytes of memory required for ++ * operation. ++ */ ++static int toi_compress_memory_needed(void) ++{ ++ return 2 * PAGE_SIZE; ++} ++ ++static int toi_compress_storage_needed(void) ++{ ++ return 2 * sizeof(unsigned long) + 2 * sizeof(int) + ++ strlen(toi_compressor_name) + 1; ++} ++ ++/* ++ * toi_compress_save_config_info ++ * @buffer: Pointer to a buffer of size PAGE_SIZE. ++ * ++ * Save informaton needed when reloading the image at resume time. ++ * Returns: Number of bytes used for saving our data. ++ */ ++static int toi_compress_save_config_info(char *buffer) ++{ ++ int len = strlen(toi_compressor_name) + 1, offset = 0; ++ ++ *((unsigned long *) buffer) = toi_compress_bytes_in; ++ offset += sizeof(unsigned long); ++ *((unsigned long *) (buffer + offset)) = toi_compress_bytes_out; ++ offset += sizeof(unsigned long); ++ *((int *) (buffer + offset)) = toi_expected_compression; ++ offset += sizeof(int); ++ *((int *) (buffer + offset)) = len; ++ offset += sizeof(int); ++ strncpy(buffer + offset, toi_compressor_name, len); ++ return offset + len; ++} ++ ++/* toi_compress_load_config_info ++ * @buffer: Pointer to the start of the data. ++ * @size: Number of bytes that were saved. ++ * ++ * Description: Reload information needed for decompressing the image at ++ * resume time. ++ */ ++static void toi_compress_load_config_info(char *buffer, int size) ++{ ++ int len, offset = 0; ++ ++ toi_compress_bytes_in = *((unsigned long *) buffer); ++ offset += sizeof(unsigned long); ++ toi_compress_bytes_out = *((unsigned long *) (buffer + offset)); ++ offset += sizeof(unsigned long); ++ toi_expected_compression = *((int *) (buffer + offset)); ++ offset += sizeof(int); ++ len = *((int *) (buffer + offset)); ++ offset += sizeof(int); ++ strncpy(toi_compressor_name, buffer + offset, len); ++} ++ ++static void toi_compress_pre_atomic_restore(struct toi_boot_kernel_data *bkd) ++{ ++ bkd->compress_bytes_in = toi_compress_bytes_in; ++ bkd->compress_bytes_out = toi_compress_bytes_out; ++} ++ ++static void toi_compress_post_atomic_restore(struct toi_boot_kernel_data *bkd) ++{ ++ toi_compress_bytes_in = bkd->compress_bytes_in; ++ toi_compress_bytes_out = bkd->compress_bytes_out; ++} ++ ++/* ++ * toi_expected_compression_ratio ++ * ++ * Description: Returns the expected ratio between data passed into this module ++ * and the amount of data output when writing. ++ * Returns: 100 if the module is disabled. Otherwise the value set by the ++ * user via our sysfs entry. ++ */ ++ ++static int toi_compress_expected_ratio(void) ++{ ++ if (!toi_compression_ops.enabled) ++ return 100; ++ else ++ return 100 - toi_expected_compression; ++} ++ ++/* ++ * data for our sysfs entries. ++ */ ++static struct toi_sysfs_data sysfs_params[] = { ++ SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression, ++ 0, 99, 0, NULL), ++ SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0, ++ NULL), ++ SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL), ++}; ++ ++/* ++ * Ops structure. ++ */ ++static struct toi_module_ops toi_compression_ops = { ++ .type = FILTER_MODULE, ++ .name = "compression", ++ .directory = "compression", ++ .module = THIS_MODULE, ++ .initialise = toi_compress_init, ++ .memory_needed = toi_compress_memory_needed, ++ .print_debug_info = toi_compress_print_debug_stats, ++ .save_config_info = toi_compress_save_config_info, ++ .load_config_info = toi_compress_load_config_info, ++ .storage_needed = toi_compress_storage_needed, ++ .expected_compression = toi_compress_expected_ratio, ++ ++ .pre_atomic_restore = toi_compress_pre_atomic_restore, ++ .post_atomic_restore = toi_compress_post_atomic_restore, ++ ++ .rw_init = toi_compress_rw_init, ++ .rw_cleanup = toi_compress_rw_cleanup, ++ ++ .write_page = toi_compress_write_page, ++ .read_page = toi_compress_read_page, ++ ++ .sysfs_data = sysfs_params, ++ .num_sysfs_entries = sizeof(sysfs_params) / ++ sizeof(struct toi_sysfs_data), ++}; ++ ++/* ---- Registration ---- */ ++ ++static __init int toi_compress_load(void) ++{ ++ return toi_register_module(&toi_compression_ops); ++} ++ ++#ifdef MODULE ++static __exit void toi_compress_unload(void) ++{ ++ toi_unregister_module(&toi_compression_ops); ++} ++ ++module_init(toi_compress_load); ++module_exit(toi_compress_unload); ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Nigel Cunningham"); ++MODULE_DESCRIPTION("Compression Support for TuxOnIce"); ++#else ++late_initcall(toi_compress_load); ++#endif +diff --git a/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c +new file mode 100644 +index 0000000..e84572c +--- /dev/null ++++ b/kernel/power/tuxonice_extent.c +@@ -0,0 +1,123 @@ ++/* ++ * kernel/power/tuxonice_extent.c ++ * ++ * Copyright (C) 2003-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * Distributed under GPLv2. ++ * ++ * These functions encapsulate the manipulation of storage metadata. ++ */ ++ ++#include ++#include "tuxonice_modules.h" ++#include "tuxonice_extent.h" ++#include "tuxonice_alloc.h" ++#include "tuxonice_ui.h" ++#include "tuxonice.h" ++ ++/** ++ * toi_get_extent - return a free extent ++ * ++ * May fail, returning NULL instead. ++ **/ ++static struct hibernate_extent *toi_get_extent(void) ++{ ++ return (struct hibernate_extent *) toi_kzalloc(2, ++ sizeof(struct hibernate_extent), TOI_ATOMIC_GFP); ++} ++ ++/** ++ * toi_put_extent_chain - free a whole chain of extents ++ * @chain: Chain to free. ++ **/ ++void toi_put_extent_chain(struct hibernate_extent_chain *chain) ++{ ++ struct hibernate_extent *this; ++ ++ this = chain->first; ++ ++ while (this) { ++ struct hibernate_extent *next = this->next; ++ toi_kfree(2, this, sizeof(*this)); ++ chain->num_extents--; ++ this = next; ++ } ++ ++ chain->first = NULL; ++ chain->last_touched = NULL; ++ chain->current_extent = NULL; ++ chain->size = 0; ++} ++EXPORT_SYMBOL_GPL(toi_put_extent_chain); ++ ++/** ++ * toi_add_to_extent_chain - add an extent to an existing chain ++ * @chain: Chain to which the extend should be added ++ * @start: Start of the extent (first physical block) ++ * @end: End of the extent (last physical block) ++ * ++ * The chain information is updated if the insertion is successful. ++ **/ ++int toi_add_to_extent_chain(struct hibernate_extent_chain *chain, ++ unsigned long start, unsigned long end) ++{ ++ struct hibernate_extent *new_ext = NULL, *cur_ext = NULL; ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, ++ "Adding extent %lu-%lu to chain %p.\n", start, end, chain); ++ ++ /* Find the right place in the chain */ ++ if (chain->last_touched && chain->last_touched->start < start) ++ cur_ext = chain->last_touched; ++ else if (chain->first && chain->first->start < start) ++ cur_ext = chain->first; ++ ++ if (cur_ext) { ++ while (cur_ext->next && cur_ext->next->start < start) ++ cur_ext = cur_ext->next; ++ ++ if (cur_ext->end == (start - 1)) { ++ struct hibernate_extent *next_ext = cur_ext->next; ++ cur_ext->end = end; ++ ++ /* Merge with the following one? */ ++ if (next_ext && cur_ext->end + 1 == next_ext->start) { ++ cur_ext->end = next_ext->end; ++ cur_ext->next = next_ext->next; ++ toi_kfree(2, next_ext, sizeof(*next_ext)); ++ chain->num_extents--; ++ } ++ ++ chain->last_touched = cur_ext; ++ chain->size += (end - start + 1); ++ ++ return 0; ++ } ++ } ++ ++ new_ext = toi_get_extent(); ++ if (!new_ext) { ++ printk(KERN_INFO "Error unable to append a new extent to the " ++ "chain.\n"); ++ return -ENOMEM; ++ } ++ ++ chain->num_extents++; ++ chain->size += (end - start + 1); ++ new_ext->start = start; ++ new_ext->end = end; ++ ++ chain->last_touched = new_ext; ++ ++ if (cur_ext) { ++ new_ext->next = cur_ext->next; ++ cur_ext->next = new_ext; ++ } else { ++ if (chain->first) ++ new_ext->next = chain->first; ++ chain->first = new_ext; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(toi_add_to_extent_chain); +diff --git a/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h +new file mode 100644 +index 0000000..157446cf +--- /dev/null ++++ b/kernel/power/tuxonice_extent.h +@@ -0,0 +1,44 @@ ++/* ++ * kernel/power/tuxonice_extent.h ++ * ++ * Copyright (C) 2003-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * It contains declarations related to extents. Extents are ++ * TuxOnIce's method of storing some of the metadata for the image. ++ * See tuxonice_extent.c for more info. ++ * ++ */ ++ ++#include "tuxonice_modules.h" ++ ++#ifndef EXTENT_H ++#define EXTENT_H ++ ++struct hibernate_extent { ++ unsigned long start, end; ++ struct hibernate_extent *next; ++}; ++ ++struct hibernate_extent_chain { ++ unsigned long size; /* size of the chain ie sum (max-min+1) */ ++ int num_extents; ++ struct hibernate_extent *first, *last_touched; ++ struct hibernate_extent *current_extent; ++ unsigned long current_offset; ++}; ++ ++/* Simplify iterating through all the values in an extent chain */ ++#define toi_extent_for_each(extent_chain, extentpointer, value) \ ++if ((extent_chain)->first) \ ++ for ((extentpointer) = (extent_chain)->first, (value) = \ ++ (extentpointer)->start; \ ++ ((extentpointer) && ((extentpointer)->next || (value) <= \ ++ (extentpointer)->end)); \ ++ (((value) == (extentpointer)->end) ? \ ++ ((extentpointer) = (extentpointer)->next, (value) = \ ++ ((extentpointer) ? (extentpointer)->start : 0)) : \ ++ (value)++)) ++ ++#endif +diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c +new file mode 100644 +index 0000000..4b817c4 +--- /dev/null ++++ b/kernel/power/tuxonice_file.c +@@ -0,0 +1,497 @@ ++/* ++ * kernel/power/tuxonice_file.c ++ * ++ * Copyright (C) 2005-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * Distributed under GPLv2. ++ * ++ * This file encapsulates functions for usage of a simple file as a ++ * backing store. It is based upon the swapallocator, and shares the ++ * same basic working. Here, though, we have nothing to do with ++ * swapspace, and only one device to worry about. ++ * ++ * The user can just ++ * ++ * echo TuxOnIce > /path/to/my_file ++ * ++ * dd if=/dev/zero bs=1M count= >> /path/to/my_file ++ * ++ * and ++ * ++ * echo /path/to/my_file > /sys/power/tuxonice/file/target ++ * ++ * then put what they find in /sys/power/tuxonice/resume ++ * as their resume= parameter in lilo.conf (and rerun lilo if using it). ++ * ++ * Having done this, they're ready to hibernate and resume. ++ * ++ * TODO: ++ * - File resizing. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "tuxonice.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_bio.h" ++#include "tuxonice_alloc.h" ++#include "tuxonice_builtin.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_io.h" ++ ++#define target_is_normal_file() (S_ISREG(target_inode->i_mode)) ++ ++static struct toi_module_ops toi_fileops; ++ ++static struct file *target_file; ++static struct block_device *toi_file_target_bdev; ++static unsigned long pages_available, pages_allocated; ++static char toi_file_target[256]; ++static struct inode *target_inode; ++static int file_target_priority; ++static int used_devt; ++static int target_claim; ++static dev_t toi_file_dev_t; ++static int sig_page_index; ++ ++/* For test_toi_file_target */ ++static struct toi_bdev_info *file_chain; ++ ++static int has_contiguous_blocks(struct toi_bdev_info *dev_info, int page_num) ++{ ++ int j; ++ sector_t last = 0; ++ ++ for (j = 0; j < dev_info->blocks_per_page; j++) { ++ sector_t this = bmap(target_inode, ++ page_num * dev_info->blocks_per_page + j); ++ ++ if (!this || (last && (last + 1) != this)) ++ break; ++ ++ last = this; ++ } ++ ++ return j == dev_info->blocks_per_page; ++} ++ ++static unsigned long get_usable_pages(struct toi_bdev_info *dev_info) ++{ ++ unsigned long result = 0; ++ struct block_device *bdev = dev_info->bdev; ++ int i; ++ ++ switch (target_inode->i_mode & S_IFMT) { ++ case S_IFSOCK: ++ case S_IFCHR: ++ case S_IFIFO: /* Socket, Char, Fifo */ ++ return -1; ++ case S_IFREG: /* Regular file: current size - holes + free ++ space on part */ ++ for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) { ++ if (has_contiguous_blocks(dev_info, i)) ++ result++; ++ } ++ break; ++ case S_IFBLK: /* Block device */ ++ if (!bdev->bd_disk) { ++ toi_message(TOI_IO, TOI_VERBOSE, 0, ++ "bdev->bd_disk null."); ++ return 0; ++ } ++ ++ result = (bdev->bd_part ? ++ bdev->bd_part->nr_sects : ++ get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9); ++ } ++ ++ ++ return result; ++} ++ ++static int toi_file_register_storage(void) ++{ ++ struct toi_bdev_info *devinfo; ++ int result = 0; ++ struct fs_info *fs_info; ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_file_register_storage."); ++ if (!strlen(toi_file_target)) { ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Register file storage: " ++ "No target filename set."); ++ return 0; ++ } ++ ++ target_file = filp_open(toi_file_target, O_RDONLY|O_LARGEFILE, 0); ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "filp_open %s returned %p.", ++ toi_file_target, target_file); ++ ++ if (IS_ERR(target_file) || !target_file) { ++ target_file = NULL; ++ toi_file_dev_t = name_to_dev_t(toi_file_target); ++ if (!toi_file_dev_t) { ++ struct kstat stat; ++ int error = vfs_stat(toi_file_target, &stat); ++ printk(KERN_INFO "Open file %s returned %p and " ++ "name_to_devt failed.\n", ++ toi_file_target, target_file); ++ if (error) { ++ printk(KERN_INFO "Stating the file also failed." ++ " Nothing more we can do.\n"); ++ return 0; ++ } else ++ toi_file_dev_t = stat.rdev; ++ } ++ ++ toi_file_target_bdev = toi_open_by_devnum(toi_file_dev_t); ++ if (IS_ERR(toi_file_target_bdev)) { ++ printk(KERN_INFO "Got a dev_num (%lx) but failed to " ++ "open it.\n", ++ (unsigned long) toi_file_dev_t); ++ toi_file_target_bdev = NULL; ++ return 0; ++ } ++ used_devt = 1; ++ target_inode = toi_file_target_bdev->bd_inode; ++ } else ++ target_inode = target_file->f_mapping->host; ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Succeeded in opening the target."); ++ if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) || ++ S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) { ++ printk(KERN_INFO "File support works with regular files," ++ " character files and block devices.\n"); ++ /* Cleanup routine will undo the above */ ++ return 0; ++ } ++ ++ if (!used_devt) { ++ if (S_ISBLK(target_inode->i_mode)) { ++ toi_file_target_bdev = I_BDEV(target_inode); ++ if (!blkdev_get(toi_file_target_bdev, FMODE_WRITE | ++ FMODE_READ, NULL)) ++ target_claim = 1; ++ } else ++ toi_file_target_bdev = target_inode->i_sb->s_bdev; ++ if (!toi_file_target_bdev) { ++ printk(KERN_INFO "%s is not a valid file allocator " ++ "target.\n", toi_file_target); ++ return 0; ++ } ++ toi_file_dev_t = toi_file_target_bdev->bd_dev; ++ } ++ ++ devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), GFP_ATOMIC); ++ if (!devinfo) { ++ printk("Failed to allocate a toi_bdev_info struct for the file allocator.\n"); ++ return -ENOMEM; ++ } ++ ++ devinfo->bdev = toi_file_target_bdev; ++ devinfo->allocator = &toi_fileops; ++ devinfo->allocator_index = 0; ++ ++ fs_info = fs_info_from_block_dev(toi_file_target_bdev); ++ if (fs_info && !IS_ERR(fs_info)) { ++ memcpy(devinfo->uuid, &fs_info->uuid, 16); ++ free_fs_info(fs_info); ++ } else ++ result = (int) PTR_ERR(fs_info); ++ ++ /* Unlike swap code, only complain if fs_info_from_block_dev returned ++ * -ENOMEM. The 'file' might be a full partition, so might validly not ++ * have an identifiable type, UUID etc. ++ */ ++ if (result) ++ printk(KERN_DEBUG "Failed to get fs_info for file device (%d).\n", ++ result); ++ devinfo->dev_t = toi_file_dev_t; ++ devinfo->prio = file_target_priority; ++ devinfo->bmap_shift = target_inode->i_blkbits - 9; ++ devinfo->blocks_per_page = ++ (1 << (PAGE_SHIFT - target_inode->i_blkbits)); ++ sprintf(devinfo->name, "file %s", toi_file_target); ++ file_chain = devinfo; ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Dev_t is %lx. Prio is %d. Bmap " ++ "shift is %d. Blocks per page %d.", ++ devinfo->dev_t, devinfo->prio, devinfo->bmap_shift, ++ devinfo->blocks_per_page); ++ ++ /* Keep one aside for the signature */ ++ pages_available = get_usable_pages(devinfo) - 1; ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering file storage, %lu " ++ "pages.", pages_available); ++ ++ toi_bio_ops.register_storage(devinfo); ++ return 0; ++} ++ ++static unsigned long toi_file_storage_available(void) ++{ ++ return pages_available; ++} ++ ++static int toi_file_allocate_storage(struct toi_bdev_info *chain, ++ unsigned long request) ++{ ++ unsigned long available = pages_available - pages_allocated; ++ unsigned long to_add = min(available, request); ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Pages available is %lu. Allocated " ++ "is %lu. Allocating %lu pages from file.", ++ pages_available, pages_allocated, to_add); ++ pages_allocated += to_add; ++ ++ return to_add; ++} ++ ++/** ++ * __populate_block_list - add an extent to the chain ++ * @min: Start of the extent (first physical block = sector) ++ * @max: End of the extent (last physical block = sector) ++ * ++ * If TOI_TEST_BIO is set, print a debug message, outputting the min and max ++ * fs block numbers. ++ **/ ++static int __populate_block_list(struct toi_bdev_info *chain, int min, int max) ++{ ++ if (test_action_state(TOI_TEST_BIO)) ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %d-%d.", ++ min << chain->bmap_shift, ++ ((max + 1) << chain->bmap_shift) - 1); ++ ++ return toi_add_to_extent_chain(&chain->blocks, min, max); ++} ++ ++static int get_main_pool_phys_params(struct toi_bdev_info *chain) ++{ ++ int i, extent_min = -1, extent_max = -1, result = 0, have_sig_page = 0; ++ unsigned long pages_mapped = 0; ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Getting file allocator blocks."); ++ ++ if (chain->blocks.first) ++ toi_put_extent_chain(&chain->blocks); ++ ++ if (!target_is_normal_file()) { ++ result = (pages_available > 0) ? ++ __populate_block_list(chain, chain->blocks_per_page, ++ (pages_allocated + 1) * ++ chain->blocks_per_page - 1) : 0; ++ return result; ++ } ++ ++ /* ++ * FIXME: We are assuming the first page is contiguous. Is that ++ * assumption always right? ++ */ ++ ++ for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) { ++ sector_t new_sector; ++ ++ if (!has_contiguous_blocks(chain, i)) ++ continue; ++ ++ if (!have_sig_page) { ++ have_sig_page = 1; ++ sig_page_index = i; ++ continue; ++ } ++ ++ pages_mapped++; ++ ++ /* Ignore first page - it has the header */ ++ if (pages_mapped == 1) ++ continue; ++ ++ new_sector = bmap(target_inode, (i * chain->blocks_per_page)); ++ ++ /* ++ * I'd love to be able to fill in holes and resize ++ * files, but not yet... ++ */ ++ ++ if (new_sector == extent_max + 1) ++ extent_max += chain->blocks_per_page; ++ else { ++ if (extent_min > -1) { ++ result = __populate_block_list(chain, ++ extent_min, extent_max); ++ if (result) ++ return result; ++ } ++ ++ extent_min = new_sector; ++ extent_max = extent_min + ++ chain->blocks_per_page - 1; ++ } ++ ++ if (pages_mapped == pages_allocated) ++ break; ++ } ++ ++ if (extent_min > -1) { ++ result = __populate_block_list(chain, extent_min, extent_max); ++ if (result) ++ return result; ++ } ++ ++ return 0; ++} ++ ++static void toi_file_free_storage(struct toi_bdev_info *chain) ++{ ++ pages_allocated = 0; ++ file_chain = NULL; ++} ++ ++/** ++ * toi_file_print_debug_stats - print debug info ++ * @buffer: Buffer to data to populate ++ * @size: Size of the buffer ++ **/ ++static int toi_file_print_debug_stats(char *buffer, int size) ++{ ++ int len = scnprintf(buffer, size, "- File Allocator active.\n"); ++ ++ len += scnprintf(buffer+len, size-len, " Storage available for " ++ "image: %lu pages.\n", pages_available); ++ ++ return len; ++} ++ ++static void toi_file_cleanup(int finishing_cycle) ++{ ++ if (toi_file_target_bdev) { ++ if (target_claim) { ++ blkdev_put(toi_file_target_bdev, FMODE_WRITE | FMODE_READ); ++ target_claim = 0; ++ } ++ ++ if (used_devt) { ++ blkdev_put(toi_file_target_bdev, ++ FMODE_READ | FMODE_NDELAY); ++ used_devt = 0; ++ } ++ toi_file_target_bdev = NULL; ++ target_inode = NULL; ++ } ++ ++ if (target_file) { ++ filp_close(target_file, NULL); ++ target_file = NULL; ++ } ++ ++ pages_available = 0; ++} ++ ++/** ++ * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target ++ * ++ * Test wheter the target file is valid for hibernating. ++ **/ ++static void test_toi_file_target(void) ++{ ++ int result = toi_file_register_storage(); ++ sector_t sector; ++ char buf[50]; ++ struct fs_info *fs_info; ++ ++ if (result || !file_chain) ++ return; ++ ++ /* This doesn't mean we're in business. Is any storage available? */ ++ if (!pages_available) ++ goto out; ++ ++ toi_file_allocate_storage(file_chain, 1); ++ result = get_main_pool_phys_params(file_chain); ++ if (result) ++ goto out; ++ ++ ++ sector = bmap(target_inode, sig_page_index * ++ file_chain->blocks_per_page) << file_chain->bmap_shift; ++ ++ /* Use the uuid, or the dev_t if that fails */ ++ fs_info = fs_info_from_block_dev(toi_file_target_bdev); ++ if (!fs_info || IS_ERR(fs_info)) { ++ bdevname(toi_file_target_bdev, buf); ++ sprintf(resume_file, "/dev/%s:%llu", buf, ++ (unsigned long long) sector); ++ } else { ++ int i; ++ hex_dump_to_buffer(fs_info->uuid, 16, 32, 1, buf, 50, 0); ++ ++ /* Remove the spaces */ ++ for (i = 1; i < 16; i++) { ++ buf[2 * i] = buf[3 * i]; ++ buf[2 * i + 1] = buf[3 * i + 1]; ++ } ++ buf[32] = 0; ++ sprintf(resume_file, "UUID=%s:0x%llx", buf, ++ (unsigned long long) sector); ++ free_fs_info(fs_info); ++ } ++ ++ toi_attempt_to_parse_resume_device(0); ++out: ++ toi_file_free_storage(file_chain); ++ toi_bio_ops.free_storage(); ++} ++ ++static struct toi_sysfs_data sysfs_params[] = { ++ SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256, ++ SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target), ++ SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0, NULL), ++ SYSFS_INT("priority", SYSFS_RW, &file_target_priority, -4095, ++ 4096, 0, NULL), ++}; ++ ++static struct toi_bio_allocator_ops toi_bio_fileops = { ++ .register_storage = toi_file_register_storage, ++ .storage_available = toi_file_storage_available, ++ .allocate_storage = toi_file_allocate_storage, ++ .bmap = get_main_pool_phys_params, ++ .free_storage = toi_file_free_storage, ++}; ++ ++static struct toi_module_ops toi_fileops = { ++ .type = BIO_ALLOCATOR_MODULE, ++ .name = "file storage", ++ .directory = "file", ++ .module = THIS_MODULE, ++ .print_debug_info = toi_file_print_debug_stats, ++ .cleanup = toi_file_cleanup, ++ .bio_allocator_ops = &toi_bio_fileops, ++ ++ .sysfs_data = sysfs_params, ++ .num_sysfs_entries = sizeof(sysfs_params) / ++ sizeof(struct toi_sysfs_data), ++}; ++ ++/* ---- Registration ---- */ ++static __init int toi_file_load(void) ++{ ++ return toi_register_module(&toi_fileops); ++} ++ ++#ifdef MODULE ++static __exit void toi_file_unload(void) ++{ ++ toi_unregister_module(&toi_fileops); ++} ++ ++module_init(toi_file_load); ++module_exit(toi_file_unload); ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Nigel Cunningham"); ++MODULE_DESCRIPTION("TuxOnIce FileAllocator"); ++#else ++late_initcall(toi_file_load); ++#endif +diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c +new file mode 100644 +index 0000000..4e8f4b6 +--- /dev/null ++++ b/kernel/power/tuxonice_highlevel.c +@@ -0,0 +1,1343 @@ ++/* ++ * kernel/power/tuxonice_highlevel.c ++ */ ++/** \mainpage TuxOnIce. ++ * ++ * TuxOnIce provides support for saving and restoring an image of ++ * system memory to an arbitrary storage device, either on the local computer, ++ * or across some network. The support is entirely OS based, so TuxOnIce ++ * works without requiring BIOS, APM or ACPI support. The vast majority of the ++ * code is also architecture independant, so it should be very easy to port ++ * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem ++ * and preemption. Initramfses and initrds are also supported. ++ * ++ * TuxOnIce uses a modular design, in which the method of storing the image is ++ * completely abstracted from the core code, as are transformations on the data ++ * such as compression and/or encryption (multiple 'modules' can be used to ++ * provide arbitrary combinations of functionality). The user interface is also ++ * modular, so that arbitrarily simple or complex interfaces can be used to ++ * provide anything from debugging information through to eye candy. ++ * ++ * \section Copyright ++ * ++ * TuxOnIce is released under the GPLv2. ++ * ++ * Copyright (C) 1998-2001 Gabor Kuti
++ * Copyright (C) 1998,2001,2002 Pavel Machek
++ * Copyright (C) 2002-2003 Florent Chabaud
++ * Copyright (C) 2002-2010 Nigel Cunningham (nigel at tuxonice net)
++ * ++ * \section Credits ++ * ++ * Nigel would like to thank the following people for their work: ++ * ++ * Bernard Blackham
++ * Web page & Wiki administration, some coding. A person without whom ++ * TuxOnIce would not be where it is. ++ * ++ * Michael Frank
++ * Extensive testing and help with improving stability. I was constantly ++ * amazed by the quality and quantity of Michael's help. ++ * ++ * Pavel Machek
++ * Modifications, defectiveness pointing, being with Gabor at the very ++ * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and ++ * 2.5.17. Even though Pavel and I disagree on the direction suspend to ++ * disk should take, I appreciate the valuable work he did in helping Gabor ++ * get the concept working. ++ * ++ * ..and of course the myriads of TuxOnIce users who have helped diagnose ++ * and fix bugs, made suggestions on how to improve the code, proofread ++ * documentation, and donated time and money. ++ * ++ * Thanks also to corporate sponsors: ++ * ++ * Redhat.Sometime employer from May 2006 (my fault, not Redhat's!). ++ * ++ * Cyclades.com. Nigel's employers from Dec 2004 until May 2006, who ++ * allowed him to work on TuxOnIce and PM related issues on company time. ++ * ++ * LinuxFund.org. Sponsored Nigel's work on TuxOnIce for four months Oct ++ * 2003 to Jan 2004. ++ * ++ * LAC Linux. Donated P4 hardware that enabled development and ongoing ++ * maintenance of SMP and Highmem support. ++ * ++ * OSDL. Provided access to various hardware configurations, make ++ * occasional small donations to the project. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include /* for get/set_fs & KERNEL_DS on i386 */ ++#include ++#include ++ ++#include "tuxonice.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_prepare_image.h" ++#include "tuxonice_io.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_power_off.h" ++#include "tuxonice_storage.h" ++#include "tuxonice_checksum.h" ++#include "tuxonice_builtin.h" ++#include "tuxonice_atomic_copy.h" ++#include "tuxonice_alloc.h" ++#include "tuxonice_cluster.h" ++ ++/*! Pageset metadata. */ ++struct pagedir pagedir2 = {2}; ++EXPORT_SYMBOL_GPL(pagedir2); ++ ++static mm_segment_t oldfs; ++static DEFINE_MUTEX(tuxonice_in_use); ++static int block_dump_save; ++ ++/* Binary signature if an image is present */ ++char tuxonice_signature[9] = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c"; ++EXPORT_SYMBOL_GPL(tuxonice_signature); ++ ++unsigned long boot_kernel_data_buffer; ++ ++static char *result_strings[] = { ++ "Hibernation was aborted", ++ "The user requested that we cancel the hibernation", ++ "No storage was available", ++ "Insufficient storage was available", ++ "Freezing filesystems and/or tasks failed", ++ "A pre-existing image was used", ++ "We would free memory, but image size limit doesn't allow this", ++ "Unable to free enough memory to hibernate", ++ "Unable to obtain the Power Management Semaphore", ++ "A device suspend/resume returned an error", ++ "A system device suspend/resume returned an error", ++ "The extra pages allowance is too small", ++ "We were unable to successfully prepare an image", ++ "TuxOnIce module initialisation failed", ++ "TuxOnIce module cleanup failed", ++ "I/O errors were encountered", ++ "Ran out of memory", ++ "An error was encountered while reading the image", ++ "Platform preparation failed", ++ "CPU Hotplugging failed", ++ "Architecture specific preparation failed", ++ "Pages needed resaving, but we were told to abort if this happens", ++ "We can't hibernate at the moment (invalid resume= or filewriter " ++ "target?)", ++ "A hibernation preparation notifier chain member cancelled the " ++ "hibernation", ++ "Pre-snapshot preparation failed", ++ "Pre-restore preparation failed", ++ "Failed to disable usermode helpers", ++ "Can't resume from alternate image", ++ "Header reservation too small", ++ "Device Power Management Preparation failed", ++}; ++ ++/** ++ * toi_finish_anything - cleanup after doing anything ++ * @hibernate_or_resume: Whether finishing a cycle or attempt at ++ * resuming. ++ * ++ * This is our basic clean-up routine, matching start_anything below. We ++ * call cleanup routines, drop module references and restore process fs and ++ * cpus allowed masks, together with the global block_dump variable's value. ++ **/ ++void toi_finish_anything(int hibernate_or_resume) ++{ ++ toi_cleanup_modules(hibernate_or_resume); ++ toi_put_modules(); ++ if (hibernate_or_resume) { ++ block_dump = block_dump_save; ++ set_cpus_allowed_ptr(current, cpu_all_mask); ++ toi_alloc_print_debug_stats(); ++ atomic_inc(&snapshot_device_available); ++ unlock_system_sleep(); ++ } ++ ++ set_fs(oldfs); ++ mutex_unlock(&tuxonice_in_use); ++} ++ ++/** ++ * toi_start_anything - basic initialisation for TuxOnIce ++ * @toi_or_resume: Whether starting a cycle or attempt at resuming. ++ * ++ * Our basic initialisation routine. Take references on modules, use the ++ * kernel segment, recheck resume= if no active allocator is set, initialise ++ * modules, save and reset block_dump and ensure we're running on CPU0. ++ **/ ++int toi_start_anything(int hibernate_or_resume) ++{ ++ mutex_lock(&tuxonice_in_use); ++ ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ ++ if (hibernate_or_resume) { ++ lock_system_sleep(); ++ ++ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) ++ goto snapshotdevice_unavailable; ++ } ++ ++ if (hibernate_or_resume == SYSFS_HIBERNATE) ++ toi_print_modules(); ++ ++ if (toi_get_modules()) { ++ printk(KERN_INFO "TuxOnIce: Get modules failed!\n"); ++ goto prehibernate_err; ++ } ++ ++ if (hibernate_or_resume) { ++ block_dump_save = block_dump; ++ block_dump = 0; ++ set_cpus_allowed_ptr(current, ++ cpumask_of(cpumask_first(cpu_online_mask))); ++ } ++ ++ if (toi_initialise_modules_early(hibernate_or_resume)) ++ goto early_init_err; ++ ++ if (!toiActiveAllocator) ++ toi_attempt_to_parse_resume_device(!hibernate_or_resume); ++ ++ if (!toi_initialise_modules_late(hibernate_or_resume)) ++ return 0; ++ ++ toi_cleanup_modules(hibernate_or_resume); ++early_init_err: ++ if (hibernate_or_resume) { ++ block_dump_save = block_dump; ++ set_cpus_allowed_ptr(current, cpu_all_mask); ++ } ++ toi_put_modules(); ++prehibernate_err: ++ if (hibernate_or_resume) ++ atomic_inc(&snapshot_device_available); ++snapshotdevice_unavailable: ++ if (hibernate_or_resume) ++ mutex_unlock(&pm_mutex); ++ set_fs(oldfs); ++ mutex_unlock(&tuxonice_in_use); ++ return -EBUSY; ++} ++ ++/* ++ * Nosave page tracking. ++ * ++ * Here rather than in prepare_image because we want to do it once only at the ++ * start of a cycle. ++ */ ++ ++/** ++ * mark_nosave_pages - set up our Nosave bitmap ++ * ++ * Build a bitmap of Nosave pages from the list. The bitmap allows faster ++ * use when preparing the image. ++ **/ ++static void mark_nosave_pages(void) ++{ ++ struct nosave_region *region; ++ ++ list_for_each_entry(region, &nosave_regions, list) { ++ unsigned long pfn; ++ ++ for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) ++ if (pfn_valid(pfn)) ++ SetPageNosave(pfn_to_page(pfn)); ++ } ++} ++ ++static int toi_alloc_bitmap(struct memory_bitmap **bm) ++{ ++ int result = 0; ++ ++ *bm = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); ++ if (!*bm) { ++ printk(KERN_ERR "Failed to kzalloc memory for a bitmap.\n"); ++ return -ENOMEM; ++ } ++ ++ result = memory_bm_create(*bm, GFP_KERNEL, 0); ++ ++ if (result) { ++ printk(KERN_ERR "Failed to create a bitmap.\n"); ++ kfree(*bm); ++ *bm = NULL; ++ } ++ ++ return result; ++} ++ ++/** ++ * allocate_bitmaps - allocate bitmaps used to record page states ++ * ++ * Allocate the bitmaps we use to record the various TuxOnIce related ++ * page states. ++ **/ ++static int allocate_bitmaps(void) ++{ ++ if (toi_alloc_bitmap(&pageset1_map) || ++ toi_alloc_bitmap(&pageset1_copy_map) || ++ toi_alloc_bitmap(&pageset2_map) || ++ toi_alloc_bitmap(&io_map) || ++ toi_alloc_bitmap(&nosave_map) || ++ toi_alloc_bitmap(&free_map) || ++ toi_alloc_bitmap(&page_resave_map)) ++ return 1; ++ ++ return 0; ++} ++ ++static void toi_free_bitmap(struct memory_bitmap **bm) ++{ ++ if (!*bm) ++ return; ++ ++ memory_bm_free(*bm, 0); ++ kfree(*bm); ++ *bm = NULL; ++} ++ ++/** ++ * free_bitmaps - free the bitmaps used to record page states ++ * ++ * Free the bitmaps allocated above. It is not an error to call ++ * memory_bm_free on a bitmap that isn't currently allocated. ++ **/ ++static void free_bitmaps(void) ++{ ++ toi_free_bitmap(&pageset1_map); ++ toi_free_bitmap(&pageset1_copy_map); ++ toi_free_bitmap(&pageset2_map); ++ toi_free_bitmap(&io_map); ++ toi_free_bitmap(&nosave_map); ++ toi_free_bitmap(&free_map); ++ toi_free_bitmap(&page_resave_map); ++} ++ ++/** ++ * io_MB_per_second - return the number of MB/s read or written ++ * @write: Whether to return the speed at which we wrote. ++ * ++ * Calculate the number of megabytes per second that were read or written. ++ **/ ++static int io_MB_per_second(int write) ++{ ++ return (toi_bkd.toi_io_time[write][1]) ? ++ MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ / ++ toi_bkd.toi_io_time[write][1] : 0; ++} ++ ++#define SNPRINTF(a...) do { len += scnprintf(((char *) buffer) + len, \ ++ count - len - 1, ## a); } while (0) ++ ++/** ++ * get_debug_info - fill a buffer with debugging information ++ * @buffer: The buffer to be filled. ++ * @count: The size of the buffer, in bytes. ++ * ++ * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will ++ * either printk or return via sysfs. ++ **/ ++static int get_toi_debug_info(const char *buffer, int count) ++{ ++ int len = 0, i, first_result = 1; ++ ++ SNPRINTF("TuxOnIce debugging info:\n"); ++ SNPRINTF("- TuxOnIce core : " TOI_CORE_VERSION "\n"); ++ SNPRINTF("- Kernel Version : " UTS_RELEASE "\n"); ++ SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__); ++ SNPRINTF("- Attempt number : %d\n", nr_hibernates); ++ SNPRINTF("- Parameters : %ld %ld %ld %d %ld %ld\n", ++ toi_result, ++ toi_bkd.toi_action, ++ toi_bkd.toi_debug_state, ++ toi_bkd.toi_default_console_level, ++ image_size_limit, ++ toi_poweroff_method); ++ SNPRINTF("- Overall expected compression percentage: %d.\n", ++ 100 - toi_expected_compression_ratio()); ++ len += toi_print_module_debug_info(((char *) buffer) + len, ++ count - len - 1); ++ if (toi_bkd.toi_io_time[0][1]) { ++ if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) { ++ SNPRINTF("- I/O speed: Write %ld KB/s", ++ (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ / ++ toi_bkd.toi_io_time[0][1])); ++ if (toi_bkd.toi_io_time[1][1]) ++ SNPRINTF(", Read %ld KB/s", ++ (KB((unsigned long) ++ toi_bkd.toi_io_time[1][0]) * HZ / ++ toi_bkd.toi_io_time[1][1])); ++ } else { ++ SNPRINTF("- I/O speed: Write %ld MB/s", ++ (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ / ++ toi_bkd.toi_io_time[0][1])); ++ if (toi_bkd.toi_io_time[1][1]) ++ SNPRINTF(", Read %ld MB/s", ++ (MB((unsigned long) ++ toi_bkd.toi_io_time[1][0]) * HZ / ++ toi_bkd.toi_io_time[1][1])); ++ } ++ SNPRINTF(".\n"); ++ } else ++ SNPRINTF("- No I/O speed stats available.\n"); ++ SNPRINTF("- Extra pages : %lu used/%lu.\n", ++ extra_pd1_pages_used, extra_pd1_pages_allowance); ++ ++ for (i = 0; i < TOI_NUM_RESULT_STATES; i++) ++ if (test_result_state(i)) { ++ SNPRINTF("%s: %s.\n", first_result ? ++ "- Result " : ++ " ", ++ result_strings[i]); ++ first_result = 0; ++ } ++ if (first_result) ++ SNPRINTF("- Result : %s.\n", nr_hibernates ? ++ "Succeeded" : ++ "No hibernation attempts so far"); ++ return len; ++} ++ ++/** ++ * do_cleanup - cleanup after attempting to hibernate or resume ++ * @get_debug_info: Whether to allocate and return debugging info. ++ * ++ * Cleanup after attempting to hibernate or resume, possibly getting ++ * debugging info as we do so. ++ **/ ++static void do_cleanup(int get_debug_info, int restarting) ++{ ++ int i = 0; ++ char *buffer = NULL; ++ ++ trap_non_toi_io = 0; ++ ++ if (get_debug_info) ++ toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up..."); ++ ++ free_checksum_pages(); ++ ++ if (get_debug_info) ++ buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP); ++ ++ if (buffer) ++ i = get_toi_debug_info(buffer, PAGE_SIZE); ++ ++ toi_free_extra_pagedir_memory(); ++ ++ pagedir1.size = 0; ++ pagedir2.size = 0; ++ set_highmem_size(pagedir1, 0); ++ set_highmem_size(pagedir2, 0); ++ ++ if (boot_kernel_data_buffer) { ++ if (!test_toi_state(TOI_BOOT_KERNEL)) ++ toi_free_page(37, boot_kernel_data_buffer); ++ boot_kernel_data_buffer = 0; ++ } ++ ++ clear_toi_state(TOI_BOOT_KERNEL); ++ thaw_processes(); ++ ++ if (!restarting) ++ toi_stop_other_threads(); ++ ++ if (test_action_state(TOI_KEEP_IMAGE) && ++ !test_result_state(TOI_ABORTED)) { ++ toi_message(TOI_ANY_SECTION, TOI_LOW, 1, ++ "TuxOnIce: Not invalidating the image due " ++ "to Keep Image being enabled."); ++ set_result_state(TOI_KEPT_IMAGE); ++ } else ++ if (toiActiveAllocator) ++ toiActiveAllocator->remove_image(); ++ ++ free_bitmaps(); ++ usermodehelper_enable(); ++ ++ if (test_toi_state(TOI_NOTIFIERS_PREPARE)) { ++ pm_notifier_call_chain(PM_POST_HIBERNATION); ++ clear_toi_state(TOI_NOTIFIERS_PREPARE); ++ } ++ ++ if (buffer && i) { ++ /* Printk can only handle 1023 bytes, including ++ * its level mangling. */ ++ for (i = 0; i < 3; i++) ++ printk(KERN_ERR "%s", buffer + (1023 * i)); ++ toi_free_page(20, (unsigned long) buffer); ++ } ++ ++ if (!test_action_state(TOI_LATE_CPU_HOTPLUG)) ++ enable_nonboot_cpus(); ++ ++ if (!restarting) ++ toi_cleanup_console(); ++ ++ free_attention_list(); ++ ++ if (!restarting) ++ toi_deactivate_storage(0); ++ ++ clear_toi_state(TOI_IGNORE_LOGLEVEL); ++ clear_toi_state(TOI_TRYING_TO_RESUME); ++ clear_toi_state(TOI_NOW_RESUMING); ++} ++ ++/** ++ * check_still_keeping_image - we kept an image; check whether to reuse it. ++ * ++ * We enter this routine when we have kept an image. If the user has said they ++ * want to still keep it, all we need to do is powerdown. If powering down ++ * means hibernating to ram and the power doesn't run out, we'll return 1. ++ * If we do power off properly or the battery runs out, we'll resume via the ++ * normal paths. ++ * ++ * If the user has said they want to remove the previously kept image, we ++ * remove it, and return 0. We'll then store a new image. ++ **/ ++static int check_still_keeping_image(void) ++{ ++ if (test_action_state(TOI_KEEP_IMAGE)) { ++ printk(KERN_INFO "Image already stored: powering down " ++ "immediately."); ++ do_toi_step(STEP_HIBERNATE_POWERDOWN); ++ return 1; /* Just in case we're using S3 */ ++ } ++ ++ printk(KERN_INFO "Invalidating previous image.\n"); ++ toiActiveAllocator->remove_image(); ++ ++ return 0; ++} ++ ++/** ++ * toi_init - prepare to hibernate to disk ++ * ++ * Initialise variables & data structures, in preparation for ++ * hibernating to disk. ++ **/ ++static int toi_init(int restarting) ++{ ++ int result, i, j; ++ ++ toi_result = 0; ++ ++ printk(KERN_INFO "Initiating a hibernation cycle.\n"); ++ ++ nr_hibernates++; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < 2; j++) ++ toi_bkd.toi_io_time[i][j] = 0; ++ ++ if (!test_toi_state(TOI_CAN_HIBERNATE) || ++ allocate_bitmaps()) ++ return 1; ++ ++ mark_nosave_pages(); ++ ++ if (!restarting) ++ toi_prepare_console(); ++ ++ result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); ++ if (result) { ++ set_result_state(TOI_NOTIFIERS_PREPARE_FAILED); ++ return 1; ++ } ++ set_toi_state(TOI_NOTIFIERS_PREPARE); ++ ++ if (!restarting) { ++ printk(KERN_ERR "Starting other threads."); ++ toi_start_other_threads(); ++ } ++ ++ result = usermodehelper_disable(); ++ if (result) { ++ printk(KERN_ERR "TuxOnIce: Failed to disable usermode " ++ "helpers\n"); ++ set_result_state(TOI_USERMODE_HELPERS_ERR); ++ return 1; ++ } ++ ++ boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP); ++ if (!boot_kernel_data_buffer) { ++ printk(KERN_ERR "TuxOnIce: Failed to allocate " ++ "boot_kernel_data_buffer.\n"); ++ set_result_state(TOI_OUT_OF_MEMORY); ++ return 1; ++ } ++ ++ if (!test_action_state(TOI_LATE_CPU_HOTPLUG) && ++ disable_nonboot_cpus()) { ++ set_abort_result(TOI_CPU_HOTPLUG_FAILED); ++ return 1; ++ } ++ ++ return 0; ++} ++ ++/** ++ * can_hibernate - perform basic 'Can we hibernate?' tests ++ * ++ * Perform basic tests that must pass if we're going to be able to hibernate: ++ * Can we get the pm_mutex? Is resume= valid (we need to know where to write ++ * the image header). ++ **/ ++static int can_hibernate(void) ++{ ++ if (!test_toi_state(TOI_CAN_HIBERNATE)) ++ toi_attempt_to_parse_resume_device(0); ++ ++ if (!test_toi_state(TOI_CAN_HIBERNATE)) { ++ printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n" ++ "This may be because you haven't put something along " ++ "the lines of\n\nresume=swap:/dev/hda1\n\n" ++ "in lilo.conf or equivalent. (Where /dev/hda1 is your " ++ "swap partition).\n"); ++ set_abort_result(TOI_CANT_SUSPEND); ++ return 0; ++ } ++ ++ if (strlen(alt_resume_param)) { ++ attempt_to_parse_alt_resume_param(); ++ ++ if (!strlen(alt_resume_param)) { ++ printk(KERN_INFO "Alternate resume parameter now " ++ "invalid. Aborting.\n"); ++ set_abort_result(TOI_CANT_USE_ALT_RESUME); ++ return 0; ++ } ++ } ++ ++ return 1; ++} ++ ++/** ++ * do_post_image_write - having written an image, figure out what to do next ++ * ++ * After writing an image, we might load an alternate image or power down. ++ * Powering down might involve hibernating to ram, in which case we also ++ * need to handle reloading pageset2. ++ **/ ++static int do_post_image_write(void) ++{ ++ /* If switching images fails, do normal powerdown */ ++ if (alt_resume_param[0]) ++ do_toi_step(STEP_RESUME_ALT_IMAGE); ++ ++ toi_power_down(); ++ ++ barrier(); ++ mb(); ++ return 0; ++} ++ ++/** ++ * __save_image - do the hard work of saving the image ++ * ++ * High level routine for getting the image saved. The key assumptions made ++ * are that processes have been frozen and sufficient memory is available. ++ * ++ * We also exit through here at resume time, coming back from toi_hibernate ++ * after the atomic restore. This is the reason for the toi_in_hibernate ++ * test. ++ **/ ++static int __save_image(void) ++{ ++ int temp_result, did_copy = 0; ++ ++ toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image.."); ++ ++ toi_message(TOI_ANY_SECTION, TOI_LOW, 1, ++ " - Final values: %d and %d.", ++ pagedir1.size, pagedir2.size); ++ ++ toi_cond_pause(1, "About to write pagedir2."); ++ ++ temp_result = write_pageset(&pagedir2); ++ ++ if (temp_result == -1 || test_result_state(TOI_ABORTED)) ++ return 1; ++ ++ toi_cond_pause(1, "About to copy pageset 1."); ++ ++ if (test_result_state(TOI_ABORTED)) ++ return 1; ++ ++ toi_deactivate_storage(1); ++ ++ toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore."); ++ ++ toi_in_hibernate = 1; ++ ++ if (toi_go_atomic(PMSG_FREEZE, 1)) ++ goto Failed; ++ ++ temp_result = toi_hibernate(); ++ ++#ifdef CONFIG_KGDB ++ if (test_action_state(TOI_POST_RESUME_BREAKPOINT)) ++ kgdb_breakpoint(); ++#endif ++ ++ if (!temp_result) ++ did_copy = 1; ++ ++ /* We return here at resume time too! */ ++ toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result); ++ ++Failed: ++ if (toi_activate_storage(1)) ++ panic("Failed to reactivate our storage."); ++ ++ /* Resume time? */ ++ if (!toi_in_hibernate) { ++ copyback_post(); ++ return 0; ++ } ++ ++ /* Nope. Hibernating. So, see if we can save the image... */ ++ ++ if (temp_result || test_result_state(TOI_ABORTED)) { ++ if (did_copy) ++ goto abort_reloading_pagedir_two; ++ else ++ return 1; ++ } ++ ++ toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size, ++ NULL); ++ ++ if (test_result_state(TOI_ABORTED)) ++ goto abort_reloading_pagedir_two; ++ ++ toi_cond_pause(1, "About to write pageset1."); ++ ++ toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1"); ++ ++ temp_result = write_pageset(&pagedir1); ++ ++ /* We didn't overwrite any memory, so no reread needs to be done. */ ++ if (test_action_state(TOI_TEST_FILTER_SPEED) || ++ test_action_state(TOI_TEST_BIO)) ++ return 1; ++ ++ if (temp_result == 1 || test_result_state(TOI_ABORTED)) ++ goto abort_reloading_pagedir_two; ++ ++ toi_cond_pause(1, "About to write header."); ++ ++ if (test_result_state(TOI_ABORTED)) ++ goto abort_reloading_pagedir_two; ++ ++ temp_result = write_image_header(); ++ ++ if (!temp_result && !test_result_state(TOI_ABORTED)) ++ return 0; ++ ++abort_reloading_pagedir_two: ++ temp_result = read_pageset2(1); ++ ++ /* If that failed, we're sunk. Panic! */ ++ if (temp_result) ++ panic("Attempt to reload pagedir 2 while aborting " ++ "a hibernate failed."); ++ ++ return 1; ++} ++ ++static void map_ps2_pages(int enable) ++{ ++ unsigned long pfn = 0; ++ ++ pfn = memory_bm_next_pfn(pageset2_map); ++ ++ while (pfn != BM_END_OF_MAP) { ++ struct page *page = pfn_to_page(pfn); ++ kernel_map_pages(page, 1, enable); ++ pfn = memory_bm_next_pfn(pageset2_map); ++ } ++} ++ ++/** ++ * do_save_image - save the image and handle the result ++ * ++ * Save the prepared image. If we fail or we're in the path returning ++ * from the atomic restore, cleanup. ++ **/ ++static int do_save_image(void) ++{ ++ int result; ++ map_ps2_pages(0); ++ result = __save_image(); ++ map_ps2_pages(1); ++ return result; ++} ++ ++/** ++ * do_prepare_image - try to prepare an image ++ * ++ * Seek to initialise and prepare an image to be saved. On failure, ++ * cleanup. ++ **/ ++static int do_prepare_image(void) ++{ ++ int restarting = test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL); ++ ++ if (!restarting && toi_activate_storage(0)) ++ return 1; ++ ++ /* ++ * If kept image and still keeping image and hibernating to RAM, we will ++ * return 1 after hibernating and resuming (provided the power doesn't ++ * run out. In that case, we skip directly to cleaning up and exiting. ++ */ ++ ++ if (!can_hibernate() || ++ (test_result_state(TOI_KEPT_IMAGE) && ++ check_still_keeping_image())) ++ return 1; ++ ++ if (toi_init(restarting) || toi_prepare_image() || ++ test_result_state(TOI_ABORTED)) ++ return 1; ++ ++ trap_non_toi_io = 1; ++ ++ return 0; ++} ++ ++/** ++ * do_check_can_resume - find out whether an image has been stored ++ * ++ * Read whether an image exists. We use the same routine as the ++ * image_exists sysfs entry, and just look to see whether the ++ * first character in the resulting buffer is a '1'. ++ **/ ++int do_check_can_resume(void) ++{ ++ int result = -1; ++ ++ if (toi_activate_storage(0)) ++ return -1; ++ ++ if (!test_toi_state(TOI_RESUME_DEVICE_OK)) ++ toi_attempt_to_parse_resume_device(1); ++ ++ if (toiActiveAllocator) ++ result = toiActiveAllocator->image_exists(1); ++ ++ toi_deactivate_storage(0); ++ return result; ++} ++EXPORT_SYMBOL_GPL(do_check_can_resume); ++ ++/** ++ * do_load_atomic_copy - load the first part of an image, if it exists ++ * ++ * Check whether we have an image. If one exists, do sanity checking ++ * (possibly invalidating the image or even rebooting if the user ++ * requests that) before loading it into memory in preparation for the ++ * atomic restore. ++ * ++ * If and only if we have an image loaded and ready to restore, we return 1. ++ **/ ++static int do_load_atomic_copy(void) ++{ ++ int read_image_result = 0; ++ ++ if (sizeof(swp_entry_t) != sizeof(long)) { ++ printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size" ++ " of long. Please report this!\n"); ++ return 1; ++ } ++ ++ if (!resume_file[0]) ++ printk(KERN_WARNING "TuxOnIce: " ++ "You need to use a resume= command line parameter to " ++ "tell TuxOnIce where to look for an image.\n"); ++ ++ toi_activate_storage(0); ++ ++ if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) && ++ !toi_attempt_to_parse_resume_device(0)) { ++ /* ++ * Without a usable storage device we can do nothing - ++ * even if noresume is given ++ */ ++ ++ if (!toiNumAllocators) ++ printk(KERN_ALERT "TuxOnIce: " ++ "No storage allocators have been registered.\n"); ++ else ++ printk(KERN_ALERT "TuxOnIce: " ++ "Missing or invalid storage location " ++ "(resume= parameter). Please correct and " ++ "rerun lilo (or equivalent) before " ++ "hibernating.\n"); ++ toi_deactivate_storage(0); ++ return 1; ++ } ++ ++ if (allocate_bitmaps()) ++ return 1; ++ ++ read_image_result = read_pageset1(); /* non fatal error ignored */ ++ ++ if (test_toi_state(TOI_NORESUME_SPECIFIED)) ++ clear_toi_state(TOI_NORESUME_SPECIFIED); ++ ++ toi_deactivate_storage(0); ++ ++ if (read_image_result) ++ return 1; ++ ++ return 0; ++} ++ ++/** ++ * prepare_restore_load_alt_image - save & restore alt image variables ++ * ++ * Save and restore the pageset1 maps, when loading an alternate image. ++ **/ ++static void prepare_restore_load_alt_image(int prepare) ++{ ++ static struct memory_bitmap *pageset1_map_save, *pageset1_copy_map_save; ++ ++ if (prepare) { ++ pageset1_map_save = pageset1_map; ++ pageset1_map = NULL; ++ pageset1_copy_map_save = pageset1_copy_map; ++ pageset1_copy_map = NULL; ++ set_toi_state(TOI_LOADING_ALT_IMAGE); ++ toi_reset_alt_image_pageset2_pfn(); ++ } else { ++ memory_bm_free(pageset1_map, 0); ++ pageset1_map = pageset1_map_save; ++ memory_bm_free(pageset1_copy_map, 0); ++ pageset1_copy_map = pageset1_copy_map_save; ++ clear_toi_state(TOI_NOW_RESUMING); ++ clear_toi_state(TOI_LOADING_ALT_IMAGE); ++ } ++} ++ ++/** ++ * do_toi_step - perform a step in hibernating or resuming ++ * ++ * Perform a step in hibernating or resuming an image. This abstraction ++ * is in preparation for implementing cluster support, and perhaps replacing ++ * uswsusp too (haven't looked whether that's possible yet). ++ **/ ++int do_toi_step(int step) ++{ ++ switch (step) { ++ case STEP_HIBERNATE_PREPARE_IMAGE: ++ return do_prepare_image(); ++ case STEP_HIBERNATE_SAVE_IMAGE: ++ return do_save_image(); ++ case STEP_HIBERNATE_POWERDOWN: ++ return do_post_image_write(); ++ case STEP_RESUME_CAN_RESUME: ++ return do_check_can_resume(); ++ case STEP_RESUME_LOAD_PS1: ++ return do_load_atomic_copy(); ++ case STEP_RESUME_DO_RESTORE: ++ /* ++ * If we succeed, this doesn't return. ++ * Instead, we return from do_save_image() in the ++ * hibernated kernel. ++ */ ++ return toi_atomic_restore(); ++ case STEP_RESUME_ALT_IMAGE: ++ printk(KERN_INFO "Trying to resume alternate image.\n"); ++ toi_in_hibernate = 0; ++ save_restore_alt_param(SAVE, NOQUIET); ++ prepare_restore_load_alt_image(1); ++ if (!do_check_can_resume()) { ++ printk(KERN_INFO "Nothing to resume from.\n"); ++ goto out; ++ } ++ if (!do_load_atomic_copy()) ++ toi_atomic_restore(); ++ ++ printk(KERN_INFO "Failed to load image.\n"); ++out: ++ prepare_restore_load_alt_image(0); ++ save_restore_alt_param(RESTORE, NOQUIET); ++ break; ++ case STEP_CLEANUP: ++ do_cleanup(1, 0); ++ break; ++ case STEP_QUIET_CLEANUP: ++ do_cleanup(0, 0); ++ break; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(do_toi_step); ++ ++/* -- Functions for kickstarting a hibernate or resume --- */ ++ ++/** ++ * toi_try_resume - try to do the steps in resuming ++ * ++ * Check if we have an image and if so try to resume. Clear the status ++ * flags too. ++ **/ ++void toi_try_resume(void) ++{ ++ set_toi_state(TOI_TRYING_TO_RESUME); ++ resume_attempted = 1; ++ ++ current->flags |= PF_MEMALLOC; ++ toi_start_other_threads(); ++ ++ if (do_toi_step(STEP_RESUME_CAN_RESUME) && ++ !do_toi_step(STEP_RESUME_LOAD_PS1)) ++ do_toi_step(STEP_RESUME_DO_RESTORE); ++ ++ toi_stop_other_threads(); ++ do_cleanup(0, 0); ++ ++ current->flags &= ~PF_MEMALLOC; ++ ++ clear_toi_state(TOI_IGNORE_LOGLEVEL); ++ clear_toi_state(TOI_TRYING_TO_RESUME); ++ clear_toi_state(TOI_NOW_RESUMING); ++} ++ ++/** ++ * toi_sys_power_disk_try_resume - wrapper calling toi_try_resume ++ * ++ * Wrapper for when __toi_try_resume is called from swsusp resume path, ++ * rather than from echo > /sys/power/tuxonice/do_resume. ++ **/ ++static void toi_sys_power_disk_try_resume(void) ++{ ++ resume_attempted = 1; ++ ++ /* ++ * There's a comment in kernel/power/disk.c that indicates ++ * we should be able to use mutex_lock_nested below. That ++ * doesn't seem to cut it, though, so let's just turn lockdep ++ * off for now. ++ */ ++ lockdep_off(); ++ ++ if (toi_start_anything(SYSFS_RESUMING)) ++ goto out; ++ ++ toi_try_resume(); ++ ++ /* ++ * For initramfs, we have to clear the boot time ++ * flag after trying to resume ++ */ ++ clear_toi_state(TOI_BOOT_TIME); ++ ++ toi_finish_anything(SYSFS_RESUMING); ++out: ++ lockdep_on(); ++} ++ ++/** ++ * toi_try_hibernate - try to start a hibernation cycle ++ * ++ * Start a hibernation cycle, coming in from either ++ * echo > /sys/power/tuxonice/do_suspend ++ * ++ * or ++ * ++ * echo disk > /sys/power/state ++ * ++ * In the later case, we come in without pm_sem taken; in the ++ * former, it has been taken. ++ **/ ++int toi_try_hibernate(void) ++{ ++ int result = 0, sys_power_disk = 0, retries = 0; ++ ++ if (!mutex_is_locked(&tuxonice_in_use)) { ++ /* Came in via /sys/power/disk */ ++ if (toi_start_anything(SYSFS_HIBERNATING)) ++ return -EBUSY; ++ sys_power_disk = 1; ++ } ++ ++ current->flags |= PF_MEMALLOC; ++ ++ if (test_toi_state(TOI_CLUSTER_MODE)) { ++ toi_initiate_cluster_hibernate(); ++ goto out; ++ } ++ ++prepare: ++ result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE); ++ ++ if (result) ++ goto out; ++ ++ if (test_action_state(TOI_FREEZER_TEST)) ++ goto out_restore_gfp_mask; ++ ++ result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE); ++ ++ if (test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL)) { ++ if (retries < 2) { ++ do_cleanup(0, 1); ++ retries++; ++ clear_result_state(TOI_ABORTED); ++ extra_pd1_pages_allowance = extra_pd1_pages_used + 500; ++ printk(KERN_INFO "Automatically adjusting the extra" ++ " pages allowance to %ld and restarting.\n", ++ extra_pd1_pages_allowance); ++ pm_restore_gfp_mask(); ++ goto prepare; ++ } ++ ++ printk(KERN_INFO "Adjusted extra pages allowance twice and " ++ "still couldn't hibernate successfully. Giving up."); ++ } ++ ++ /* This code runs at resume time too! */ ++ if (!result && toi_in_hibernate) ++ result = do_toi_step(STEP_HIBERNATE_POWERDOWN); ++ ++out_restore_gfp_mask: ++ pm_restore_gfp_mask(); ++out: ++ do_cleanup(1, 0); ++ current->flags &= ~PF_MEMALLOC; ++ ++ if (sys_power_disk) ++ toi_finish_anything(SYSFS_HIBERNATING); ++ ++ return result; ++} ++ ++/* ++ * channel_no: If !0, -c is added to args (userui). ++ */ ++int toi_launch_userspace_program(char *command, int channel_no, ++ int wait, int debug) ++{ ++ int retval; ++ static char *envp[] = { ++ "HOME=/", ++ "TERM=linux", ++ "PATH=/sbin:/usr/sbin:/bin:/usr/bin", ++ NULL }; ++ static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL ++ }; ++ char *channel = NULL; ++ int arg = 0, size; ++ char test_read[255]; ++ char *orig_posn = command; ++ ++ if (!strlen(orig_posn)) ++ return 1; ++ ++ if (channel_no) { ++ channel = toi_kzalloc(4, 6, GFP_KERNEL); ++ if (!channel) { ++ printk(KERN_INFO "Failed to allocate memory in " ++ "preparing to launch userspace program.\n"); ++ return 1; ++ } ++ } ++ ++ /* Up to 6 args supported */ ++ while (arg < 6) { ++ sscanf(orig_posn, "%s", test_read); ++ size = strlen(test_read); ++ if (!(size)) ++ break; ++ argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP); ++ strcpy(argv[arg], test_read); ++ orig_posn += size + 1; ++ *test_read = 0; ++ arg++; ++ } ++ ++ if (channel_no) { ++ sprintf(channel, "-c%d", channel_no); ++ argv[arg] = channel; ++ } else ++ arg--; ++ ++ if (debug) { ++ argv[++arg] = toi_kzalloc(5, 8, TOI_ATOMIC_GFP); ++ strcpy(argv[arg], "--debug"); ++ } ++ ++ retval = call_usermodehelper(argv[0], argv, envp, wait); ++ ++ /* ++ * If the program reports an error, retval = 256. Don't complain ++ * about that here. ++ */ ++ if (retval && retval != 256) ++ printk(KERN_ERR "Failed to launch userspace program '%s': " ++ "Error %d\n", command, retval); ++ ++ { ++ int i; ++ for (i = 0; i < arg; i++) ++ if (argv[i] && argv[i] != channel) ++ toi_kfree(5, argv[i], sizeof(*argv[i])); ++ } ++ ++ toi_kfree(4, channel, sizeof(*channel)); ++ ++ return retval; ++} ++ ++/* ++ * This array contains entries that are automatically registered at ++ * boot. Modules and the console code register their own entries separately. ++ */ ++static struct toi_sysfs_data sysfs_params[] = { ++ SYSFS_LONG("extra_pages_allowance", SYSFS_RW, ++ &extra_pd1_pages_allowance, 0, LONG_MAX, 0), ++ SYSFS_CUSTOM("image_exists", SYSFS_RW, image_exists_read, ++ image_exists_write, SYSFS_NEEDS_SM_FOR_BOTH, NULL), ++ SYSFS_STRING("resume", SYSFS_RW, resume_file, 255, ++ SYSFS_NEEDS_SM_FOR_WRITE, ++ attempt_to_parse_resume_device2), ++ SYSFS_STRING("alt_resume_param", SYSFS_RW, alt_resume_param, 255, ++ SYSFS_NEEDS_SM_FOR_WRITE, ++ attempt_to_parse_alt_resume_param), ++ SYSFS_CUSTOM("debug_info", SYSFS_READONLY, get_toi_debug_info, NULL, 0, ++ NULL), ++ SYSFS_BIT("ignore_rootfs", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_IGNORE_ROOTFS, 0), ++ SYSFS_LONG("image_size_limit", SYSFS_RW, &image_size_limit, -2, ++ INT_MAX, 0), ++ SYSFS_UL("last_result", SYSFS_RW, &toi_result, 0, 0, 0), ++ SYSFS_BIT("no_multithreaded_io", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_NO_MULTITHREADED_IO, 0), ++ SYSFS_BIT("no_flusher_thread", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_NO_FLUSHER_THREAD, 0), ++ SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_PAGESET2_FULL, 0), ++ SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0), ++ SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_REPLACE_SWSUSP, 0), ++ SYSFS_STRING("resume_commandline", SYSFS_RW, ++ toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0, ++ NULL), ++ SYSFS_STRING("version", SYSFS_READONLY, TOI_CORE_VERSION, 0, 0, NULL), ++ SYSFS_BIT("freezer_test", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_FREEZER_TEST, 0), ++ SYSFS_BIT("test_bio", SYSFS_RW, &toi_bkd.toi_action, TOI_TEST_BIO, 0), ++ SYSFS_BIT("test_filter_speed", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_TEST_FILTER_SPEED, 0), ++ SYSFS_BIT("no_pageset2", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_NO_PAGESET2, 0), ++ SYSFS_BIT("no_pageset2_if_unneeded", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_NO_PS2_IF_UNNEEDED, 0), ++ SYSFS_BIT("late_cpu_hotplug", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_LATE_CPU_HOTPLUG, 0), ++ SYSFS_STRING("binary_signature", SYSFS_READONLY, ++ tuxonice_signature, 9, 0, NULL), ++ SYSFS_INT("max_workers", SYSFS_RW, &toi_max_workers, 0, NR_CPUS, 0, ++ NULL), ++#ifdef CONFIG_KGDB ++ SYSFS_BIT("post_resume_breakpoint", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_POST_RESUME_BREAKPOINT, 0), ++#endif ++ SYSFS_BIT("no_readahead", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_NO_READAHEAD, 0), ++#ifdef CONFIG_TOI_KEEP_IMAGE ++ SYSFS_BIT("keep_image", SYSFS_RW , &toi_bkd.toi_action, TOI_KEEP_IMAGE, ++ 0), ++#endif ++}; ++ ++static struct toi_core_fns my_fns = { ++ .get_nonconflicting_page = __toi_get_nonconflicting_page, ++ .post_context_save = __toi_post_context_save, ++ .try_hibernate = toi_try_hibernate, ++ .try_resume = toi_sys_power_disk_try_resume, ++}; ++ ++/** ++ * core_load - initialisation of TuxOnIce core ++ * ++ * Initialise the core, beginning with sysfs. Checksum and so on are part of ++ * the core, but have their own initialisation routines because they either ++ * aren't compiled in all the time or have their own subdirectories. ++ **/ ++static __init int core_load(void) ++{ ++ int i, ++ numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); ++ ++ printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ++ " (http://tuxonice.net)\n"); ++ ++ if (toi_sysfs_init()) ++ return 1; ++ ++ for (i = 0; i < numfiles; i++) ++ toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]); ++ ++ toi_core_fns = &my_fns; ++ ++ if (toi_alloc_init()) ++ return 1; ++ if (toi_checksum_init()) ++ return 1; ++ if (toi_usm_init()) ++ return 1; ++ if (toi_ui_init()) ++ return 1; ++ if (toi_poweroff_init()) ++ return 1; ++ if (toi_cluster_init()) ++ return 1; ++ ++ return 0; ++} ++ ++#ifdef MODULE ++/** ++ * core_unload: Prepare to unload the core code. ++ **/ ++static __exit void core_unload(void) ++{ ++ int i, ++ numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); ++ ++ toi_alloc_exit(); ++ toi_checksum_exit(); ++ toi_poweroff_exit(); ++ toi_ui_exit(); ++ toi_usm_exit(); ++ toi_cluster_exit(); ++ ++ for (i = 0; i < numfiles; i++) ++ toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]); ++ ++ toi_core_fns = NULL; ++ ++ toi_sysfs_exit(); ++} ++MODULE_LICENSE("GPL"); ++module_init(core_load); ++module_exit(core_unload); ++#else ++late_initcall(core_load); ++#endif +diff --git a/kernel/power/tuxonice_incremental.c b/kernel/power/tuxonice_incremental.c +new file mode 100644 +index 0000000..16d58fb +--- /dev/null ++++ b/kernel/power/tuxonice_incremental.c +@@ -0,0 +1,383 @@ ++/* ++ * kernel/power/incremental.c ++ * ++ * Copyright (C) 2012 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * This file contains routines related to storing incremental images - that ++ * is, retaining an image after an initial cycle and then storing incremental ++ * changes on subsequent hibernations. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "tuxonice_builtin.h" ++#include "tuxonice.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_io.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_alloc.h" ++ ++static struct toi_module_ops toi_incremental_ops; ++static struct toi_module_ops *next_driver; ++static unsigned long toi_incremental_bytes_in, toi_incremental_bytes_out; ++ ++static char toi_incremental_slow_cmp_name[32] = "sha1"; ++static int toi_incremental_digestsize; ++ ++static DEFINE_MUTEX(stats_lock); ++ ++struct cpu_context { ++ u8 *buffer_start; ++ struct hash_desc desc; ++ struct scatterlist sg[1]; ++ unsigned char *digest; ++}; ++ ++#define OUT_BUF_SIZE (2 * PAGE_SIZE) ++ ++static DEFINE_PER_CPU(struct cpu_context, contexts); ++ ++/* ++ * toi_crypto_prepare ++ * ++ * Prepare to do some work by allocating buffers and transforms. ++ */ ++static int toi_incremental_crypto_prepare(void) ++{ ++ int cpu, digestsize = toi_incremental_digestsize; ++ ++ if (!*toi_incremental_slow_cmp_name) { ++ printk(KERN_INFO "TuxOnIce: Incremental image support enabled but no " ++ "hash algorithm set.\n"); ++ return 1; ++ } ++ ++ for_each_online_cpu(cpu) { ++ struct cpu_context *this = &per_cpu(contexts, cpu); ++ this->desc.tfm = crypto_alloc_hash(toi_incremental_slow_cmp_name, 0, 0); ++ if (IS_ERR(this->desc.tfm)) { ++ printk(KERN_INFO "TuxOnIce: Failed to initialise the " ++ "%s hashing transform.\n", ++ toi_incremental_slow_cmp_name); ++ this->desc.tfm = NULL; ++ return 1; ++ } ++ ++ if (!digestsize) { ++ digestsize = crypto_hash_digestsize(this->desc.tfm); ++ toi_incremental_digestsize = digestsize; ++ } ++ ++ this->digest = toi_kzalloc(16, digestsize, GFP_KERNEL); ++ if (!this->digest) ++ return -ENOMEM; ++ ++ this->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; ++ } ++ ++ return 0; ++} ++ ++static int toi_incremental_rw_cleanup(int writing) ++{ ++ int cpu; ++ ++ for_each_online_cpu(cpu) { ++ struct cpu_context *this = &per_cpu(contexts, cpu); ++ if (this->desc.tfm) { ++ crypto_free_hash(this->desc.tfm); ++ this->desc.tfm = NULL; ++ } ++ ++ if (this->digest) { ++ toi_kfree(16, this->digest, toi_incremental_digestsize); ++ this->digest = NULL; ++ } ++ } ++ ++ return 0; ++} ++ ++/* ++ * toi_incremental_init ++ */ ++ ++static int toi_incremental_init(int hibernate_or_resume) ++{ ++ if (!hibernate_or_resume) ++ return 0; ++ ++ next_driver = toi_get_next_filter(&toi_incremental_ops); ++ ++ return next_driver ? 0 : -ECHILD; ++} ++ ++/* ++ * toi_incremental_rw_init() ++ */ ++ ++static int toi_incremental_rw_init(int rw, int stream_number) ++{ ++ if (rw == WRITE && toi_incremental_crypto_prepare()) { ++ printk(KERN_ERR "Failed to initialise hashing " ++ "algorithm.\n"); ++ if (rw == READ) { ++ printk(KERN_INFO "Unable to read the image.\n"); ++ return -ENODEV; ++ } else { ++ printk(KERN_INFO "Continuing without " ++ " calculating an incremental image.\n"); ++ toi_incremental_ops.enabled = 0; ++ } ++ } ++ ++ return 0; ++} ++ ++/* ++ * toi_incremental_write_page() ++ * ++ * Decide whether to write a page to the image. Calculate the SHA1 (or something ++ * else if the user changes the hashing algo) of the page and compare it to the ++ * previous value (if any). If there was no previous value or the values are ++ * different, write the page. Otherwise, skip the write. ++ * ++ * @TODO: Clear hashes for pages that are no longer in the image! ++ * ++ * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing ++ * data to be written. ++ * ++ * Returns: 0 on success. Otherwise the error is that returned by later ++ * modules, -ECHILD if we have a broken pipeline or -EIO if ++ * zlib errs. ++ */ ++static int toi_incremental_write_page(unsigned long index, int buf_type, ++ void *buffer_page, unsigned int buf_size) ++{ ++ int ret = 0, cpu = smp_processor_id(); ++ struct cpu_context *ctx = &per_cpu(contexts, cpu); ++ int to_write = true; ++ ++ if (ctx->desc.tfm) { ++ // char *old_hash; ++ ++ ctx->buffer_start = TOI_MAP(buf_type, buffer_page); ++ ++ sg_init_one(&ctx->sg[0], ctx->buffer_start, buf_size); ++ ++ ret = crypto_hash_digest(&ctx->desc, &ctx->sg[0], ctx->sg[0].length, ctx->digest); ++ // old_hash = get_old_hash(index); ++ ++ TOI_UNMAP(buf_type, buffer_page); ++ ++#if 0 ++ if (!ret && new_hash == old_hash) { ++ to_write = false; ++ } else ++ store_hash(ctx, index, new_hash); ++#endif ++ } ++ ++ mutex_lock(&stats_lock); ++ ++ toi_incremental_bytes_in += buf_size; ++ if (ret || to_write) ++ toi_incremental_bytes_out += buf_size; ++ ++ mutex_unlock(&stats_lock); ++ ++ if (ret || to_write) { ++ int ret2 = next_driver->write_page(index, buf_type, ++ buffer_page, buf_size); ++ if (!ret) ++ ret = ret2; ++ } ++ ++ return ret; ++} ++ ++/* ++ * toi_incremental_read_page() ++ * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE. ++ * ++ * Nothing extra to do here. ++ */ ++static int toi_incremental_read_page(unsigned long *index, int buf_type, ++ void *buffer_page, unsigned int *buf_size) ++{ ++ return next_driver->read_page(index, TOI_PAGE, buffer_page, buf_size); ++} ++ ++/* ++ * toi_incremental_print_debug_stats ++ * @buffer: Pointer to a buffer into which the debug info will be printed. ++ * @size: Size of the buffer. ++ * ++ * Print information to be recorded for debugging purposes into a buffer. ++ * Returns: Number of characters written to the buffer. ++ */ ++ ++static int toi_incremental_print_debug_stats(char *buffer, int size) ++{ ++ unsigned long pages_in = toi_incremental_bytes_in >> PAGE_SHIFT, ++ pages_out = toi_incremental_bytes_out >> PAGE_SHIFT; ++ int len; ++ ++ /* Output the size of the incremental image. */ ++ if (*toi_incremental_slow_cmp_name) ++ len = scnprintf(buffer, size, "- Hash algorithm is '%s'.\n", ++ toi_incremental_slow_cmp_name); ++ else ++ len = scnprintf(buffer, size, "- Hash algorithm is not set.\n"); ++ ++ if (pages_in) ++ len += scnprintf(buffer+len, size - len, " Incremental image " ++ "%lu of %lu bytes (%ld percent).\n", ++ toi_incremental_bytes_out, ++ toi_incremental_bytes_in, ++ pages_out * 100 / pages_in); ++ return len; ++} ++ ++/* ++ * toi_incremental_memory_needed ++ * ++ * Tell the caller how much memory we need to operate during hibernate/resume. ++ * Returns: Unsigned long. Maximum number of bytes of memory required for ++ * operation. ++ */ ++static int toi_incremental_memory_needed(void) ++{ ++ return 2 * PAGE_SIZE; ++} ++ ++static int toi_incremental_storage_needed(void) ++{ ++ return 2 * sizeof(unsigned long) + sizeof(int) + ++ strlen(toi_incremental_slow_cmp_name) + 1; ++} ++ ++/* ++ * toi_incremental_save_config_info ++ * @buffer: Pointer to a buffer of size PAGE_SIZE. ++ * ++ * Save informaton needed when reloading the image at resume time. ++ * Returns: Number of bytes used for saving our data. ++ */ ++static int toi_incremental_save_config_info(char *buffer) ++{ ++ int len = strlen(toi_incremental_slow_cmp_name) + 1, offset = 0; ++ ++ *((unsigned long *) buffer) = toi_incremental_bytes_in; ++ offset += sizeof(unsigned long); ++ *((unsigned long *) (buffer + offset)) = toi_incremental_bytes_out; ++ offset += sizeof(unsigned long); ++ *((int *) (buffer + offset)) = len; ++ offset += sizeof(int); ++ strncpy(buffer + offset, toi_incremental_slow_cmp_name, len); ++ return offset + len; ++} ++ ++/* toi_incremental_load_config_info ++ * @buffer: Pointer to the start of the data. ++ * @size: Number of bytes that were saved. ++ * ++ * Description: Reload information to be retained for debugging info. ++ */ ++static void toi_incremental_load_config_info(char *buffer, int size) ++{ ++ int len, offset = 0; ++ ++ toi_incremental_bytes_in = *((unsigned long *) buffer); ++ offset += sizeof(unsigned long); ++ toi_incremental_bytes_out = *((unsigned long *) (buffer + offset)); ++ offset += sizeof(unsigned long); ++ len = *((int *) (buffer + offset)); ++ offset += sizeof(int); ++ strncpy(toi_incremental_slow_cmp_name, buffer + offset, len); ++} ++ ++static void toi_incremental_pre_atomic_restore(struct toi_boot_kernel_data *bkd) ++{ ++ bkd->incremental_bytes_in = toi_incremental_bytes_in; ++ bkd->incremental_bytes_out = toi_incremental_bytes_out; ++} ++ ++static void toi_incremental_post_atomic_restore(struct toi_boot_kernel_data *bkd) ++{ ++ toi_incremental_bytes_in = bkd->incremental_bytes_in; ++ toi_incremental_bytes_out = bkd->incremental_bytes_out; ++} ++ ++static void toi_incremental_algo_change(void) ++{ ++ /* Reset so it's gotten from crypto_hash_digestsize afresh */ ++ toi_incremental_digestsize = 0; ++} ++ ++/* ++ * data for our sysfs entries. ++ */ ++static struct toi_sysfs_data sysfs_params[] = { ++ SYSFS_INT("enabled", SYSFS_RW, &toi_incremental_ops.enabled, 0, 1, 0, ++ NULL), ++ SYSFS_STRING("algorithm", SYSFS_RW, toi_incremental_slow_cmp_name, 31, 0, toi_incremental_algo_change), ++}; ++ ++/* ++ * Ops structure. ++ */ ++static struct toi_module_ops toi_incremental_ops = { ++ .type = FILTER_MODULE, ++ .name = "incremental", ++ .directory = "incremental", ++ .module = THIS_MODULE, ++ .initialise = toi_incremental_init, ++ .memory_needed = toi_incremental_memory_needed, ++ .print_debug_info = toi_incremental_print_debug_stats, ++ .save_config_info = toi_incremental_save_config_info, ++ .load_config_info = toi_incremental_load_config_info, ++ .storage_needed = toi_incremental_storage_needed, ++ ++ .pre_atomic_restore = toi_incremental_pre_atomic_restore, ++ .post_atomic_restore = toi_incremental_post_atomic_restore, ++ ++ .rw_init = toi_incremental_rw_init, ++ .rw_cleanup = toi_incremental_rw_cleanup, ++ ++ .write_page = toi_incremental_write_page, ++ .read_page = toi_incremental_read_page, ++ ++ .sysfs_data = sysfs_params, ++ .num_sysfs_entries = sizeof(sysfs_params) / ++ sizeof(struct toi_sysfs_data), ++}; ++ ++/* ---- Registration ---- */ ++ ++static __init int toi_incremental_load(void) ++{ ++ return toi_register_module(&toi_incremental_ops); ++} ++ ++#ifdef MODULE ++static __exit void toi_incremental_unload(void) ++{ ++ toi_unregister_module(&toi_incremental_ops); ++} ++ ++module_init(toi_incremental_load); ++module_exit(toi_incremental_unload); ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Nigel Cunningham"); ++MODULE_DESCRIPTION("Incremental Image Support for TuxOnIce"); ++#else ++late_initcall(toi_incremental_load); ++#endif +diff --git a/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c +new file mode 100644 +index 0000000..901f1c9 +--- /dev/null ++++ b/kernel/power/tuxonice_io.c +@@ -0,0 +1,1936 @@ ++/* ++ * kernel/power/tuxonice_io.c ++ * ++ * Copyright (C) 1998-2001 Gabor Kuti ++ * Copyright (C) 1998,2001,2002 Pavel Machek ++ * Copyright (C) 2002-2003 Florent Chabaud ++ * Copyright (C) 2002-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * It contains high level IO routines for hibernating. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "tuxonice.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_pageflags.h" ++#include "tuxonice_io.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_storage.h" ++#include "tuxonice_prepare_image.h" ++#include "tuxonice_extent.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_builtin.h" ++#include "tuxonice_checksum.h" ++#include "tuxonice_alloc.h" ++char alt_resume_param[256]; ++ ++/* Version read from image header at resume */ ++static int toi_image_header_version; ++ ++#define read_if_version(VERS, VAR, DESC, ERR_ACT) do { \ ++ if (likely(toi_image_header_version >= VERS)) \ ++ if (toiActiveAllocator->rw_header_chunk(READ, NULL, \ ++ (char *) &VAR, sizeof(VAR))) { \ ++ abort_hibernate(TOI_FAILED_IO, "Failed to read DESC."); \ ++ ERR_ACT; \ ++ } \ ++} while(0) \ ++ ++/* Variables shared between threads and updated under the mutex */ ++static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result; ++static int io_index, io_nextupdate, io_pc, io_pc_step; ++static DEFINE_MUTEX(io_mutex); ++static DEFINE_PER_CPU(struct page *, last_sought); ++static DEFINE_PER_CPU(struct page *, last_high_page); ++static DEFINE_PER_CPU(char *, checksum_locn); ++static DEFINE_PER_CPU(struct pbe *, last_low_page); ++static atomic_t io_count; ++atomic_t toi_io_workers; ++EXPORT_SYMBOL_GPL(toi_io_workers); ++ ++static int using_flusher; ++ ++DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher); ++EXPORT_SYMBOL_GPL(toi_io_queue_flusher); ++ ++int toi_bio_queue_flusher_should_finish; ++EXPORT_SYMBOL_GPL(toi_bio_queue_flusher_should_finish); ++ ++int toi_max_workers; ++ ++static char *image_version_error = "The image header version is newer than " \ ++ "this kernel supports."; ++ ++struct toi_module_ops *first_filter; ++ ++static atomic_t toi_num_other_threads; ++static DECLARE_WAIT_QUEUE_HEAD(toi_worker_wait_queue); ++enum toi_worker_commands { ++ TOI_IO_WORKER_STOP, ++ TOI_IO_WORKER_RUN, ++ TOI_IO_WORKER_EXIT ++}; ++static enum toi_worker_commands toi_worker_command; ++ ++/** ++ * toi_attempt_to_parse_resume_device - determine if we can hibernate ++ * ++ * Can we hibernate, using the current resume= parameter? ++ **/ ++int toi_attempt_to_parse_resume_device(int quiet) ++{ ++ struct list_head *Allocator; ++ struct toi_module_ops *thisAllocator; ++ int result, returning = 0; ++ ++ if (toi_activate_storage(0)) ++ return 0; ++ ++ toiActiveAllocator = NULL; ++ clear_toi_state(TOI_RESUME_DEVICE_OK); ++ clear_toi_state(TOI_CAN_RESUME); ++ clear_result_state(TOI_ABORTED); ++ ++ if (!toiNumAllocators) { ++ if (!quiet) ++ printk(KERN_INFO "TuxOnIce: No storage allocators have " ++ "been registered. Hibernating will be " ++ "disabled.\n"); ++ goto cleanup; ++ } ++ ++ list_for_each(Allocator, &toiAllocators) { ++ thisAllocator = list_entry(Allocator, struct toi_module_ops, ++ type_list); ++ ++ /* ++ * Not sure why you'd want to disable an allocator, but ++ * we should honour the flag if we're providing it ++ */ ++ if (!thisAllocator->enabled) ++ continue; ++ ++ result = thisAllocator->parse_sig_location( ++ resume_file, (toiNumAllocators == 1), ++ quiet); ++ ++ switch (result) { ++ case -EINVAL: ++ /* For this allocator, but not a valid ++ * configuration. Error already printed. */ ++ goto cleanup; ++ ++ case 0: ++ /* For this allocator and valid. */ ++ toiActiveAllocator = thisAllocator; ++ ++ set_toi_state(TOI_RESUME_DEVICE_OK); ++ set_toi_state(TOI_CAN_RESUME); ++ returning = 1; ++ goto cleanup; ++ } ++ } ++ if (!quiet) ++ printk(KERN_INFO "TuxOnIce: No matching enabled allocator " ++ "found. Resuming disabled.\n"); ++cleanup: ++ toi_deactivate_storage(0); ++ return returning; ++} ++EXPORT_SYMBOL_GPL(toi_attempt_to_parse_resume_device); ++ ++void attempt_to_parse_resume_device2(void) ++{ ++ toi_prepare_usm(); ++ toi_attempt_to_parse_resume_device(0); ++ toi_cleanup_usm(); ++} ++EXPORT_SYMBOL_GPL(attempt_to_parse_resume_device2); ++ ++void save_restore_alt_param(int replace, int quiet) ++{ ++ static char resume_param_save[255]; ++ static unsigned long toi_state_save; ++ ++ if (replace) { ++ toi_state_save = toi_state; ++ strcpy(resume_param_save, resume_file); ++ strcpy(resume_file, alt_resume_param); ++ } else { ++ strcpy(resume_file, resume_param_save); ++ toi_state = toi_state_save; ++ } ++ toi_attempt_to_parse_resume_device(quiet); ++} ++ ++void attempt_to_parse_alt_resume_param(void) ++{ ++ int ok = 0; ++ ++ /* Temporarily set resume_param to the poweroff value */ ++ if (!strlen(alt_resume_param)) ++ return; ++ ++ printk(KERN_INFO "=== Trying Poweroff Resume2 ===\n"); ++ save_restore_alt_param(SAVE, NOQUIET); ++ if (test_toi_state(TOI_CAN_RESUME)) ++ ok = 1; ++ ++ printk(KERN_INFO "=== Done ===\n"); ++ save_restore_alt_param(RESTORE, QUIET); ++ ++ /* If not ok, clear the string */ ++ if (ok) ++ return; ++ ++ printk(KERN_INFO "Can't resume from that location; clearing " ++ "alt_resume_param.\n"); ++ alt_resume_param[0] = '\0'; ++} ++ ++/** ++ * noresume_reset_modules - reset data structures in case of non resuming ++ * ++ * When we read the start of an image, modules (and especially the ++ * active allocator) might need to reset data structures if we ++ * decide to remove the image rather than resuming from it. ++ **/ ++static void noresume_reset_modules(void) ++{ ++ struct toi_module_ops *this_filter; ++ ++ list_for_each_entry(this_filter, &toi_filters, type_list) ++ if (this_filter->noresume_reset) ++ this_filter->noresume_reset(); ++ ++ if (toiActiveAllocator && toiActiveAllocator->noresume_reset) ++ toiActiveAllocator->noresume_reset(); ++} ++ ++/** ++ * fill_toi_header - fill the hibernate header structure ++ * @struct toi_header: Header data structure to be filled. ++ **/ ++static int fill_toi_header(struct toi_header *sh) ++{ ++ int i, error; ++ ++ error = init_header((struct swsusp_info *) sh); ++ if (error) ++ return error; ++ ++ sh->pagedir = pagedir1; ++ sh->pageset_2_size = pagedir2.size; ++ sh->param0 = toi_result; ++ sh->param1 = toi_bkd.toi_action; ++ sh->param2 = toi_bkd.toi_debug_state; ++ sh->param3 = toi_bkd.toi_default_console_level; ++ sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev; ++ for (i = 0; i < 4; i++) ++ sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2]; ++ sh->bkd = boot_kernel_data_buffer; ++ return 0; ++} ++ ++/** ++ * rw_init_modules - initialize modules ++ * @rw: Whether we are reading of writing an image. ++ * @which: Section of the image being processed. ++ * ++ * Iterate over modules, preparing the ones that will be used to read or write ++ * data. ++ **/ ++static int rw_init_modules(int rw, int which) ++{ ++ struct toi_module_ops *this_module; ++ /* Initialise page transformers */ ++ list_for_each_entry(this_module, &toi_filters, type_list) { ++ if (!this_module->enabled) ++ continue; ++ if (this_module->rw_init && this_module->rw_init(rw, which)) { ++ abort_hibernate(TOI_FAILED_MODULE_INIT, ++ "Failed to initialize the %s filter.", ++ this_module->name); ++ return 1; ++ } ++ } ++ ++ /* Initialise allocator */ ++ if (toiActiveAllocator->rw_init(rw, which)) { ++ abort_hibernate(TOI_FAILED_MODULE_INIT, ++ "Failed to initialise the allocator."); ++ return 1; ++ } ++ ++ /* Initialise other modules */ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (!this_module->enabled || ++ this_module->type == FILTER_MODULE || ++ this_module->type == WRITER_MODULE) ++ continue; ++ if (this_module->rw_init && this_module->rw_init(rw, which)) { ++ set_abort_result(TOI_FAILED_MODULE_INIT); ++ printk(KERN_INFO "Setting aborted flag due to module " ++ "init failure.\n"); ++ return 1; ++ } ++ } ++ ++ return 0; ++} ++ ++/** ++ * rw_cleanup_modules - cleanup modules ++ * @rw: Whether we are reading of writing an image. ++ * ++ * Cleanup components after reading or writing a set of pages. ++ * Only the allocator may fail. ++ **/ ++static int rw_cleanup_modules(int rw) ++{ ++ struct toi_module_ops *this_module; ++ int result = 0; ++ ++ /* Cleanup other modules */ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (!this_module->enabled || ++ this_module->type == FILTER_MODULE || ++ this_module->type == WRITER_MODULE) ++ continue; ++ if (this_module->rw_cleanup) ++ result |= this_module->rw_cleanup(rw); ++ } ++ ++ /* Flush data and cleanup */ ++ list_for_each_entry(this_module, &toi_filters, type_list) { ++ if (!this_module->enabled) ++ continue; ++ if (this_module->rw_cleanup) ++ result |= this_module->rw_cleanup(rw); ++ } ++ ++ result |= toiActiveAllocator->rw_cleanup(rw); ++ ++ return result; ++} ++ ++static struct page *copy_page_from_orig_page(struct page *orig_page, int is_high) ++{ ++ int index, min, max; ++ struct page *high_page = NULL, ++ **my_last_high_page = &__get_cpu_var(last_high_page), ++ **my_last_sought = &__get_cpu_var(last_sought); ++ struct pbe *this, **my_last_low_page = &__get_cpu_var(last_low_page); ++ void *compare; ++ ++ if (is_high) { ++ if (*my_last_sought && *my_last_high_page && ++ *my_last_sought < orig_page) ++ high_page = *my_last_high_page; ++ else ++ high_page = (struct page *) restore_highmem_pblist; ++ this = (struct pbe *) kmap(high_page); ++ compare = orig_page; ++ } else { ++ if (*my_last_sought && *my_last_low_page && ++ *my_last_sought < orig_page) ++ this = *my_last_low_page; ++ else ++ this = restore_pblist; ++ compare = page_address(orig_page); ++ } ++ ++ *my_last_sought = orig_page; ++ ++ /* Locate page containing pbe */ ++ while (this[PBES_PER_PAGE - 1].next && ++ this[PBES_PER_PAGE - 1].orig_address < compare) { ++ if (is_high) { ++ struct page *next_high_page = (struct page *) ++ this[PBES_PER_PAGE - 1].next; ++ kunmap(high_page); ++ this = kmap(next_high_page); ++ high_page = next_high_page; ++ } else ++ this = this[PBES_PER_PAGE - 1].next; ++ } ++ ++ /* Do a binary search within the page */ ++ min = 0; ++ max = PBES_PER_PAGE; ++ index = PBES_PER_PAGE / 2; ++ while (max - min) { ++ if (!this[index].orig_address || ++ this[index].orig_address > compare) ++ max = index; ++ else if (this[index].orig_address == compare) { ++ if (is_high) { ++ struct page *page = this[index].address; ++ *my_last_high_page = high_page; ++ kunmap(high_page); ++ return page; ++ } ++ *my_last_low_page = this; ++ return virt_to_page(this[index].address); ++ } else ++ min = index; ++ index = ((max + min) / 2); ++ }; ++ ++ if (is_high) ++ kunmap(high_page); ++ ++ abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for" ++ " orig page %p. This[min].orig_address=%p.\n", orig_page, ++ this[index].orig_address); ++ return NULL; ++} ++ ++/** ++ * write_next_page - write the next page in a pageset ++ * @data_pfn: The pfn where the next data to write is located. ++ * @my_io_index: The index of the page in the pageset. ++ * @write_pfn: The pfn number to write in the image (where the data belongs). ++ * ++ * Get the pfn of the next page to write, map the page if necessary and do the ++ * write. ++ **/ ++static int write_next_page(unsigned long *data_pfn, int *my_io_index, ++ unsigned long *write_pfn) ++{ ++ struct page *page; ++ char **my_checksum_locn = &__get_cpu_var(checksum_locn); ++ int result = 0, was_present; ++ ++ *data_pfn = memory_bm_next_pfn(io_map); ++ ++ /* Another thread could have beaten us to it. */ ++ if (*data_pfn == BM_END_OF_MAP) { ++ if (atomic_read(&io_count)) { ++ printk(KERN_INFO "Ran out of pfns but io_count is " ++ "still %d.\n", atomic_read(&io_count)); ++ BUG(); ++ } ++ mutex_unlock(&io_mutex); ++ return -ENODATA; ++ } ++ ++ *my_io_index = io_finish_at - atomic_sub_return(1, &io_count); ++ ++ memory_bm_clear_bit(io_map, *data_pfn); ++ page = pfn_to_page(*data_pfn); ++ ++ was_present = kernel_page_present(page); ++ if (!was_present) ++ kernel_map_pages(page, 1, 1); ++ ++ if (io_pageset == 1) ++ *write_pfn = memory_bm_next_pfn(pageset1_map); ++ else { ++ *write_pfn = *data_pfn; ++ *my_checksum_locn = tuxonice_get_next_checksum(); ++ } ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Write %d:%ld.", *my_io_index, *write_pfn); ++ ++ mutex_unlock(&io_mutex); ++ ++ if (io_pageset == 2 && tuxonice_calc_checksum(page, *my_checksum_locn)) ++ return 1; ++ ++ result = first_filter->write_page(*write_pfn, TOI_PAGE, page, ++ PAGE_SIZE); ++ ++ if (!was_present) ++ kernel_map_pages(page, 1, 0); ++ ++ return result; ++} ++ ++/** ++ * read_next_page - read the next page in a pageset ++ * @my_io_index: The index of the page in the pageset. ++ * @write_pfn: The pfn in which the data belongs. ++ * ++ * Read a page of the image into our buffer. It can happen (here and in the ++ * write routine) that threads don't get run until after other CPUs have done ++ * all the work. This was the cause of the long standing issue with ++ * occasionally getting -ENODATA errors at the end of reading the image. We ++ * therefore need to check there's actually a page to read before trying to ++ * retrieve one. ++ **/ ++ ++static int read_next_page(int *my_io_index, unsigned long *write_pfn, ++ struct page *buffer) ++{ ++ unsigned int buf_size = PAGE_SIZE; ++ unsigned long left = atomic_read(&io_count); ++ ++ if (!left) ++ return -ENODATA; ++ ++ /* Start off assuming the page we read isn't resaved */ ++ *my_io_index = io_finish_at - atomic_sub_return(1, &io_count); ++ ++ mutex_unlock(&io_mutex); ++ ++ /* ++ * Are we aborting? If so, don't submit any more I/O as ++ * resetting the resume_attempted flag (from ui.c) will ++ * clear the bdev flags, making this thread oops. ++ */ ++ if (unlikely(test_toi_state(TOI_STOP_RESUME))) { ++ atomic_dec(&toi_io_workers); ++ if (!atomic_read(&toi_io_workers)) { ++ /* ++ * So we can be sure we'll have memory for ++ * marking that we haven't resumed. ++ */ ++ rw_cleanup_modules(READ); ++ set_toi_state(TOI_IO_STOPPED); ++ } ++ while (1) ++ schedule(); ++ } ++ ++ /* ++ * See toi_bio_read_page in tuxonice_bio.c: ++ * read the next page in the image. ++ */ ++ return first_filter->read_page(write_pfn, TOI_PAGE, buffer, &buf_size); ++} ++ ++static void use_read_page(unsigned long write_pfn, struct page *buffer) ++{ ++ struct page *final_page = pfn_to_page(write_pfn), ++ *copy_page = final_page; ++ char *virt, *buffer_virt; ++ int was_present, cpu = smp_processor_id(); ++ unsigned long idx = 0; ++ ++ if (io_pageset == 1 && (!pageset1_copy_map || ++ !memory_bm_test_bit_index(pageset1_copy_map, write_pfn, cpu))) { ++ int is_high = PageHighMem(final_page); ++ copy_page = copy_page_from_orig_page(is_high ? (void *) write_pfn : final_page, is_high); ++ } ++ ++ if (!memory_bm_test_bit_index(io_map, write_pfn, cpu)) { ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Discard %ld.", write_pfn); ++ mutex_lock(&io_mutex); ++ idx = atomic_add_return(1, &io_count); ++ mutex_unlock(&io_mutex); ++ return; ++ } ++ ++ virt = kmap(copy_page); ++ buffer_virt = kmap(buffer); ++ was_present = kernel_page_present(copy_page); ++ if (!was_present) ++ kernel_map_pages(copy_page, 1, 1); ++ memcpy(virt, buffer_virt, PAGE_SIZE); ++ if (!was_present) ++ kernel_map_pages(copy_page, 1, 0); ++ kunmap(copy_page); ++ kunmap(buffer); ++ memory_bm_clear_bit_index(io_map, write_pfn, cpu); ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Read %d:%ld", idx, write_pfn); ++} ++ ++static unsigned long status_update(int writing, unsigned long done, ++ unsigned long ticks) ++{ ++ int cs_index = writing ? 0 : 1; ++ unsigned long ticks_so_far = toi_bkd.toi_io_time[cs_index][1] + ticks; ++ unsigned long msec = jiffies_to_msecs(abs(ticks_so_far)); ++ unsigned long pgs_per_s, estimate = 0, pages_left; ++ ++ if (msec) { ++ pages_left = io_barmax - done; ++ pgs_per_s = 1000 * done / msec; ++ if (pgs_per_s) ++ estimate = DIV_ROUND_UP(pages_left, pgs_per_s); ++ } ++ ++ if (estimate && ticks > HZ / 2) ++ return toi_update_status(done, io_barmax, ++ " %d/%d MB (%lu sec left)", ++ MB(done+1), MB(io_barmax), estimate); ++ ++ return toi_update_status(done, io_barmax, " %d/%d MB", ++ MB(done+1), MB(io_barmax)); ++} ++ ++/** ++ * worker_rw_loop - main loop to read/write pages ++ * ++ * The main I/O loop for reading or writing pages. The io_map bitmap is used to ++ * track the pages to read/write. ++ * If we are reading, the pages are loaded to their final (mapped) pfn. ++ * Data is non zero iff this is a thread started via start_other_threads. ++ * In that case, we stay in here until told to quit. ++ **/ ++static int worker_rw_loop(void *data) ++{ ++ unsigned long data_pfn, write_pfn, next_jiffies = jiffies + HZ / 4, ++ jif_index = 1, start_time = jiffies, thread_num; ++ int result = 0, my_io_index = 0, last_worker; ++ struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP); ++ cpumask_var_t orig_mask; ++ ++ if (!alloc_cpumask_var(&orig_mask, GFP_KERNEL)) { ++ printk(KERN_EMERG "Failed to allocate cpumask for TuxOnIce I/O thread %ld.\n", (unsigned long) data); ++ return -ENOMEM; ++ } ++ ++ cpumask_copy(orig_mask, tsk_cpus_allowed(current)); ++ ++ current->flags |= PF_NOFREEZE; ++ ++top: ++ mutex_lock(&io_mutex); ++ thread_num = atomic_read(&toi_io_workers); ++ ++ cpumask_copy(tsk_cpus_allowed(current), orig_mask); ++ schedule(); ++ ++ atomic_inc(&toi_io_workers); ++ ++ while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) && ++ !(io_write && test_result_state(TOI_ABORTED)) && ++ toi_worker_command == TOI_IO_WORKER_RUN) { ++ if (!thread_num && jiffies > next_jiffies) { ++ next_jiffies += HZ / 4; ++ if (toiActiveAllocator->update_throughput_throttle) ++ toiActiveAllocator->update_throughput_throttle( ++ jif_index); ++ jif_index++; ++ } ++ ++ /* ++ * What page to use? If reading, don't know yet which page's ++ * data will be read, so always use the buffer. If writing, ++ * use the copy (Pageset1) or original page (Pageset2), but ++ * always write the pfn of the original page. ++ */ ++ if (io_write) ++ result = write_next_page(&data_pfn, &my_io_index, ++ &write_pfn); ++ else /* Reading */ ++ result = read_next_page(&my_io_index, &write_pfn, ++ buffer); ++ ++ if (result) { ++ mutex_lock(&io_mutex); ++ /* Nothing to do? */ ++ if (result == -ENODATA) { ++ toi_message(TOI_IO, TOI_VERBOSE, 0, ++ "Thread %d has no more work.", ++ smp_processor_id()); ++ break; ++ } ++ ++ io_result = result; ++ ++ if (io_write) { ++ printk(KERN_INFO "Write chunk returned %d.\n", ++ result); ++ abort_hibernate(TOI_FAILED_IO, ++ "Failed to write a chunk of the " ++ "image."); ++ break; ++ } ++ ++ if (io_pageset == 1) { ++ printk(KERN_ERR "\nBreaking out of I/O loop " ++ "because of result code %d.\n", result); ++ break; ++ } ++ panic("Read chunk returned (%d)", result); ++ } ++ ++ /* ++ * Discard reads of resaved pages while reading ps2 ++ * and unwanted pages while rereading ps2 when aborting. ++ */ ++ if (!io_write) { ++ if (!PageResave(pfn_to_page(write_pfn))) ++ use_read_page(write_pfn, buffer); ++ else { ++ mutex_lock(&io_mutex); ++ toi_message(TOI_IO, TOI_VERBOSE, 0, ++ "Resaved %ld.", write_pfn); ++ atomic_inc(&io_count); ++ mutex_unlock(&io_mutex); ++ } ++ } ++ ++ if (!thread_num) { ++ if(my_io_index + io_base > io_nextupdate) ++ io_nextupdate = status_update(io_write, ++ my_io_index + io_base, ++ jiffies - start_time); ++ ++ if (my_io_index > io_pc) { ++ printk(KERN_CONT "...%d%%", 20 * io_pc_step); ++ io_pc_step++; ++ io_pc = io_finish_at * io_pc_step / 5; ++ } ++ } ++ ++ toi_cond_pause(0, NULL); ++ ++ /* ++ * Subtle: If there's less I/O still to be done than threads ++ * running, quit. This stops us doing I/O beyond the end of ++ * the image when reading. ++ * ++ * Possible race condition. Two threads could do the test at ++ * the same time; one should exit and one should continue. ++ * Therefore we take the mutex before comparing and exiting. ++ */ ++ ++ mutex_lock(&io_mutex); ++ } ++ ++ last_worker = atomic_dec_and_test(&toi_io_workers); ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "%d workers left.", atomic_read(&toi_io_workers)); ++ mutex_unlock(&io_mutex); ++ ++ if ((unsigned long) data && toi_worker_command != TOI_IO_WORKER_EXIT) { ++ /* Were we the last thread and we're using a flusher thread? */ ++ if (last_worker && using_flusher) { ++ toiActiveAllocator->finish_all_io(); ++ } ++ /* First, if we're doing I/O, wait for it to finish */ ++ wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_RUN); ++ /* Then wait to be told what to do next */ ++ wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_STOP); ++ if (toi_worker_command == TOI_IO_WORKER_RUN) ++ goto top; ++ } ++ ++ if (thread_num) ++ atomic_dec(&toi_num_other_threads); ++ ++ toi_message(TOI_IO, TOI_LOW, 0, "Thread %d exiting.", thread_num); ++ toi__free_page(28, buffer); ++ free_cpumask_var(orig_mask); ++ ++ return result; ++} ++ ++int toi_start_other_threads(void) ++{ ++ int cpu; ++ struct task_struct *p; ++ int to_start = (toi_max_workers ? toi_max_workers : num_online_cpus()) - 1; ++ unsigned long num_started = 0; ++ ++ if (test_action_state(TOI_NO_MULTITHREADED_IO)) ++ return 0; ++ ++ toi_worker_command = TOI_IO_WORKER_STOP; ++ ++ for_each_online_cpu(cpu) { ++ if (num_started == to_start) ++ break; ++ ++ if (cpu == smp_processor_id()) ++ continue; ++ ++ p = kthread_create_on_node(worker_rw_loop, (void *) num_started + 1, ++ cpu_to_node(cpu), "ktoi_io/%d", cpu); ++ if (IS_ERR(p)) { ++ printk(KERN_ERR "ktoi_io for %i failed\n", cpu); ++ continue; ++ } ++ kthread_bind(p, cpu); ++ p->flags |= PF_MEMALLOC; ++ wake_up_process(p); ++ num_started++; ++ atomic_inc(&toi_num_other_threads); ++ } ++ ++ toi_message(TOI_IO, TOI_LOW, 0, "Started %d threads.", num_started); ++ return num_started; ++} ++ ++void toi_stop_other_threads(void) ++{ ++ toi_message(TOI_IO, TOI_LOW, 0, "Stopping other threads."); ++ toi_worker_command = TOI_IO_WORKER_EXIT; ++ wake_up(&toi_worker_wait_queue); ++} ++ ++/** ++ * do_rw_loop - main highlevel function for reading or writing pages ++ * ++ * Create the io_map bitmap and call worker_rw_loop to perform I/O operations. ++ **/ ++static int do_rw_loop(int write, int finish_at, struct memory_bitmap *pageflags, ++ int base, int barmax, int pageset) ++{ ++ int index = 0, cpu, result = 0, workers_started; ++ unsigned long pfn; ++ ++ first_filter = toi_get_next_filter(NULL); ++ ++ if (!finish_at) ++ return 0; ++ ++ io_write = write; ++ io_finish_at = finish_at; ++ io_base = base; ++ io_barmax = barmax; ++ io_pageset = pageset; ++ io_index = 0; ++ io_pc = io_finish_at / 5; ++ io_pc_step = 1; ++ io_result = 0; ++ io_nextupdate = base + 1; ++ toi_bio_queue_flusher_should_finish = 0; ++ ++ for_each_online_cpu(cpu) { ++ per_cpu(last_sought, cpu) = NULL; ++ per_cpu(last_low_page, cpu) = NULL; ++ per_cpu(last_high_page, cpu) = NULL; ++ } ++ ++ /* Ensure all bits clear */ ++ memory_bm_clear(io_map); ++ ++ /* Set the bits for the pages to write */ ++ memory_bm_position_reset(pageflags); ++ ++ pfn = memory_bm_next_pfn(pageflags); ++ ++ while (pfn != BM_END_OF_MAP && index < finish_at) { ++ memory_bm_set_bit(io_map, pfn); ++ pfn = memory_bm_next_pfn(pageflags); ++ index++; ++ } ++ ++ BUG_ON(index < finish_at); ++ ++ atomic_set(&io_count, finish_at); ++ ++ memory_bm_position_reset(pageset1_map); ++ ++ mutex_lock(&io_mutex); ++ ++ clear_toi_state(TOI_IO_STOPPED); ++ ++ using_flusher = (atomic_read(&toi_num_other_threads) && ++ toiActiveAllocator->io_flusher && ++ !test_action_state(TOI_NO_FLUSHER_THREAD)); ++ ++ workers_started = atomic_read(&toi_num_other_threads); ++ ++ memory_bm_set_iterators(io_map, atomic_read(&toi_num_other_threads) + 1); ++ memory_bm_position_reset(io_map); ++ ++ memory_bm_set_iterators(pageset1_copy_map, atomic_read(&toi_num_other_threads) + 1); ++ memory_bm_position_reset(pageset1_copy_map); ++ ++ toi_worker_command = TOI_IO_WORKER_RUN; ++ wake_up(&toi_worker_wait_queue); ++ ++ mutex_unlock(&io_mutex); ++ ++ if (using_flusher) ++ result = toiActiveAllocator->io_flusher(write); ++ else ++ worker_rw_loop(NULL); ++ ++ while (atomic_read(&toi_io_workers)) ++ schedule(); ++ ++ printk(KERN_CONT "\n"); ++ ++ toi_worker_command = TOI_IO_WORKER_STOP; ++ wake_up(&toi_worker_wait_queue); ++ ++ if (unlikely(test_toi_state(TOI_STOP_RESUME))) { ++ if (!atomic_read(&toi_io_workers)) { ++ rw_cleanup_modules(READ); ++ set_toi_state(TOI_IO_STOPPED); ++ } ++ while (1) ++ schedule(); ++ } ++ set_toi_state(TOI_IO_STOPPED); ++ ++ if (!io_result && !result && !test_result_state(TOI_ABORTED)) { ++ unsigned long next; ++ ++ toi_update_status(io_base + io_finish_at, io_barmax, ++ " %d/%d MB ", ++ MB(io_base + io_finish_at), MB(io_barmax)); ++ ++ memory_bm_position_reset(io_map); ++ next = memory_bm_next_pfn(io_map); ++ if (next != BM_END_OF_MAP) { ++ printk(KERN_INFO "Finished I/O loop but still work to " ++ "do?\nFinish at = %d. io_count = %d.\n", ++ finish_at, atomic_read(&io_count)); ++ printk(KERN_INFO "I/O bitmap still records work to do." ++ "%ld.\n", next); ++ BUG(); ++ do { ++ cpu_relax(); ++ } while (0); ++ } ++ } ++ ++ return io_result ? io_result : result; ++} ++ ++/** ++ * write_pageset - write a pageset to disk. ++ * @pagedir: Which pagedir to write. ++ * ++ * Returns: ++ * Zero on success or -1 on failure. ++ **/ ++int write_pageset(struct pagedir *pagedir) ++{ ++ int finish_at, base = 0; ++ int barmax = pagedir1.size + pagedir2.size; ++ long error = 0; ++ struct memory_bitmap *pageflags; ++ unsigned long start_time, end_time; ++ ++ /* ++ * Even if there is nothing to read or write, the allocator ++ * may need the init/cleanup for it's housekeeping. (eg: ++ * Pageset1 may start where pageset2 ends when writing). ++ */ ++ finish_at = pagedir->size; ++ ++ if (pagedir->id == 1) { ++ toi_prepare_status(DONT_CLEAR_BAR, ++ "Writing kernel & process data..."); ++ base = pagedir2.size; ++ if (test_action_state(TOI_TEST_FILTER_SPEED) || ++ test_action_state(TOI_TEST_BIO)) ++ pageflags = pageset1_map; ++ else ++ pageflags = pageset1_copy_map; ++ } else { ++ toi_prepare_status(DONT_CLEAR_BAR, "Writing caches..."); ++ pageflags = pageset2_map; ++ } ++ ++ start_time = jiffies; ++ ++ if (rw_init_modules(1, pagedir->id)) { ++ abort_hibernate(TOI_FAILED_MODULE_INIT, ++ "Failed to initialise modules for writing."); ++ error = 1; ++ } ++ ++ if (!error) ++ error = do_rw_loop(1, finish_at, pageflags, base, barmax, ++ pagedir->id); ++ ++ if (rw_cleanup_modules(WRITE) && !error) { ++ abort_hibernate(TOI_FAILED_MODULE_CLEANUP, ++ "Failed to cleanup after writing."); ++ error = 1; ++ } ++ ++ end_time = jiffies; ++ ++ if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) { ++ toi_bkd.toi_io_time[0][0] += finish_at, ++ toi_bkd.toi_io_time[0][1] += (end_time - start_time); ++ } ++ ++ return error; ++} ++ ++/** ++ * read_pageset - highlevel function to read a pageset from disk ++ * @pagedir: pageset to read ++ * @overwrittenpagesonly: Whether to read the whole pageset or ++ * only part of it. ++ * ++ * Returns: ++ * Zero on success or -1 on failure. ++ **/ ++static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly) ++{ ++ int result = 0, base = 0; ++ int finish_at = pagedir->size; ++ int barmax = pagedir1.size + pagedir2.size; ++ struct memory_bitmap *pageflags; ++ unsigned long start_time, end_time; ++ ++ if (pagedir->id == 1) { ++ toi_prepare_status(DONT_CLEAR_BAR, ++ "Reading kernel & process data..."); ++ pageflags = pageset1_map; ++ } else { ++ toi_prepare_status(DONT_CLEAR_BAR, "Reading caches..."); ++ if (overwrittenpagesonly) { ++ barmax = min(pagedir1.size, pagedir2.size); ++ finish_at = min(pagedir1.size, pagedir2.size); ++ } else ++ base = pagedir1.size; ++ pageflags = pageset2_map; ++ } ++ ++ start_time = jiffies; ++ ++ if (rw_init_modules(0, pagedir->id)) { ++ toiActiveAllocator->remove_image(); ++ result = 1; ++ } else ++ result = do_rw_loop(0, finish_at, pageflags, base, barmax, ++ pagedir->id); ++ ++ if (rw_cleanup_modules(READ) && !result) { ++ abort_hibernate(TOI_FAILED_MODULE_CLEANUP, ++ "Failed to cleanup after reading."); ++ result = 1; ++ } ++ ++ /* Statistics */ ++ end_time = jiffies; ++ ++ if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) { ++ toi_bkd.toi_io_time[1][0] += finish_at, ++ toi_bkd.toi_io_time[1][1] += (end_time - start_time); ++ } ++ ++ return result; ++} ++ ++/** ++ * write_module_configs - store the modules configuration ++ * ++ * The configuration for each module is stored in the image header. ++ * Returns: Int ++ * Zero on success, Error value otherwise. ++ **/ ++static int write_module_configs(void) ++{ ++ struct toi_module_ops *this_module; ++ char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP); ++ int len, index = 1; ++ struct toi_module_header toi_module_header; ++ ++ if (!buffer) { ++ printk(KERN_INFO "Failed to allocate a buffer for saving " ++ "module configuration info.\n"); ++ return -ENOMEM; ++ } ++ ++ /* ++ * We have to know which data goes with which module, so we at ++ * least write a length of zero for a module. Note that we are ++ * also assuming every module's config data takes <= PAGE_SIZE. ++ */ ++ ++ /* For each module (in registration order) */ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (!this_module->enabled || !this_module->storage_needed || ++ (this_module->type == WRITER_MODULE && ++ toiActiveAllocator != this_module)) ++ continue; ++ ++ /* Get the data from the module */ ++ len = 0; ++ if (this_module->save_config_info) ++ len = this_module->save_config_info(buffer); ++ ++ /* Save the details of the module */ ++ toi_module_header.enabled = this_module->enabled; ++ toi_module_header.type = this_module->type; ++ toi_module_header.index = index++; ++ strncpy(toi_module_header.name, this_module->name, ++ sizeof(toi_module_header.name)); ++ toiActiveAllocator->rw_header_chunk(WRITE, ++ this_module, ++ (char *) &toi_module_header, ++ sizeof(toi_module_header)); ++ ++ /* Save the size of the data and any data returned */ ++ toiActiveAllocator->rw_header_chunk(WRITE, ++ this_module, ++ (char *) &len, sizeof(int)); ++ if (len) ++ toiActiveAllocator->rw_header_chunk( ++ WRITE, this_module, buffer, len); ++ } ++ ++ /* Write a blank header to terminate the list */ ++ toi_module_header.name[0] = '\0'; ++ toiActiveAllocator->rw_header_chunk(WRITE, NULL, ++ (char *) &toi_module_header, sizeof(toi_module_header)); ++ ++ toi_free_page(22, (unsigned long) buffer); ++ return 0; ++} ++ ++/** ++ * read_one_module_config - read and configure one module ++ * ++ * Read the configuration for one module, and configure the module ++ * to match if it is loaded. ++ * ++ * Returns: Int ++ * Zero on success, Error value otherwise. ++ **/ ++static int read_one_module_config(struct toi_module_header *header) ++{ ++ struct toi_module_ops *this_module; ++ int result, len; ++ char *buffer; ++ ++ /* Find the module */ ++ this_module = toi_find_module_given_name(header->name); ++ ++ if (!this_module) { ++ if (header->enabled) { ++ toi_early_boot_message(1, TOI_CONTINUE_REQ, ++ "It looks like we need module %s for reading " ++ "the image but it hasn't been registered.\n", ++ header->name); ++ if (!(test_toi_state(TOI_CONTINUE_REQ))) ++ return -EINVAL; ++ } else ++ printk(KERN_INFO "Module %s configuration data found, " ++ "but the module hasn't registered. Looks like " ++ "it was disabled, so we're ignoring its data.", ++ header->name); ++ } ++ ++ /* Get the length of the data (if any) */ ++ result = toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &len, ++ sizeof(int)); ++ if (result) { ++ printk(KERN_ERR "Failed to read the length of the module %s's" ++ " configuration data.\n", ++ header->name); ++ return -EINVAL; ++ } ++ ++ /* Read any data and pass to the module (if we found one) */ ++ if (!len) ++ return 0; ++ ++ buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP); ++ ++ if (!buffer) { ++ printk(KERN_ERR "Failed to allocate a buffer for reloading " ++ "module configuration info.\n"); ++ return -ENOMEM; ++ } ++ ++ toiActiveAllocator->rw_header_chunk(READ, NULL, buffer, len); ++ ++ if (!this_module) ++ goto out; ++ ++ if (!this_module->save_config_info) ++ printk(KERN_ERR "Huh? Module %s appears to have a " ++ "save_config_info, but not a load_config_info " ++ "function!\n", this_module->name); ++ else ++ this_module->load_config_info(buffer, len); ++ ++ /* ++ * Now move this module to the tail of its lists. This will put it in ++ * order. Any new modules will end up at the top of the lists. They ++ * should have been set to disabled when loaded (people will ++ * normally not edit an initrd to load a new module and then hibernate ++ * without using it!). ++ */ ++ ++ toi_move_module_tail(this_module); ++ ++ this_module->enabled = header->enabled; ++ ++out: ++ toi_free_page(23, (unsigned long) buffer); ++ return 0; ++} ++ ++/** ++ * read_module_configs - reload module configurations from the image header. ++ * ++ * Returns: Int ++ * Zero on success or an error code. ++ **/ ++static int read_module_configs(void) ++{ ++ int result = 0; ++ struct toi_module_header toi_module_header; ++ struct toi_module_ops *this_module; ++ ++ /* All modules are initially disabled. That way, if we have a module ++ * loaded now that wasn't loaded when we hibernated, it won't be used ++ * in trying to read the data. ++ */ ++ list_for_each_entry(this_module, &toi_modules, module_list) ++ this_module->enabled = 0; ++ ++ /* Get the first module header */ ++ result = toiActiveAllocator->rw_header_chunk(READ, NULL, ++ (char *) &toi_module_header, ++ sizeof(toi_module_header)); ++ if (result) { ++ printk(KERN_ERR "Failed to read the next module header.\n"); ++ return -EINVAL; ++ } ++ ++ /* For each module (in registration order) */ ++ while (toi_module_header.name[0]) { ++ result = read_one_module_config(&toi_module_header); ++ ++ if (result) ++ return -EINVAL; ++ ++ /* Get the next module header */ ++ result = toiActiveAllocator->rw_header_chunk(READ, NULL, ++ (char *) &toi_module_header, ++ sizeof(toi_module_header)); ++ ++ if (result) { ++ printk(KERN_ERR "Failed to read the next module " ++ "header.\n"); ++ return -EINVAL; ++ } ++ } ++ ++ return 0; ++} ++ ++static inline int save_fs_info(struct fs_info *fs, struct block_device *bdev) ++{ ++ return (!fs || IS_ERR(fs) || !fs->last_mount_size) ? 0 : 1; ++} ++ ++int fs_info_space_needed(void) ++{ ++ const struct super_block *sb; ++ int result = sizeof(int); ++ ++ list_for_each_entry(sb, &super_blocks, s_list) { ++ struct fs_info *fs; ++ ++ if (!sb->s_bdev) ++ continue; ++ ++ fs = fs_info_from_block_dev(sb->s_bdev); ++ if (save_fs_info(fs, sb->s_bdev)) ++ result += 16 + sizeof(dev_t) + sizeof(int) + ++ fs->last_mount_size; ++ free_fs_info(fs); ++ } ++ return result; ++} ++ ++static int fs_info_num_to_save(void) ++{ ++ const struct super_block *sb; ++ int to_save = 0; ++ ++ list_for_each_entry(sb, &super_blocks, s_list) { ++ struct fs_info *fs; ++ ++ if (!sb->s_bdev) ++ continue; ++ ++ fs = fs_info_from_block_dev(sb->s_bdev); ++ if (save_fs_info(fs, sb->s_bdev)) ++ to_save++; ++ free_fs_info(fs); ++ } ++ ++ return to_save; ++} ++ ++static int fs_info_save(void) ++{ ++ const struct super_block *sb; ++ int to_save = fs_info_num_to_save(); ++ ++ if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, (char *) &to_save, ++ sizeof(int))) { ++ abort_hibernate(TOI_FAILED_IO, "Failed to write num fs_info" ++ " to save."); ++ return -EIO; ++ } ++ ++ list_for_each_entry(sb, &super_blocks, s_list) { ++ struct fs_info *fs; ++ ++ if (!sb->s_bdev) ++ continue; ++ ++ fs = fs_info_from_block_dev(sb->s_bdev); ++ if (save_fs_info(fs, sb->s_bdev)) { ++ if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, ++ &fs->uuid[0], 16)) { ++ abort_hibernate(TOI_FAILED_IO, "Failed to " ++ "write uuid."); ++ return -EIO; ++ } ++ if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, ++ (char *) &fs->dev_t, sizeof(dev_t))) { ++ abort_hibernate(TOI_FAILED_IO, "Failed to " ++ "write dev_t."); ++ return -EIO; ++ } ++ if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, ++ (char *) &fs->last_mount_size, sizeof(int))) { ++ abort_hibernate(TOI_FAILED_IO, "Failed to " ++ "write last mount length."); ++ return -EIO; ++ } ++ if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, ++ fs->last_mount, fs->last_mount_size)) { ++ abort_hibernate(TOI_FAILED_IO, "Failed to " ++ "write uuid."); ++ return -EIO; ++ } ++ } ++ free_fs_info(fs); ++ } ++ return 0; ++} ++ ++static int fs_info_load_and_check_one(void) ++{ ++ char uuid[16], *last_mount; ++ int result = 0, ln; ++ dev_t dev_t; ++ struct block_device *dev; ++ struct fs_info *fs_info, seek; ++ ++ if (toiActiveAllocator->rw_header_chunk(READ, NULL, uuid, 16)) { ++ abort_hibernate(TOI_FAILED_IO, "Failed to read uuid."); ++ return -EIO; ++ } ++ ++ read_if_version(3, dev_t, "uuid dev_t field", return -EIO); ++ ++ if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &ln, ++ sizeof(int))) { ++ abort_hibernate(TOI_FAILED_IO, ++ "Failed to read last mount size."); ++ return -EIO; ++ } ++ ++ last_mount = kzalloc(ln, GFP_KERNEL); ++ ++ if (!last_mount) ++ return -ENOMEM; ++ ++ if (toiActiveAllocator->rw_header_chunk(READ, NULL, last_mount, ln)) { ++ abort_hibernate(TOI_FAILED_IO, ++ "Failed to read last mount timestamp."); ++ result = -EIO; ++ goto out_lmt; ++ } ++ ++ strncpy((char *) &seek.uuid, uuid, 16); ++ seek.dev_t = dev_t; ++ seek.last_mount_size = ln; ++ seek.last_mount = last_mount; ++ dev_t = blk_lookup_fs_info(&seek); ++ if (!dev_t) ++ goto out_lmt; ++ ++ dev = toi_open_by_devnum(dev_t); ++ ++ fs_info = fs_info_from_block_dev(dev); ++ if (fs_info && !IS_ERR(fs_info)) { ++ if (ln != fs_info->last_mount_size) { ++ printk(KERN_EMERG "Found matching uuid but last mount " ++ "time lengths differ?! " ++ "(%d vs %d).\n", ln, ++ fs_info->last_mount_size); ++ result = -EINVAL; ++ } else { ++ char buf[BDEVNAME_SIZE]; ++ result = !!memcmp(fs_info->last_mount, last_mount, ln); ++ if (result) ++ printk(KERN_EMERG "Last mount time for %s has " ++ "changed!\n", bdevname(dev, buf)); ++ } ++ } ++ toi_close_bdev(dev); ++ free_fs_info(fs_info); ++out_lmt: ++ kfree(last_mount); ++ return result; ++} ++ ++static int fs_info_load_and_check(void) ++{ ++ int to_do, result = 0; ++ ++ if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &to_do, ++ sizeof(int))) { ++ abort_hibernate(TOI_FAILED_IO, "Failed to read num fs_info " ++ "to load."); ++ return -EIO; ++ } ++ ++ while(to_do--) ++ result |= fs_info_load_and_check_one(); ++ ++ return result; ++} ++ ++/** ++ * write_image_header - write the image header after write the image proper ++ * ++ * Returns: Int ++ * Zero on success, error value otherwise. ++ **/ ++int write_image_header(void) ++{ ++ int ret; ++ int total = pagedir1.size + pagedir2.size+2; ++ char *header_buffer = NULL; ++ ++ /* Now prepare to write the header */ ++ ret = toiActiveAllocator->write_header_init(); ++ if (ret) { ++ abort_hibernate(TOI_FAILED_MODULE_INIT, ++ "Active allocator's write_header_init" ++ " function failed."); ++ goto write_image_header_abort; ++ } ++ ++ /* Get a buffer */ ++ header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP); ++ if (!header_buffer) { ++ abort_hibernate(TOI_OUT_OF_MEMORY, ++ "Out of memory when trying to get page for header!"); ++ goto write_image_header_abort; ++ } ++ ++ /* Write hibernate header */ ++ if (fill_toi_header((struct toi_header *) header_buffer)) { ++ abort_hibernate(TOI_OUT_OF_MEMORY, ++ "Failure to fill header information!"); ++ goto write_image_header_abort; ++ } ++ ++ if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, ++ header_buffer, sizeof(struct toi_header))) { ++ abort_hibernate(TOI_OUT_OF_MEMORY, ++ "Failure to write header info."); ++ goto write_image_header_abort; ++ } ++ ++ if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, ++ (char *) &toi_max_workers, sizeof(toi_max_workers))) { ++ abort_hibernate(TOI_OUT_OF_MEMORY, ++ "Failure to number of workers to use."); ++ goto write_image_header_abort; ++ } ++ ++ /* Write filesystem info */ ++ if (fs_info_save()) ++ goto write_image_header_abort; ++ ++ /* Write module configurations */ ++ ret = write_module_configs(); ++ if (ret) { ++ abort_hibernate(TOI_FAILED_IO, ++ "Failed to write module configs."); ++ goto write_image_header_abort; ++ } ++ ++ if (memory_bm_write(pageset1_map, ++ toiActiveAllocator->rw_header_chunk)) { ++ abort_hibernate(TOI_FAILED_IO, ++ "Failed to write bitmaps."); ++ goto write_image_header_abort; ++ } ++ ++ /* Flush data and let allocator cleanup */ ++ if (toiActiveAllocator->write_header_cleanup()) { ++ abort_hibernate(TOI_FAILED_IO, ++ "Failed to cleanup writing header."); ++ goto write_image_header_abort_no_cleanup; ++ } ++ ++ if (test_result_state(TOI_ABORTED)) ++ goto write_image_header_abort_no_cleanup; ++ ++ toi_update_status(total, total, NULL); ++ ++out: ++ if (header_buffer) ++ toi_free_page(24, (unsigned long) header_buffer); ++ return ret; ++ ++write_image_header_abort: ++ toiActiveAllocator->write_header_cleanup(); ++write_image_header_abort_no_cleanup: ++ ret = -1; ++ goto out; ++} ++ ++/** ++ * sanity_check - check the header ++ * @sh: the header which was saved at hibernate time. ++ * ++ * Perform a few checks, seeking to ensure that the kernel being ++ * booted matches the one hibernated. They need to match so we can ++ * be _sure_ things will work. It is not absolutely impossible for ++ * resuming from a different kernel to work, just not assured. ++ **/ ++static char *sanity_check(struct toi_header *sh) ++{ ++ char *reason = check_image_kernel((struct swsusp_info *) sh); ++ ++ if (reason) ++ return reason; ++ ++ if (!test_action_state(TOI_IGNORE_ROOTFS)) { ++ const struct super_block *sb; ++ list_for_each_entry(sb, &super_blocks, s_list) { ++ if ((!(sb->s_flags & MS_RDONLY)) && ++ (sb->s_type->fs_flags & FS_REQUIRES_DEV)) ++ return "Device backed fs has been mounted " ++ "rw prior to resume or initrd/ramfs " ++ "is mounted rw."; ++ } ++ } ++ ++ return NULL; ++} ++ ++static DECLARE_WAIT_QUEUE_HEAD(freeze_wait); ++ ++#define FREEZE_IN_PROGRESS (~0) ++ ++static int freeze_result; ++ ++static void do_freeze(struct work_struct *dummy) ++{ ++ freeze_result = freeze_processes(); ++ wake_up(&freeze_wait); ++ trap_non_toi_io = 1; ++} ++ ++static DECLARE_WORK(freeze_work, do_freeze); ++ ++/** ++ * __read_pageset1 - test for the existence of an image and attempt to load it ++ * ++ * Returns: Int ++ * Zero if image found and pageset1 successfully loaded. ++ * Error if no image found or loaded. ++ **/ ++static int __read_pageset1(void) ++{ ++ int i, result = 0; ++ char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP), ++ *sanity_error = NULL; ++ struct toi_header *toi_header; ++ ++ if (!header_buffer) { ++ printk(KERN_INFO "Unable to allocate a page for reading the " ++ "signature.\n"); ++ return -ENOMEM; ++ } ++ ++ /* Check for an image */ ++ result = toiActiveAllocator->image_exists(1); ++ if (result == 3) { ++ result = -ENODATA; ++ toi_early_boot_message(1, 0, "The signature from an older " ++ "version of TuxOnIce has been detected."); ++ goto out_remove_image; ++ } ++ ++ if (result != 1) { ++ result = -ENODATA; ++ noresume_reset_modules(); ++ printk(KERN_INFO "TuxOnIce: No image found.\n"); ++ goto out; ++ } ++ ++ /* ++ * Prepare the active allocator for reading the image header. The ++ * activate allocator might read its own configuration. ++ * ++ * NB: This call may never return because there might be a signature ++ * for a different image such that we warn the user and they choose ++ * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the ++ * location of the image might be unavailable if it was stored on a ++ * network connection). ++ */ ++ ++ result = toiActiveAllocator->read_header_init(); ++ if (result) { ++ printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the " ++ "image header.\n"); ++ goto out_remove_image; ++ } ++ ++ /* Check for noresume command line option */ ++ if (test_toi_state(TOI_NORESUME_SPECIFIED)) { ++ printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed " ++ "image.\n"); ++ goto out_remove_image; ++ } ++ ++ /* Check whether we've resumed before */ ++ if (test_toi_state(TOI_RESUMED_BEFORE)) { ++ toi_early_boot_message(1, 0, NULL); ++ if (!(test_toi_state(TOI_CONTINUE_REQ))) { ++ printk(KERN_INFO "TuxOnIce: Tried to resume before: " ++ "Invalidated image.\n"); ++ goto out_remove_image; ++ } ++ } ++ ++ clear_toi_state(TOI_CONTINUE_REQ); ++ ++ toi_image_header_version = toiActiveAllocator->get_header_version(); ++ ++ if (unlikely(toi_image_header_version > TOI_HEADER_VERSION)) { ++ toi_early_boot_message(1, 0, image_version_error); ++ if (!(test_toi_state(TOI_CONTINUE_REQ))) { ++ printk(KERN_INFO "TuxOnIce: Header version too new: " ++ "Invalidated image.\n"); ++ goto out_remove_image; ++ } ++ } ++ ++ /* Read hibernate header */ ++ result = toiActiveAllocator->rw_header_chunk(READ, NULL, ++ header_buffer, sizeof(struct toi_header)); ++ if (result < 0) { ++ printk(KERN_ERR "TuxOnIce: Failed to read the image " ++ "signature.\n"); ++ goto out_remove_image; ++ } ++ ++ toi_header = (struct toi_header *) header_buffer; ++ ++ /* ++ * NB: This call may also result in a reboot rather than returning. ++ */ ++ ++ sanity_error = sanity_check(toi_header); ++ if (sanity_error) { ++ toi_early_boot_message(1, TOI_CONTINUE_REQ, ++ sanity_error); ++ printk(KERN_INFO "TuxOnIce: Sanity check failed.\n"); ++ goto out_remove_image; ++ } ++ ++ /* ++ * We have an image and it looks like it will load okay. ++ * ++ * Get metadata from header. Don't override commandline parameters. ++ * ++ * We don't need to save the image size limit because it's not used ++ * during resume and will be restored with the image anyway. ++ */ ++ ++ memcpy((char *) &pagedir1, ++ (char *) &toi_header->pagedir, sizeof(pagedir1)); ++ toi_result = toi_header->param0; ++ if (!toi_bkd.toi_debug_state) { ++ toi_bkd.toi_action = ++ (toi_header->param1 & ~toi_bootflags_mask) | ++ (toi_bkd.toi_action & toi_bootflags_mask); ++ toi_bkd.toi_debug_state = toi_header->param2; ++ toi_bkd.toi_default_console_level = toi_header->param3; ++ } ++ clear_toi_state(TOI_IGNORE_LOGLEVEL); ++ pagedir2.size = toi_header->pageset_2_size; ++ for (i = 0; i < 4; i++) ++ toi_bkd.toi_io_time[i/2][i%2] = ++ toi_header->io_time[i/2][i%2]; ++ ++ set_toi_state(TOI_BOOT_KERNEL); ++ boot_kernel_data_buffer = toi_header->bkd; ++ ++ read_if_version(1, toi_max_workers, "TuxOnIce max workers", ++ goto out_remove_image); ++ ++ /* Read filesystem info */ ++ if (fs_info_load_and_check()) { ++ printk(KERN_EMERG "TuxOnIce: File system mount time checks " ++ "failed. Refusing to corrupt your filesystems!\n"); ++ goto out_remove_image; ++ } ++ ++ /* Read module configurations */ ++ result = read_module_configs(); ++ if (result) { ++ pagedir1.size = 0; ++ pagedir2.size = 0; ++ printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module " ++ "configurations.\n"); ++ clear_action_state(TOI_KEEP_IMAGE); ++ goto out_remove_image; ++ } ++ ++ toi_prepare_console(); ++ ++ set_toi_state(TOI_NOW_RESUMING); ++ ++ if (!test_action_state(TOI_LATE_CPU_HOTPLUG)) { ++ toi_prepare_status(DONT_CLEAR_BAR, "Disable nonboot cpus."); ++ if (disable_nonboot_cpus()) { ++ set_abort_result(TOI_CPU_HOTPLUG_FAILED); ++ goto out_reset_console; ++ } ++ } ++ ++ result = pm_notifier_call_chain(PM_RESTORE_PREPARE); ++ if (result) ++ goto out_notifier_call_chain;; ++ ++ if (usermodehelper_disable()) ++ goto out_enable_nonboot_cpus; ++ ++ current->flags |= PF_NOFREEZE; ++ freeze_result = FREEZE_IN_PROGRESS; ++ ++ schedule_work_on(cpumask_first(cpu_online_mask), &freeze_work); ++ ++ toi_cond_pause(1, "About to read original pageset1 locations."); ++ ++ /* ++ * See _toi_rw_header_chunk in tuxonice_bio.c: ++ * Initialize pageset1_map by reading the map from the image. ++ */ ++ if (memory_bm_read(pageset1_map, toiActiveAllocator->rw_header_chunk)) ++ goto out_thaw; ++ ++ /* ++ * See toi_rw_cleanup in tuxonice_bio.c: ++ * Clean up after reading the header. ++ */ ++ result = toiActiveAllocator->read_header_cleanup(); ++ if (result) { ++ printk(KERN_ERR "TuxOnIce: Failed to cleanup after reading the " ++ "image header.\n"); ++ goto out_thaw; ++ } ++ ++ toi_cond_pause(1, "About to read pagedir."); ++ ++ /* ++ * Get the addresses of pages into which we will load the kernel to ++ * be copied back and check if they conflict with the ones we are using. ++ */ ++ if (toi_get_pageset1_load_addresses()) { ++ printk(KERN_INFO "TuxOnIce: Failed to get load addresses for " ++ "pageset1.\n"); ++ goto out_thaw; ++ } ++ ++ /* Read the original kernel back */ ++ toi_cond_pause(1, "About to read pageset 1."); ++ ++ /* Given the pagemap, read back the data from disk */ ++ if (read_pageset(&pagedir1, 0)) { ++ toi_prepare_status(DONT_CLEAR_BAR, "Failed to read pageset 1."); ++ result = -EIO; ++ goto out_thaw; ++ } ++ ++ toi_cond_pause(1, "About to restore original kernel."); ++ result = 0; ++ ++ if (!test_action_state(TOI_KEEP_IMAGE) && ++ toiActiveAllocator->mark_resume_attempted) ++ toiActiveAllocator->mark_resume_attempted(1); ++ ++ wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS); ++out: ++ current->flags &= ~PF_NOFREEZE; ++ toi_free_page(25, (unsigned long) header_buffer); ++ return result; ++ ++out_thaw: ++ wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS); ++ trap_non_toi_io = 0; ++ thaw_processes(); ++ usermodehelper_enable(); ++out_enable_nonboot_cpus: ++ enable_nonboot_cpus(); ++out_notifier_call_chain: ++ pm_notifier_call_chain(PM_POST_RESTORE); ++out_reset_console: ++ toi_cleanup_console(); ++out_remove_image: ++ result = -EINVAL; ++ if (!test_action_state(TOI_KEEP_IMAGE)) ++ toiActiveAllocator->remove_image(); ++ toiActiveAllocator->read_header_cleanup(); ++ noresume_reset_modules(); ++ goto out; ++} ++ ++/** ++ * read_pageset1 - highlevel function to read the saved pages ++ * ++ * Attempt to read the header and pageset1 of a hibernate image. ++ * Handle the outcome, complaining where appropriate. ++ **/ ++int read_pageset1(void) ++{ ++ int error; ++ ++ error = __read_pageset1(); ++ ++ if (error && error != -ENODATA && error != -EINVAL && ++ !test_result_state(TOI_ABORTED)) ++ abort_hibernate(TOI_IMAGE_ERROR, ++ "TuxOnIce: Error %d resuming\n", error); ++ ++ return error; ++} ++ ++/** ++ * get_have_image_data - check the image header ++ **/ ++static char *get_have_image_data(void) ++{ ++ char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP); ++ struct toi_header *toi_header; ++ ++ if (!output_buffer) { ++ printk(KERN_INFO "Output buffer null.\n"); ++ return NULL; ++ } ++ ++ /* Check for an image */ ++ if (!toiActiveAllocator->image_exists(1) || ++ toiActiveAllocator->read_header_init() || ++ toiActiveAllocator->rw_header_chunk(READ, NULL, ++ output_buffer, sizeof(struct toi_header))) { ++ sprintf(output_buffer, "0\n"); ++ /* ++ * From an initrd/ramfs, catting have_image and ++ * getting a result of 0 is sufficient. ++ */ ++ clear_toi_state(TOI_BOOT_TIME); ++ goto out; ++ } ++ ++ toi_header = (struct toi_header *) output_buffer; ++ ++ sprintf(output_buffer, "1\n%s\n%s\n", ++ toi_header->uts.machine, ++ toi_header->uts.version); ++ ++ /* Check whether we've resumed before */ ++ if (test_toi_state(TOI_RESUMED_BEFORE)) ++ strcat(output_buffer, "Resumed before.\n"); ++ ++out: ++ noresume_reset_modules(); ++ return output_buffer; ++} ++ ++/** ++ * read_pageset2 - read second part of the image ++ * @overwrittenpagesonly: Read only pages which would have been ++ * verwritten by pageset1? ++ * ++ * Read in part or all of pageset2 of an image, depending upon ++ * whether we are hibernating and have only overwritten a portion ++ * with pageset1 pages, or are resuming and need to read them ++ * all. ++ * ++ * Returns: Int ++ * Zero if no error, otherwise the error value. ++ **/ ++int read_pageset2(int overwrittenpagesonly) ++{ ++ int result = 0; ++ ++ if (!pagedir2.size) ++ return 0; ++ ++ result = read_pageset(&pagedir2, overwrittenpagesonly); ++ ++ toi_cond_pause(1, "Pagedir 2 read."); ++ ++ return result; ++} ++ ++/** ++ * image_exists_read - has an image been found? ++ * @page: Output buffer ++ * ++ * Store 0 or 1 in page, depending on whether an image is found. ++ * Incoming buffer is PAGE_SIZE and result is guaranteed ++ * to be far less than that, so we don't worry about ++ * overflow. ++ **/ ++int image_exists_read(const char *page, int count) ++{ ++ int len = 0; ++ char *result; ++ ++ if (toi_activate_storage(0)) ++ return count; ++ ++ if (!test_toi_state(TOI_RESUME_DEVICE_OK)) ++ toi_attempt_to_parse_resume_device(0); ++ ++ if (!toiActiveAllocator) { ++ len = sprintf((char *) page, "-1\n"); ++ } else { ++ result = get_have_image_data(); ++ if (result) { ++ len = sprintf((char *) page, "%s", result); ++ toi_free_page(26, (unsigned long) result); ++ } ++ } ++ ++ toi_deactivate_storage(0); ++ ++ return len; ++} ++ ++/** ++ * image_exists_write - invalidate an image if one exists ++ **/ ++int image_exists_write(const char *buffer, int count) ++{ ++ if (toi_activate_storage(0)) ++ return count; ++ ++ if (toiActiveAllocator && toiActiveAllocator->image_exists(1)) ++ toiActiveAllocator->remove_image(); ++ ++ toi_deactivate_storage(0); ++ ++ clear_result_state(TOI_KEPT_IMAGE); ++ ++ return count; ++} +diff --git a/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h +new file mode 100644 +index 0000000..fe37713 +--- /dev/null ++++ b/kernel/power/tuxonice_io.h +@@ -0,0 +1,74 @@ ++/* ++ * kernel/power/tuxonice_io.h ++ * ++ * Copyright (C) 2005-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * It contains high level IO routines for hibernating. ++ * ++ */ ++ ++#include ++#include "tuxonice_pagedir.h" ++ ++/* Non-module data saved in our image header */ ++struct toi_header { ++ /* ++ * Mirror struct swsusp_info, but without ++ * the page aligned attribute ++ */ ++ struct new_utsname uts; ++ u32 version_code; ++ unsigned long num_physpages; ++ int cpus; ++ unsigned long image_pages; ++ unsigned long pages; ++ unsigned long size; ++ ++ /* Our own data */ ++ unsigned long orig_mem_free; ++ int page_size; ++ int pageset_2_size; ++ int param0; ++ int param1; ++ int param2; ++ int param3; ++ int progress0; ++ int progress1; ++ int progress2; ++ int progress3; ++ int io_time[2][2]; ++ struct pagedir pagedir; ++ dev_t root_fs; ++ unsigned long bkd; /* Boot kernel data locn */ ++}; ++ ++extern int write_pageset(struct pagedir *pagedir); ++extern int write_image_header(void); ++extern int read_pageset1(void); ++extern int read_pageset2(int overwrittenpagesonly); ++ ++extern int toi_attempt_to_parse_resume_device(int quiet); ++extern void attempt_to_parse_resume_device2(void); ++extern void attempt_to_parse_alt_resume_param(void); ++int image_exists_read(const char *page, int count); ++int image_exists_write(const char *buffer, int count); ++extern void save_restore_alt_param(int replace, int quiet); ++extern atomic_t toi_io_workers; ++ ++/* Args to save_restore_alt_param */ ++#define RESTORE 0 ++#define SAVE 1 ++ ++#define NOQUIET 0 ++#define QUIET 1 ++ ++extern dev_t name_to_dev_t(char *line); ++ ++extern wait_queue_head_t toi_io_queue_flusher; ++extern int toi_bio_queue_flusher_should_finish; ++ ++int fs_info_space_needed(void); ++ ++extern int toi_max_workers; +diff --git a/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c +new file mode 100644 +index 0000000..4cc24a9 +--- /dev/null ++++ b/kernel/power/tuxonice_modules.c +@@ -0,0 +1,522 @@ ++/* ++ * kernel/power/tuxonice_modules.c ++ * ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ */ ++ ++#include ++#include "tuxonice.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_ui.h" ++ ++LIST_HEAD(toi_filters); ++LIST_HEAD(toiAllocators); ++ ++LIST_HEAD(toi_modules); ++EXPORT_SYMBOL_GPL(toi_modules); ++ ++struct toi_module_ops *toiActiveAllocator; ++EXPORT_SYMBOL_GPL(toiActiveAllocator); ++ ++static int toi_num_filters; ++int toiNumAllocators, toi_num_modules; ++ ++/* ++ * toi_header_storage_for_modules ++ * ++ * Returns the amount of space needed to store configuration ++ * data needed by the modules prior to copying back the original ++ * kernel. We can exclude data for pageset2 because it will be ++ * available anyway once the kernel is copied back. ++ */ ++long toi_header_storage_for_modules(void) ++{ ++ struct toi_module_ops *this_module; ++ int bytes = 0; ++ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (!this_module->enabled || ++ (this_module->type == WRITER_MODULE && ++ toiActiveAllocator != this_module)) ++ continue; ++ if (this_module->storage_needed) { ++ int this = this_module->storage_needed() + ++ sizeof(struct toi_module_header) + ++ sizeof(int); ++ this_module->header_requested = this; ++ bytes += this; ++ } ++ } ++ ++ /* One more for the empty terminator */ ++ return bytes + sizeof(struct toi_module_header); ++} ++ ++void print_toi_header_storage_for_modules(void) ++{ ++ struct toi_module_ops *this_module; ++ int bytes = 0; ++ ++ printk(KERN_DEBUG "Header storage:\n"); ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (!this_module->enabled || ++ (this_module->type == WRITER_MODULE && ++ toiActiveAllocator != this_module)) ++ continue; ++ if (this_module->storage_needed) { ++ int this = this_module->storage_needed() + ++ sizeof(struct toi_module_header) + ++ sizeof(int); ++ this_module->header_requested = this; ++ bytes += this; ++ printk(KERN_DEBUG "+ %16s : %-4d/%d.\n", ++ this_module->name, ++ this_module->header_used, this); ++ } ++ } ++ ++ printk(KERN_DEBUG "+ empty terminator : %zu.\n", ++ sizeof(struct toi_module_header)); ++ printk(KERN_DEBUG " ====\n"); ++ printk(KERN_DEBUG " %zu\n", ++ bytes + sizeof(struct toi_module_header)); ++} ++EXPORT_SYMBOL_GPL(print_toi_header_storage_for_modules); ++ ++/* ++ * toi_memory_for_modules ++ * ++ * Returns the amount of memory requested by modules for ++ * doing their work during the cycle. ++ */ ++ ++long toi_memory_for_modules(int print_parts) ++{ ++ long bytes = 0, result; ++ struct toi_module_ops *this_module; ++ ++ if (print_parts) ++ printk(KERN_INFO "Memory for modules:\n===================\n"); ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ int this; ++ if (!this_module->enabled) ++ continue; ++ if (this_module->memory_needed) { ++ this = this_module->memory_needed(); ++ if (print_parts) ++ printk(KERN_INFO "%10d bytes (%5ld pages) for " ++ "module '%s'.\n", this, ++ DIV_ROUND_UP(this, PAGE_SIZE), ++ this_module->name); ++ bytes += this; ++ } ++ } ++ ++ result = DIV_ROUND_UP(bytes, PAGE_SIZE); ++ if (print_parts) ++ printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result); ++ ++ return result; ++} ++ ++/* ++ * toi_expected_compression_ratio ++ * ++ * Returns the compression ratio expected when saving the image. ++ */ ++ ++int toi_expected_compression_ratio(void) ++{ ++ int ratio = 100; ++ struct toi_module_ops *this_module; ++ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (!this_module->enabled) ++ continue; ++ if (this_module->expected_compression) ++ ratio = ratio * this_module->expected_compression() ++ / 100; ++ } ++ ++ return ratio; ++} ++ ++/* toi_find_module_given_dir ++ * Functionality : Return a module (if found), given a pointer ++ * to its directory name ++ */ ++ ++static struct toi_module_ops *toi_find_module_given_dir(char *name) ++{ ++ struct toi_module_ops *this_module, *found_module = NULL; ++ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (!strcmp(name, this_module->directory)) { ++ found_module = this_module; ++ break; ++ } ++ } ++ ++ return found_module; ++} ++ ++/* toi_find_module_given_name ++ * Functionality : Return a module (if found), given a pointer ++ * to its name ++ */ ++ ++struct toi_module_ops *toi_find_module_given_name(char *name) ++{ ++ struct toi_module_ops *this_module, *found_module = NULL; ++ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (!strcmp(name, this_module->name)) { ++ found_module = this_module; ++ break; ++ } ++ } ++ ++ return found_module; ++} ++ ++/* ++ * toi_print_module_debug_info ++ * Functionality : Get debugging info from modules into a buffer. ++ */ ++int toi_print_module_debug_info(char *buffer, int buffer_size) ++{ ++ struct toi_module_ops *this_module; ++ int len = 0; ++ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (!this_module->enabled) ++ continue; ++ if (this_module->print_debug_info) { ++ int result; ++ result = this_module->print_debug_info(buffer + len, ++ buffer_size - len); ++ len += result; ++ } ++ } ++ ++ /* Ensure null terminated */ ++ buffer[buffer_size] = 0; ++ ++ return len; ++} ++ ++/* ++ * toi_register_module ++ * ++ * Register a module. ++ */ ++int toi_register_module(struct toi_module_ops *module) ++{ ++ int i; ++ struct kobject *kobj; ++ ++ module->enabled = 1; ++ ++ if (toi_find_module_given_name(module->name)) { ++ printk(KERN_INFO "TuxOnIce: Trying to load module %s," ++ " which is already registered.\n", ++ module->name); ++ return -EBUSY; ++ } ++ ++ switch (module->type) { ++ case FILTER_MODULE: ++ list_add_tail(&module->type_list, &toi_filters); ++ toi_num_filters++; ++ break; ++ case WRITER_MODULE: ++ list_add_tail(&module->type_list, &toiAllocators); ++ toiNumAllocators++; ++ break; ++ case MISC_MODULE: ++ case MISC_HIDDEN_MODULE: ++ case BIO_ALLOCATOR_MODULE: ++ break; ++ default: ++ printk(KERN_ERR "Hmmm. Module '%s' has an invalid type." ++ " It has been ignored.\n", module->name); ++ return -EINVAL; ++ } ++ list_add_tail(&module->module_list, &toi_modules); ++ toi_num_modules++; ++ ++ if ((!module->directory && !module->shared_directory) || ++ !module->sysfs_data || !module->num_sysfs_entries) ++ return 0; ++ ++ /* ++ * Modules may share a directory, but those with shared_dir ++ * set must be loaded (via symbol dependencies) after parents ++ * and unloaded beforehand. ++ */ ++ if (module->shared_directory) { ++ struct toi_module_ops *shared = ++ toi_find_module_given_dir(module->shared_directory); ++ if (!shared) { ++ printk(KERN_ERR "TuxOnIce: Module %s wants to share " ++ "%s's directory but %s isn't loaded.\n", ++ module->name, module->shared_directory, ++ module->shared_directory); ++ toi_unregister_module(module); ++ return -ENODEV; ++ } ++ kobj = shared->dir_kobj; ++ } else { ++ if (!strncmp(module->directory, "[ROOT]", 6)) ++ kobj = tuxonice_kobj; ++ else ++ kobj = make_toi_sysdir(module->directory); ++ } ++ module->dir_kobj = kobj; ++ for (i = 0; i < module->num_sysfs_entries; i++) { ++ int result = toi_register_sysfs_file(kobj, ++ &module->sysfs_data[i]); ++ if (result) ++ return result; ++ } ++ return 0; ++} ++EXPORT_SYMBOL_GPL(toi_register_module); ++ ++/* ++ * toi_unregister_module ++ * ++ * Remove a module. ++ */ ++void toi_unregister_module(struct toi_module_ops *module) ++{ ++ int i; ++ ++ if (module->dir_kobj) ++ for (i = 0; i < module->num_sysfs_entries; i++) ++ toi_unregister_sysfs_file(module->dir_kobj, ++ &module->sysfs_data[i]); ++ ++ if (!module->shared_directory && module->directory && ++ strncmp(module->directory, "[ROOT]", 6)) ++ remove_toi_sysdir(module->dir_kobj); ++ ++ switch (module->type) { ++ case FILTER_MODULE: ++ list_del(&module->type_list); ++ toi_num_filters--; ++ break; ++ case WRITER_MODULE: ++ list_del(&module->type_list); ++ toiNumAllocators--; ++ if (toiActiveAllocator == module) { ++ toiActiveAllocator = NULL; ++ clear_toi_state(TOI_CAN_RESUME); ++ clear_toi_state(TOI_CAN_HIBERNATE); ++ } ++ break; ++ case MISC_MODULE: ++ case MISC_HIDDEN_MODULE: ++ case BIO_ALLOCATOR_MODULE: ++ break; ++ default: ++ printk(KERN_ERR "Module '%s' has an invalid type." ++ " It has been ignored.\n", module->name); ++ return; ++ } ++ list_del(&module->module_list); ++ toi_num_modules--; ++} ++EXPORT_SYMBOL_GPL(toi_unregister_module); ++ ++/* ++ * toi_move_module_tail ++ * ++ * Rearrange modules when reloading the config. ++ */ ++void toi_move_module_tail(struct toi_module_ops *module) ++{ ++ switch (module->type) { ++ case FILTER_MODULE: ++ if (toi_num_filters > 1) ++ list_move_tail(&module->type_list, &toi_filters); ++ break; ++ case WRITER_MODULE: ++ if (toiNumAllocators > 1) ++ list_move_tail(&module->type_list, &toiAllocators); ++ break; ++ case MISC_MODULE: ++ case MISC_HIDDEN_MODULE: ++ case BIO_ALLOCATOR_MODULE: ++ break; ++ default: ++ printk(KERN_ERR "Module '%s' has an invalid type." ++ " It has been ignored.\n", module->name); ++ return; ++ } ++ if ((toi_num_filters + toiNumAllocators) > 1) ++ list_move_tail(&module->module_list, &toi_modules); ++} ++ ++/* ++ * toi_initialise_modules ++ * ++ * Get ready to do some work! ++ */ ++int toi_initialise_modules(int starting_cycle, int early) ++{ ++ struct toi_module_ops *this_module; ++ int result; ++ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ this_module->header_requested = 0; ++ this_module->header_used = 0; ++ if (!this_module->enabled) ++ continue; ++ if (this_module->early != early) ++ continue; ++ if (this_module->initialise) { ++ result = this_module->initialise(starting_cycle); ++ if (result) { ++ toi_cleanup_modules(starting_cycle); ++ return result; ++ } ++ this_module->initialised = 1; ++ } ++ } ++ ++ return 0; ++} ++ ++/* ++ * toi_cleanup_modules ++ * ++ * Tell modules the work is done. ++ */ ++void toi_cleanup_modules(int finishing_cycle) ++{ ++ struct toi_module_ops *this_module; ++ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (!this_module->enabled || !this_module->initialised) ++ continue; ++ if (this_module->cleanup) ++ this_module->cleanup(finishing_cycle); ++ this_module->initialised = 0; ++ } ++} ++ ++/* ++ * toi_pre_atomic_restore_modules ++ * ++ * Get ready to do some work! ++ */ ++void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd) ++{ ++ struct toi_module_ops *this_module; ++ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (this_module->enabled && this_module->pre_atomic_restore) ++ this_module->pre_atomic_restore(bkd); ++ } ++} ++ ++/* ++ * toi_post_atomic_restore_modules ++ * ++ * Get ready to do some work! ++ */ ++void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd) ++{ ++ struct toi_module_ops *this_module; ++ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (this_module->enabled && this_module->post_atomic_restore) ++ this_module->post_atomic_restore(bkd); ++ } ++} ++ ++/* ++ * toi_get_next_filter ++ * ++ * Get the next filter in the pipeline. ++ */ ++struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought) ++{ ++ struct toi_module_ops *last_filter = NULL, *this_filter = NULL; ++ ++ list_for_each_entry(this_filter, &toi_filters, type_list) { ++ if (!this_filter->enabled) ++ continue; ++ if ((last_filter == filter_sought) || (!filter_sought)) ++ return this_filter; ++ last_filter = this_filter; ++ } ++ ++ return toiActiveAllocator; ++} ++EXPORT_SYMBOL_GPL(toi_get_next_filter); ++ ++/** ++ * toi_show_modules: Printk what support is loaded. ++ */ ++void toi_print_modules(void) ++{ ++ struct toi_module_ops *this_module; ++ int prev = 0; ++ ++ printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ", with support for"); ++ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ if (this_module->type == MISC_HIDDEN_MODULE) ++ continue; ++ printk("%s %s%s%s", prev ? "," : "", ++ this_module->enabled ? "" : "[", ++ this_module->name, ++ this_module->enabled ? "" : "]"); ++ prev = 1; ++ } ++ ++ printk(".\n"); ++} ++ ++/* toi_get_modules ++ * ++ * Take a reference to modules so they can't go away under us. ++ */ ++ ++int toi_get_modules(void) ++{ ++ struct toi_module_ops *this_module; ++ ++ list_for_each_entry(this_module, &toi_modules, module_list) { ++ struct toi_module_ops *this_module2; ++ ++ if (try_module_get(this_module->module)) ++ continue; ++ ++ /* Failed! Reverse gets and return error */ ++ list_for_each_entry(this_module2, &toi_modules, ++ module_list) { ++ if (this_module == this_module2) ++ return -EINVAL; ++ module_put(this_module2->module); ++ } ++ } ++ return 0; ++} ++ ++/* toi_put_modules ++ * ++ * Release our references to modules we used. ++ */ ++ ++void toi_put_modules(void) ++{ ++ struct toi_module_ops *this_module; ++ ++ list_for_each_entry(this_module, &toi_modules, module_list) ++ module_put(this_module->module); ++} +diff --git a/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h +new file mode 100644 +index 0000000..bf5d749 +--- /dev/null ++++ b/kernel/power/tuxonice_modules.h +@@ -0,0 +1,211 @@ ++/* ++ * kernel/power/tuxonice_modules.h ++ * ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * It contains declarations for modules. Modules are additions to ++ * TuxOnIce that provide facilities such as image compression or ++ * encryption, backends for storage of the image and user interfaces. ++ * ++ */ ++ ++#ifndef TOI_MODULES_H ++#define TOI_MODULES_H ++ ++/* This is the maximum size we store in the image header for a module name */ ++#define TOI_MAX_MODULE_NAME_LENGTH 30 ++ ++struct toi_boot_kernel_data; ++ ++/* Per-module metadata */ ++struct toi_module_header { ++ char name[TOI_MAX_MODULE_NAME_LENGTH]; ++ int enabled; ++ int type; ++ int index; ++ int data_length; ++ unsigned long signature; ++}; ++ ++enum { ++ FILTER_MODULE, ++ WRITER_MODULE, ++ BIO_ALLOCATOR_MODULE, ++ MISC_MODULE, ++ MISC_HIDDEN_MODULE, ++}; ++ ++enum { ++ TOI_ASYNC, ++ TOI_SYNC ++}; ++ ++enum { ++ TOI_VIRT, ++ TOI_PAGE, ++}; ++ ++#define TOI_MAP(type, addr) \ ++ (type == TOI_PAGE ? kmap(addr) : addr) ++ ++#define TOI_UNMAP(type, addr) \ ++ do { \ ++ if (type == TOI_PAGE) \ ++ kunmap(addr); \ ++ } while(0) ++ ++struct toi_module_ops { ++ /* Functions common to all modules */ ++ int type; ++ char *name; ++ char *directory; ++ char *shared_directory; ++ struct kobject *dir_kobj; ++ struct module *module; ++ int enabled, early, initialised; ++ struct list_head module_list; ++ ++ /* List of filters or allocators */ ++ struct list_head list, type_list; ++ ++ /* ++ * Requirements for memory and storage in ++ * the image header.. ++ */ ++ int (*memory_needed) (void); ++ int (*storage_needed) (void); ++ ++ int header_requested, header_used; ++ ++ int (*expected_compression) (void); ++ ++ /* ++ * Debug info ++ */ ++ int (*print_debug_info) (char *buffer, int size); ++ int (*save_config_info) (char *buffer); ++ void (*load_config_info) (char *buffer, int len); ++ ++ /* ++ * Initialise & cleanup - general routines called ++ * at the start and end of a cycle. ++ */ ++ int (*initialise) (int starting_cycle); ++ void (*cleanup) (int finishing_cycle); ++ ++ void (*pre_atomic_restore) (struct toi_boot_kernel_data *bkd); ++ void (*post_atomic_restore) (struct toi_boot_kernel_data *bkd); ++ ++ /* ++ * Calls for allocating storage (allocators only). ++ * ++ * Header space is requested separately and cannot fail, but the ++ * reservation is only applied when main storage is allocated. ++ * The header space reservation is thus always set prior to ++ * requesting the allocation of storage - and prior to querying ++ * how much storage is available. ++ */ ++ ++ unsigned long (*storage_available) (void); ++ void (*reserve_header_space) (unsigned long space_requested); ++ int (*register_storage) (void); ++ int (*allocate_storage) (unsigned long space_requested); ++ unsigned long (*storage_allocated) (void); ++ ++ /* ++ * Routines used in image I/O. ++ */ ++ int (*rw_init) (int rw, int stream_number); ++ int (*rw_cleanup) (int rw); ++ int (*write_page) (unsigned long index, int buf_type, void *buf, ++ unsigned int buf_size); ++ int (*read_page) (unsigned long *index, int buf_type, void *buf, ++ unsigned int *buf_size); ++ int (*io_flusher) (int rw); ++ ++ /* Reset module if image exists but reading aborted */ ++ void (*noresume_reset) (void); ++ ++ /* Read and write the metadata */ ++ int (*write_header_init) (void); ++ int (*write_header_cleanup) (void); ++ ++ int (*read_header_init) (void); ++ int (*read_header_cleanup) (void); ++ ++ /* To be called after read_header_init */ ++ int (*get_header_version) (void); ++ ++ int (*rw_header_chunk) (int rw, struct toi_module_ops *owner, ++ char *buffer_start, int buffer_size); ++ ++ int (*rw_header_chunk_noreadahead) (int rw, ++ struct toi_module_ops *owner, char *buffer_start, ++ int buffer_size); ++ ++ /* Attempt to parse an image location */ ++ int (*parse_sig_location) (char *buffer, int only_writer, int quiet); ++ ++ /* Throttle I/O according to throughput */ ++ void (*update_throughput_throttle) (int jif_index); ++ ++ /* Flush outstanding I/O */ ++ int (*finish_all_io) (void); ++ ++ /* Determine whether image exists that we can restore */ ++ int (*image_exists) (int quiet); ++ ++ /* Mark the image as having tried to resume */ ++ int (*mark_resume_attempted) (int); ++ ++ /* Destroy image if one exists */ ++ int (*remove_image) (void); ++ ++ /* Sysfs Data */ ++ struct toi_sysfs_data *sysfs_data; ++ int num_sysfs_entries; ++ ++ /* Block I/O allocator */ ++ struct toi_bio_allocator_ops *bio_allocator_ops; ++}; ++ ++extern int toi_num_modules, toiNumAllocators; ++ ++extern struct toi_module_ops *toiActiveAllocator; ++extern struct list_head toi_filters, toiAllocators, toi_modules; ++ ++extern void toi_prepare_console_modules(void); ++extern void toi_cleanup_console_modules(void); ++ ++extern struct toi_module_ops *toi_find_module_given_name(char *name); ++extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *); ++ ++extern int toi_register_module(struct toi_module_ops *module); ++extern void toi_move_module_tail(struct toi_module_ops *module); ++ ++extern long toi_header_storage_for_modules(void); ++extern long toi_memory_for_modules(int print_parts); ++extern void print_toi_header_storage_for_modules(void); ++extern int toi_expected_compression_ratio(void); ++ ++extern int toi_print_module_debug_info(char *buffer, int buffer_size); ++extern int toi_register_module(struct toi_module_ops *module); ++extern void toi_unregister_module(struct toi_module_ops *module); ++ ++extern int toi_initialise_modules(int starting_cycle, int early); ++#define toi_initialise_modules_early(starting) \ ++ toi_initialise_modules(starting, 1) ++#define toi_initialise_modules_late(starting) \ ++ toi_initialise_modules(starting, 0) ++extern void toi_cleanup_modules(int finishing_cycle); ++ ++extern void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd); ++extern void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd); ++ ++extern void toi_print_modules(void); ++ ++int toi_get_modules(void); ++void toi_put_modules(void); ++#endif +diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c +new file mode 100644 +index 0000000..75b4aa9 +--- /dev/null ++++ b/kernel/power/tuxonice_netlink.c +@@ -0,0 +1,329 @@ ++/* ++ * kernel/power/tuxonice_netlink.c ++ * ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * Functions for communicating with a userspace helper via netlink. ++ */ ++ ++ ++#include ++#include ++#include "tuxonice_netlink.h" ++#include "tuxonice.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_alloc.h" ++#include "tuxonice_builtin.h" ++ ++static struct user_helper_data *uhd_list; ++ ++/* ++ * Refill our pool of SKBs for use in emergencies (eg, when eating memory and ++ * none can be allocated). ++ */ ++static void toi_fill_skb_pool(struct user_helper_data *uhd) ++{ ++ while (uhd->pool_level < uhd->pool_limit) { ++ struct sk_buff *new_skb = ++ alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP); ++ ++ if (!new_skb) ++ break; ++ ++ new_skb->next = uhd->emerg_skbs; ++ uhd->emerg_skbs = new_skb; ++ uhd->pool_level++; ++ } ++} ++ ++/* ++ * Try to allocate a single skb. If we can't get one, try to use one from ++ * our pool. ++ */ ++static struct sk_buff *toi_get_skb(struct user_helper_data *uhd) ++{ ++ struct sk_buff *skb = ++ alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP); ++ ++ if (skb) ++ return skb; ++ ++ skb = uhd->emerg_skbs; ++ if (skb) { ++ uhd->pool_level--; ++ uhd->emerg_skbs = skb->next; ++ skb->next = NULL; ++ } ++ ++ return skb; ++} ++ ++void toi_send_netlink_message(struct user_helper_data *uhd, ++ int type, void *params, size_t len) ++{ ++ struct sk_buff *skb; ++ struct nlmsghdr *nlh; ++ void *dest; ++ struct task_struct *t; ++ ++ if (uhd->pid == -1) ++ return; ++ ++ if (uhd->debug) ++ printk(KERN_ERR "toi_send_netlink_message: Send " ++ "message type %d.\n", type); ++ ++ skb = toi_get_skb(uhd); ++ if (!skb) { ++ printk(KERN_INFO "toi_netlink: Can't allocate skb!\n"); ++ return; ++ } ++ ++ nlh = nlmsg_put(skb, 0, uhd->sock_seq, type, len, 0); ++ uhd->sock_seq++; ++ ++ dest = NLMSG_DATA(nlh); ++ if (params && len > 0) ++ memcpy(dest, params, len); ++ ++ netlink_unicast(uhd->nl, skb, uhd->pid, 0); ++ ++ toi_read_lock_tasklist(); ++ t = find_task_by_pid_ns(uhd->pid, &init_pid_ns); ++ if (!t) { ++ toi_read_unlock_tasklist(); ++ if (uhd->pid > -1) ++ printk(KERN_INFO "Hmm. Can't find the userspace task" ++ " %d.\n", uhd->pid); ++ return; ++ } ++ wake_up_process(t); ++ toi_read_unlock_tasklist(); ++ ++ yield(); ++} ++EXPORT_SYMBOL_GPL(toi_send_netlink_message); ++ ++static void send_whether_debugging(struct user_helper_data *uhd) ++{ ++ static u8 is_debugging = 1; ++ ++ toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING, ++ &is_debugging, sizeof(u8)); ++} ++ ++/* ++ * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we ++ * are hibernating. ++ */ ++static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid) ++{ ++ struct task_struct *t; ++ ++ if (uhd->debug) ++ printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid); ++ ++ toi_read_lock_tasklist(); ++ t = find_task_by_pid_ns(pid, &init_pid_ns); ++ if (!t) { ++ toi_read_unlock_tasklist(); ++ printk(KERN_INFO "Strange. Can't find the userspace task %d.\n", ++ pid); ++ return -EINVAL; ++ } ++ ++ t->flags |= PF_NOFREEZE; ++ ++ toi_read_unlock_tasklist(); ++ uhd->pid = pid; ++ ++ toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0); ++ ++ return 0; ++} ++ ++/* ++ * Called when the userspace process has informed us that it's ready to roll. ++ */ ++static int nl_ready(struct user_helper_data *uhd, u32 version) ++{ ++ if (version != uhd->interface_version) { ++ printk(KERN_INFO "%s userspace process using invalid interface" ++ " version (%d - kernel wants %d). Trying to " ++ "continue without it.\n", ++ uhd->name, version, uhd->interface_version); ++ if (uhd->not_ready) ++ uhd->not_ready(); ++ return -EINVAL; ++ } ++ ++ complete(&uhd->wait_for_process); ++ ++ return 0; ++} ++ ++void toi_netlink_close_complete(struct user_helper_data *uhd) ++{ ++ if (uhd->nl) { ++ netlink_kernel_release(uhd->nl); ++ uhd->nl = NULL; ++ } ++ ++ while (uhd->emerg_skbs) { ++ struct sk_buff *next = uhd->emerg_skbs->next; ++ kfree_skb(uhd->emerg_skbs); ++ uhd->emerg_skbs = next; ++ } ++ ++ uhd->pid = -1; ++} ++EXPORT_SYMBOL_GPL(toi_netlink_close_complete); ++ ++static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd, ++ struct sk_buff *skb, struct nlmsghdr *nlh) ++{ ++ int type = nlh->nlmsg_type; ++ int *data; ++ int err; ++ ++ if (uhd->debug) ++ printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n", ++ type); ++ ++ /* Let the more specific handler go first. It returns ++ * 1 for valid messages that it doesn't know. */ ++ err = uhd->rcv_msg(skb, nlh); ++ if (err != 1) ++ return err; ++ ++ /* Only allow one task to receive NOFREEZE privileges */ ++ if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) { ++ printk(KERN_INFO "Received extra nofreeze me requests.\n"); ++ return -EBUSY; ++ } ++ ++ data = NLMSG_DATA(nlh); ++ ++ switch (type) { ++ case NETLINK_MSG_NOFREEZE_ME: ++ return nl_set_nofreeze(uhd, nlh->nlmsg_pid); ++ case NETLINK_MSG_GET_DEBUGGING: ++ send_whether_debugging(uhd); ++ return 0; ++ case NETLINK_MSG_READY: ++ if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) { ++ printk(KERN_INFO "Invalid ready mesage.\n"); ++ if (uhd->not_ready) ++ uhd->not_ready(); ++ return -EINVAL; ++ } ++ return nl_ready(uhd, (u32) *data); ++ case NETLINK_MSG_CLEANUP: ++ toi_netlink_close_complete(uhd); ++ return 0; ++ } ++ ++ return -EINVAL; ++} ++ ++static void toi_user_rcv_skb(struct sk_buff *skb) ++{ ++ int err; ++ struct nlmsghdr *nlh; ++ struct user_helper_data *uhd = uhd_list; ++ ++ while (uhd && uhd->netlink_id != skb->sk->sk_protocol) ++ uhd = uhd->next; ++ ++ if (!uhd) ++ return; ++ ++ while (skb->len >= NLMSG_SPACE(0)) { ++ u32 rlen; ++ ++ nlh = (struct nlmsghdr *) skb->data; ++ if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) ++ return; ++ ++ rlen = NLMSG_ALIGN(nlh->nlmsg_len); ++ if (rlen > skb->len) ++ rlen = skb->len; ++ ++ err = toi_nl_gen_rcv_msg(uhd, skb, nlh); ++ if (err) ++ netlink_ack(skb, nlh, err); ++ else if (nlh->nlmsg_flags & NLM_F_ACK) ++ netlink_ack(skb, nlh, 0); ++ skb_pull(skb, rlen); ++ } ++} ++ ++static int netlink_prepare(struct user_helper_data *uhd) ++{ ++ struct netlink_kernel_cfg cfg = { ++ .groups = 0, ++ .input = toi_user_rcv_skb, ++ }; ++ ++ uhd->next = uhd_list; ++ uhd_list = uhd; ++ ++ uhd->sock_seq = 0x42c0ffee; ++ uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, &cfg); ++ if (!uhd->nl) { ++ printk(KERN_INFO "Failed to allocate netlink socket for %s.\n", ++ uhd->name); ++ return -ENOMEM; ++ } ++ ++ toi_fill_skb_pool(uhd); ++ ++ return 0; ++} ++ ++void toi_netlink_close(struct user_helper_data *uhd) ++{ ++ struct task_struct *t; ++ ++ toi_read_lock_tasklist(); ++ t = find_task_by_pid_ns(uhd->pid, &init_pid_ns); ++ if (t) ++ t->flags &= ~PF_NOFREEZE; ++ toi_read_unlock_tasklist(); ++ ++ toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0); ++} ++EXPORT_SYMBOL_GPL(toi_netlink_close); ++ ++int toi_netlink_setup(struct user_helper_data *uhd) ++{ ++ /* In case userui didn't cleanup properly on us */ ++ toi_netlink_close_complete(uhd); ++ ++ if (netlink_prepare(uhd) < 0) { ++ printk(KERN_INFO "Netlink prepare failed.\n"); ++ return 1; ++ } ++ ++ if (toi_launch_userspace_program(uhd->program, uhd->netlink_id, ++ UMH_WAIT_EXEC, uhd->debug) < 0) { ++ printk(KERN_INFO "Launch userspace program failed.\n"); ++ toi_netlink_close_complete(uhd); ++ return 1; ++ } ++ ++ /* Wait 2 seconds for the userspace process to make contact */ ++ wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ); ++ ++ if (uhd->pid == -1) { ++ printk(KERN_INFO "%s: Failed to contact userspace process.\n", ++ uhd->name); ++ toi_netlink_close_complete(uhd); ++ return 1; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(toi_netlink_setup); +diff --git a/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h +new file mode 100644 +index 0000000..b8ef06e +--- /dev/null ++++ b/kernel/power/tuxonice_netlink.h +@@ -0,0 +1,62 @@ ++/* ++ * kernel/power/tuxonice_netlink.h ++ * ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * Declarations for functions for communicating with a userspace helper ++ * via netlink. ++ */ ++ ++#include ++#include ++ ++#define NETLINK_MSG_BASE 0x10 ++ ++#define NETLINK_MSG_READY 0x10 ++#define NETLINK_MSG_NOFREEZE_ME 0x16 ++#define NETLINK_MSG_GET_DEBUGGING 0x19 ++#define NETLINK_MSG_CLEANUP 0x24 ++#define NETLINK_MSG_NOFREEZE_ACK 0x27 ++#define NETLINK_MSG_IS_DEBUGGING 0x28 ++ ++struct user_helper_data { ++ int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh); ++ void (*not_ready) (void); ++ struct sock *nl; ++ u32 sock_seq; ++ pid_t pid; ++ char *comm; ++ char program[256]; ++ int pool_level; ++ int pool_limit; ++ struct sk_buff *emerg_skbs; ++ int skb_size; ++ int netlink_id; ++ char *name; ++ struct user_helper_data *next; ++ struct completion wait_for_process; ++ u32 interface_version; ++ int must_init; ++ int debug; ++}; ++ ++#ifdef CONFIG_NET ++int toi_netlink_setup(struct user_helper_data *uhd); ++void toi_netlink_close(struct user_helper_data *uhd); ++void toi_send_netlink_message(struct user_helper_data *uhd, ++ int type, void *params, size_t len); ++void toi_netlink_close_complete(struct user_helper_data *uhd); ++#else ++static inline int toi_netlink_setup(struct user_helper_data *uhd) ++{ ++ return 0; ++} ++ ++static inline void toi_netlink_close(struct user_helper_data *uhd) { }; ++static inline void toi_send_netlink_message(struct user_helper_data *uhd, ++ int type, void *params, size_t len) { }; ++static inline void toi_netlink_close_complete(struct user_helper_data *uhd) ++ { }; ++#endif +diff --git a/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c +new file mode 100644 +index 0000000..ce0d38c +--- /dev/null ++++ b/kernel/power/tuxonice_pagedir.c +@@ -0,0 +1,346 @@ ++/* ++ * kernel/power/tuxonice_pagedir.c ++ * ++ * Copyright (C) 1998-2001 Gabor Kuti ++ * Copyright (C) 1998,2001,2002 Pavel Machek ++ * Copyright (C) 2002-2003 Florent Chabaud ++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * Routines for handling pagesets. ++ * Note that pbes aren't actually stored as such. They're stored as ++ * bitmaps and extents. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "tuxonice_pageflags.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_pagedir.h" ++#include "tuxonice_prepare_image.h" ++#include "tuxonice.h" ++#include "tuxonice_builtin.h" ++#include "tuxonice_alloc.h" ++ ++static int ptoi_pfn; ++static struct pbe *this_low_pbe; ++static struct pbe **last_low_pbe_ptr; ++ ++void toi_reset_alt_image_pageset2_pfn(void) ++{ ++ memory_bm_position_reset(pageset2_map); ++} ++ ++static struct page *first_conflicting_page; ++ ++/* ++ * free_conflicting_pages ++ */ ++ ++static void free_conflicting_pages(void) ++{ ++ while (first_conflicting_page) { ++ struct page *next = ++ *((struct page **) kmap(first_conflicting_page)); ++ kunmap(first_conflicting_page); ++ toi__free_page(29, first_conflicting_page); ++ first_conflicting_page = next; ++ } ++} ++ ++/* __toi_get_nonconflicting_page ++ * ++ * Description: Gets order zero pages that won't be overwritten ++ * while copying the original pages. ++ */ ++ ++struct page *___toi_get_nonconflicting_page(int can_be_highmem) ++{ ++ struct page *page; ++ gfp_t flags = TOI_ATOMIC_GFP; ++ if (can_be_highmem) ++ flags |= __GFP_HIGHMEM; ++ ++ ++ if (test_toi_state(TOI_LOADING_ALT_IMAGE) && ++ pageset2_map && ++ (ptoi_pfn != BM_END_OF_MAP)) { ++ do { ++ ptoi_pfn = memory_bm_next_pfn(pageset2_map); ++ if (ptoi_pfn != BM_END_OF_MAP) { ++ page = pfn_to_page(ptoi_pfn); ++ if (!PagePageset1(page) && ++ (can_be_highmem || !PageHighMem(page))) ++ return page; ++ } ++ } while (ptoi_pfn != BM_END_OF_MAP); ++ } ++ ++ do { ++ page = toi_alloc_page(29, flags); ++ if (!page) { ++ printk(KERN_INFO "Failed to get nonconflicting " ++ "page.\n"); ++ return NULL; ++ } ++ if (PagePageset1(page)) { ++ struct page **next = (struct page **) kmap(page); ++ *next = first_conflicting_page; ++ first_conflicting_page = page; ++ kunmap(page); ++ } ++ } while (PagePageset1(page)); ++ ++ return page; ++} ++ ++unsigned long __toi_get_nonconflicting_page(void) ++{ ++ struct page *page = ___toi_get_nonconflicting_page(0); ++ return page ? (unsigned long) page_address(page) : 0; ++} ++ ++static struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe, ++ int highmem) ++{ ++ if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1)) ++ + 2 * sizeof(struct pbe)) > PAGE_SIZE) { ++ struct page *new_page = ++ ___toi_get_nonconflicting_page(highmem); ++ if (!new_page) ++ return ERR_PTR(-ENOMEM); ++ this_pbe = (struct pbe *) kmap(new_page); ++ memset(this_pbe, 0, PAGE_SIZE); ++ *page_ptr = new_page; ++ } else ++ this_pbe++; ++ ++ return this_pbe; ++} ++ ++/** ++ * get_pageset1_load_addresses - generate pbes for conflicting pages ++ * ++ * We check here that pagedir & pages it points to won't collide ++ * with pages where we're going to restore from the loaded pages ++ * later. ++ * ++ * Returns: ++ * Zero on success, one if couldn't find enough pages (shouldn't ++ * happen). ++ **/ ++int toi_get_pageset1_load_addresses(void) ++{ ++ int pfn, highallocd = 0, lowallocd = 0; ++ int low_needed = pagedir1.size - get_highmem_size(pagedir1); ++ int high_needed = get_highmem_size(pagedir1); ++ int low_pages_for_highmem = 0; ++ gfp_t flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM; ++ struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL, ++ *low_pbe_page, *last_low_pbe_page = NULL; ++ struct pbe **last_high_pbe_ptr = &restore_highmem_pblist, ++ *this_high_pbe = NULL; ++ unsigned long orig_low_pfn, orig_high_pfn; ++ int high_pbes_done = 0, low_pbes_done = 0; ++ int low_direct = 0, high_direct = 0, result = 0, i; ++ int high_page = 1, high_offset = 0, low_page = 1, low_offset = 0; ++ ++ memory_bm_set_iterators(pageset1_map, 3); ++ memory_bm_position_reset(pageset1_map); ++ ++ memory_bm_set_iterators(pageset1_copy_map, 2); ++ memory_bm_position_reset(pageset1_copy_map); ++ ++ last_low_pbe_ptr = &restore_pblist; ++ ++ /* First, allocate pages for the start of our pbe lists. */ ++ if (high_needed) { ++ high_pbe_page = ___toi_get_nonconflicting_page(1); ++ if (!high_pbe_page) { ++ result = -ENOMEM; ++ goto out; ++ } ++ this_high_pbe = (struct pbe *) kmap(high_pbe_page); ++ memset(this_high_pbe, 0, PAGE_SIZE); ++ } ++ ++ low_pbe_page = ___toi_get_nonconflicting_page(0); ++ if (!low_pbe_page) { ++ result = -ENOMEM; ++ goto out; ++ } ++ this_low_pbe = (struct pbe *) page_address(low_pbe_page); ++ ++ /* ++ * Next, allocate the number of pages we need. ++ */ ++ ++ i = low_needed + high_needed; ++ ++ do { ++ int is_high; ++ ++ if (i == low_needed) ++ flags &= ~__GFP_HIGHMEM; ++ ++ page = toi_alloc_page(30, flags); ++ BUG_ON(!page); ++ ++ SetPagePageset1Copy(page); ++ is_high = PageHighMem(page); ++ ++ if (PagePageset1(page)) { ++ if (is_high) ++ high_direct++; ++ else ++ low_direct++; ++ } else { ++ if (is_high) ++ highallocd++; ++ else ++ lowallocd++; ++ } ++ } while (--i); ++ ++ high_needed -= high_direct; ++ low_needed -= low_direct; ++ ++ /* ++ * Do we need to use some lowmem pages for the copies of highmem ++ * pages? ++ */ ++ if (high_needed > highallocd) { ++ low_pages_for_highmem = high_needed - highallocd; ++ high_needed -= low_pages_for_highmem; ++ low_needed += low_pages_for_highmem; ++ } ++ ++ /* ++ * Now generate our pbes (which will be used for the atomic restore), ++ * and free unneeded pages. ++ */ ++ memory_bm_position_reset(pageset1_copy_map); ++ for (pfn = memory_bm_next_pfn_index(pageset1_copy_map, 1); pfn != BM_END_OF_MAP; ++ pfn = memory_bm_next_pfn_index(pageset1_copy_map, 1)) { ++ int is_high; ++ page = pfn_to_page(pfn); ++ is_high = PageHighMem(page); ++ ++ if (PagePageset1(page)) ++ continue; ++ ++ /* Nope. We're going to use this page. Add a pbe. */ ++ if (is_high || low_pages_for_highmem) { ++ struct page *orig_page; ++ high_pbes_done++; ++ if (!is_high) ++ low_pages_for_highmem--; ++ do { ++ orig_high_pfn = memory_bm_next_pfn_index(pageset1_map, 1); ++ BUG_ON(orig_high_pfn == BM_END_OF_MAP); ++ orig_page = pfn_to_page(orig_high_pfn); ++ } while (!PageHighMem(orig_page) || ++ PagePageset1Copy(orig_page)); ++ ++ this_high_pbe->orig_address = (void *) orig_high_pfn; ++ this_high_pbe->address = page; ++ this_high_pbe->next = NULL; ++ toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "High pbe %d/%d: %p(%d)=>%p", ++ high_page, high_offset, page, orig_high_pfn, orig_page); ++ if (last_high_pbe_page != high_pbe_page) { ++ *last_high_pbe_ptr = ++ (struct pbe *) high_pbe_page; ++ if (last_high_pbe_page) { ++ kunmap(last_high_pbe_page); ++ high_page++; ++ high_offset = 0; ++ } else ++ high_offset++; ++ last_high_pbe_page = high_pbe_page; ++ } else { ++ *last_high_pbe_ptr = this_high_pbe; ++ high_offset++; ++ } ++ last_high_pbe_ptr = &this_high_pbe->next; ++ this_high_pbe = get_next_pbe(&high_pbe_page, ++ this_high_pbe, 1); ++ if (IS_ERR(this_high_pbe)) { ++ printk(KERN_INFO ++ "This high pbe is an error.\n"); ++ return -ENOMEM; ++ } ++ } else { ++ struct page *orig_page; ++ low_pbes_done++; ++ do { ++ orig_low_pfn = memory_bm_next_pfn_index(pageset1_map, 2); ++ BUG_ON(orig_low_pfn == BM_END_OF_MAP); ++ orig_page = pfn_to_page(orig_low_pfn); ++ } while (PageHighMem(orig_page) || ++ PagePageset1Copy(orig_page)); ++ ++ this_low_pbe->orig_address = page_address(orig_page); ++ this_low_pbe->address = page_address(page); ++ this_low_pbe->next = NULL; ++ toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "Low pbe %d/%d: %p(%d)=>%p", ++ low_page, low_offset, this_low_pbe->orig_address, ++ orig_low_pfn, this_low_pbe->address); ++ *last_low_pbe_ptr = this_low_pbe; ++ last_low_pbe_ptr = &this_low_pbe->next; ++ this_low_pbe = get_next_pbe(&low_pbe_page, ++ this_low_pbe, 0); ++ if (low_pbe_page != last_low_pbe_page) { ++ if (last_low_pbe_page) { ++ low_page++; ++ low_offset = 0; ++ } ++ last_low_pbe_page = low_pbe_page; ++ } else ++ low_offset++; ++ if (IS_ERR(this_low_pbe)) { ++ printk(KERN_INFO "this_low_pbe is an error.\n"); ++ return -ENOMEM; ++ } ++ } ++ } ++ ++ if (high_pbe_page) ++ kunmap(high_pbe_page); ++ ++ if (last_high_pbe_page != high_pbe_page) { ++ if (last_high_pbe_page) ++ kunmap(last_high_pbe_page); ++ toi__free_page(29, high_pbe_page); ++ } ++ ++ free_conflicting_pages(); ++ ++out: ++ memory_bm_set_iterators(pageset1_map, 1); ++ memory_bm_set_iterators(pageset1_copy_map, 1); ++ return result; ++} ++ ++int add_boot_kernel_data_pbe(void) ++{ ++ this_low_pbe->address = (char *) __toi_get_nonconflicting_page(); ++ if (!this_low_pbe->address) { ++ printk(KERN_INFO "Failed to get bkd atomic restore buffer."); ++ return -ENOMEM; ++ } ++ ++ toi_bkd.size = sizeof(toi_bkd); ++ memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd)); ++ ++ *last_low_pbe_ptr = this_low_pbe; ++ this_low_pbe->orig_address = (char *) boot_kernel_data_buffer; ++ this_low_pbe->next = NULL; ++ return 0; ++} +diff --git a/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h +new file mode 100644 +index 0000000..d08e4b1 +--- /dev/null ++++ b/kernel/power/tuxonice_pagedir.h +@@ -0,0 +1,50 @@ ++/* ++ * kernel/power/tuxonice_pagedir.h ++ * ++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * Declarations for routines for handling pagesets. ++ */ ++ ++#ifndef KERNEL_POWER_PAGEDIR_H ++#define KERNEL_POWER_PAGEDIR_H ++ ++/* Pagedir ++ * ++ * Contains the metadata for a set of pages saved in the image. ++ */ ++ ++struct pagedir { ++ int id; ++ unsigned long size; ++#ifdef CONFIG_HIGHMEM ++ unsigned long size_high; ++#endif ++}; ++ ++#ifdef CONFIG_HIGHMEM ++#define get_highmem_size(pagedir) (pagedir.size_high) ++#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0) ++#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0) ++#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high) ++#else ++#define get_highmem_size(pagedir) (0) ++#define set_highmem_size(pagedir, sz) do { } while (0) ++#define inc_highmem_size(pagedir) do { } while (0) ++#define get_lowmem_size(pagedir) (pagedir.size) ++#endif ++ ++extern struct pagedir pagedir1, pagedir2; ++ ++extern void toi_copy_pageset1(void); ++ ++extern int toi_get_pageset1_load_addresses(void); ++ ++extern unsigned long __toi_get_nonconflicting_page(void); ++struct page *___toi_get_nonconflicting_page(int can_be_highmem); ++ ++extern void toi_reset_alt_image_pageset2_pfn(void); ++extern int add_boot_kernel_data_pbe(void); ++#endif +diff --git a/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c +new file mode 100644 +index 0000000..77fab4f +--- /dev/null ++++ b/kernel/power/tuxonice_pageflags.c +@@ -0,0 +1,29 @@ ++/* ++ * kernel/power/tuxonice_pageflags.c ++ * ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * Routines for serialising and relocating pageflags in which we ++ * store our image metadata. ++ */ ++ ++#include ++#include ++#include "tuxonice_pageflags.h" ++#include "power.h" ++ ++int toi_pageflags_space_needed(void) ++{ ++ int total = 0; ++ struct bm_block *bb; ++ ++ total = sizeof(unsigned int); ++ ++ list_for_each_entry(bb, &pageset1_map->blocks, hook) ++ total += 2 * sizeof(unsigned long) + PAGE_SIZE; ++ ++ return total; ++} ++EXPORT_SYMBOL_GPL(toi_pageflags_space_needed); +diff --git a/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h +new file mode 100644 +index 0000000..d5aa7b1 +--- /dev/null ++++ b/kernel/power/tuxonice_pageflags.h +@@ -0,0 +1,72 @@ ++/* ++ * kernel/power/tuxonice_pageflags.h ++ * ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ */ ++ ++#ifndef KERNEL_POWER_TUXONICE_PAGEFLAGS_H ++#define KERNEL_POWER_TUXONICE_PAGEFLAGS_H ++ ++extern struct memory_bitmap *pageset1_map; ++extern struct memory_bitmap *pageset1_copy_map; ++extern struct memory_bitmap *pageset2_map; ++extern struct memory_bitmap *page_resave_map; ++extern struct memory_bitmap *io_map; ++extern struct memory_bitmap *nosave_map; ++extern struct memory_bitmap *free_map; ++ ++#define PagePageset1(page) \ ++ (memory_bm_test_bit(pageset1_map, page_to_pfn(page))) ++#define SetPagePageset1(page) \ ++ (memory_bm_set_bit(pageset1_map, page_to_pfn(page))) ++#define ClearPagePageset1(page) \ ++ (memory_bm_clear_bit(pageset1_map, page_to_pfn(page))) ++ ++#define PagePageset1Copy(page) \ ++ (memory_bm_test_bit(pageset1_copy_map, page_to_pfn(page))) ++#define SetPagePageset1Copy(page) \ ++ (memory_bm_set_bit(pageset1_copy_map, page_to_pfn(page))) ++#define ClearPagePageset1Copy(page) \ ++ (memory_bm_clear_bit(pageset1_copy_map, page_to_pfn(page))) ++ ++#define PagePageset2(page) \ ++ (memory_bm_test_bit(pageset2_map, page_to_pfn(page))) ++#define SetPagePageset2(page) \ ++ (memory_bm_set_bit(pageset2_map, page_to_pfn(page))) ++#define ClearPagePageset2(page) \ ++ (memory_bm_clear_bit(pageset2_map, page_to_pfn(page))) ++ ++#define PageWasRW(page) \ ++ (memory_bm_test_bit(pageset2_map, page_to_pfn(page))) ++#define SetPageWasRW(page) \ ++ (memory_bm_set_bit(pageset2_map, page_to_pfn(page))) ++#define ClearPageWasRW(page) \ ++ (memory_bm_clear_bit(pageset2_map, page_to_pfn(page))) ++ ++#define PageResave(page) (page_resave_map ? \ ++ memory_bm_test_bit(page_resave_map, page_to_pfn(page)) : 0) ++#define SetPageResave(page) \ ++ (memory_bm_set_bit(page_resave_map, page_to_pfn(page))) ++#define ClearPageResave(page) \ ++ (memory_bm_clear_bit(page_resave_map, page_to_pfn(page))) ++ ++#define PageNosave(page) (nosave_map ? \ ++ memory_bm_test_bit(nosave_map, page_to_pfn(page)) : 0) ++#define SetPageNosave(page) \ ++ (memory_bm_set_bit(nosave_map, page_to_pfn(page))) ++#define ClearPageNosave(page) \ ++ (memory_bm_clear_bit(nosave_map, page_to_pfn(page))) ++ ++#define PageNosaveFree(page) (free_map ? \ ++ memory_bm_test_bit(free_map, page_to_pfn(page)) : 0) ++#define SetPageNosaveFree(page) \ ++ (memory_bm_set_bit(free_map, page_to_pfn(page))) ++#define ClearPageNosaveFree(page) \ ++ (memory_bm_clear_bit(free_map, page_to_pfn(page))) ++ ++extern void save_pageflags(struct memory_bitmap *pagemap); ++extern int load_pageflags(struct memory_bitmap *pagemap); ++extern int toi_pageflags_space_needed(void); ++#endif +diff --git a/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c +new file mode 100644 +index 0000000..1604a95 +--- /dev/null ++++ b/kernel/power/tuxonice_power_off.c +@@ -0,0 +1,287 @@ ++/* ++ * kernel/power/tuxonice_power_off.c ++ * ++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * Support for powering down. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "tuxonice.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_power_off.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_io.h" ++ ++unsigned long toi_poweroff_method; /* 0 - Kernel power off */ ++EXPORT_SYMBOL_GPL(toi_poweroff_method); ++ ++static int wake_delay; ++static char lid_state_file[256], wake_alarm_dir[256]; ++static struct file *lid_file, *alarm_file, *epoch_file; ++static int post_wake_state = -1; ++ ++static int did_suspend_to_both; ++ ++/* ++ * __toi_power_down ++ * Functionality : Powers down or reboots the computer once the image ++ * has been written to disk. ++ * Key Assumptions : Able to reboot/power down via code called or that ++ * the warning emitted if the calls fail will be visible ++ * to the user (ie printk resumes devices). ++ */ ++ ++static void __toi_power_down(int method) ++{ ++ int error; ++ ++ toi_cond_pause(1, test_action_state(TOI_REBOOT) ? "Ready to reboot." : ++ "Powering down."); ++ ++ if (test_result_state(TOI_ABORTED)) ++ goto out; ++ ++ if (test_action_state(TOI_REBOOT)) ++ kernel_restart(NULL); ++ ++ switch (method) { ++ case 0: ++ break; ++ case 3: ++ /* ++ * Re-read the overwritten part of pageset2 to make post-resume ++ * faster. ++ */ ++ if (read_pageset2(1)) ++ panic("Attempt to reload pagedir 2 failed. " ++ "Try rebooting."); ++ ++ pm_prepare_console(); ++ ++ error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); ++ if (!error) { ++ pm_restore_gfp_mask(); ++ error = suspend_devices_and_enter(PM_SUSPEND_MEM); ++ pm_restrict_gfp_mask(); ++ if (!error) ++ did_suspend_to_both = 1; ++ } ++ pm_notifier_call_chain(PM_POST_SUSPEND); ++ pm_restore_console(); ++ ++ /* Success - we're now post-resume-from-ram */ ++ if (did_suspend_to_both) ++ return; ++ ++ /* Failed to suspend to ram - do normal power off */ ++ break; ++ case 4: ++ /* ++ * If succeeds, doesn't return. If fails, do a simple ++ * powerdown. ++ */ ++ hibernation_platform_enter(); ++ break; ++ case 5: ++ /* Historic entry only now */ ++ break; ++ } ++ ++ if (method && method != 5) ++ toi_cond_pause(1, ++ "Falling back to alternate power off method."); ++ ++ if (test_result_state(TOI_ABORTED)) ++ goto out; ++ ++ kernel_power_off(); ++ kernel_halt(); ++ toi_cond_pause(1, "Powerdown failed."); ++ while (1) ++ cpu_relax(); ++ ++out: ++ if (read_pageset2(1)) ++ panic("Attempt to reload pagedir 2 failed. Try rebooting."); ++ return; ++} ++ ++#define CLOSE_FILE(file) \ ++ if (file) { \ ++ filp_close(file, NULL); file = NULL; \ ++ } ++ ++static void powerdown_cleanup(int toi_or_resume) ++{ ++ if (!toi_or_resume) ++ return; ++ ++ CLOSE_FILE(lid_file); ++ CLOSE_FILE(alarm_file); ++ CLOSE_FILE(epoch_file); ++} ++ ++static void open_file(char *format, char *arg, struct file **var, int mode, ++ char *desc) ++{ ++ char buf[256]; ++ ++ if (strlen(arg)) { ++ sprintf(buf, format, arg); ++ *var = filp_open(buf, mode, 0); ++ if (IS_ERR(*var) || !*var) { ++ printk(KERN_INFO "Failed to open %s file '%s' (%p).\n", ++ desc, buf, *var); ++ *var = NULL; ++ } ++ } ++} ++ ++static int powerdown_init(int toi_or_resume) ++{ ++ if (!toi_or_resume) ++ return 0; ++ ++ did_suspend_to_both = 0; ++ ++ open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file, ++ O_RDONLY, "lid"); ++ ++ if (strlen(wake_alarm_dir)) { ++ open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir, ++ &alarm_file, O_WRONLY, "alarm"); ++ ++ open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir, ++ &epoch_file, O_RDONLY, "epoch"); ++ } ++ ++ return 0; ++} ++ ++static int lid_closed(void) ++{ ++ char array[25]; ++ ssize_t size; ++ loff_t pos = 0; ++ ++ if (!lid_file) ++ return 0; ++ ++ size = vfs_read(lid_file, (char __user *) array, 25, &pos); ++ if ((int) size < 1) { ++ printk(KERN_INFO "Failed to read lid state file (%d).\n", ++ (int) size); ++ return 0; ++ } ++ ++ if (!strcmp(array, "state: closed\n")) ++ return 1; ++ ++ return 0; ++} ++ ++static void write_alarm_file(int value) ++{ ++ ssize_t size; ++ char buf[40]; ++ loff_t pos = 0; ++ ++ if (!alarm_file) ++ return; ++ ++ sprintf(buf, "%d\n", value); ++ ++ size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos); ++ ++ if (size < 0) ++ printk(KERN_INFO "Error %d writing alarm value %s.\n", ++ (int) size, buf); ++} ++ ++/** ++ * toi_check_resleep: See whether to powerdown again after waking. ++ * ++ * After waking, check whether we should powerdown again in a (usually ++ * different) way. We only do this if the lid switch is still closed. ++ */ ++void toi_check_resleep(void) ++{ ++ /* We only return if we suspended to ram and woke. */ ++ if (lid_closed() && post_wake_state >= 0) ++ __toi_power_down(post_wake_state); ++} ++ ++void toi_power_down(void) ++{ ++ if (alarm_file && wake_delay) { ++ char array[25]; ++ loff_t pos = 0; ++ size_t size = vfs_read(epoch_file, (char __user *) array, 25, ++ &pos); ++ ++ if (((int) size) < 1) ++ printk(KERN_INFO "Failed to read epoch file (%d).\n", ++ (int) size); ++ else { ++ unsigned long since_epoch; ++ if (!strict_strtoul(array, 0, &since_epoch)) { ++ /* Clear any wakeup time. */ ++ write_alarm_file(0); ++ ++ /* Set new wakeup time. */ ++ write_alarm_file(since_epoch + wake_delay); ++ } ++ } ++ } ++ ++ __toi_power_down(toi_poweroff_method); ++ ++ toi_check_resleep(); ++} ++EXPORT_SYMBOL_GPL(toi_power_down); ++ ++static struct toi_sysfs_data sysfs_params[] = { ++#if defined(CONFIG_ACPI) ++ SYSFS_STRING("lid_file", SYSFS_RW, lid_state_file, 256, 0, NULL), ++ SYSFS_INT("wake_delay", SYSFS_RW, &wake_delay, 0, INT_MAX, 0, NULL), ++ SYSFS_STRING("wake_alarm_dir", SYSFS_RW, wake_alarm_dir, 256, 0, NULL), ++ SYSFS_INT("post_wake_state", SYSFS_RW, &post_wake_state, -1, 5, 0, ++ NULL), ++ SYSFS_UL("powerdown_method", SYSFS_RW, &toi_poweroff_method, 0, 5, 0), ++ SYSFS_INT("did_suspend_to_both", SYSFS_READONLY, &did_suspend_to_both, ++ 0, 0, 0, NULL) ++#endif ++}; ++ ++static struct toi_module_ops powerdown_ops = { ++ .type = MISC_HIDDEN_MODULE, ++ .name = "poweroff", ++ .initialise = powerdown_init, ++ .cleanup = powerdown_cleanup, ++ .directory = "[ROOT]", ++ .module = THIS_MODULE, ++ .sysfs_data = sysfs_params, ++ .num_sysfs_entries = sizeof(sysfs_params) / ++ sizeof(struct toi_sysfs_data), ++}; ++ ++int toi_poweroff_init(void) ++{ ++ return toi_register_module(&powerdown_ops); ++} ++ ++void toi_poweroff_exit(void) ++{ ++ toi_unregister_module(&powerdown_ops); ++} +diff --git a/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h +new file mode 100644 +index 0000000..9aa0ea8 +--- /dev/null ++++ b/kernel/power/tuxonice_power_off.h +@@ -0,0 +1,24 @@ ++/* ++ * kernel/power/tuxonice_power_off.h ++ * ++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * Support for the powering down. ++ */ ++ ++int toi_pm_state_finish(void); ++void toi_power_down(void); ++extern unsigned long toi_poweroff_method; ++int toi_poweroff_init(void); ++void toi_poweroff_exit(void); ++void toi_check_resleep(void); ++ ++extern int platform_begin(int platform_mode); ++extern int platform_pre_snapshot(int platform_mode); ++extern void platform_leave(int platform_mode); ++extern void platform_end(int platform_mode); ++extern void platform_finish(int platform_mode); ++extern int platform_pre_restore(int platform_mode); ++extern void platform_restore_cleanup(int platform_mode); +diff --git a/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c +new file mode 100644 +index 0000000..a2d4259 +--- /dev/null ++++ b/kernel/power/tuxonice_prepare_image.c +@@ -0,0 +1,1115 @@ ++/* ++ * kernel/power/tuxonice_prepare_image.c ++ * ++ * Copyright (C) 2003-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * We need to eat memory until we can: ++ * 1. Perform the save without changing anything (RAM_NEEDED < #pages) ++ * 2. Fit it all in available space (toiActiveAllocator->available_space() >= ++ * main_storage_needed()) ++ * 3. Reload the pagedir and pageset1 to places that don't collide with their ++ * final destinations, not knowing to what extent the resumed kernel will ++ * overlap with the one loaded at boot time. I think the resumed kernel ++ * should overlap completely, but I don't want to rely on this as it is ++ * an unproven assumption. We therefore assume there will be no overlap at ++ * all (worse case). ++ * 4. Meet the user's requested limit (if any) on the size of the image. ++ * The limit is in MB, so pages/256 (assuming 4K pages). ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "tuxonice_pageflags.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_io.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_prepare_image.h" ++#include "tuxonice.h" ++#include "tuxonice_extent.h" ++#include "tuxonice_checksum.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_alloc.h" ++#include "tuxonice_atomic_copy.h" ++#include "tuxonice_builtin.h" ++ ++static unsigned long num_nosave, main_storage_allocated, storage_limit, ++ header_storage_needed; ++unsigned long extra_pd1_pages_allowance = ++ CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE; ++long image_size_limit = CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT; ++static int no_ps2_needed; ++ ++struct attention_list { ++ struct task_struct *task; ++ struct attention_list *next; ++}; ++ ++static struct attention_list *attention_list; ++ ++#define PAGESET1 0 ++#define PAGESET2 1 ++ ++void free_attention_list(void) ++{ ++ struct attention_list *last = NULL; ++ ++ while (attention_list) { ++ last = attention_list; ++ attention_list = attention_list->next; ++ toi_kfree(6, last, sizeof(*last)); ++ } ++} ++ ++static int build_attention_list(void) ++{ ++ int i, task_count = 0; ++ struct task_struct *p; ++ struct attention_list *next; ++ ++ /* ++ * Count all userspace process (with task->mm) marked PF_NOFREEZE. ++ */ ++ toi_read_lock_tasklist(); ++ for_each_process(p) ++ if ((p->flags & PF_NOFREEZE) || p == current) ++ task_count++; ++ toi_read_unlock_tasklist(); ++ ++ /* ++ * Allocate attention list structs. ++ */ ++ for (i = 0; i < task_count; i++) { ++ struct attention_list *this = ++ toi_kzalloc(6, sizeof(struct attention_list), ++ TOI_WAIT_GFP); ++ if (!this) { ++ printk(KERN_INFO "Failed to allocate slab for " ++ "attention list.\n"); ++ free_attention_list(); ++ return 1; ++ } ++ this->next = NULL; ++ if (attention_list) ++ this->next = attention_list; ++ attention_list = this; ++ } ++ ++ next = attention_list; ++ toi_read_lock_tasklist(); ++ for_each_process(p) ++ if ((p->flags & PF_NOFREEZE) || p == current) { ++ next->task = p; ++ next = next->next; ++ } ++ toi_read_unlock_tasklist(); ++ return 0; ++} ++ ++static void pageset2_full(void) ++{ ++ struct zone *zone; ++ struct page *page; ++ unsigned long flags; ++ int i; ++ ++ for_each_populated_zone(zone) { ++ spin_lock_irqsave(&zone->lru_lock, flags); ++ for_each_lru(i) { ++ if (!zone_page_state(zone, NR_LRU_BASE + i)) ++ continue; ++ ++ list_for_each_entry(page, &zone->lruvec.lists[i], lru) { ++ struct address_space *mapping; ++ ++ mapping = page_mapping(page); ++ if (!mapping || !mapping->host || ++ !(mapping->host->i_flags & S_ATOMIC_COPY)) ++ SetPagePageset2(page); ++ } ++ } ++ spin_unlock_irqrestore(&zone->lru_lock, flags); ++ } ++} ++ ++/* ++ * toi_mark_task_as_pageset ++ * Functionality : Marks all the saveable pages belonging to a given process ++ * as belonging to a particular pageset. ++ */ ++ ++static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2) ++{ ++ struct vm_area_struct *vma; ++ struct mm_struct *mm; ++ ++ mm = t->active_mm; ++ ++ if (!mm || !mm->mmap) ++ return; ++ ++ if (!irqs_disabled()) ++ down_read(&mm->mmap_sem); ++ ++ for (vma = mm->mmap; vma; vma = vma->vm_next) { ++ unsigned long posn; ++ ++ if (!vma->vm_start || ++ vma->vm_flags & (VM_IO | VM_DONTDUMP | VM_PFNMAP)) ++ continue; ++ ++ for (posn = vma->vm_start; posn < vma->vm_end; ++ posn += PAGE_SIZE) { ++ struct page *page = follow_page(vma, posn, 0); ++ struct address_space *mapping; ++ ++ if (!page || !pfn_valid(page_to_pfn(page))) ++ continue; ++ ++ mapping = page_mapping(page); ++ if (mapping && mapping->host && ++ mapping->host->i_flags & S_ATOMIC_COPY) ++ continue; ++ ++ if (pageset2) ++ SetPagePageset2(page); ++ else { ++ ClearPagePageset2(page); ++ SetPagePageset1(page); ++ } ++ } ++ } ++ ++ if (!irqs_disabled()) ++ up_read(&mm->mmap_sem); ++} ++ ++static void mark_tasks(int pageset) ++{ ++ struct task_struct *p; ++ ++ toi_read_lock_tasklist(); ++ for_each_process(p) { ++ if (!p->mm) ++ continue; ++ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ toi_mark_task_as_pageset(p, pageset); ++ } ++ toi_read_unlock_tasklist(); ++ ++} ++ ++/* mark_pages_for_pageset2 ++ * ++ * Description: Mark unshared pages in processes not needed for hibernate as ++ * being able to be written out in a separate pagedir. ++ * HighMem pages are simply marked as pageset2. They won't be ++ * needed during hibernate. ++ */ ++ ++static void toi_mark_pages_for_pageset2(void) ++{ ++ struct attention_list *this = attention_list; ++ ++ memory_bm_clear(pageset2_map); ++ ++ if (test_action_state(TOI_NO_PAGESET2) || no_ps2_needed) ++ return; ++ ++ if (test_action_state(TOI_PAGESET2_FULL)) ++ pageset2_full(); ++ else ++ mark_tasks(PAGESET2); ++ ++ /* ++ * Because the tasks in attention_list are ones related to hibernating, ++ * we know that they won't go away under us. ++ */ ++ ++ while (this) { ++ if (!test_result_state(TOI_ABORTED)) ++ toi_mark_task_as_pageset(this->task, PAGESET1); ++ this = this->next; ++ } ++} ++ ++/* ++ * The atomic copy of pageset1 is stored in pageset2 pages. ++ * But if pageset1 is larger (normally only just after boot), ++ * we need to allocate extra pages to store the atomic copy. ++ * The following data struct and functions are used to handle ++ * the allocation and freeing of that memory. ++ */ ++ ++static unsigned long extra_pages_allocated; ++ ++struct extras { ++ struct page *page; ++ int order; ++ struct extras *next; ++}; ++ ++static struct extras *extras_list; ++ ++/* toi_free_extra_pagedir_memory ++ * ++ * Description: Free previously allocated extra pagedir memory. ++ */ ++void toi_free_extra_pagedir_memory(void) ++{ ++ /* Free allocated pages */ ++ while (extras_list) { ++ struct extras *this = extras_list; ++ int i; ++ ++ extras_list = this->next; ++ ++ for (i = 0; i < (1 << this->order); i++) ++ ClearPageNosave(this->page + i); ++ ++ toi_free_pages(9, this->page, this->order); ++ toi_kfree(7, this, sizeof(*this)); ++ } ++ ++ extra_pages_allocated = 0; ++} ++ ++/* toi_allocate_extra_pagedir_memory ++ * ++ * Description: Allocate memory for making the atomic copy of pagedir1 in the ++ * case where it is bigger than pagedir2. ++ * Arguments: int num_to_alloc: Number of extra pages needed. ++ * Result: int. Number of extra pages we now have allocated. ++ */ ++static int toi_allocate_extra_pagedir_memory(int extra_pages_needed) ++{ ++ int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated; ++ gfp_t flags = TOI_ATOMIC_GFP; ++ ++ if (num_to_alloc < 1) ++ return 0; ++ ++ order = fls(num_to_alloc); ++ if (order >= MAX_ORDER) ++ order = MAX_ORDER - 1; ++ ++ while (num_to_alloc) { ++ struct page *newpage; ++ unsigned long virt; ++ struct extras *extras_entry; ++ ++ while ((1 << order) > num_to_alloc) ++ order--; ++ ++ extras_entry = (struct extras *) toi_kzalloc(7, ++ sizeof(struct extras), TOI_ATOMIC_GFP); ++ ++ if (!extras_entry) ++ return extra_pages_allocated; ++ ++ virt = toi_get_free_pages(9, flags, order); ++ while (!virt && order) { ++ order--; ++ virt = toi_get_free_pages(9, flags, order); ++ } ++ ++ if (!virt) { ++ toi_kfree(7, extras_entry, sizeof(*extras_entry)); ++ return extra_pages_allocated; ++ } ++ ++ newpage = virt_to_page(virt); ++ ++ extras_entry->page = newpage; ++ extras_entry->order = order; ++ extras_entry->next = extras_list; ++ ++ extras_list = extras_entry; ++ ++ for (j = 0; j < (1 << order); j++) { ++ SetPageNosave(newpage + j); ++ SetPagePageset1Copy(newpage + j); ++ } ++ ++ extra_pages_allocated += (1 << order); ++ num_to_alloc -= (1 << order); ++ } ++ ++ return extra_pages_allocated; ++} ++ ++/* ++ * real_nr_free_pages: Count pcp pages for a zone type or all zones ++ * (-1 for all, otherwise zone_idx() result desired). ++ */ ++unsigned long real_nr_free_pages(unsigned long zone_idx_mask) ++{ ++ struct zone *zone; ++ int result = 0, cpu; ++ ++ /* PCP lists */ ++ for_each_populated_zone(zone) { ++ if (!(zone_idx_mask & (1 << zone_idx(zone)))) ++ continue; ++ ++ for_each_online_cpu(cpu) { ++ struct per_cpu_pageset *pset = ++ per_cpu_ptr(zone->pageset, cpu); ++ struct per_cpu_pages *pcp = &pset->pcp; ++ result += pcp->count; ++ } ++ ++ result += zone_page_state(zone, NR_FREE_PAGES); ++ } ++ return result; ++} ++EXPORT_SYMBOL_GPL(real_nr_free_pages); ++ ++/* ++ * Discover how much extra memory will be required by the drivers ++ * when they're asked to hibernate. We can then ensure that amount ++ * of memory is available when we really want it. ++ */ ++static void get_extra_pd1_allowance(void) ++{ ++ unsigned long orig_num_free = real_nr_free_pages(all_zones_mask), final; ++ ++ toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers."); ++ ++ if (toi_go_atomic(PMSG_FREEZE, 1)) ++ return; ++ ++ final = real_nr_free_pages(all_zones_mask); ++ toi_end_atomic(ATOMIC_ALL_STEPS, 1, 0); ++ ++ extra_pd1_pages_allowance = (orig_num_free > final) ? ++ orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE : ++ MIN_EXTRA_PAGES_ALLOWANCE; ++} ++ ++/* ++ * Amount of storage needed, possibly taking into account the ++ * expected compression ratio and possibly also ignoring our ++ * allowance for extra pages. ++ */ ++static unsigned long main_storage_needed(int use_ecr, ++ int ignore_extra_pd1_allow) ++{ ++ return (pagedir1.size + pagedir2.size + ++ (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) * ++ (use_ecr ? toi_expected_compression_ratio() : 100) / 100; ++} ++ ++/* ++ * Storage needed for the image header, in bytes until the return. ++ */ ++unsigned long get_header_storage_needed(void) ++{ ++ unsigned long bytes = sizeof(struct toi_header) + ++ toi_header_storage_for_modules() + ++ toi_pageflags_space_needed() + ++ fs_info_space_needed(); ++ ++ return DIV_ROUND_UP(bytes, PAGE_SIZE); ++} ++EXPORT_SYMBOL_GPL(get_header_storage_needed); ++ ++/* ++ * When freeing memory, pages from either pageset might be freed. ++ * ++ * When seeking to free memory to be able to hibernate, for every ps1 page ++ * freed, we need 2 less pages for the atomic copy because there is one less ++ * page to copy and one more page into which data can be copied. ++ * ++ * Freeing ps2 pages saves us nothing directly. No more memory is available ++ * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but ++ * that's too much work to figure out. ++ * ++ * => ps1_to_free functions ++ * ++ * Of course if we just want to reduce the image size, because of storage ++ * limitations or an image size limit either ps will do. ++ * ++ * => any_to_free function ++ */ ++ ++static unsigned long lowpages_usable_for_highmem_copy(void) ++{ ++ unsigned long needed = get_lowmem_size(pagedir1) + ++ extra_pd1_pages_allowance + MIN_FREE_RAM + ++ toi_memory_for_modules(0), ++ available = get_lowmem_size(pagedir2) + ++ real_nr_free_low_pages() + extra_pages_allocated; ++ ++ return available > needed ? available - needed : 0; ++} ++ ++static unsigned long highpages_ps1_to_free(void) ++{ ++ unsigned long need = get_highmem_size(pagedir1), ++ available = get_highmem_size(pagedir2) + ++ real_nr_free_high_pages() + ++ lowpages_usable_for_highmem_copy(); ++ ++ return need > available ? DIV_ROUND_UP(need - available, 2) : 0; ++} ++ ++static unsigned long lowpages_ps1_to_free(void) ++{ ++ unsigned long needed = get_lowmem_size(pagedir1) + ++ extra_pd1_pages_allowance + MIN_FREE_RAM + ++ toi_memory_for_modules(0), ++ available = get_lowmem_size(pagedir2) + ++ real_nr_free_low_pages() + extra_pages_allocated; ++ ++ return needed > available ? DIV_ROUND_UP(needed - available, 2) : 0; ++} ++ ++static unsigned long current_image_size(void) ++{ ++ return pagedir1.size + pagedir2.size + header_storage_needed; ++} ++ ++static unsigned long storage_still_required(void) ++{ ++ unsigned long needed = main_storage_needed(1, 1); ++ return needed > storage_limit ? needed - storage_limit : 0; ++} ++ ++static unsigned long ram_still_required(void) ++{ ++ unsigned long needed = MIN_FREE_RAM + toi_memory_for_modules(0) + ++ 2 * extra_pd1_pages_allowance, ++ available = real_nr_free_low_pages() + extra_pages_allocated; ++ return needed > available ? needed - available : 0; ++} ++ ++unsigned long any_to_free(int use_image_size_limit) ++{ ++ int use_soft_limit = use_image_size_limit && image_size_limit > 0; ++ unsigned long current_size = current_image_size(), ++ soft_limit = use_soft_limit ? (image_size_limit << 8) : 0, ++ to_free = use_soft_limit ? (current_size > soft_limit ? ++ current_size - soft_limit : 0) : 0, ++ storage_limit = storage_still_required(), ++ ram_limit = ram_still_required(), ++ first_max = max(to_free, storage_limit); ++ ++ return max(first_max, ram_limit); ++} ++ ++static int need_pageset2(void) ++{ ++ return (real_nr_free_low_pages() + extra_pages_allocated - ++ 2 * extra_pd1_pages_allowance - MIN_FREE_RAM - ++ toi_memory_for_modules(0) - pagedir1.size) < pagedir2.size; ++} ++ ++/* amount_needed ++ * ++ * Calculates the amount by which the image size needs to be reduced to meet ++ * our constraints. ++ */ ++static unsigned long amount_needed(int use_image_size_limit) ++{ ++ return max(highpages_ps1_to_free() + lowpages_ps1_to_free(), ++ any_to_free(use_image_size_limit)); ++} ++ ++static int image_not_ready(int use_image_size_limit) ++{ ++ toi_message(TOI_EAT_MEMORY, TOI_LOW, 1, ++ "Amount still needed (%lu) > 0:%u," ++ " Storage allocd: %lu < %lu: %u.\n", ++ amount_needed(use_image_size_limit), ++ (amount_needed(use_image_size_limit) > 0), ++ main_storage_allocated, ++ main_storage_needed(1, 1), ++ main_storage_allocated < main_storage_needed(1, 1)); ++ ++ toi_cond_pause(0, NULL); ++ ++ return (amount_needed(use_image_size_limit) > 0) || ++ main_storage_allocated < main_storage_needed(1, 1); ++} ++ ++static void display_failure_reason(int tries_exceeded) ++{ ++ unsigned long storage_required = storage_still_required(), ++ ram_required = ram_still_required(), ++ high_ps1 = highpages_ps1_to_free(), ++ low_ps1 = lowpages_ps1_to_free(); ++ ++ printk(KERN_INFO "Failed to prepare the image because...\n"); ++ ++ if (!storage_limit) { ++ printk(KERN_INFO "- You need some storage available to be " ++ "able to hibernate.\n"); ++ return; ++ } ++ ++ if (tries_exceeded) ++ printk(KERN_INFO "- The maximum number of iterations was " ++ "reached without successfully preparing the " ++ "image.\n"); ++ ++ if (storage_required) { ++ printk(KERN_INFO " - We need at least %lu pages of storage " ++ "(ignoring the header), but only have %lu.\n", ++ main_storage_needed(1, 1), ++ main_storage_allocated); ++ set_abort_result(TOI_INSUFFICIENT_STORAGE); ++ } ++ ++ if (ram_required) { ++ printk(KERN_INFO " - We need %lu more free pages of low " ++ "memory.\n", ram_required); ++ printk(KERN_INFO " Minimum free : %8d\n", MIN_FREE_RAM); ++ printk(KERN_INFO " + Reqd. by modules : %8lu\n", ++ toi_memory_for_modules(0)); ++ printk(KERN_INFO " + 2 * extra allow : %8lu\n", ++ 2 * extra_pd1_pages_allowance); ++ printk(KERN_INFO " - Currently free : %8lu\n", ++ real_nr_free_low_pages()); ++ printk(KERN_INFO " - Pages allocd : %8lu\n", ++ extra_pages_allocated); ++ printk(KERN_INFO " : ========\n"); ++ printk(KERN_INFO " Still needed : %8lu\n", ++ ram_required); ++ ++ /* Print breakdown of memory needed for modules */ ++ toi_memory_for_modules(1); ++ set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); ++ } ++ ++ if (high_ps1) { ++ printk(KERN_INFO "- We need to free %lu highmem pageset 1 " ++ "pages.\n", high_ps1); ++ set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); ++ } ++ ++ if (low_ps1) { ++ printk(KERN_INFO " - We need to free %ld lowmem pageset 1 " ++ "pages.\n", low_ps1); ++ set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY); ++ } ++} ++ ++static void display_stats(int always, int sub_extra_pd1_allow) ++{ ++ char buffer[255]; ++ snprintf(buffer, 254, ++ "Free:%lu(%lu). Sets:%lu(%lu),%lu(%lu). " ++ "Nosave:%lu-%lu=%lu. Storage:%lu/%lu(%lu=>%lu). " ++ "Needed:%lu,%lu,%lu(%u,%lu,%lu,%ld) (PS2:%s)\n", ++ ++ /* Free */ ++ real_nr_free_pages(all_zones_mask), ++ real_nr_free_low_pages(), ++ ++ /* Sets */ ++ pagedir1.size, pagedir1.size - get_highmem_size(pagedir1), ++ pagedir2.size, pagedir2.size - get_highmem_size(pagedir2), ++ ++ /* Nosave */ ++ num_nosave, extra_pages_allocated, ++ num_nosave - extra_pages_allocated, ++ ++ /* Storage */ ++ main_storage_allocated, ++ storage_limit, ++ main_storage_needed(1, sub_extra_pd1_allow), ++ main_storage_needed(1, 1), ++ ++ /* Needed */ ++ lowpages_ps1_to_free(), highpages_ps1_to_free(), ++ any_to_free(1), ++ MIN_FREE_RAM, toi_memory_for_modules(0), ++ extra_pd1_pages_allowance, ++ image_size_limit, ++ ++ need_pageset2() ? "yes" : "no"); ++ ++ if (always) ++ printk("%s", buffer); ++ else ++ toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer); ++} ++ ++/* generate_free_page_map ++ * ++ * Description: This routine generates a bitmap of free pages from the ++ * lists used by the memory manager. We then use the bitmap ++ * to quickly calculate which pages to save and in which ++ * pagesets. ++ */ ++static void generate_free_page_map(void) ++{ ++ int order, cpu, t; ++ unsigned long flags, i; ++ struct zone *zone; ++ struct list_head *curr; ++ unsigned long pfn; ++ struct page *page; ++ ++ for_each_populated_zone(zone) { ++ ++ if (!zone->spanned_pages) ++ continue; ++ ++ spin_lock_irqsave(&zone->lock, flags); ++ ++ for (i = 0; i < zone->spanned_pages; i++) { ++ pfn = zone->zone_start_pfn + i; ++ ++ if (!pfn_valid(pfn)) ++ continue; ++ ++ page = pfn_to_page(pfn); ++ ++ ClearPageNosaveFree(page); ++ } ++ ++ for_each_migratetype_order(order, t) { ++ list_for_each(curr, ++ &zone->free_area[order].free_list[t]) { ++ unsigned long j; ++ ++ pfn = page_to_pfn(list_entry(curr, struct page, ++ lru)); ++ for (j = 0; j < (1UL << order); j++) ++ SetPageNosaveFree(pfn_to_page(pfn + j)); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ struct per_cpu_pageset *pset = ++ per_cpu_ptr(zone->pageset, cpu); ++ struct per_cpu_pages *pcp = &pset->pcp; ++ struct page *page; ++ int t; ++ ++ for (t = 0; t < MIGRATE_PCPTYPES; t++) ++ list_for_each_entry(page, &pcp->lists[t], lru) ++ SetPageNosaveFree(page); ++ } ++ ++ spin_unlock_irqrestore(&zone->lock, flags); ++ } ++} ++ ++/* size_of_free_region ++ * ++ * Description: Return the number of pages that are free, beginning with and ++ * including this one. ++ */ ++static int size_of_free_region(struct zone *zone, unsigned long start_pfn) ++{ ++ unsigned long this_pfn = start_pfn, ++ end_pfn = zone->zone_start_pfn + zone->spanned_pages - 1; ++ ++ while (pfn_valid(this_pfn) && this_pfn <= end_pfn && PageNosaveFree(pfn_to_page(this_pfn))) ++ this_pfn++; ++ ++ return this_pfn - start_pfn; ++} ++ ++/* flag_image_pages ++ * ++ * This routine generates our lists of pages to be stored in each ++ * pageset. Since we store the data using extents, and adding new ++ * extents might allocate a new extent page, this routine may well ++ * be called more than once. ++ */ ++static void flag_image_pages(int atomic_copy) ++{ ++ int num_free = 0; ++ unsigned long loop; ++ struct zone *zone; ++ ++ pagedir1.size = 0; ++ pagedir2.size = 0; ++ ++ set_highmem_size(pagedir1, 0); ++ set_highmem_size(pagedir2, 0); ++ ++ num_nosave = 0; ++ ++ memory_bm_clear(pageset1_map); ++ ++ generate_free_page_map(); ++ ++ /* ++ * Pages not to be saved are marked Nosave irrespective of being ++ * reserved. ++ */ ++ for_each_populated_zone(zone) { ++ int highmem = is_highmem(zone); ++ ++ for (loop = 0; loop < zone->spanned_pages; loop++) { ++ unsigned long pfn = zone->zone_start_pfn + loop; ++ struct page *page; ++ int chunk_size; ++ ++ if (!pfn_valid(pfn)) ++ continue; ++ ++ chunk_size = size_of_free_region(zone, pfn); ++ if (chunk_size) { ++ num_free += chunk_size; ++ loop += chunk_size - 1; ++ continue; ++ } ++ ++ page = pfn_to_page(pfn); ++ ++ if (PageNosave(page)) { ++ num_nosave++; ++ continue; ++ } ++ ++ page = highmem ? saveable_highmem_page(zone, pfn) : ++ saveable_page(zone, pfn); ++ ++ if (!page) { ++ num_nosave++; ++ continue; ++ } ++ ++ if (PagePageset2(page)) { ++ pagedir2.size++; ++ if (PageHighMem(page)) ++ inc_highmem_size(pagedir2); ++ else ++ SetPagePageset1Copy(page); ++ if (PageResave(page)) { ++ SetPagePageset1(page); ++ ClearPagePageset1Copy(page); ++ pagedir1.size++; ++ if (PageHighMem(page)) ++ inc_highmem_size(pagedir1); ++ } ++ } else { ++ pagedir1.size++; ++ SetPagePageset1(page); ++ if (PageHighMem(page)) ++ inc_highmem_size(pagedir1); ++ } ++ } ++ } ++ ++ if (!atomic_copy) ++ toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0, ++ "Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld)" ++ " + NumFree (%d) = %d.\n", ++ pagedir1.size, pagedir2.size, num_nosave, num_free, ++ pagedir1.size + pagedir2.size + num_nosave + num_free); ++} ++ ++void toi_recalculate_image_contents(int atomic_copy) ++{ ++ memory_bm_clear(pageset1_map); ++ if (!atomic_copy) { ++ unsigned long pfn; ++ memory_bm_position_reset(pageset2_map); ++ for (pfn = memory_bm_next_pfn(pageset2_map); ++ pfn != BM_END_OF_MAP; ++ pfn = memory_bm_next_pfn(pageset2_map)) ++ ClearPagePageset1Copy(pfn_to_page(pfn)); ++ /* Need to call this before getting pageset1_size! */ ++ toi_mark_pages_for_pageset2(); ++ } ++ flag_image_pages(atomic_copy); ++ ++ if (!atomic_copy) { ++ storage_limit = toiActiveAllocator->storage_available(); ++ display_stats(0, 0); ++ } ++} ++ ++int try_allocate_extra_memory(void) ++{ ++ unsigned long wanted = pagedir1.size + extra_pd1_pages_allowance - ++ get_lowmem_size(pagedir2); ++ if (wanted > extra_pages_allocated) { ++ unsigned long got = toi_allocate_extra_pagedir_memory(wanted); ++ if (wanted < got) { ++ toi_message(TOI_EAT_MEMORY, TOI_LOW, 1, ++ "Want %d extra pages for pageset1, got %d.\n", ++ wanted, got); ++ return 1; ++ } ++ } ++ return 0; ++} ++ ++ ++/* update_image ++ * ++ * Allocate [more] memory and storage for the image. ++ */ ++static void update_image(int ps2_recalc) ++{ ++ int old_header_req; ++ unsigned long seek; ++ ++ if (try_allocate_extra_memory()) ++ return; ++ ++ if (ps2_recalc) ++ goto recalc; ++ ++ thaw_kernel_threads(); ++ ++ /* ++ * Allocate remaining storage space, if possible, up to the ++ * maximum we know we'll need. It's okay to allocate the ++ * maximum if the writer is the swapwriter, but ++ * we don't want to grab all available space on an NFS share. ++ * We therefore ignore the expected compression ratio here, ++ * thereby trying to allocate the maximum image size we could ++ * need (assuming compression doesn't expand the image), but ++ * don't complain if we can't get the full amount we're after. ++ */ ++ ++ do { ++ int result; ++ ++ old_header_req = header_storage_needed; ++ toiActiveAllocator->reserve_header_space(header_storage_needed); ++ ++ /* How much storage is free with the reservation applied? */ ++ storage_limit = toiActiveAllocator->storage_available(); ++ seek = min(storage_limit, main_storage_needed(0, 0)); ++ ++ result = toiActiveAllocator->allocate_storage(seek); ++ if (result) ++ printk("Failed to allocate storage (%d).\n", result); ++ ++ main_storage_allocated = ++ toiActiveAllocator->storage_allocated(); ++ ++ /* Need more header because more storage allocated? */ ++ header_storage_needed = get_header_storage_needed(); ++ ++ } while (header_storage_needed > old_header_req); ++ ++ if (freeze_kernel_threads()) ++ set_abort_result(TOI_FREEZING_FAILED); ++ ++recalc: ++ toi_recalculate_image_contents(0); ++} ++ ++/* attempt_to_freeze ++ * ++ * Try to freeze processes. ++ */ ++ ++static int attempt_to_freeze(void) ++{ ++ int result; ++ ++ /* Stop processes before checking again */ ++ toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing " ++ "filesystems."); ++ result = freeze_processes(); ++ ++ if (result) ++ set_abort_result(TOI_FREEZING_FAILED); ++ ++ result = freeze_kernel_threads(); ++ ++ if (result) ++ set_abort_result(TOI_FREEZING_FAILED); ++ ++ return result; ++} ++ ++/* eat_memory ++ * ++ * Try to free some memory, either to meet hard or soft constraints on the image ++ * characteristics. ++ * ++ * Hard constraints: ++ * - Pageset1 must be < half of memory; ++ * - We must have enough memory free at resume time to have pageset1 ++ * be able to be loaded in pages that don't conflict with where it has to ++ * be restored. ++ * Soft constraints ++ * - User specificied image size limit. ++ */ ++static void eat_memory(void) ++{ ++ unsigned long amount_wanted = 0; ++ int did_eat_memory = 0; ++ ++ /* ++ * Note that if we have enough storage space and enough free memory, we ++ * may exit without eating anything. We give up when the last 10 ++ * iterations ate no extra pages because we're not going to get much ++ * more anyway, but the few pages we get will take a lot of time. ++ * ++ * We freeze processes before beginning, and then unfreeze them if we ++ * need to eat memory until we think we have enough. If our attempts ++ * to freeze fail, we give up and abort. ++ */ ++ ++ amount_wanted = amount_needed(1); ++ ++ switch (image_size_limit) { ++ case -1: /* Don't eat any memory */ ++ if (amount_wanted > 0) { ++ set_abort_result(TOI_WOULD_EAT_MEMORY); ++ return; ++ } ++ break; ++ case -2: /* Free caches only */ ++ drop_pagecache(); ++ toi_recalculate_image_contents(0); ++ amount_wanted = amount_needed(1); ++ break; ++ default: ++ break; ++ } ++ ++ if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) && ++ image_size_limit != -1) { ++ unsigned long request = amount_wanted; ++ unsigned long high_req = max(highpages_ps1_to_free(), ++ any_to_free(1)); ++ unsigned long low_req = lowpages_ps1_to_free(); ++ unsigned long got = 0; ++ ++ toi_prepare_status(CLEAR_BAR, ++ "Seeking to free %ldMB of memory.", ++ MB(amount_wanted)); ++ ++ thaw_kernel_threads(); ++ ++ /* ++ * Ask for too many because shrink_memory_mask doesn't ++ * currently return enough most of the time. ++ */ ++ ++ if (low_req) ++ got = shrink_memory_mask(low_req, GFP_KERNEL); ++ if (high_req) ++ shrink_memory_mask(high_req - got, GFP_HIGHUSER); ++ ++ did_eat_memory = 1; ++ ++ toi_recalculate_image_contents(0); ++ ++ amount_wanted = amount_needed(1); ++ ++ printk(KERN_DEBUG "Asked shrink_memory_mask for %ld low pages &" ++ " %ld pages from anywhere, got %ld.\n", ++ high_req, low_req, ++ request - amount_wanted); ++ ++ toi_cond_pause(0, NULL); ++ ++ if (freeze_kernel_threads()) ++ set_abort_result(TOI_FREEZING_FAILED); ++ } ++ ++ if (did_eat_memory) ++ toi_recalculate_image_contents(0); ++} ++ ++/* toi_prepare_image ++ * ++ * Entry point to the whole image preparation section. ++ * ++ * We do four things: ++ * - Freeze processes; ++ * - Ensure image size constraints are met; ++ * - Complete all the preparation for saving the image, ++ * including allocation of storage. The only memory ++ * that should be needed when we're finished is that ++ * for actually storing the image (and we know how ++ * much is needed for that because the modules tell ++ * us). ++ * - Make sure that all dirty buffers are written out. ++ */ ++#define MAX_TRIES 2 ++int toi_prepare_image(void) ++{ ++ int result = 1, tries = 1; ++ ++ main_storage_allocated = 0; ++ no_ps2_needed = 0; ++ ++ if (attempt_to_freeze()) ++ return 1; ++ ++ if (!extra_pd1_pages_allowance) ++ get_extra_pd1_allowance(); ++ ++ storage_limit = toiActiveAllocator->storage_available(); ++ ++ if (!storage_limit) { ++ printk(KERN_INFO "No storage available. Didn't try to prepare " ++ "an image.\n"); ++ display_failure_reason(0); ++ set_abort_result(TOI_NOSTORAGE_AVAILABLE); ++ return 1; ++ } ++ ++ if (build_attention_list()) { ++ abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE, ++ "Unable to successfully prepare the image.\n"); ++ return 1; ++ } ++ ++ toi_recalculate_image_contents(0); ++ ++ do { ++ toi_prepare_status(CLEAR_BAR, ++ "Preparing Image. Try %d.", tries); ++ ++ eat_memory(); ++ ++ if (test_result_state(TOI_ABORTED)) ++ break; ++ ++ update_image(0); ++ ++ tries++; ++ ++ } while (image_not_ready(1) && tries <= MAX_TRIES && ++ !test_result_state(TOI_ABORTED)); ++ ++ result = image_not_ready(0); ++ ++ if (!test_result_state(TOI_ABORTED)) { ++ if (result) { ++ display_stats(1, 0); ++ display_failure_reason(tries > MAX_TRIES); ++ abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE, ++ "Unable to successfully prepare the image.\n"); ++ } else { ++ /* Pageset 2 needed? */ ++ if (!need_pageset2() && ++ test_action_state(TOI_NO_PS2_IF_UNNEEDED)) { ++ no_ps2_needed = 1; ++ toi_recalculate_image_contents(0); ++ update_image(1); ++ } ++ ++ toi_cond_pause(1, "Image preparation complete."); ++ } ++ } ++ ++ return result ? result : allocate_checksum_pages(); ++} +diff --git a/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h +new file mode 100644 +index 0000000..2a2ca0b +--- /dev/null ++++ b/kernel/power/tuxonice_prepare_image.h +@@ -0,0 +1,38 @@ ++/* ++ * kernel/power/tuxonice_prepare_image.h ++ * ++ * Copyright (C) 2003-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ */ ++ ++#include ++ ++extern int toi_prepare_image(void); ++extern void toi_recalculate_image_contents(int storage_available); ++extern unsigned long real_nr_free_pages(unsigned long zone_idx_mask); ++extern long image_size_limit; ++extern void toi_free_extra_pagedir_memory(void); ++extern unsigned long extra_pd1_pages_allowance; ++extern void free_attention_list(void); ++ ++#define MIN_FREE_RAM 100 ++#define MIN_EXTRA_PAGES_ALLOWANCE 500 ++ ++#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1)) ++#ifdef CONFIG_HIGHMEM ++#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM)) ++#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \ ++ (1 << ZONE_HIGHMEM))) ++#else ++#define real_nr_free_high_pages() (0) ++#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask)) ++ ++/* For eat_memory function */ ++#define ZONE_HIGHMEM (MAX_NR_ZONES + 1) ++#endif ++ ++unsigned long get_header_storage_needed(void); ++unsigned long any_to_free(int use_image_size_limit); ++int try_allocate_extra_memory(void); +diff --git a/kernel/power/tuxonice_prune.c b/kernel/power/tuxonice_prune.c +new file mode 100644 +index 0000000..9a9444d +--- /dev/null ++++ b/kernel/power/tuxonice_prune.c +@@ -0,0 +1,419 @@ ++/* ++ * kernel/power/tuxonice_prune.c ++ * ++ * Copyright (C) 2012 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * This file implements a TuxOnIce module that seeks to prune the ++ * amount of data written to disk. It builds a table of hashes ++ * of the uncompressed data, and writes the pfn of the previous page ++ * with the same contents instead of repeating the data when a match ++ * is found. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "tuxonice_builtin.h" ++#include "tuxonice.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_io.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_alloc.h" ++ ++/* ++ * We never write a page bigger than PAGE_SIZE, so use a large number ++ * to indicate that data is a PFN. ++ */ ++#define PRUNE_DATA_IS_PFN (PAGE_SIZE + 100) ++ ++static unsigned long toi_pruned_pages; ++ ++static struct toi_module_ops toi_prune_ops; ++static struct toi_module_ops *next_driver; ++ ++static char toi_prune_hash_algo_name[32] = "sha1"; ++ ++static DEFINE_MUTEX(stats_lock); ++ ++struct cpu_context { ++ struct shash_desc desc; ++ char *digest; ++}; ++ ++#define OUT_BUF_SIZE (2 * PAGE_SIZE) ++ ++static DEFINE_PER_CPU(struct cpu_context, contexts); ++ ++/* ++ * toi_crypto_prepare ++ * ++ * Prepare to do some work by allocating buffers and transforms. ++ */ ++static int toi_prune_crypto_prepare(void) ++{ ++ int cpu, ret, digestsize; ++ ++ if (!*toi_prune_hash_algo_name) { ++ printk(KERN_INFO "TuxOnIce: Pruning enabled but no " ++ "hash algorithm set.\n"); ++ return 1; ++ } ++ ++ for_each_online_cpu(cpu) { ++ struct cpu_context *this = &per_cpu(contexts, cpu); ++ this->desc.tfm = crypto_alloc_shash(toi_prune_hash_algo_name, 0, 0); ++ if (IS_ERR(this->desc.tfm)) { ++ printk(KERN_INFO "TuxOnIce: Failed to allocate the " ++ "%s prune hash algorithm.\n", ++ toi_prune_hash_algo_name); ++ this->desc.tfm = NULL; ++ return 1; ++ } ++ ++ if (!digestsize) ++ digestsize = crypto_shash_digestsize(this->desc.tfm); ++ ++ this->digest = kmalloc(digestsize, GFP_KERNEL); ++ if (!this->digest) { ++ printk(KERN_INFO "TuxOnIce: Failed to allocate space " ++ "for digest output.\n"); ++ crypto_free_shash(this->desc.tfm); ++ this->desc.tfm = NULL; ++ } ++ ++ this->desc.flags = 0; ++ ++ ret = crypto_shash_init(&this->desc); ++ if (ret < 0) { ++ printk(KERN_INFO "TuxOnIce: Failed to initialise the " ++ "%s prune hash algorithm.\n", ++ toi_prune_hash_algo_name); ++ kfree(this->digest); ++ this->digest = NULL; ++ crypto_free_shash(this->desc.tfm); ++ this->desc.tfm = NULL; ++ return 1; ++ } ++ } ++ ++ return 0; ++} ++ ++static int toi_prune_rw_cleanup(int writing) ++{ ++ int cpu; ++ ++ for_each_online_cpu(cpu) { ++ struct cpu_context *this = &per_cpu(contexts, cpu); ++ if (this->desc.tfm) { ++ crypto_free_shash(this->desc.tfm); ++ this->desc.tfm = NULL; ++ } ++ ++ if (this->digest) { ++ kfree(this->digest); ++ this->digest = NULL; ++ } ++ } ++ ++ return 0; ++} ++ ++/* ++ * toi_prune_init ++ */ ++ ++static int toi_prune_init(int toi_or_resume) ++{ ++ if (!toi_or_resume) ++ return 0; ++ ++ toi_pruned_pages = 0; ++ ++ next_driver = toi_get_next_filter(&toi_prune_ops); ++ ++ return next_driver ? 0 : -ECHILD; ++} ++ ++/* ++ * toi_prune_rw_init() ++ */ ++ ++static int toi_prune_rw_init(int rw, int stream_number) ++{ ++ if (toi_prune_crypto_prepare()) { ++ printk(KERN_ERR "Failed to initialise prune " ++ "algorithm.\n"); ++ if (rw == READ) { ++ printk(KERN_INFO "Unable to read the image.\n"); ++ return -ENODEV; ++ } else { ++ printk(KERN_INFO "Continuing without " ++ "pruning the image.\n"); ++ toi_prune_ops.enabled = 0; ++ } ++ } ++ ++ return 0; ++} ++ ++/* ++ * toi_prune_write_page() ++ * ++ * Compress a page of data, buffering output and passing on filled ++ * pages to the next module in the pipeline. ++ * ++ * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing ++ * data to be checked. ++ * ++ * Returns: 0 on success. Otherwise the error is that returned by later ++ * modules, -ECHILD if we have a broken pipeline or -EIO if ++ * zlib errs. ++ */ ++static int toi_prune_write_page(unsigned long index, int buf_type, ++ void *buffer_page, unsigned int buf_size) ++{ ++ int ret = 0, cpu = smp_processor_id(), write_data = 1; ++ struct cpu_context *ctx = &per_cpu(contexts, cpu); ++ u8* output_buffer = buffer_page; ++ int output_len = buf_size; ++ int out_buf_type = buf_type; ++ void *buffer_start; ++ u32 buf[4]; ++ ++ if (ctx->desc.tfm) { ++ ++ buffer_start = TOI_MAP(buf_type, buffer_page); ++ ctx->len = OUT_BUF_SIZE; ++ ++ ret = crypto_shash_digest(&ctx->desc, buffer_start, buf_size, &ctx->digest); ++ if (ret) { ++ printk(KERN_INFO "TuxOnIce: Failed to calculate digest (%d).\n", ret); ++ } else { ++ mutex_lock(&stats_lock); ++ ++ toi_pruned_pages++; ++ ++ mutex_unlock(&stats_lock); ++ ++ } ++ ++ TOI_UNMAP(buf_type, buffer_page); ++ } ++ ++ if (write_data) ++ ret = next_driver->write_page(index, out_buf_type, ++ output_buffer, output_len); ++ else ++ ret = next_driver->write_page(index, out_buf_type, ++ output_buffer, output_len); ++ ++ return ret; ++} ++ ++/* ++ * toi_prune_read_page() ++ * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE. ++ * ++ * Retrieve data from later modules or from a previously loaded page and ++ * fill the input buffer. ++ * Zero if successful. Error condition from me or from downstream on failure. ++ */ ++static int toi_prune_read_page(unsigned long *index, int buf_type, ++ void *buffer_page, unsigned int *buf_size) ++{ ++ int ret, cpu = smp_processor_id(); ++ unsigned int len; ++ char *buffer_start; ++ struct cpu_context *ctx = &per_cpu(contexts, cpu); ++ ++ if (!ctx->desc.tfm) ++ return next_driver->read_page(index, TOI_PAGE, buffer_page, ++ buf_size); ++ ++ /* ++ * All our reads must be synchronous - we can't handle ++ * data that hasn't been read yet. ++ */ ++ ++ ret = next_driver->read_page(index, buf_type, buffer_page, &len); ++ ++ if (len == PRUNE_DATA_IS_PFN) { ++ buffer_start = kmap(buffer_page); ++ } ++ ++ return ret; ++} ++ ++/* ++ * toi_prune_print_debug_stats ++ * @buffer: Pointer to a buffer into which the debug info will be printed. ++ * @size: Size of the buffer. ++ * ++ * Print information to be recorded for debugging purposes into a buffer. ++ * Returns: Number of characters written to the buffer. ++ */ ++ ++static int toi_prune_print_debug_stats(char *buffer, int size) ++{ ++ int len; ++ ++ /* Output the number of pages pruned. */ ++ if (*toi_prune_hash_algo_name) ++ len = scnprintf(buffer, size, "- Compressor is '%s'.\n", ++ toi_prune_hash_algo_name); ++ else ++ len = scnprintf(buffer, size, "- Compressor is not set.\n"); ++ ++ if (toi_pruned_pages) ++ len += scnprintf(buffer+len, size - len, " Pruned " ++ "%lu pages).\n", ++ toi_pruned_pages); ++ return len; ++} ++ ++/* ++ * toi_prune_memory_needed ++ * ++ * Tell the caller how much memory we need to operate during hibernate/resume. ++ * Returns: Unsigned long. Maximum number of bytes of memory required for ++ * operation. ++ */ ++static int toi_prune_memory_needed(void) ++{ ++ return 2 * PAGE_SIZE; ++} ++ ++static int toi_prune_storage_needed(void) ++{ ++ return 2 * sizeof(unsigned long) + 2 * sizeof(int) + ++ strlen(toi_prune_hash_algo_name) + 1; ++} ++ ++/* ++ * toi_prune_save_config_info ++ * @buffer: Pointer to a buffer of size PAGE_SIZE. ++ * ++ * Save informaton needed when reloading the image at resume time. ++ * Returns: Number of bytes used for saving our data. ++ */ ++static int toi_prune_save_config_info(char *buffer) ++{ ++ int len = strlen(toi_prune_hash_algo_name) + 1, offset = 0; ++ ++ *((unsigned long *) buffer) = toi_pruned_pages; ++ offset += sizeof(unsigned long); ++ *((int *) (buffer + offset)) = len; ++ offset += sizeof(int); ++ strncpy(buffer + offset, toi_prune_hash_algo_name, len); ++ return offset + len; ++} ++ ++/* toi_prune_load_config_info ++ * @buffer: Pointer to the start of the data. ++ * @size: Number of bytes that were saved. ++ * ++ * Description: Reload information needed for passing back to the ++ * resumed kernel. ++ */ ++static void toi_prune_load_config_info(char *buffer, int size) ++{ ++ int len, offset = 0; ++ ++ toi_pruned_pages = *((unsigned long *) buffer); ++ offset += sizeof(unsigned long); ++ len = *((int *) (buffer + offset)); ++ offset += sizeof(int); ++ strncpy(toi_prune_hash_algo_name, buffer + offset, len); ++} ++ ++static void toi_prune_pre_atomic_restore(struct toi_boot_kernel_data *bkd) ++{ ++ bkd->pruned_pages = toi_pruned_pages; ++} ++ ++static void toi_prune_post_atomic_restore(struct toi_boot_kernel_data *bkd) ++{ ++ toi_pruned_pages = bkd->pruned_pages; ++} ++ ++/* ++ * toi_expected_ratio ++ * ++ * Description: Returns the expected ratio between data passed into this module ++ * and the amount of data output when writing. ++ * Returns: 100 - we have no idea how many pages will be pruned. ++ */ ++ ++static int toi_prune_expected_ratio(void) ++{ ++ return 100; ++} ++ ++/* ++ * data for our sysfs entries. ++ */ ++static struct toi_sysfs_data sysfs_params[] = { ++ SYSFS_INT("enabled", SYSFS_RW, &toi_prune_ops.enabled, 0, 1, 0, ++ NULL), ++ SYSFS_STRING("algorithm", SYSFS_RW, toi_prune_hash_algo_name, 31, 0, NULL), ++}; ++ ++/* ++ * Ops structure. ++ */ ++static struct toi_module_ops toi_prune_ops = { ++ .type = FILTER_MODULE, ++ .name = "prune", ++ .directory = "prune", ++ .module = THIS_MODULE, ++ .initialise = toi_prune_init, ++ .memory_needed = toi_prune_memory_needed, ++ .print_debug_info = toi_prune_print_debug_stats, ++ .save_config_info = toi_prune_save_config_info, ++ .load_config_info = toi_prune_load_config_info, ++ .storage_needed = toi_prune_storage_needed, ++ .expected_compression = toi_prune_expected_ratio, ++ ++ .pre_atomic_restore = toi_prune_pre_atomic_restore, ++ .post_atomic_restore = toi_prune_post_atomic_restore, ++ ++ .rw_init = toi_prune_rw_init, ++ .rw_cleanup = toi_prune_rw_cleanup, ++ ++ .write_page = toi_prune_write_page, ++ .read_page = toi_prune_read_page, ++ ++ .sysfs_data = sysfs_params, ++ .num_sysfs_entries = sizeof(sysfs_params) / ++ sizeof(struct toi_sysfs_data), ++}; ++ ++/* ---- Registration ---- */ ++ ++static __init int toi_prune_load(void) ++{ ++ return toi_register_module(&toi_prune_ops); ++} ++ ++#ifdef MODULE ++static __exit void toi_prune_unload(void) ++{ ++ toi_unregister_module(&toi_prune_ops); ++} ++ ++module_init(toi_prune_load); ++module_exit(toi_prune_unload); ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Nigel Cunningham"); ++MODULE_DESCRIPTION("Image Pruning Support for TuxOnIce"); ++#else ++late_initcall(toi_prune_load); ++#endif +diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c +new file mode 100644 +index 0000000..dcf83f4 +--- /dev/null ++++ b/kernel/power/tuxonice_storage.c +@@ -0,0 +1,283 @@ ++/* ++ * kernel/power/tuxonice_storage.c ++ * ++ * Copyright (C) 2005-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * Routines for talking to a userspace program that manages storage. ++ * ++ * The kernel side: ++ * - starts the userspace program; ++ * - sends messages telling it when to open and close the connection; ++ * - tells it when to quit; ++ * ++ * The user space side: ++ * - passes messages regarding status; ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include "tuxonice_sysfs.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_netlink.h" ++#include "tuxonice_storage.h" ++#include "tuxonice_ui.h" ++ ++static struct user_helper_data usm_helper_data; ++static struct toi_module_ops usm_ops; ++static int message_received, usm_prepare_count; ++static int storage_manager_last_action, storage_manager_action; ++ ++static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) ++{ ++ int type; ++ int *data; ++ ++ type = nlh->nlmsg_type; ++ ++ /* A control message: ignore them */ ++ if (type < NETLINK_MSG_BASE) ++ return 0; ++ ++ /* Unknown message: reply with EINVAL */ ++ if (type >= USM_MSG_MAX) ++ return -EINVAL; ++ ++ /* All operations require privileges, even GET */ ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ /* Only allow one task to receive NOFREEZE privileges */ ++ if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1) ++ return -EBUSY; ++ ++ data = (int *) NLMSG_DATA(nlh); ++ ++ switch (type) { ++ case USM_MSG_SUCCESS: ++ case USM_MSG_FAILED: ++ message_received = type; ++ complete(&usm_helper_data.wait_for_process); ++ break; ++ default: ++ printk(KERN_INFO "Storage manager doesn't recognise " ++ "message %d.\n", type); ++ } ++ ++ return 1; ++} ++ ++#ifdef CONFIG_NET ++static int activations; ++ ++int toi_activate_storage(int force) ++{ ++ int tries = 1; ++ ++ if (usm_helper_data.pid == -1 || !usm_ops.enabled) ++ return 0; ++ ++ message_received = 0; ++ activations++; ++ ++ if (activations > 1 && !force) ++ return 0; ++ ++ while ((!message_received || message_received == USM_MSG_FAILED) && ++ tries < 2) { ++ toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt " ++ "%d.\n", tries); ++ ++ init_completion(&usm_helper_data.wait_for_process); ++ ++ toi_send_netlink_message(&usm_helper_data, ++ USM_MSG_CONNECT, ++ NULL, 0); ++ ++ /* Wait 2 seconds for the userspace process to make contact */ ++ wait_for_completion_timeout(&usm_helper_data.wait_for_process, ++ 2*HZ); ++ ++ tries++; ++ } ++ ++ return 0; ++} ++ ++int toi_deactivate_storage(int force) ++{ ++ if (usm_helper_data.pid == -1 || !usm_ops.enabled) ++ return 0; ++ ++ message_received = 0; ++ activations--; ++ ++ if (activations && !force) ++ return 0; ++ ++ init_completion(&usm_helper_data.wait_for_process); ++ ++ toi_send_netlink_message(&usm_helper_data, ++ USM_MSG_DISCONNECT, ++ NULL, 0); ++ ++ wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ); ++ ++ if (!message_received || message_received == USM_MSG_FAILED) { ++ printk(KERN_INFO "Returning failure disconnecting storage.\n"); ++ return 1; ++ } ++ ++ return 0; ++} ++#endif ++ ++static void storage_manager_simulate(void) ++{ ++ printk(KERN_INFO "--- Storage manager simulate ---\n"); ++ toi_prepare_usm(); ++ schedule(); ++ printk(KERN_INFO "--- Activate storage 1 ---\n"); ++ toi_activate_storage(1); ++ schedule(); ++ printk(KERN_INFO "--- Deactivate storage 1 ---\n"); ++ toi_deactivate_storage(1); ++ schedule(); ++ printk(KERN_INFO "--- Cleanup usm ---\n"); ++ toi_cleanup_usm(); ++ schedule(); ++ printk(KERN_INFO "--- Storage manager simulate ends ---\n"); ++} ++ ++static int usm_storage_needed(void) ++{ ++ return sizeof(int) + strlen(usm_helper_data.program) + 1; ++} ++ ++static int usm_save_config_info(char *buf) ++{ ++ int len = strlen(usm_helper_data.program); ++ memcpy(buf, usm_helper_data.program, len + 1); ++ return sizeof(int) + len + 1; ++} ++ ++static void usm_load_config_info(char *buf, int size) ++{ ++ /* Don't load the saved path if one has already been set */ ++ if (usm_helper_data.program[0]) ++ return; ++ ++ memcpy(usm_helper_data.program, buf + sizeof(int), *((int *) buf)); ++} ++ ++static int usm_memory_needed(void) ++{ ++ /* ball park figure of 32 pages */ ++ return 32 * PAGE_SIZE; ++} ++ ++/* toi_prepare_usm ++ */ ++int toi_prepare_usm(void) ++{ ++ usm_prepare_count++; ++ ++ if (usm_prepare_count > 1 || !usm_ops.enabled) ++ return 0; ++ ++ usm_helper_data.pid = -1; ++ ++ if (!*usm_helper_data.program) ++ return 0; ++ ++ toi_netlink_setup(&usm_helper_data); ++ ++ if (usm_helper_data.pid == -1) ++ printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't" ++ " start it.\n"); ++ ++ toi_activate_storage(0); ++ ++ return usm_helper_data.pid != -1; ++} ++ ++void toi_cleanup_usm(void) ++{ ++ usm_prepare_count--; ++ ++ if (usm_helper_data.pid > -1 && !usm_prepare_count) { ++ toi_deactivate_storage(0); ++ toi_netlink_close(&usm_helper_data); ++ } ++} ++ ++static void storage_manager_activate(void) ++{ ++ if (storage_manager_action == storage_manager_last_action) ++ return; ++ ++ if (storage_manager_action) ++ toi_prepare_usm(); ++ else ++ toi_cleanup_usm(); ++ ++ storage_manager_last_action = storage_manager_action; ++} ++ ++/* ++ * User interface specific /sys/power/tuxonice entries. ++ */ ++ ++static struct toi_sysfs_data sysfs_params[] = { ++ SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate), ++ SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL), ++ SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0, ++ NULL), ++ SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1, ++ 0, storage_manager_activate) ++}; ++ ++static struct toi_module_ops usm_ops = { ++ .type = MISC_MODULE, ++ .name = "usm", ++ .directory = "storage_manager", ++ .module = THIS_MODULE, ++ .storage_needed = usm_storage_needed, ++ .save_config_info = usm_save_config_info, ++ .load_config_info = usm_load_config_info, ++ .memory_needed = usm_memory_needed, ++ ++ .sysfs_data = sysfs_params, ++ .num_sysfs_entries = sizeof(sysfs_params) / ++ sizeof(struct toi_sysfs_data), ++}; ++ ++/* toi_usm_sysfs_init ++ * Description: Boot time initialisation for user interface. ++ */ ++int toi_usm_init(void) ++{ ++ usm_helper_data.nl = NULL; ++ usm_helper_data.program[0] = '\0'; ++ usm_helper_data.pid = -1; ++ usm_helper_data.skb_size = 0; ++ usm_helper_data.pool_limit = 6; ++ usm_helper_data.netlink_id = NETLINK_TOI_USM; ++ usm_helper_data.name = "userspace storage manager"; ++ usm_helper_data.rcv_msg = usm_user_rcv_msg; ++ usm_helper_data.interface_version = 2; ++ usm_helper_data.must_init = 0; ++ init_completion(&usm_helper_data.wait_for_process); ++ ++ return toi_register_module(&usm_ops); ++} ++ ++void toi_usm_exit(void) ++{ ++ toi_netlink_close_complete(&usm_helper_data); ++ toi_unregister_module(&usm_ops); ++} +diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h +new file mode 100644 +index 0000000..8c6b5a7 +--- /dev/null ++++ b/kernel/power/tuxonice_storage.h +@@ -0,0 +1,45 @@ ++/* ++ * kernel/power/tuxonice_storage.h ++ * ++ * Copyright (C) 2005-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ */ ++ ++#ifdef CONFIG_NET ++int toi_prepare_usm(void); ++void toi_cleanup_usm(void); ++ ++int toi_activate_storage(int force); ++int toi_deactivate_storage(int force); ++extern int toi_usm_init(void); ++extern void toi_usm_exit(void); ++#else ++static inline int toi_usm_init(void) { return 0; } ++static inline void toi_usm_exit(void) { } ++ ++static inline int toi_activate_storage(int force) ++{ ++ return 0; ++} ++ ++static inline int toi_deactivate_storage(int force) ++{ ++ return 0; ++} ++ ++static inline int toi_prepare_usm(void) { return 0; } ++static inline void toi_cleanup_usm(void) { } ++#endif ++ ++enum { ++ USM_MSG_BASE = 0x10, ++ ++ /* Kernel -> Userspace */ ++ USM_MSG_CONNECT = 0x30, ++ USM_MSG_DISCONNECT = 0x31, ++ USM_MSG_SUCCESS = 0x40, ++ USM_MSG_FAILED = 0x41, ++ ++ USM_MSG_MAX, ++}; +diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c +new file mode 100644 +index 0000000..a6c0d76 +--- /dev/null ++++ b/kernel/power/tuxonice_swap.c +@@ -0,0 +1,463 @@ ++/* ++ * kernel/power/tuxonice_swap.c ++ * ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * Distributed under GPLv2. ++ * ++ * This file encapsulates functions for usage of swap space as a ++ * backing store. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "tuxonice.h" ++#include "tuxonice_sysfs.h" ++#include "tuxonice_modules.h" ++#include "tuxonice_io.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_extent.h" ++#include "tuxonice_bio.h" ++#include "tuxonice_alloc.h" ++#include "tuxonice_builtin.h" ++ ++static struct toi_module_ops toi_swapops; ++ ++/* For swapfile automatically swapon/off'd. */ ++static char swapfilename[255] = ""; ++static int toi_swapon_status; ++ ++/* Swap Pages */ ++static unsigned long swap_allocated; ++ ++static struct sysinfo swapinfo; ++ ++static int is_ram_backed(struct swap_info_struct *si) ++{ ++ if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3) || ++ !strncmp(si->bdev->bd_disk->disk_name, "zram", 4)) ++ return 1; ++ ++ return 0; ++} ++ ++/** ++ * enable_swapfile: Swapon the user specified swapfile prior to hibernating. ++ * ++ * Activate the given swapfile if it wasn't already enabled. Remember whether ++ * we really did swapon it for swapoffing later. ++ */ ++static void enable_swapfile(void) ++{ ++ int activateswapresult = -EINVAL; ++ ++ if (swapfilename[0]) { ++ /* Attempt to swap on with maximum priority */ ++ activateswapresult = sys_swapon(swapfilename, 0xFFFF); ++ if (activateswapresult && activateswapresult != -EBUSY) ++ printk(KERN_ERR "TuxOnIce: The swapfile/partition " ++ "specified by /sys/power/tuxonice/swap/swapfile" ++ " (%s) could not be turned on (error %d). " ++ "Attempting to continue.\n", ++ swapfilename, activateswapresult); ++ if (!activateswapresult) ++ toi_swapon_status = 1; ++ } ++} ++ ++/** ++ * disable_swapfile: Swapoff any file swaponed at the start of the cycle. ++ * ++ * If we did successfully swapon a file at the start of the cycle, swapoff ++ * it now (finishing up). ++ */ ++static void disable_swapfile(void) ++{ ++ if (!toi_swapon_status) ++ return; ++ ++ sys_swapoff(swapfilename); ++ toi_swapon_status = 0; ++} ++ ++static int add_blocks_to_extent_chain(struct toi_bdev_info *chain, ++ unsigned long start, unsigned long end) ++{ ++ if (test_action_state(TOI_TEST_BIO)) ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %lu-%lu to " ++ "chain %p.", start << chain->bmap_shift, ++ end << chain->bmap_shift, chain); ++ ++ return toi_add_to_extent_chain(&chain->blocks, start, end); ++} ++ ++ ++static int get_main_pool_phys_params(struct toi_bdev_info *chain) ++{ ++ struct hibernate_extent *extentpointer = NULL; ++ unsigned long address, extent_min = 0, extent_max = 0; ++ int empty = 1; ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "get main pool phys params for " ++ "chain %d.", chain->allocator_index); ++ ++ if (!chain->allocations.first) ++ return 0; ++ ++ if (chain->blocks.first) ++ toi_put_extent_chain(&chain->blocks); ++ ++ toi_extent_for_each(&chain->allocations, extentpointer, address) { ++ swp_entry_t swap_address = (swp_entry_t) { address }; ++ struct block_device *bdev; ++ sector_t new_sector = map_swap_entry(swap_address, &bdev); ++ ++ if (empty) { ++ empty = 0; ++ extent_min = extent_max = new_sector; ++ continue; ++ } ++ ++ if (new_sector == extent_max + 1) { ++ extent_max++; ++ continue; ++ } ++ ++ if (add_blocks_to_extent_chain(chain, extent_min, extent_max)) { ++ printk(KERN_ERR "Out of memory while making block " ++ "chains.\n"); ++ return -ENOMEM; ++ } ++ ++ extent_min = new_sector; ++ extent_max = new_sector; ++ } ++ ++ if (!empty && ++ add_blocks_to_extent_chain(chain, extent_min, extent_max)) { ++ printk(KERN_ERR "Out of memory while making block chains.\n"); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Like si_swapinfo, except that we don't include ram backed swap (compcache!) ++ * and don't need to use the spinlocks (userspace is stopped when this ++ * function is called). ++ */ ++void si_swapinfo_no_compcache(void) ++{ ++ unsigned int i; ++ ++ si_swapinfo(&swapinfo); ++ swapinfo.freeswap = 0; ++ swapinfo.totalswap = 0; ++ ++ for (i = 0; i < MAX_SWAPFILES; i++) { ++ struct swap_info_struct *si = get_swap_info_struct(i); ++ if (si && (si->flags & SWP_WRITEOK) && !is_ram_backed(si)) { ++ swapinfo.totalswap += si->inuse_pages; ++ swapinfo.freeswap += si->pages - si->inuse_pages; ++ } ++ } ++} ++/* ++ * We can't just remember the value from allocation time, because other ++ * processes might have allocated swap in the mean time. ++ */ ++static unsigned long toi_swap_storage_available(void) ++{ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "In toi_swap_storage_available."); ++ si_swapinfo_no_compcache(); ++ return swapinfo.freeswap + swap_allocated; ++} ++ ++static int toi_swap_initialise(int starting_cycle) ++{ ++ if (!starting_cycle) ++ return 0; ++ ++ enable_swapfile(); ++ return 0; ++} ++ ++static void toi_swap_cleanup(int ending_cycle) ++{ ++ if (!ending_cycle) ++ return; ++ ++ disable_swapfile(); ++} ++ ++static void toi_swap_free_storage(struct toi_bdev_info *chain) ++{ ++ /* Free swap entries */ ++ struct hibernate_extent *extentpointer; ++ unsigned long extentvalue; ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing storage for chain %p.", ++ chain); ++ ++ swap_allocated -= chain->allocations.size; ++ toi_extent_for_each(&chain->allocations, extentpointer, extentvalue) ++ swap_free((swp_entry_t) { extentvalue }); ++ ++ toi_put_extent_chain(&chain->allocations); ++} ++ ++static void free_swap_range(unsigned long min, unsigned long max) ++{ ++ int j; ++ ++ for (j = min; j <= max; j++) ++ swap_free((swp_entry_t) { j }); ++ swap_allocated -= (max - min + 1); ++} ++ ++/* ++ * Allocation of a single swap type. Swap priorities are handled at the higher ++ * level. ++ */ ++static int toi_swap_allocate_storage(struct toi_bdev_info *chain, ++ unsigned long request) ++{ ++ unsigned long gotten = 0; ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, " Swap allocate storage: Asked to" ++ " allocate %lu pages from device %d.", request, ++ chain->allocator_index); ++ ++ while (gotten < request) { ++ swp_entry_t start, end; ++ get_swap_range_of_type(chain->allocator_index, &start, &end, ++ request - gotten + 1); ++ if (start.val) { ++ int added = end.val - start.val + 1; ++ if (toi_add_to_extent_chain(&chain->allocations, ++ start.val, end.val)) { ++ printk(KERN_INFO "Failed to allocate extent for " ++ "%lu-%lu.\n", start.val, end.val); ++ free_swap_range(start.val, end.val); ++ break; ++ } ++ gotten += added; ++ swap_allocated += added; ++ } else ++ break; ++ } ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, " Allocated %lu pages.", gotten); ++ return gotten; ++} ++ ++static int toi_swap_register_storage(void) ++{ ++ int i, result = 0; ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_swap_register_storage."); ++ for (i = 0; i < MAX_SWAPFILES; i++) { ++ struct swap_info_struct *si = get_swap_info_struct(i); ++ struct toi_bdev_info *devinfo; ++ unsigned char *p; ++ unsigned char buf[256]; ++ struct fs_info *fs_info; ++ ++ if (!si || !(si->flags & SWP_WRITEOK) || is_ram_backed(si)) ++ continue; ++ ++ devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), ++ GFP_ATOMIC); ++ if (!devinfo) { ++ printk("Failed to allocate devinfo struct for swap " ++ "device %d.\n", i); ++ return -ENOMEM; ++ } ++ ++ devinfo->bdev = si->bdev; ++ devinfo->allocator = &toi_swapops; ++ devinfo->allocator_index = i; ++ ++ fs_info = fs_info_from_block_dev(si->bdev); ++ if (fs_info && !IS_ERR(fs_info)) { ++ memcpy(devinfo->uuid, &fs_info->uuid, 16); ++ free_fs_info(fs_info); ++ } else ++ result = (int) PTR_ERR(fs_info); ++ ++ if (!fs_info) ++ printk("fs_info from block dev returned %d.\n", result); ++ devinfo->dev_t = si->bdev->bd_dev; ++ devinfo->prio = si->prio; ++ devinfo->bmap_shift = 3; ++ devinfo->blocks_per_page = 1; ++ ++ p = d_path(&si->swap_file->f_path, buf, sizeof(buf)); ++ sprintf(devinfo->name, "swap on %s", p); ++ ++ toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering swap storage:" ++ " Device %d (%lx), prio %d.", i, ++ (unsigned long) devinfo->dev_t, devinfo->prio); ++ toi_bio_ops.register_storage(devinfo); ++ } ++ ++ return 0; ++} ++ ++/* ++ * workspace_size ++ * ++ * Description: ++ * Returns the number of bytes of RAM needed for this ++ * code to do its work. (Used when calculating whether ++ * we have enough memory to be able to hibernate & resume). ++ * ++ */ ++static int toi_swap_memory_needed(void) ++{ ++ return 1; ++} ++ ++/* ++ * Print debug info ++ * ++ * Description: ++ */ ++static int toi_swap_print_debug_stats(char *buffer, int size) ++{ ++ int len = 0; ++ ++ len = scnprintf(buffer, size, "- Swap Allocator enabled.\n"); ++ if (swapfilename[0]) ++ len += scnprintf(buffer+len, size-len, ++ " Attempting to automatically swapon: %s.\n", ++ swapfilename); ++ ++ si_swapinfo_no_compcache(); ++ ++ len += scnprintf(buffer+len, size-len, ++ " Swap available for image: %lu pages.\n", ++ swapinfo.freeswap + swap_allocated); ++ ++ return len; ++} ++ ++static int header_locations_read_sysfs(const char *page, int count) ++{ ++ int i, printedpartitionsmessage = 0, len = 0, haveswap = 0; ++ struct inode *swapf = NULL; ++ int zone; ++ char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL); ++ char *path, *output = (char *) page; ++ int path_len; ++ ++ if (!page) ++ return 0; ++ ++ for (i = 0; i < MAX_SWAPFILES; i++) { ++ struct swap_info_struct *si = get_swap_info_struct(i); ++ ++ if (!si || !(si->flags & SWP_WRITEOK)) ++ continue; ++ ++ if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) { ++ haveswap = 1; ++ if (!printedpartitionsmessage) { ++ len += sprintf(output + len, ++ "For swap partitions, simply use the " ++ "format: resume=swap:/dev/hda1.\n"); ++ printedpartitionsmessage = 1; ++ } ++ } else { ++ path_len = 0; ++ ++ path = d_path(&si->swap_file->f_path, path_page, ++ PAGE_SIZE); ++ path_len = snprintf(path_page, PAGE_SIZE, "%s", path); ++ ++ haveswap = 1; ++ swapf = si->swap_file->f_mapping->host; ++ zone = bmap(swapf, 0); ++ if (!zone) { ++ len += sprintf(output + len, ++ "Swapfile %s has been corrupted. Reuse" ++ " mkswap on it and try again.\n", ++ path_page); ++ } else { ++ char name_buffer[BDEVNAME_SIZE]; ++ len += sprintf(output + len, ++ "For swapfile `%s`," ++ " use resume=swap:/dev/%s:0x%x.\n", ++ path_page, ++ bdevname(si->bdev, name_buffer), ++ zone << (swapf->i_blkbits - 9)); ++ } ++ } ++ } ++ ++ if (!haveswap) ++ len = sprintf(output, "You need to turn on swap partitions " ++ "before examining this file.\n"); ++ ++ toi_free_page(10, (unsigned long) path_page); ++ return len; ++} ++ ++static struct toi_sysfs_data sysfs_params[] = { ++ SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL), ++ SYSFS_CUSTOM("headerlocations", SYSFS_READONLY, ++ header_locations_read_sysfs, NULL, 0, NULL), ++ SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0, ++ attempt_to_parse_resume_device2), ++}; ++ ++static struct toi_bio_allocator_ops toi_bio_swapops = { ++ .register_storage = toi_swap_register_storage, ++ .storage_available = toi_swap_storage_available, ++ .allocate_storage = toi_swap_allocate_storage, ++ .bmap = get_main_pool_phys_params, ++ .free_storage = toi_swap_free_storage, ++}; ++ ++static struct toi_module_ops toi_swapops = { ++ .type = BIO_ALLOCATOR_MODULE, ++ .name = "swap storage", ++ .directory = "swap", ++ .module = THIS_MODULE, ++ .memory_needed = toi_swap_memory_needed, ++ .print_debug_info = toi_swap_print_debug_stats, ++ .initialise = toi_swap_initialise, ++ .cleanup = toi_swap_cleanup, ++ .bio_allocator_ops = &toi_bio_swapops, ++ ++ .sysfs_data = sysfs_params, ++ .num_sysfs_entries = sizeof(sysfs_params) / ++ sizeof(struct toi_sysfs_data), ++}; ++ ++/* ---- Registration ---- */ ++static __init int toi_swap_load(void) ++{ ++ return toi_register_module(&toi_swapops); ++} ++ ++#ifdef MODULE ++static __exit void toi_swap_unload(void) ++{ ++ toi_unregister_module(&toi_swapops); ++} ++ ++module_init(toi_swap_load); ++module_exit(toi_swap_unload); ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Nigel Cunningham"); ++MODULE_DESCRIPTION("TuxOnIce SwapAllocator"); ++#else ++late_initcall(toi_swap_load); ++#endif +diff --git a/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c +new file mode 100644 +index 0000000..0088409 +--- /dev/null ++++ b/kernel/power/tuxonice_sysfs.c +@@ -0,0 +1,335 @@ ++/* ++ * kernel/power/tuxonice_sysfs.c ++ * ++ * Copyright (C) 2002-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * This file contains support for sysfs entries for tuning TuxOnIce. ++ * ++ * We have a generic handler that deals with the most common cases, and ++ * hooks for special handlers to use. ++ */ ++ ++#include ++ ++#include "tuxonice_sysfs.h" ++#include "tuxonice.h" ++#include "tuxonice_storage.h" ++#include "tuxonice_alloc.h" ++ ++static int toi_sysfs_initialised; ++ ++static void toi_initialise_sysfs(void); ++ ++static struct toi_sysfs_data sysfs_params[]; ++ ++#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr) ++ ++static void toi_main_wrapper(void) ++{ ++ toi_try_hibernate(); ++} ++ ++static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr, ++ char *page) ++{ ++ struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr); ++ int len = 0; ++ int full_prep = sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ; ++ ++ if (full_prep && toi_start_anything(0)) ++ return -EBUSY; ++ ++ if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ) ++ toi_prepare_usm(); ++ ++ switch (sysfs_data->type) { ++ case TOI_SYSFS_DATA_CUSTOM: ++ len = (sysfs_data->data.special.read_sysfs) ? ++ (sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE) ++ : 0; ++ break; ++ case TOI_SYSFS_DATA_BIT: ++ len = sprintf(page, "%d\n", ++ -test_bit(sysfs_data->data.bit.bit, ++ sysfs_data->data.bit.bit_vector)); ++ break; ++ case TOI_SYSFS_DATA_INTEGER: ++ len = sprintf(page, "%d\n", ++ *(sysfs_data->data.integer.variable)); ++ break; ++ case TOI_SYSFS_DATA_LONG: ++ len = sprintf(page, "%ld\n", ++ *(sysfs_data->data.a_long.variable)); ++ break; ++ case TOI_SYSFS_DATA_UL: ++ len = sprintf(page, "%lu\n", ++ *(sysfs_data->data.ul.variable)); ++ break; ++ case TOI_SYSFS_DATA_STRING: ++ len = sprintf(page, "%s\n", ++ sysfs_data->data.string.variable); ++ break; ++ } ++ ++ if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ) ++ toi_cleanup_usm(); ++ ++ if (full_prep) ++ toi_finish_anything(0); ++ ++ return len; ++} ++ ++#define BOUND(_variable, _type) do { \ ++ if (*_variable < sysfs_data->data._type.minimum) \ ++ *_variable = sysfs_data->data._type.minimum; \ ++ else if (*_variable > sysfs_data->data._type.maximum) \ ++ *_variable = sysfs_data->data._type.maximum; \ ++} while (0) ++ ++static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr, ++ const char *my_buf, size_t count) ++{ ++ int assigned_temp_buffer = 0, result = count; ++ struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr); ++ ++ if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME))) ++ return -EBUSY; ++ ++ ((char *) my_buf)[count] = 0; ++ ++ if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE) ++ toi_prepare_usm(); ++ ++ switch (sysfs_data->type) { ++ case TOI_SYSFS_DATA_CUSTOM: ++ if (sysfs_data->data.special.write_sysfs) ++ result = (sysfs_data->data.special.write_sysfs)(my_buf, ++ count); ++ break; ++ case TOI_SYSFS_DATA_BIT: ++ { ++ unsigned long value; ++ result = strict_strtoul(my_buf, 0, &value); ++ if (result) ++ break; ++ if (value) ++ set_bit(sysfs_data->data.bit.bit, ++ (sysfs_data->data.bit.bit_vector)); ++ else ++ clear_bit(sysfs_data->data.bit.bit, ++ (sysfs_data->data.bit.bit_vector)); ++ } ++ break; ++ case TOI_SYSFS_DATA_INTEGER: ++ { ++ long temp; ++ result = strict_strtol(my_buf, 0, &temp); ++ if (result) ++ break; ++ *(sysfs_data->data.integer.variable) = (int) temp; ++ BOUND(sysfs_data->data.integer.variable, integer); ++ break; ++ } ++ case TOI_SYSFS_DATA_LONG: ++ { ++ long *variable = ++ sysfs_data->data.a_long.variable; ++ result = strict_strtol(my_buf, 0, variable); ++ if (result) ++ break; ++ BOUND(variable, a_long); ++ break; ++ } ++ case TOI_SYSFS_DATA_UL: ++ { ++ unsigned long *variable = ++ sysfs_data->data.ul.variable; ++ result = strict_strtoul(my_buf, 0, variable); ++ if (result) ++ break; ++ BOUND(variable, ul); ++ break; ++ } ++ break; ++ case TOI_SYSFS_DATA_STRING: ++ { ++ int copy_len = count; ++ char *variable = ++ sysfs_data->data.string.variable; ++ ++ if (sysfs_data->data.string.max_length && ++ (copy_len > sysfs_data->data.string.max_length)) ++ copy_len = sysfs_data->data.string.max_length; ++ ++ if (!variable) { ++ variable = (char *) toi_get_zeroed_page(31, ++ TOI_ATOMIC_GFP); ++ sysfs_data->data.string.variable = variable; ++ assigned_temp_buffer = 1; ++ } ++ strncpy(variable, my_buf, copy_len); ++ if (copy_len && my_buf[copy_len - 1] == '\n') ++ variable[count - 1] = 0; ++ variable[count] = 0; ++ } ++ break; ++ } ++ ++ if (!result) ++ result = count; ++ ++ /* Side effect routine? */ ++ if (result == count && sysfs_data->write_side_effect) ++ sysfs_data->write_side_effect(); ++ ++ /* Free temporary buffers */ ++ if (assigned_temp_buffer) { ++ toi_free_page(31, ++ (unsigned long) sysfs_data->data.string.variable); ++ sysfs_data->data.string.variable = NULL; ++ } ++ ++ if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE) ++ toi_cleanup_usm(); ++ ++ toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME); ++ ++ return result; ++} ++ ++static struct sysfs_ops toi_sysfs_ops = { ++ .show = &toi_attr_show, ++ .store = &toi_attr_store, ++}; ++ ++static struct kobj_type toi_ktype = { ++ .sysfs_ops = &toi_sysfs_ops, ++}; ++ ++struct kobject *tuxonice_kobj; ++ ++/* Non-module sysfs entries. ++ * ++ * This array contains entries that are automatically registered at ++ * boot. Modules and the console code register their own entries separately. ++ */ ++ ++static struct toi_sysfs_data sysfs_params[] = { ++ SYSFS_CUSTOM("do_hibernate", SYSFS_WRITEONLY, NULL, NULL, ++ SYSFS_HIBERNATING, toi_main_wrapper), ++ SYSFS_CUSTOM("do_resume", SYSFS_WRITEONLY, NULL, NULL, ++ SYSFS_RESUMING, toi_try_resume) ++}; ++ ++void remove_toi_sysdir(struct kobject *kobj) ++{ ++ if (!kobj) ++ return; ++ ++ kobject_put(kobj); ++} ++ ++struct kobject *make_toi_sysdir(char *name) ++{ ++ struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj); ++ ++ if (!kobj) { ++ printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs " ++ "dir!\n"); ++ return NULL; ++ } ++ ++ kobj->ktype = &toi_ktype; ++ ++ return kobj; ++} ++ ++/* toi_register_sysfs_file ++ * ++ * Helper for registering a new /sysfs/tuxonice entry. ++ */ ++ ++int toi_register_sysfs_file( ++ struct kobject *kobj, ++ struct toi_sysfs_data *toi_sysfs_data) ++{ ++ int result; ++ ++ if (!toi_sysfs_initialised) ++ toi_initialise_sysfs(); ++ ++ result = sysfs_create_file(kobj, &toi_sysfs_data->attr); ++ if (result) ++ printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s " ++ "returned %d.\n", ++ toi_sysfs_data->attr.name, result); ++ kobj->ktype = &toi_ktype; ++ ++ return result; ++} ++EXPORT_SYMBOL_GPL(toi_register_sysfs_file); ++ ++/* toi_unregister_sysfs_file ++ * ++ * Helper for removing unwanted /sys/power/tuxonice entries. ++ * ++ */ ++void toi_unregister_sysfs_file(struct kobject *kobj, ++ struct toi_sysfs_data *toi_sysfs_data) ++{ ++ sysfs_remove_file(kobj, &toi_sysfs_data->attr); ++} ++EXPORT_SYMBOL_GPL(toi_unregister_sysfs_file); ++ ++void toi_cleanup_sysfs(void) ++{ ++ int i, ++ numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); ++ ++ if (!toi_sysfs_initialised) ++ return; ++ ++ for (i = 0; i < numfiles; i++) ++ toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]); ++ ++ kobject_put(tuxonice_kobj); ++ toi_sysfs_initialised = 0; ++} ++ ++/* toi_initialise_sysfs ++ * ++ * Initialise the /sysfs/tuxonice directory. ++ */ ++ ++static void toi_initialise_sysfs(void) ++{ ++ int i; ++ int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data); ++ ++ if (toi_sysfs_initialised) ++ return; ++ ++ /* Make our TuxOnIce directory a child of /sys/power */ ++ tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj); ++ if (!tuxonice_kobj) ++ return; ++ ++ toi_sysfs_initialised = 1; ++ ++ for (i = 0; i < numfiles; i++) ++ toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]); ++} ++ ++int toi_sysfs_init(void) ++{ ++ toi_initialise_sysfs(); ++ return 0; ++} ++ ++void toi_sysfs_exit(void) ++{ ++ toi_cleanup_sysfs(); ++} +diff --git a/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h +new file mode 100644 +index 0000000..4185c6d +--- /dev/null ++++ b/kernel/power/tuxonice_sysfs.h +@@ -0,0 +1,137 @@ ++/* ++ * kernel/power/tuxonice_sysfs.h ++ * ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ */ ++ ++#include ++ ++struct toi_sysfs_data { ++ struct attribute attr; ++ int type; ++ int flags; ++ union { ++ struct { ++ unsigned long *bit_vector; ++ int bit; ++ } bit; ++ struct { ++ int *variable; ++ int minimum; ++ int maximum; ++ } integer; ++ struct { ++ long *variable; ++ long minimum; ++ long maximum; ++ } a_long; ++ struct { ++ unsigned long *variable; ++ unsigned long minimum; ++ unsigned long maximum; ++ } ul; ++ struct { ++ char *variable; ++ int max_length; ++ } string; ++ struct { ++ int (*read_sysfs) (const char *buffer, int count); ++ int (*write_sysfs) (const char *buffer, int count); ++ void *data; ++ } special; ++ } data; ++ ++ /* Side effects routine. Used, eg, for reparsing the ++ * resume= entry when it changes */ ++ void (*write_side_effect) (void); ++ struct list_head sysfs_data_list; ++}; ++ ++enum { ++ TOI_SYSFS_DATA_NONE = 1, ++ TOI_SYSFS_DATA_CUSTOM, ++ TOI_SYSFS_DATA_BIT, ++ TOI_SYSFS_DATA_INTEGER, ++ TOI_SYSFS_DATA_UL, ++ TOI_SYSFS_DATA_LONG, ++ TOI_SYSFS_DATA_STRING ++}; ++ ++#define SYSFS_WRITEONLY 0200 ++#define SYSFS_READONLY 0444 ++#define SYSFS_RW 0644 ++ ++#define SYSFS_BIT(_name, _mode, _ul, _bit, _flags) { \ ++ .attr = {.name = _name , .mode = _mode }, \ ++ .type = TOI_SYSFS_DATA_BIT, \ ++ .flags = _flags, \ ++ .data = { .bit = { .bit_vector = _ul, .bit = _bit } } } ++ ++#define SYSFS_INT(_name, _mode, _int, _min, _max, _flags, _wse) { \ ++ .attr = {.name = _name , .mode = _mode }, \ ++ .type = TOI_SYSFS_DATA_INTEGER, \ ++ .flags = _flags, \ ++ .data = { .integer = { .variable = _int, .minimum = _min, \ ++ .maximum = _max } }, \ ++ .write_side_effect = _wse } ++ ++#define SYSFS_UL(_name, _mode, _ul, _min, _max, _flags) { \ ++ .attr = {.name = _name , .mode = _mode }, \ ++ .type = TOI_SYSFS_DATA_UL, \ ++ .flags = _flags, \ ++ .data = { .ul = { .variable = _ul, .minimum = _min, \ ++ .maximum = _max } } } ++ ++#define SYSFS_LONG(_name, _mode, _long, _min, _max, _flags) { \ ++ .attr = {.name = _name , .mode = _mode }, \ ++ .type = TOI_SYSFS_DATA_LONG, \ ++ .flags = _flags, \ ++ .data = { .a_long = { .variable = _long, .minimum = _min, \ ++ .maximum = _max } } } ++ ++#define SYSFS_STRING(_name, _mode, _string, _max_len, _flags, _wse) { \ ++ .attr = {.name = _name , .mode = _mode }, \ ++ .type = TOI_SYSFS_DATA_STRING, \ ++ .flags = _flags, \ ++ .data = { .string = { .variable = _string, .max_length = _max_len } }, \ ++ .write_side_effect = _wse } ++ ++#define SYSFS_CUSTOM(_name, _mode, _read, _write, _flags, _wse) { \ ++ .attr = {.name = _name , .mode = _mode }, \ ++ .type = TOI_SYSFS_DATA_CUSTOM, \ ++ .flags = _flags, \ ++ .data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }, \ ++ .write_side_effect = _wse } ++ ++#define SYSFS_NONE(_name, _wse) { \ ++ .attr = {.name = _name , .mode = SYSFS_WRITEONLY }, \ ++ .type = TOI_SYSFS_DATA_NONE, \ ++ .write_side_effect = _wse, \ ++} ++ ++/* Flags */ ++#define SYSFS_NEEDS_SM_FOR_READ 1 ++#define SYSFS_NEEDS_SM_FOR_WRITE 2 ++#define SYSFS_HIBERNATE 4 ++#define SYSFS_RESUME 8 ++#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME) ++#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE) ++#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE) ++#define SYSFS_NEEDS_SM_FOR_BOTH \ ++ (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE) ++ ++int toi_register_sysfs_file(struct kobject *kobj, ++ struct toi_sysfs_data *toi_sysfs_data); ++void toi_unregister_sysfs_file(struct kobject *kobj, ++ struct toi_sysfs_data *toi_sysfs_data); ++ ++extern struct kobject *tuxonice_kobj; ++ ++struct kobject *make_toi_sysdir(char *name); ++void remove_toi_sysdir(struct kobject *obj); ++extern void toi_cleanup_sysfs(void); ++ ++extern int toi_sysfs_init(void); ++extern void toi_sysfs_exit(void); +diff --git a/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c +new file mode 100644 +index 0000000..452b3db +--- /dev/null ++++ b/kernel/power/tuxonice_ui.c +@@ -0,0 +1,250 @@ ++/* ++ * kernel/power/tuxonice_ui.c ++ * ++ * Copyright (C) 1998-2001 Gabor Kuti ++ * Copyright (C) 1998,2001,2002 Pavel Machek ++ * Copyright (C) 2002-2003 Florent Chabaud ++ * Copyright (C) 2002-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * Routines for TuxOnIce's user interface. ++ * ++ * The user interface code talks to a userspace program via a ++ * netlink socket. ++ * ++ * The kernel side: ++ * - starts the userui program; ++ * - sends text messages and progress bar status; ++ * ++ * The user space side: ++ * - passes messages regarding user requests (abort, toggle reboot etc) ++ * ++ */ ++ ++#define __KERNEL_SYSCALLS__ ++ ++#include ++ ++#include "tuxonice_sysfs.h" ++#include "tuxonice_modules.h" ++#include "tuxonice.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_netlink.h" ++#include "tuxonice_power_off.h" ++#include "tuxonice_builtin.h" ++ ++static char local_printf_buf[1024]; /* Same as printk - should be safe */ ++struct ui_ops *toi_current_ui; ++EXPORT_SYMBOL_GPL(toi_current_ui); ++ ++/** ++ * toi_wait_for_keypress - Wait for keypress via userui or /dev/console. ++ * ++ * @timeout: Maximum time to wait. ++ * ++ * Wait for a keypress, either from userui or /dev/console if userui isn't ++ * available. The non-userui path is particularly for at boot-time, prior ++ * to userui being started, when we have an important warning to give to ++ * the user. ++ */ ++static char toi_wait_for_keypress(int timeout) ++{ ++ if (toi_current_ui && toi_current_ui->wait_for_key(timeout)) ++ return ' '; ++ ++ return toi_wait_for_keypress_dev_console(timeout); ++} ++ ++/* toi_early_boot_message() ++ * Description: Handle errors early in the process of booting. ++ * The user may press C to continue booting, perhaps ++ * invalidating the image, or space to reboot. ++ * This works from either the serial console or normally ++ * attached keyboard. ++ * ++ * Note that we come in here from init, while the kernel is ++ * locked. If we want to get events from the serial console, ++ * we need to temporarily unlock the kernel. ++ * ++ * toi_early_boot_message may also be called post-boot. ++ * In this case, it simply printks the message and returns. ++ * ++ * Arguments: int Whether we are able to erase the image. ++ * int default_answer. What to do when we timeout. This ++ * will normally be continue, but the user might ++ * provide command line options (__setup) to override ++ * particular cases. ++ * Char *. Pointer to a string explaining why we're moaning. ++ */ ++ ++#define say(message, a...) printk(KERN_EMERG message, ##a) ++ ++void toi_early_boot_message(int message_detail, int default_answer, ++ char *warning_reason, ...) ++{ ++#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE) ++ unsigned long orig_state = get_toi_state(), continue_req = 0; ++ unsigned long orig_loglevel = console_loglevel; ++ int can_ask = 1; ++#else ++ int can_ask = 0; ++#endif ++ ++ va_list args; ++ int printed_len; ++ ++ if (!toi_wait) { ++ set_toi_state(TOI_CONTINUE_REQ); ++ can_ask = 0; ++ } ++ ++ if (warning_reason) { ++ va_start(args, warning_reason); ++ printed_len = vsnprintf(local_printf_buf, ++ sizeof(local_printf_buf), ++ warning_reason, ++ args); ++ va_end(args); ++ } ++ ++ if (!test_toi_state(TOI_BOOT_TIME)) { ++ printk("TuxOnIce: %s\n", local_printf_buf); ++ return; ++ } ++ ++ if (!can_ask) { ++ continue_req = !!default_answer; ++ goto post_ask; ++ } ++ ++#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE) ++ console_loglevel = 7; ++ ++ say("=== TuxOnIce ===\n\n"); ++ if (warning_reason) { ++ say("BIG FAT WARNING!! %s\n\n", local_printf_buf); ++ switch (message_detail) { ++ case 0: ++ say("If you continue booting, note that any image WILL" ++ "NOT BE REMOVED.\nTuxOnIce is unable to do so " ++ "because the appropriate modules aren't\n" ++ "loaded. You should manually remove the image " ++ "to avoid any\npossibility of corrupting your " ++ "filesystem(s) later.\n"); ++ break; ++ case 1: ++ say("If you want to use the current TuxOnIce image, " ++ "reboot and try\nagain with the same kernel " ++ "that you hibernated from. If you want\n" ++ "to forget that image, continue and the image " ++ "will be erased.\n"); ++ break; ++ } ++ say("Press SPACE to reboot or C to continue booting with " ++ "this kernel\n\n"); ++ if (toi_wait > 0) ++ say("Default action if you don't select one in %d " ++ "seconds is: %s.\n", ++ toi_wait, ++ default_answer == TOI_CONTINUE_REQ ? ++ "continue booting" : "reboot"); ++ } else { ++ say("BIG FAT WARNING!!\n\n" ++ "You have tried to resume from this image before.\n" ++ "If it failed once, it may well fail again.\n" ++ "Would you like to remove the image and boot " ++ "normally?\nThis will be equivalent to entering " ++ "noresume on the\nkernel command line.\n\n" ++ "Press SPACE to remove the image or C to continue " ++ "resuming.\n\n"); ++ if (toi_wait > 0) ++ say("Default action if you don't select one in %d " ++ "seconds is: %s.\n", toi_wait, ++ !!default_answer ? ++ "continue resuming" : "remove the image"); ++ } ++ console_loglevel = orig_loglevel; ++ ++ set_toi_state(TOI_SANITY_CHECK_PROMPT); ++ clear_toi_state(TOI_CONTINUE_REQ); ++ ++ if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */ ++ continue_req = !!default_answer; ++ else ++ continue_req = test_toi_state(TOI_CONTINUE_REQ); ++ ++#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */ ++ ++post_ask: ++ if ((warning_reason) && (!continue_req)) ++ kernel_restart(NULL); ++ ++ restore_toi_state(orig_state); ++ if (continue_req) ++ set_toi_state(TOI_CONTINUE_REQ); ++} ++EXPORT_SYMBOL_GPL(toi_early_boot_message); ++#undef say ++ ++/* ++ * User interface specific /sys/power/tuxonice entries. ++ */ ++ ++static struct toi_sysfs_data sysfs_params[] = { ++#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) ++ SYSFS_INT("default_console_level", SYSFS_RW, ++ &toi_bkd.toi_default_console_level, 0, 7, 0, NULL), ++ SYSFS_UL("debug_sections", SYSFS_RW, &toi_bkd.toi_debug_state, 0, ++ 1 << 30, 0), ++ SYSFS_BIT("log_everything", SYSFS_RW, &toi_bkd.toi_action, TOI_LOGALL, ++ 0) ++#endif ++}; ++ ++static struct toi_module_ops userui_ops = { ++ .type = MISC_HIDDEN_MODULE, ++ .name = "printk ui", ++ .directory = "user_interface", ++ .module = THIS_MODULE, ++ .sysfs_data = sysfs_params, ++ .num_sysfs_entries = sizeof(sysfs_params) / ++ sizeof(struct toi_sysfs_data), ++}; ++ ++int toi_register_ui_ops(struct ui_ops *this_ui) ++{ ++ if (toi_current_ui) { ++ printk(KERN_INFO "Only one TuxOnIce user interface module can " ++ "be loaded at a time."); ++ return -EBUSY; ++ } ++ ++ toi_current_ui = this_ui; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(toi_register_ui_ops); ++ ++void toi_remove_ui_ops(struct ui_ops *this_ui) ++{ ++ if (toi_current_ui != this_ui) ++ return; ++ ++ toi_current_ui = NULL; ++} ++EXPORT_SYMBOL_GPL(toi_remove_ui_ops); ++ ++/* toi_console_sysfs_init ++ * Description: Boot time initialisation for user interface. ++ */ ++ ++int toi_ui_init(void) ++{ ++ return toi_register_module(&userui_ops); ++} ++ ++void toi_ui_exit(void) ++{ ++ toi_unregister_module(&userui_ops); ++} +diff --git a/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h +new file mode 100644 +index 0000000..4ced165 +--- /dev/null ++++ b/kernel/power/tuxonice_ui.h +@@ -0,0 +1,97 @@ ++/* ++ * kernel/power/tuxonice_ui.h ++ * ++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net) ++ */ ++ ++enum { ++ DONT_CLEAR_BAR, ++ CLEAR_BAR ++}; ++ ++enum { ++ /* Userspace -> Kernel */ ++ USERUI_MSG_ABORT = 0x11, ++ USERUI_MSG_SET_STATE = 0x12, ++ USERUI_MSG_GET_STATE = 0x13, ++ USERUI_MSG_GET_DEBUG_STATE = 0x14, ++ USERUI_MSG_SET_DEBUG_STATE = 0x15, ++ USERUI_MSG_SPACE = 0x18, ++ USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A, ++ USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B, ++ USERUI_MSG_GET_LOGLEVEL = 0x1C, ++ USERUI_MSG_SET_LOGLEVEL = 0x1D, ++ USERUI_MSG_PRINTK = 0x1E, ++ ++ /* Kernel -> Userspace */ ++ USERUI_MSG_MESSAGE = 0x21, ++ USERUI_MSG_PROGRESS = 0x22, ++ USERUI_MSG_POST_ATOMIC_RESTORE = 0x25, ++ ++ USERUI_MSG_MAX, ++}; ++ ++struct userui_msg_params { ++ u32 a, b, c, d; ++ char text[255]; ++}; ++ ++struct ui_ops { ++ char (*wait_for_key) (int timeout); ++ u32 (*update_status) (u32 value, u32 maximum, const char *fmt, ...); ++ void (*prepare_status) (int clearbar, const char *fmt, ...); ++ void (*cond_pause) (int pause, char *message); ++ void (*abort)(int result_code, const char *fmt, ...); ++ void (*prepare)(void); ++ void (*cleanup)(void); ++ void (*message)(u32 section, u32 level, u32 normally_logged, ++ const char *fmt, ...); ++}; ++ ++extern struct ui_ops *toi_current_ui; ++ ++#define toi_update_status(val, max, fmt, args...) \ ++ (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \ ++ max) ++ ++#define toi_prepare_console(void) \ ++ do { if (toi_current_ui) \ ++ (toi_current_ui->prepare)(); \ ++ } while (0) ++ ++#define toi_cleanup_console(void) \ ++ do { if (toi_current_ui) \ ++ (toi_current_ui->cleanup)(); \ ++ } while (0) ++ ++#define abort_hibernate(result, fmt, args...) \ ++ do { if (toi_current_ui) \ ++ (toi_current_ui->abort)(result, fmt, ##args); \ ++ else { \ ++ set_abort_result(result); \ ++ } \ ++ } while (0) ++ ++#define toi_cond_pause(pause, message) \ ++ do { if (toi_current_ui) \ ++ (toi_current_ui->cond_pause)(pause, message); \ ++ } while (0) ++ ++#define toi_prepare_status(clear, fmt, args...) \ ++ do { if (toi_current_ui) \ ++ (toi_current_ui->prepare_status)(clear, fmt, ##args); \ ++ else \ ++ printk(KERN_INFO fmt "%s", ##args, "\n"); \ ++ } while (0) ++ ++#define toi_message(sn, lev, log, fmt, a...) \ ++do { \ ++ if (toi_current_ui && (!sn || test_debug_state(sn))) \ ++ toi_current_ui->message(sn, lev, log, fmt, ##a); \ ++} while (0) ++ ++__exit void toi_ui_cleanup(void); ++extern int toi_ui_init(void); ++extern void toi_ui_exit(void); ++extern int toi_register_ui_ops(struct ui_ops *this_ui); ++extern void toi_remove_ui_ops(struct ui_ops *this_ui); +diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c +new file mode 100644 +index 0000000..bc74672 +--- /dev/null ++++ b/kernel/power/tuxonice_userui.c +@@ -0,0 +1,667 @@ ++/* ++ * kernel/power/user_ui.c ++ * ++ * Copyright (C) 2005-2007 Bernard Blackham ++ * Copyright (C) 2002-2010 Nigel Cunningham (nigel at tuxonice net) ++ * ++ * This file is released under the GPLv2. ++ * ++ * Routines for TuxOnIce's user interface. ++ * ++ * The user interface code talks to a userspace program via a ++ * netlink socket. ++ * ++ * The kernel side: ++ * - starts the userui program; ++ * - sends text messages and progress bar status; ++ * ++ * The user space side: ++ * - passes messages regarding user requests (abort, toggle reboot etc) ++ * ++ */ ++ ++#define __KERNEL_SYSCALLS__ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "tuxonice_sysfs.h" ++#include "tuxonice_modules.h" ++#include "tuxonice.h" ++#include "tuxonice_ui.h" ++#include "tuxonice_netlink.h" ++#include "tuxonice_power_off.h" ++ ++static char local_printf_buf[1024]; /* Same as printk - should be safe */ ++ ++static struct user_helper_data ui_helper_data; ++static struct toi_module_ops userui_ops; ++static int orig_kmsg; ++ ++static char lastheader[512]; ++static int lastheader_message_len; ++static int ui_helper_changed; /* Used at resume-time so don't overwrite value ++ set from initrd/ramfs. */ ++ ++/* Number of distinct progress amounts that userspace can display */ ++static int progress_granularity = 30; ++ ++static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key); ++ ++/** ++ * ui_nl_set_state - Update toi_action based on a message from userui. ++ * ++ * @n: The bit (1 << bit) to set. ++ */ ++static void ui_nl_set_state(int n) ++{ ++ /* Only let them change certain settings */ ++ static const u32 toi_action_mask = ++ (1 << TOI_REBOOT) | (1 << TOI_PAUSE) | ++ (1 << TOI_LOGALL) | ++ (1 << TOI_SINGLESTEP) | ++ (1 << TOI_PAUSE_NEAR_PAGESET_END); ++ static unsigned long new_action; ++ ++ new_action = (toi_bkd.toi_action & (~toi_action_mask)) | ++ (n & toi_action_mask); ++ ++ printk(KERN_DEBUG "n is %x. Action flags being changed from %lx " ++ "to %lx.", n, toi_bkd.toi_action, new_action); ++ toi_bkd.toi_action = new_action; ++ ++ if (!test_action_state(TOI_PAUSE) && ++ !test_action_state(TOI_SINGLESTEP)) ++ wake_up_interruptible(&userui_wait_for_key); ++} ++ ++/** ++ * userui_post_atomic_restore - Tell userui that atomic restore just happened. ++ * ++ * Tell userui that atomic restore just occured, so that it can do things like ++ * redrawing the screen, re-getting settings and so on. ++ */ ++static void userui_post_atomic_restore(struct toi_boot_kernel_data *bkd) ++{ ++ toi_send_netlink_message(&ui_helper_data, ++ USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0); ++} ++ ++/** ++ * userui_storage_needed - Report how much memory in image header is needed. ++ */ ++static int userui_storage_needed(void) ++{ ++ return sizeof(ui_helper_data.program) + 1 + sizeof(int); ++} ++ ++/** ++ * userui_save_config_info - Fill buffer with config info for image header. ++ * ++ * @buf: Buffer into which to put the config info we want to save. ++ */ ++static int userui_save_config_info(char *buf) ++{ ++ *((int *) buf) = progress_granularity; ++ memcpy(buf + sizeof(int), ui_helper_data.program, ++ sizeof(ui_helper_data.program)); ++ return sizeof(ui_helper_data.program) + sizeof(int) + 1; ++} ++ ++/** ++ * userui_load_config_info - Restore config info from buffer. ++ * ++ * @buf: Buffer containing header info loaded. ++ * @size: Size of data loaded for this module. ++ */ ++static void userui_load_config_info(char *buf, int size) ++{ ++ progress_granularity = *((int *) buf); ++ size -= sizeof(int); ++ ++ /* Don't load the saved path if one has already been set */ ++ if (ui_helper_changed) ++ return; ++ ++ if (size > sizeof(ui_helper_data.program)) ++ size = sizeof(ui_helper_data.program); ++ ++ memcpy(ui_helper_data.program, buf + sizeof(int), size); ++ ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0'; ++} ++ ++/** ++ * set_ui_program_set: Record that userui program was changed. ++ * ++ * Side effect routine for when the userui program is set. In an initrd or ++ * ramfs, the user may set a location for the userui program. If this happens, ++ * we don't want to reload the value that was saved in the image header. This ++ * routine allows us to flag that we shouldn't restore the program name from ++ * the image header. ++ */ ++static void set_ui_program_set(void) ++{ ++ ui_helper_changed = 1; ++} ++ ++/** ++ * userui_memory_needed - Tell core how much memory to reserve for us. ++ */ ++static int userui_memory_needed(void) ++{ ++ /* ball park figure of 128 pages */ ++ return 128 * PAGE_SIZE; ++} ++ ++/** ++ * userui_update_status - Update the progress bar and (if on) in-bar message. ++ * ++ * @value: Current progress percentage numerator. ++ * @maximum: Current progress percentage denominator. ++ * @fmt: Message to be displayed in the middle of the progress bar. ++ * ++ * Note that a NULL message does not mean that any previous message is erased! ++ * For that, you need toi_prepare_status with clearbar on. ++ * ++ * Returns an unsigned long, being the next numerator (as determined by the ++ * maximum and progress granularity) where status needs to be updated. ++ * This is to reduce unnecessary calls to update_status. ++ */ ++static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...) ++{ ++ static u32 last_step = 9999; ++ struct userui_msg_params msg; ++ u32 this_step, next_update; ++ int bitshift; ++ ++ if (ui_helper_data.pid == -1) ++ return 0; ++ ++ if ((!maximum) || (!progress_granularity)) ++ return maximum; ++ ++ if (value < 0) ++ value = 0; ++ ++ if (value > maximum) ++ value = maximum; ++ ++ /* Try to avoid math problems - we can't do 64 bit math here ++ * (and shouldn't need it - anyone got screen resolution ++ * of 65536 pixels or more?) */ ++ bitshift = fls(maximum) - 16; ++ if (bitshift > 0) { ++ u32 temp_maximum = maximum >> bitshift; ++ u32 temp_value = value >> bitshift; ++ this_step = (u32) ++ (temp_value * progress_granularity / temp_maximum); ++ next_update = (((this_step + 1) * temp_maximum / ++ progress_granularity) + 1) << bitshift; ++ } else { ++ this_step = (u32) (value * progress_granularity / maximum); ++ next_update = ((this_step + 1) * maximum / ++ progress_granularity) + 1; ++ } ++ ++ if (this_step == last_step) ++ return next_update; ++ ++ memset(&msg, 0, sizeof(msg)); ++ ++ msg.a = this_step; ++ msg.b = progress_granularity; ++ ++ if (fmt) { ++ va_list args; ++ va_start(args, fmt); ++ vsnprintf(msg.text, sizeof(msg.text), fmt, args); ++ va_end(args); ++ msg.text[sizeof(msg.text)-1] = '\0'; ++ } ++ ++ toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS, ++ &msg, sizeof(msg)); ++ last_step = this_step; ++ ++ return next_update; ++} ++ ++/** ++ * userui_message - Display a message without necessarily logging it. ++ * ++ * @section: Type of message. Messages can be filtered by type. ++ * @level: Degree of importance of the message. Lower values = higher priority. ++ * @normally_logged: Whether logged even if log_everything is off. ++ * @fmt: Message (and parameters). ++ * ++ * This function is intended to do the same job as printk, but without normally ++ * logging what is printed. The point is to be able to get debugging info on ++ * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M" ++ * ++ * It may be called from an interrupt context - can't sleep! ++ */ ++static void userui_message(u32 section, u32 level, u32 normally_logged, ++ const char *fmt, ...) ++{ ++ struct userui_msg_params msg; ++ ++ if ((level) && (level > console_loglevel)) ++ return; ++ ++ memset(&msg, 0, sizeof(msg)); ++ ++ msg.a = section; ++ msg.b = level; ++ msg.c = normally_logged; ++ ++ if (fmt) { ++ va_list args; ++ va_start(args, fmt); ++ vsnprintf(msg.text, sizeof(msg.text), fmt, args); ++ va_end(args); ++ msg.text[sizeof(msg.text)-1] = '\0'; ++ } ++ ++ if (test_action_state(TOI_LOGALL)) ++ printk(KERN_INFO "%s\n", msg.text); ++ ++ toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE, ++ &msg, sizeof(msg)); ++} ++ ++/** ++ * wait_for_key_via_userui - Wait for userui to receive a keypress. ++ */ ++static void wait_for_key_via_userui(void) ++{ ++ DECLARE_WAITQUEUE(wait, current); ++ ++ add_wait_queue(&userui_wait_for_key, &wait); ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ interruptible_sleep_on(&userui_wait_for_key); ++ ++ set_current_state(TASK_RUNNING); ++ remove_wait_queue(&userui_wait_for_key, &wait); ++} ++ ++/** ++ * userui_prepare_status - Display high level messages. ++ * ++ * @clearbar: Whether to clear the progress bar. ++ * @fmt...: New message for the title. ++ * ++ * Prepare the 'nice display', drawing the header and version, along with the ++ * current action and perhaps also resetting the progress bar. ++ */ ++static void userui_prepare_status(int clearbar, const char *fmt, ...) ++{ ++ va_list args; ++ ++ if (fmt) { ++ va_start(args, fmt); ++ lastheader_message_len = vsnprintf(lastheader, 512, fmt, args); ++ va_end(args); ++ } ++ ++ if (clearbar) ++ toi_update_status(0, 1, NULL); ++ ++ if (ui_helper_data.pid == -1) ++ printk(KERN_EMERG "%s\n", lastheader); ++ else ++ toi_message(0, TOI_STATUS, 1, lastheader, NULL); ++} ++ ++/** ++ * toi_wait_for_keypress - Wait for keypress via userui. ++ * ++ * @timeout: Maximum time to wait. ++ * ++ * Wait for a keypress from userui. ++ * ++ * FIXME: Implement timeout? ++ */ ++static char userui_wait_for_keypress(int timeout) ++{ ++ char key = '\0'; ++ ++ if (ui_helper_data.pid != -1) { ++ wait_for_key_via_userui(); ++ key = ' '; ++ } ++ ++ return key; ++} ++ ++/** ++ * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it. ++ * ++ * @result_code: Reason why we're aborting (1 << bit). ++ * @fmt: Message to display if telling the user what's going on. ++ * ++ * Abort a cycle. If this wasn't at the user's request (and we're displaying ++ * output), tell the user why and wait for them to acknowledge the message. ++ */ ++static void userui_abort_hibernate(int result_code, const char *fmt, ...) ++{ ++ va_list args; ++ int printed_len = 0; ++ ++ set_result_state(result_code); ++ ++ if (test_result_state(TOI_ABORTED)) ++ return; ++ ++ set_result_state(TOI_ABORTED); ++ ++ if (test_result_state(TOI_ABORT_REQUESTED)) ++ return; ++ ++ va_start(args, fmt); ++ printed_len = vsnprintf(local_printf_buf, sizeof(local_printf_buf), ++ fmt, args); ++ va_end(args); ++ if (ui_helper_data.pid != -1) ++ printed_len = sprintf(local_printf_buf + printed_len, ++ " (Press SPACE to continue)"); ++ ++ toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf); ++ ++ if (ui_helper_data.pid != -1) ++ userui_wait_for_keypress(0); ++} ++ ++/** ++ * request_abort_hibernate - Abort hibernating or resuming at user request. ++ * ++ * Handle the user requesting the cancellation of a hibernation or resume by ++ * pressing escape. ++ */ ++static void request_abort_hibernate(void) ++{ ++ if (test_result_state(TOI_ABORT_REQUESTED) || ++ !test_action_state(TOI_CAN_CANCEL)) ++ return; ++ ++ if (test_toi_state(TOI_NOW_RESUMING)) { ++ toi_prepare_status(CLEAR_BAR, "Escape pressed. " ++ "Powering down again."); ++ set_toi_state(TOI_STOP_RESUME); ++ while (!test_toi_state(TOI_IO_STOPPED)) ++ schedule(); ++ if (toiActiveAllocator->mark_resume_attempted) ++ toiActiveAllocator->mark_resume_attempted(0); ++ toi_power_down(); ++ } ++ ++ toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :" ++ " ABORTING HIBERNATION ---"); ++ set_abort_result(TOI_ABORT_REQUESTED); ++ wake_up_interruptible(&userui_wait_for_key); ++} ++ ++/** ++ * userui_user_rcv_msg - Receive a netlink message from userui. ++ * ++ * @skb: skb received. ++ * @nlh: Netlink header received. ++ */ ++static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) ++{ ++ int type; ++ int *data; ++ ++ type = nlh->nlmsg_type; ++ ++ /* A control message: ignore them */ ++ if (type < NETLINK_MSG_BASE) ++ return 0; ++ ++ /* Unknown message: reply with EINVAL */ ++ if (type >= USERUI_MSG_MAX) ++ return -EINVAL; ++ ++ /* All operations require privileges, even GET */ ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ /* Only allow one task to receive NOFREEZE privileges */ ++ if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) { ++ printk(KERN_INFO "Got NOFREEZE_ME request when " ++ "ui_helper_data.pid is %d.\n", ui_helper_data.pid); ++ return -EBUSY; ++ } ++ ++ data = (int *) NLMSG_DATA(nlh); ++ ++ switch (type) { ++ case USERUI_MSG_ABORT: ++ request_abort_hibernate(); ++ return 0; ++ case USERUI_MSG_GET_STATE: ++ toi_send_netlink_message(&ui_helper_data, ++ USERUI_MSG_GET_STATE, &toi_bkd.toi_action, ++ sizeof(toi_bkd.toi_action)); ++ return 0; ++ case USERUI_MSG_GET_DEBUG_STATE: ++ toi_send_netlink_message(&ui_helper_data, ++ USERUI_MSG_GET_DEBUG_STATE, ++ &toi_bkd.toi_debug_state, ++ sizeof(toi_bkd.toi_debug_state)); ++ return 0; ++ case USERUI_MSG_SET_STATE: ++ if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) ++ return -EINVAL; ++ ui_nl_set_state(*data); ++ return 0; ++ case USERUI_MSG_SET_DEBUG_STATE: ++ if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) ++ return -EINVAL; ++ toi_bkd.toi_debug_state = (*data); ++ return 0; ++ case USERUI_MSG_SPACE: ++ wake_up_interruptible(&userui_wait_for_key); ++ return 0; ++ case USERUI_MSG_GET_POWERDOWN_METHOD: ++ toi_send_netlink_message(&ui_helper_data, ++ USERUI_MSG_GET_POWERDOWN_METHOD, ++ &toi_poweroff_method, ++ sizeof(toi_poweroff_method)); ++ return 0; ++ case USERUI_MSG_SET_POWERDOWN_METHOD: ++ if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char))) ++ return -EINVAL; ++ toi_poweroff_method = (unsigned long)(*data); ++ return 0; ++ case USERUI_MSG_GET_LOGLEVEL: ++ toi_send_netlink_message(&ui_helper_data, ++ USERUI_MSG_GET_LOGLEVEL, ++ &toi_bkd.toi_default_console_level, ++ sizeof(toi_bkd.toi_default_console_level)); ++ return 0; ++ case USERUI_MSG_SET_LOGLEVEL: ++ if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int))) ++ return -EINVAL; ++ toi_bkd.toi_default_console_level = (*data); ++ return 0; ++ case USERUI_MSG_PRINTK: ++ printk(KERN_INFO "%s", (char *) data); ++ return 0; ++ } ++ ++ /* Unhandled here */ ++ return 1; ++} ++ ++/** ++ * userui_cond_pause - Possibly pause at user request. ++ * ++ * @pause: Whether to pause or just display the message. ++ * @message: Message to display at the start of pausing. ++ * ++ * Potentially pause and wait for the user to tell us to continue. We normally ++ * only pause when @pause is set. While paused, the user can do things like ++ * changing the loglevel, toggling the display of debugging sections and such ++ * like. ++ */ ++static void userui_cond_pause(int pause, char *message) ++{ ++ int displayed_message = 0, last_key = 0; ++ ++ while (last_key != 32 && ++ ui_helper_data.pid != -1 && ++ ((test_action_state(TOI_PAUSE) && pause) || ++ (test_action_state(TOI_SINGLESTEP)))) { ++ if (!displayed_message) { ++ toi_prepare_status(DONT_CLEAR_BAR, ++ "%s Press SPACE to continue.%s", ++ message ? message : "", ++ (test_action_state(TOI_SINGLESTEP)) ? ++ " Single step on." : ""); ++ displayed_message = 1; ++ } ++ last_key = userui_wait_for_keypress(0); ++ } ++ schedule(); ++} ++ ++/** ++ * userui_prepare_console - Prepare the console for use. ++ * ++ * Prepare a console for use, saving current kmsg settings and attempting to ++ * start userui. Console loglevel changes are handled by userui. ++ */ ++static void userui_prepare_console(void) ++{ ++ orig_kmsg = vt_kmsg_redirect(fg_console + 1); ++ ++ ui_helper_data.pid = -1; ++ ++ if (!userui_ops.enabled) { ++ printk(KERN_INFO "TuxOnIce: Userui disabled.\n"); ++ return; ++ } ++ ++ if (*ui_helper_data.program) ++ toi_netlink_setup(&ui_helper_data); ++ else ++ printk(KERN_INFO "TuxOnIce: Userui program not configured.\n"); ++} ++ ++/** ++ * userui_cleanup_console - Cleanup after a cycle. ++ * ++ * Tell userui to cleanup, and restore kmsg_redirect to its original value. ++ */ ++ ++static void userui_cleanup_console(void) ++{ ++ if (ui_helper_data.pid > -1) ++ toi_netlink_close(&ui_helper_data); ++ ++ vt_kmsg_redirect(orig_kmsg); ++} ++ ++/* ++ * User interface specific /sys/power/tuxonice entries. ++ */ ++ ++static struct toi_sysfs_data sysfs_params[] = { ++#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) ++ SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_CAN_CANCEL, 0), ++ SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action, ++ TOI_PAUSE, 0), ++ SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL), ++ SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1, ++ 2048, 0, NULL), ++ SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0, ++ set_ui_program_set), ++ SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL) ++#endif ++}; ++ ++static struct toi_module_ops userui_ops = { ++ .type = MISC_MODULE, ++ .name = "userui", ++ .shared_directory = "user_interface", ++ .module = THIS_MODULE, ++ .storage_needed = userui_storage_needed, ++ .save_config_info = userui_save_config_info, ++ .load_config_info = userui_load_config_info, ++ .memory_needed = userui_memory_needed, ++ .post_atomic_restore = userui_post_atomic_restore, ++ .sysfs_data = sysfs_params, ++ .num_sysfs_entries = sizeof(sysfs_params) / ++ sizeof(struct toi_sysfs_data), ++}; ++ ++static struct ui_ops my_ui_ops = { ++ .update_status = userui_update_status, ++ .message = userui_message, ++ .prepare_status = userui_prepare_status, ++ .abort = userui_abort_hibernate, ++ .cond_pause = userui_cond_pause, ++ .prepare = userui_prepare_console, ++ .cleanup = userui_cleanup_console, ++ .wait_for_key = userui_wait_for_keypress, ++}; ++ ++/** ++ * toi_user_ui_init - Boot time initialisation for user interface. ++ * ++ * Invoked from the core init routine. ++ */ ++static __init int toi_user_ui_init(void) ++{ ++ int result; ++ ++ ui_helper_data.nl = NULL; ++ strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255); ++ ui_helper_data.pid = -1; ++ ui_helper_data.skb_size = sizeof(struct userui_msg_params); ++ ui_helper_data.pool_limit = 6; ++ ui_helper_data.netlink_id = NETLINK_TOI_USERUI; ++ ui_helper_data.name = "userspace ui"; ++ ui_helper_data.rcv_msg = userui_user_rcv_msg; ++ ui_helper_data.interface_version = 8; ++ ui_helper_data.must_init = 0; ++ ui_helper_data.not_ready = userui_cleanup_console; ++ init_completion(&ui_helper_data.wait_for_process); ++ result = toi_register_module(&userui_ops); ++ if (!result) ++ result = toi_register_ui_ops(&my_ui_ops); ++ if (result) ++ toi_unregister_module(&userui_ops); ++ ++ return result; ++} ++ ++#ifdef MODULE ++/** ++ * toi_user_ui_ext - Cleanup code for if the core is unloaded. ++ */ ++static __exit void toi_user_ui_exit(void) ++{ ++ toi_netlink_close_complete(&ui_helper_data); ++ toi_remove_ui_ops(&my_ui_ops); ++ toi_unregister_module(&userui_ops); ++} ++ ++module_init(toi_user_ui_init); ++module_exit(toi_user_ui_exit); ++MODULE_AUTHOR("Nigel Cunningham"); ++MODULE_DESCRIPTION("TuxOnIce Userui Support"); ++MODULE_LICENSE("GPL"); ++#else ++late_initcall(toi_user_ui_init); ++#endif +diff --git a/kernel/power/user.c b/kernel/power/user.c +index 4ed81e7..793144d 100644 +--- a/kernel/power/user.c ++++ b/kernel/power/user.c +@@ -12,6 +12,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -42,6 +43,7 @@ static struct snapshot_data { + } snapshot_state; + + atomic_t snapshot_device_available = ATOMIC_INIT(1); ++EXPORT_SYMBOL_GPL(snapshot_device_available); + + static int snapshot_open(struct inode *inode, struct file *filp) + { +diff --git a/kernel/printk.c b/kernel/printk.c +index f24633a..86fca67 100644 +--- a/kernel/printk.c ++++ b/kernel/printk.c +@@ -33,6 +33,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -71,6 +72,7 @@ int console_printk[4] = { + MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ + DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ + }; ++EXPORT_SYMBOL_GPL(console_printk); + + /* + * Low level drivers may need that to know if they can schedule in +@@ -1867,6 +1869,7 @@ void suspend_console(void) + console_suspended = 1; + up(&console_sem); + } ++EXPORT_SYMBOL_GPL(suspend_console); + + void resume_console(void) + { +@@ -1876,6 +1879,7 @@ void resume_console(void) + console_suspended = 0; + console_unlock(); + } ++EXPORT_SYMBOL_GPL(resume_console); + + /** + * console_cpu_notify - print deferred console messages after CPU hotplug +diff --git a/mm/highmem.c b/mm/highmem.c +index b32b70c..db3d6ea 100644 +--- a/mm/highmem.c ++++ b/mm/highmem.c +@@ -66,6 +66,7 @@ unsigned int nr_free_highpages (void) + + return pages; + } ++EXPORT_SYMBOL_GPL(nr_free_highpages); + + static int pkmap_count[LAST_PKMAP]; + static unsigned int last_pkmap_nr; +diff --git a/mm/memory.c b/mm/memory.c +index bb1369f..6ac7584 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1617,6 +1617,7 @@ no_page_table: + return ERR_PTR(-EFAULT); + return page; + } ++EXPORT_SYMBOL_GPL(follow_page); + + static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) + { +diff --git a/mm/mmzone.c b/mm/mmzone.c +index 4596d81..ddf6b28 100644 +--- a/mm/mmzone.c ++++ b/mm/mmzone.c +@@ -8,11 +8,13 @@ + #include + #include + #include ++#include + + struct pglist_data *first_online_pgdat(void) + { + return NODE_DATA(first_online_node); + } ++EXPORT_SYMBOL_GPL(first_online_pgdat); + + struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) + { +@@ -22,6 +24,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) + return NULL; + return NODE_DATA(nid); + } ++EXPORT_SYMBOL_GPL(next_online_pgdat); + + /* + * next_zone - helper magic for for_each_zone() +@@ -41,6 +44,7 @@ struct zone *next_zone(struct zone *zone) + } + return zone; + } ++EXPORT_SYMBOL_GPL(next_zone); + + static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) + { +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 7300c9d..44988d2 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -108,6 +108,7 @@ unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ + * Flag that makes the machine dump writes/reads and block dirtyings. + */ + int block_dump; ++EXPORT_SYMBOL_GPL(block_dump); + + /* + * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index d1107ad..344404c 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -132,6 +132,7 @@ void pm_restore_gfp_mask(void) + saved_gfp_mask = 0; + } + } ++EXPORT_SYMBOL_GPL(pm_restore_gfp_mask); + + void pm_restrict_gfp_mask(void) + { +@@ -140,6 +141,7 @@ void pm_restrict_gfp_mask(void) + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~GFP_IOFS; + } ++EXPORT_SYMBOL_GPL(pm_restrict_gfp_mask); + + bool pm_suspended_storage(void) + { +@@ -2810,6 +2812,26 @@ static unsigned int nr_free_zone_pages(int offset) + return sum; + } + ++static unsigned int nr_unallocated_zone_pages(int offset) ++{ ++ struct zoneref *z; ++ struct zone *zone; ++ ++ /* Just pick one node, since fallback list is circular */ ++ unsigned int sum = 0; ++ ++ struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); ++ ++ for_each_zone_zonelist(zone, z, zonelist, offset) { ++ unsigned long high = high_wmark_pages(zone); ++ unsigned long left = zone_page_state(zone, NR_FREE_PAGES); ++ if (left > high) ++ sum += left - high; ++ } ++ ++ return sum; ++} ++ + /* + * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL + */ +@@ -2820,6 +2842,15 @@ unsigned int nr_free_buffer_pages(void) + EXPORT_SYMBOL_GPL(nr_free_buffer_pages); + + /* ++ * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL ++ */ ++unsigned int nr_unallocated_buffer_pages(void) ++{ ++ return nr_unallocated_zone_pages(gfp_zone(GFP_USER)); ++} ++EXPORT_SYMBOL_GPL(nr_unallocated_buffer_pages); ++ ++/* + * Amount of free RAM allocatable within all zones + */ + unsigned int nr_free_pagecache_pages(void) +diff --git a/mm/shmem.c b/mm/shmem.c +index 5dd56f6..72879f8 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -1361,7 +1361,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) + } + + static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, +- umode_t mode, dev_t dev, unsigned long flags) ++ umode_t mode, dev_t dev, unsigned long flags, int atomic_copy) + { + struct inode *inode; + struct shmem_inode_info *info; +@@ -1382,6 +1382,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode + memset(info, 0, (char *)inode - (char *)info); + spin_lock_init(&info->lock); + info->flags = flags & VM_NORESERVE; ++ if (atomic_copy) ++ inode->i_flags |= S_ATOMIC_COPY; + INIT_LIST_HEAD(&info->swaplist); + simple_xattrs_init(&info->xattrs); + cache_no_acl(inode); +@@ -1936,7 +1938,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) + struct inode *inode; + int error = -ENOSPC; + +- inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); ++ inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE, 0); + if (inode) { + error = security_inode_init_security(inode, dir, + &dentry->d_name, +@@ -2075,7 +2077,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s + if (len > PAGE_CACHE_SIZE) + return -ENAMETOOLONG; + +- inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); ++ inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE, 0); + if (!inode) + return -ENOSPC; + +@@ -2604,7 +2606,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) + sb->s_flags |= MS_POSIXACL; + #endif + +- inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); ++ inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE, 0); + if (!inode) + goto failed; + inode->i_uid = sbinfo->uid; +@@ -2857,7 +2859,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); + + #define shmem_vm_ops generic_file_vm_ops + #define shmem_file_operations ramfs_file_operations +-#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) ++#define shmem_get_inode(sb, dir, mode, dev, flags, atomic_copy) ramfs_get_inode(sb, dir, mode, dev) + #define shmem_acct_size(flags, size) 0 + #define shmem_unacct_size(flags, size) do {} while (0) + +@@ -2870,8 +2872,10 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); + * @name: name for dentry (to be seen in /proc//maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size ++ * @atomic_copy: Atomically copy the area when hibernating? + */ +-struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) ++struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags, ++ int atomic_copy) + { + int error; + struct file *file; +@@ -2900,7 +2904,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags + path.mnt = mntget(shm_mnt); + + error = -ENOSPC; +- inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); ++ inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags, ++ atomic_copy); + if (!inode) + goto put_dentry; + +@@ -2938,7 +2943,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) + struct file *file; + loff_t size = vma->vm_end - vma->vm_start; + +- file = shmem_file_setup("dev/zero", size, vma->vm_flags); ++ file = shmem_file_setup("dev/zero", size, vma->vm_flags, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + +diff --git a/mm/swap_state.c b/mm/swap_state.c +index 0cb36fb..3e3a20f 100644 +--- a/mm/swap_state.c ++++ b/mm/swap_state.c +@@ -7,6 +7,7 @@ + * Rewritten to use page cache, (C) 1998 Stephen Tweedie + */ + #include ++#include + #include + #include + #include +@@ -43,6 +44,7 @@ struct address_space swapper_space = { + .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), + .backing_dev_info = &swap_backing_dev_info, + }; ++EXPORT_SYMBOL_GPL(swapper_space); + + #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) + +diff --git a/mm/swapfile.c b/mm/swapfile.c +index e97a0e5..f321484 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -43,7 +44,6 @@ + static bool swap_count_continued(struct swap_info_struct *, pgoff_t, + unsigned char); + static void free_swap_count_continuations(struct swap_info_struct *); +-static sector_t map_swap_entry(swp_entry_t, struct block_device**); + + DEFINE_SPINLOCK(swap_lock); + static unsigned int nr_swapfiles; +@@ -474,6 +474,56 @@ swp_entry_t get_swap_page_of_type(int type) + spin_unlock(&swap_lock); + return (swp_entry_t) {0}; + } ++EXPORT_SYMBOL_GPL(get_swap_page_of_type); ++ ++static unsigned int find_next_to_unuse(struct swap_info_struct *si, ++ unsigned int prev, bool frontswap); ++ ++void get_swap_range_of_type(int type, swp_entry_t *start, swp_entry_t *end, ++ unsigned int limit) ++{ ++ struct swap_info_struct *si; ++ pgoff_t start_at; ++ unsigned int i; ++ ++ *start = swp_entry(0, 0); ++ *end = swp_entry(0, 0); ++ spin_lock(&swap_lock); ++ si = swap_info[type]; ++ if (si && (si->flags & SWP_WRITEOK)) { ++ /* This is called for allocating swap entry, not cache */ ++ start_at = scan_swap_map(si, 1); ++ if (start_at) { ++ unsigned int stop_at = find_next_to_unuse(si, start_at, 0); ++ if (stop_at > start_at) ++ stop_at--; ++ else ++ stop_at = si->max - 1; ++ if (stop_at - start_at + 1 > limit) ++ stop_at = min_t(unsigned int, ++ start_at + limit - 1, ++ si->max - 1); ++ /* Mark them used */ ++ for (i = start_at; i <= stop_at; i++) ++ si->swap_map[i] = 1; ++ /* first page already done above */ ++ si->inuse_pages += stop_at - start_at; ++ ++ nr_swap_pages -= stop_at - start_at + 1; ++ if (start_at + 1 == si->lowest_bit) ++ si->lowest_bit = stop_at + 1; ++ if (si->inuse_pages == si->pages) { ++ si->lowest_bit = si->max; ++ si->highest_bit = 0; ++ } ++ si->cluster_next = stop_at + 1; ++ *start = swp_entry(type, start_at); ++ *end = swp_entry(type, stop_at); ++ } ++ } ++ spin_unlock(&swap_lock); ++} ++EXPORT_SYMBOL_GPL(get_swap_range_of_type); + + static struct swap_info_struct *swap_info_get(swp_entry_t entry) + { +@@ -601,6 +651,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) + spin_unlock(&swap_lock); + } + } ++EXPORT_SYMBOL_GPL(swap_free); + + /* + * How many references to page are currently swapped out? +@@ -1279,7 +1330,7 @@ static void drain_mmlist(void) + * Note that the type of this function is sector_t, but it returns page offset + * into the bdev, not sector offset. + */ +-static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) ++sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) + { + struct swap_info_struct *sis; + struct swap_extent *start_se; +@@ -1306,6 +1357,7 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) + BUG_ON(se == start_se); /* It *must* be present */ + } + } ++EXPORT_SYMBOL_GPL(map_swap_entry); + + /* + * Returns the page offset into bdev for the specified page's swap entry. +@@ -1617,6 +1669,7 @@ out: + putname(pathname); + return err; + } ++EXPORT_SYMBOL_GPL(sys_swapoff); + + #ifdef CONFIG_PROC_FS + static unsigned swaps_poll(struct file *file, poll_table *wait) +@@ -2103,6 +2156,7 @@ out: + mutex_unlock(&inode->i_mutex); + return error; + } ++EXPORT_SYMBOL_GPL(sys_swapon); + + void si_swapinfo(struct sysinfo *val) + { +@@ -2120,6 +2174,7 @@ void si_swapinfo(struct sysinfo *val) + val->totalswap = total_swap_pages + nr_to_be_unused; + spin_unlock(&swap_lock); + } ++EXPORT_SYMBOL_GPL(si_swapinfo); + + /* + * Verify that a swap entry is valid and increment its swap map count. +@@ -2254,8 +2309,15 @@ pgoff_t __page_file_index(struct page *page) + VM_BUG_ON(!PageSwapCache(page)); + return swp_offset(swap); + } ++ + EXPORT_SYMBOL_GPL(__page_file_index); + ++struct swap_info_struct *get_swap_info_struct(unsigned type) ++{ ++ return swap_info[type]; ++} ++EXPORT_SYMBOL_GPL(get_swap_info_struct); ++ + /* + * add_swap_count_continuation - called when a swap count is duplicated + * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 196709f..7897f18 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1188,7 +1188,7 @@ static int too_many_isolated(struct zone *zone, int file, + { + unsigned long inactive, isolated; + +- if (current_is_kswapd()) ++ if (current_is_kswapd() || sc->hibernation_mode) + return 0; + + if (!global_reclaim(sc)) +@@ -1762,7 +1762,9 @@ out: + unsigned long scan; + + scan = get_lru_size(lruvec, lru); +- if (sc->priority || noswap || !vmscan_swappiness(sc)) { ++ if (sc->hibernation_mode) ++ scan = SWAP_CLUSTER_MAX; ++ else if (sc->priority || noswap || !vmscan_swappiness(sc)) { + scan >>= sc->priority; + if (!scan && force_scan) + scan = SWAP_CLUSTER_MAX; +@@ -1798,6 +1800,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, + unsigned long pages_for_compaction; + unsigned long inactive_lru_pages; + ++ if (nr_reclaimed && nr_scanned && sc->nr_to_reclaim >= sc->nr_reclaimed) ++ return true; ++ + /* If not in reclaim/compaction mode, stop */ + if (!in_reclaim_compaction(sc)) + return false; +@@ -1896,7 +1901,7 @@ restart: + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ +- if (inactive_anon_is_low(lruvec)) ++ if (sc->hibernation_mode || inactive_anon_is_low(lruvec)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); + +@@ -2028,7 +2033,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) + if (zone->all_unreclaimable && + sc->priority != DEF_PRIORITY) + continue; /* Let kswapd poll it */ +- if (IS_ENABLED(CONFIG_COMPACTION)) { ++ if (IS_ENABLED(CONFIG_COMPACTION) && !sc->hibernation_mode) { + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. +@@ -2116,6 +2121,11 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, + unsigned long writeback_threshold; + bool aborted_reclaim; + ++#ifdef CONFIG_FREEZER ++ if (unlikely(pm_freezing && !sc->hibernation_mode)) ++ return 0; ++#endif ++ + delayacct_freepages_start(); + + if (global_reclaim(sc)) +@@ -3023,6 +3033,11 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) + if (!populated_zone(zone)) + return; + ++#ifdef CONFIG_FREEZER ++ if (pm_freezing) ++ return; ++#endif ++ + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + return; + pgdat = zone->zone_pgdat; +@@ -3083,11 +3098,11 @@ unsigned long zone_reclaimable_pages(struct zone *zone) + * LRU order by reclaiming preferentially + * inactive > active > active referenced > active mapped + */ +-unsigned long shrink_all_memory(unsigned long nr_to_reclaim) ++unsigned long shrink_memory_mask(unsigned long nr_to_reclaim, gfp_t mask) + { + struct reclaim_state reclaim_state; + struct scan_control sc = { +- .gfp_mask = GFP_HIGHUSER_MOVABLE, ++ .gfp_mask = mask, + .may_swap = 1, + .may_unmap = 1, + .may_writepage = 1, +@@ -3116,6 +3131,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) + + return nr_reclaimed; + } ++EXPORT_SYMBOL_GPL(shrink_memory_mask); ++ ++unsigned long shrink_all_memory(unsigned long nr_to_reclaim) ++{ ++ return shrink_memory_mask(nr_to_reclaim, GFP_HIGHUSER_MOVABLE); ++} ++EXPORT_SYMBOL_GPL(shrink_all_memory); + #endif /* CONFIG_HIBERNATION */ + + /* It's optimal to keep kswapds on the same CPUs as their memory, but diff --git a/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/zzz-config-desktop-bfq-tuxonice.patch b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/zzz-config-desktop-bfq-tuxonice.patch new file mode 100644 index 000000000..f5b1daf22 --- /dev/null +++ b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/zzz-config-desktop-bfq-tuxonice.patch @@ -0,0 +1,63 @@ +# Calculate format=diff os_linux_system==desktop +--- .config.orig 2013-02-18 14:27:13.773480200 +0400 ++++ .config 2013-02-27 16:37:47.238816225 +0400 +@@ -288,8 +289,11 @@ + CONFIG_IOSCHED_NOOP=y + CONFIG_IOSCHED_DEADLINE=y + CONFIG_IOSCHED_CFQ=y ++CONFIG_IOSCHED_BFQ=y ++CONFIG_CGROUP_BFQIO=y + # CONFIG_DEFAULT_DEADLINE is not set + CONFIG_DEFAULT_CFQ=y ++# CONFIG_DEFAULT_BFQ is not set + # CONFIG_DEFAULT_NOOP is not set + CONFIG_DEFAULT_IOSCHED="cfq" + CONFIG_PREEMPT_NOTIFIERS=y +@@ -461,6 +465,29 @@ + CONFIG_HIBERNATE_CALLBACKS=y + CONFIG_HIBERNATION=y + CONFIG_PM_STD_PARTITION="" ++CONFIG_TOI_CORE=y ++ ++# ++# Image Storage (you need at least one allocator) ++# ++CONFIG_TOI_FILE=y ++CONFIG_TOI_SWAP=y ++ ++# ++# General Options ++# ++CONFIG_TOI_CRYPTO=y ++CONFIG_TOI_USERUI=y ++CONFIG_TOI_USERUI_DEFAULT_PATH="/sbin/tuxoniceui" ++CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT=-2 ++# CONFIG_TOI_KEEP_IMAGE is not set ++CONFIG_TOI_REPLACE_SWSUSP=y ++# CONFIG_TOI_IGNORE_LATE_INITCALL is not set ++CONFIG_TOI_DEFAULT_WAIT=25 ++CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE=2000 ++CONFIG_TOI_CHECKSUM=y ++CONFIG_TOI=y ++CONFIG_TOI_ZRAM_SUPPORT=y + CONFIG_PM_SLEEP=y + CONFIG_PM_SLEEP_SMP=y + # CONFIG_PM_AUTOSLEEP is not set +@@ -4330,7 +4357,7 @@ + CONFIG_CRYPTO_AEAD2=y + CONFIG_CRYPTO_BLKCIPHER=m + CONFIG_CRYPTO_BLKCIPHER2=y +-CONFIG_CRYPTO_HASH=m ++CONFIG_CRYPTO_HASH=y + CONFIG_CRYPTO_HASH2=y + CONFIG_CRYPTO_RNG2=y + CONFIG_CRYPTO_PCOMP2=y +@@ -4379,7 +4406,7 @@ + CONFIG_CRYPTO_CRC32C=m + # CONFIG_CRYPTO_CRC32C_INTEL is not set + # CONFIG_CRYPTO_GHASH is not set +-CONFIG_CRYPTO_MD4=m ++CONFIG_CRYPTO_MD4=y + CONFIG_CRYPTO_MD5=m + CONFIG_CRYPTO_MICHAEL_MIC=m + # CONFIG_CRYPTO_RMD128 is not set