diff --git a/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.10/4501_muqss.patch b/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.10/4501_muqss.patch new file mode 100644 index 000000000..016b3e73a --- /dev/null +++ b/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.10/4501_muqss.patch @@ -0,0 +1,11164 @@ +# Calculate format=diff merge(sys-kernel/calculate-sources[muqss])!= +From 35f6640868573a07b1291c153021f5d75749c15e Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Fri, 25 Oct 2019 14:00:52 +1100 +Subject: [PATCH 01/16] MultiQueue Skiplist Scheduler v0.205 + +--- + .../admin-guide/kernel-parameters.txt | 8 + + Documentation/admin-guide/sysctl/kernel.rst | 34 + + Documentation/scheduler/sched-BFS.txt | 351 + + Documentation/scheduler/sched-MuQSS.txt | 373 + + arch/alpha/Kconfig | 2 + + arch/arm/Kconfig | 2 + + arch/arm64/Kconfig | 2 + + arch/powerpc/Kconfig | 2 + + arch/powerpc/platforms/cell/spufs/sched.c | 5 - + arch/x86/Kconfig | 18 + + fs/proc/base.c | 2 +- + include/linux/init_task.h | 4 + + include/linux/ioprio.h | 2 + + include/linux/sched.h | 61 +- + include/linux/sched/deadline.h | 9 + + include/linux/sched/nohz.h | 2 +- + include/linux/sched/prio.h | 12 + + include/linux/sched/rt.h | 2 + + include/linux/sched/task.h | 2 +- + include/linux/skip_list.h | 33 + + include/uapi/linux/sched.h | 9 +- + init/Kconfig | 24 +- + init/init_task.c | 10 + + init/main.c | 2 + + kernel/Kconfig.MuQSS | 105 + + kernel/Makefile | 3 +- + kernel/delayacct.c | 2 +- + kernel/exit.c | 4 +- + kernel/kthread.c | 30 +- + kernel/livepatch/transition.c | 6 +- + kernel/sched/Makefile | 10 +- + kernel/sched/MuQSS.c | 7866 +++++++++++++++++ + kernel/sched/MuQSS.h | 1070 +++ + kernel/sched/cpufreq_schedutil.c | 12 +- + kernel/sched/cpupri.h | 2 + + kernel/sched/cputime.c | 22 +- + kernel/sched/idle.c | 2 + + kernel/sched/sched.h | 35 + + kernel/sched/topology.c | 8 + + kernel/skip_list.c | 148 + + kernel/sysctl.c | 63 +- + kernel/time/Kconfig | 2 +- + kernel/time/clockevents.c | 5 + + kernel/time/posix-cpu-timers.c | 4 +- + kernel/time/timer.c | 7 +- + kernel/trace/trace_selftest.c | 5 + + 46 files changed, 10332 insertions(+), 50 deletions(-) + create mode 100644 Documentation/scheduler/sched-BFS.txt + create mode 100644 Documentation/scheduler/sched-MuQSS.txt + create mode 100644 include/linux/skip_list.h + create mode 100644 kernel/Kconfig.MuQSS + create mode 100644 kernel/sched/MuQSS.c + create mode 100644 kernel/sched/MuQSS.h + create mode 100644 kernel/skip_list.c + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 44fde25bb221..044daa3a41ab 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4652,6 +4652,14 @@ + Memory area to be used by remote processor image, + managed by CMA. + ++ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type. ++ Format: ++ smt -- Share SMT (hyperthread) sibling runqueues ++ mc -- Share MC (multicore) sibling runqueues ++ smp -- Share SMP runqueues ++ none -- So not share any runqueues ++ Default value is mc ++ + rw [KNL] Mount root device read-write on boot + + S [KNL] Run init in single mode +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index d4b32cc32bb7..9e1e71fc66d0 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -436,6 +436,16 @@ this allows system administrators to override the + ``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded. + + ++iso_cpu: (MuQSS CPU scheduler only) ++=================================== ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling five ++seconds over the -whole- system, meaning all cpus. ++ ++Set to 70 (percent) by default. ++ ++ + kexec_load_disabled + =================== + +@@ -1077,6 +1087,20 @@ ROM/Flash boot loader. Maybe to tell it what to do after + rebooting. ??? + + ++rr_interval: (MuQSS CPU scheduler only) ++======================================= ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. Conversely decreasing it will decrease average and maximum ++latencies but at the expense of throughput. This value is in ++milliseconds and the default value chosen depends on the number of ++cpus available at scheduler initialisation with a minimum of 6. ++ ++Valid values are from 1-1000. ++ ++ + sched_energy_aware + ================== + +@@ -1515,3 +1539,13 @@ is 10 seconds. + + The softlockup threshold is (``2 * watchdog_thresh``). Setting this + tunable to zero will disable lockup detection altogether. ++ ++ ++yield_type: (MuQSS CPU scheduler only) ++====================================== ++ ++This determines what type of yield calls to sched_yield will perform. ++ ++ 0: No yield. ++ 1: Yield only to better priority/deadline tasks. (default) ++ 2: Expire timeslice and recalculate deadline. +diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt +new file mode 100644 +index 000000000000..c0282002a079 +--- /dev/null ++++ b/Documentation/scheduler/sched-BFS.txt +@@ -0,0 +1,351 @@ ++BFS - The Brain Fuck Scheduler by Con Kolivas. ++ ++Goals. ++ ++The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to ++completely do away with the complex designs of the past for the cpu process ++scheduler and instead implement one that is very simple in basic design. ++The main focus of BFS is to achieve excellent desktop interactivity and ++responsiveness without heuristics and tuning knobs that are difficult to ++understand, impossible to model and predict the effect of, and when tuned to ++one workload cause massive detriment to another. ++ ++ ++Design summary. ++ ++BFS is best described as a single runqueue, O(n) lookup, earliest effective ++virtual deadline first design, loosely based on EEVDF (earliest eligible virtual ++deadline first) and my previous Staircase Deadline scheduler. Each component ++shall be described in order to understand the significance of, and reasoning for ++it. The codebase when the first stable version was released was approximately ++9000 lines less code than the existing mainline linux kernel scheduler (in ++2.6.31). This does not even take into account the removal of documentation and ++the cgroups code that is not used. ++ ++Design reasoning. ++ ++The single runqueue refers to the queued but not running processes for the ++entire system, regardless of the number of CPUs. The reason for going back to ++a single runqueue design is that once multiple runqueues are introduced, ++per-CPU or otherwise, there will be complex interactions as each runqueue will ++be responsible for the scheduling latency and fairness of the tasks only on its ++own runqueue, and to achieve fairness and low latency across multiple CPUs, any ++advantage in throughput of having CPU local tasks causes other disadvantages. ++This is due to requiring a very complex balancing system to at best achieve some ++semblance of fairness across CPUs and can only maintain relatively low latency ++for tasks bound to the same CPUs, not across them. To increase said fairness ++and latency across CPUs, the advantage of local runqueue locking, which makes ++for better scalability, is lost due to having to grab multiple locks. ++ ++A significant feature of BFS is that all accounting is done purely based on CPU ++used and nowhere is sleep time used in any way to determine entitlement or ++interactivity. Interactivity "estimators" that use some kind of sleep/run ++algorithm are doomed to fail to detect all interactive tasks, and to falsely tag ++tasks that aren't interactive as being so. The reason for this is that it is ++close to impossible to determine that when a task is sleeping, whether it is ++doing it voluntarily, as in a userspace application waiting for input in the ++form of a mouse click or otherwise, or involuntarily, because it is waiting for ++another thread, process, I/O, kernel activity or whatever. Thus, such an ++estimator will introduce corner cases, and more heuristics will be required to ++cope with those corner cases, introducing more corner cases and failed ++interactivity detection and so on. Interactivity in BFS is built into the design ++by virtue of the fact that tasks that are waking up have not used up their quota ++of CPU time, and have earlier effective deadlines, thereby making it very likely ++they will preempt any CPU bound task of equivalent nice level. See below for ++more information on the virtual deadline mechanism. Even if they do not preempt ++a running task, because the rr interval is guaranteed to have a bound upper ++limit on how long a task will wait for, it will be scheduled within a timeframe ++that will not cause visible interface jitter. ++ ++ ++Design details. ++ ++Task insertion. ++ ++BFS inserts tasks into each relevant queue as an O(1) insertion into a double ++linked list. On insertion, *every* running queue is checked to see if the newly ++queued task can run on any idle queue, or preempt the lowest running task on the ++system. This is how the cross-CPU scheduling of BFS achieves significantly lower ++latency per extra CPU the system has. In this case the lookup is, in the worst ++case scenario, O(n) where n is the number of CPUs on the system. ++ ++Data protection. ++ ++BFS has one single lock protecting the process local data of every task in the ++global queue. Thus every insertion, removal and modification of task data in the ++global runqueue needs to grab the global lock. However, once a task is taken by ++a CPU, the CPU has its own local data copy of the running process' accounting ++information which only that CPU accesses and modifies (such as during a ++timer tick) thus allowing the accounting data to be updated lockless. Once a ++CPU has taken a task to run, it removes it from the global queue. Thus the ++global queue only ever has, at most, ++ ++ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 ++ ++tasks in the global queue. This value is relevant for the time taken to look up ++tasks during scheduling. This will increase if many tasks with CPU affinity set ++in their policy to limit which CPUs they're allowed to run on if they outnumber ++the number of CPUs. The +1 is because when rescheduling a task, the CPU's ++currently running task is put back on the queue. Lookup will be described after ++the virtual deadline mechanism is explained. ++ ++Virtual deadline. ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in BFS is entirely in the virtual deadline mechanism. The one ++tunable in BFS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in jiffies by this equation: ++ ++ jiffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. Once a task is descheduled, it is put back on the queue, and an ++O(n) lookup of all queued-but-not-running tasks is done to determine which has ++the earliest deadline and that task is chosen to receive CPU next. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (jiffies) is ++constantly moving. ++ ++Task lookup. ++ ++BFS has 103 priority queues. 100 of these are dedicated to the static priority ++of realtime tasks, and the remaining 3 are, in order of best to worst priority, ++SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority ++scheduling). When a task of these priorities is queued, a bitmap of running ++priorities is set showing which of these priorities has tasks waiting for CPU ++time. When a CPU is made to reschedule, the lookup for the next task to get ++CPU time is performed in the following way: ++ ++First the bitmap is checked to see what static priority tasks are queued. If ++any realtime priorities are found, the corresponding queue is checked and the ++first task listed there is taken (provided CPU affinity is suitable) and lookup ++is complete. If the priority corresponds to a SCHED_ISO task, they are also ++taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds ++to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this ++stage, every task in the runlist that corresponds to that priority is checked ++to see which has the earliest set deadline, and (provided it has suitable CPU ++affinity) it is taken off the runqueue and given the CPU. If a task has an ++expired deadline, it is taken and the rest of the lookup aborted (as they are ++chosen in FIFO order). ++ ++Thus, the lookup is O(n) in the worst case only, where n is as described ++earlier, as tasks may be chosen before the whole task list is looked over. ++ ++ ++Scalability. ++ ++The major limitations of BFS will be that of scalability, as the separate ++runqueue designs will have less lock contention as the number of CPUs rises. ++However they do not scale linearly even with separate runqueues as multiple ++runqueues will need to be locked concurrently on such designs to be able to ++achieve fair CPU balancing, to try and achieve some sort of nice-level fairness ++across CPUs, and to achieve low enough latency for tasks on a busy CPU when ++other CPUs would be more suited. BFS has the advantage that it requires no ++balancing algorithm whatsoever, as balancing occurs by proxy simply because ++all CPUs draw off the global runqueue, in priority and deadline order. Despite ++the fact that scalability is _not_ the prime concern of BFS, it both shows very ++good scalability to smaller numbers of CPUs and is likely a more scalable design ++at these numbers of CPUs. ++ ++It also has some very low overhead scalability features built into the design ++when it has been deemed their overhead is so marginal that they're worth adding. ++The first is the local copy of the running process' data to the CPU it's running ++on to allow that data to be updated lockless where possible. Then there is ++deference paid to the last CPU a task was running on, by trying that CPU first ++when looking for an idle CPU to use the next time it's scheduled. Finally there ++is the notion of cache locality beyond the last running CPU. The sched_domains ++information is used to determine the relative virtual "cache distance" that ++other CPUs have from the last CPU a task was running on. CPUs with shared ++caches, such as SMT siblings, or multicore CPUs with shared caches, are treated ++as cache local. CPUs without shared caches are treated as not cache local, and ++CPUs on different NUMA nodes are treated as very distant. This "relative cache ++distance" is used by modifying the virtual deadline value when doing lookups. ++Effectively, the deadline is unaltered between "cache local" CPUs, doubled for ++"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning ++behind the doubling of deadlines is as follows. The real cost of migrating a ++task from one CPU to another is entirely dependant on the cache footprint of ++the task, how cache intensive the task is, how long it's been running on that ++CPU to take up the bulk of its cache, how big the CPU cache is, how fast and ++how layered the CPU cache is, how fast a context switch is... and so on. In ++other words, it's close to random in the real world where we do more than just ++one sole workload. The only thing we can be sure of is that it's not free. So ++BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs ++is more important than cache locality, and cache locality only plays a part ++after that. Doubling the effective deadline is based on the premise that the ++"cache local" CPUs will tend to work on the same tasks up to double the number ++of cache local CPUs, and once the workload is beyond that amount, it is likely ++that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA ++is a value I pulled out of my arse. ++ ++When choosing an idle CPU for a waking task, the cache locality is determined ++according to where the task last ran and then idle CPUs are ranked from best ++to worst to choose the most suitable idle CPU based on cache locality, NUMA ++node locality and hyperthread sibling business. They are chosen in the ++following preference (if idle): ++ ++* Same core, idle or busy cache, idle threads ++* Other core, same cache, idle or busy cache, idle threads. ++* Same node, other CPU, idle cache, idle threads. ++* Same node, other CPU, busy cache, idle threads. ++* Same core, busy threads. ++* Other core, same cache, busy threads. ++* Same node, other CPU, busy threads. ++* Other node, other CPU, idle cache, idle threads. ++* Other node, other CPU, busy cache, idle threads. ++* Other node, other CPU, busy threads. ++ ++This shows the SMT or "hyperthread" awareness in the design as well which will ++choose a real idle core first before a logical SMT sibling which already has ++tasks on the physical CPU. ++ ++Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. ++However this benchmarking was performed on an earlier design that was far less ++scalable than the current one so it's hard to know how scalable it is in terms ++of both CPUs (due to the global runqueue) and heavily loaded machines (due to ++O(n) lookup) at this stage. Note that in terms of scalability, the number of ++_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) ++quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark ++results are very promising indeed, without needing to tweak any knobs, features ++or options. Benchmark contributions are most welcome. ++ ++ ++Features ++ ++As the initial prime target audience for BFS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval ++and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition ++to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is ++support for CGROUPS. The average user should neither need to know what these ++are, nor should they need to be using them to have good desktop behaviour. ++ ++rr_interval ++ ++There is only one "scheduler" tunable, the round robin interval. This can be ++accessed in ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6 on a ++uniprocessor machine, and automatically set to a progressively higher value on ++multiprocessor machines. The reasoning behind increasing the value on more CPUs ++is that the effective latency is decreased by virtue of there being more CPUs on ++BFS (for reasons explained above), and increasing the value allows for less ++cache contention and more throughput. Valid values are from 1 to 1000 ++Decreasing the value will decrease latencies at the cost of decreasing ++throughput, while increasing it will improve throughput, but at the cost of ++worsening latencies. The accuracy of the rr interval is limited by HZ resolution ++of the kernel configuration. Thus, the worst case latencies are usually slightly ++higher than this actual value. The default value of 6 is not an arbitrary one. ++It is based on the fact that humans can detect jitter at approximately 7ms, so ++aiming for much lower latencies is pointless under most circumstances. It is ++worth noting this fact when comparing the latency performance of BFS to other ++schedulers. Worst case latencies being higher than 7ms are far worse than ++average latencies not being in the microsecond range. ++ ++Isochronous scheduling. ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of _total CPU_ available across the machine, configurable ++as a percentage in the following "resource handling" tunable (as opposed to a ++scheduler tunable): ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of BFS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++Because some applications constantly set their policy as well as their nice ++level, there is potential for them to undo the override specified by the user ++on the command line of setting the policy to SCHED_ISO. To counter this, once ++a task has been set to SCHED_ISO policy, it needs superuser privileges to set ++it back to SCHED_NORMAL. This will ensure the task remains ISO and all child ++processes and threads will also inherit the ISO policy. ++ ++Idleprio scheduling. ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start ++a video encode or so on without any slowdown of other tasks. To avoid this ++policy from grabbing shared resources and holding them indefinitely, if it ++detects a state where the task is waiting on I/O, the machine is about to ++suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As ++per the Isochronous task management, once a task has been scheduled as IDLEPRIO, ++it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can ++be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++ schedtool -D -e ./mprime ++ ++Subtick accounting. ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the ++timer tick frequency (HZ) is lowered. It is possible to create an application ++which uses almost 100% CPU, yet by being descheduled at the right time, records ++zero CPU usage. While the main problem with this is that there are possible ++security implications, it is also difficult to determine how much CPU a task ++really does use. BFS tries to use the sub-tick accounting from the TSC clock, ++where possible, to determine real CPU usage. This is not entirely reliable, but ++is far more likely to produce accurate CPU usage data than the existing designs ++and will not show tasks as consuming no CPU usage when they actually are. Thus, ++the amount of CPU reported as being used by BFS will more accurately represent ++how much CPU the task itself is using (as is shown for example by the 'time' ++application), so the reported values may be quite different to other schedulers. ++Values reported as the 'load' are more prone to problems with this design, but ++per process values are closer to real usage. When comparing throughput of BFS ++to other designs, it is important to compare the actual completed work in terms ++of total wall clock time taken and total work done, rather than the reported ++"cpu usage". ++ ++ ++Con Kolivas Fri Aug 27 2010 +diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt +new file mode 100644 +index 000000000000..ae28b85c9995 +--- /dev/null ++++ b/Documentation/scheduler/sched-MuQSS.txt +@@ -0,0 +1,373 @@ ++MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. ++ ++MuQSS is a per-cpu runqueue variant of the original BFS scheduler with ++one 8 level skiplist per runqueue, and fine grained locking for much more ++scalability. ++ ++ ++Goals. ++ ++The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from ++here on (pronounced mux) is to completely do away with the complex designs of ++the past for the cpu process scheduler and instead implement one that is very ++simple in basic design. The main focus of MuQSS is to achieve excellent desktop ++interactivity and responsiveness without heuristics and tuning knobs that are ++difficult to understand, impossible to model and predict the effect of, and when ++tuned to one workload cause massive detriment to another, while still being ++scalable to many CPUs and processes. ++ ++ ++Design summary. ++ ++MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) ++lookup, earliest effective virtual deadline first tickless design, loosely based ++on EEVDF (earliest eligible virtual deadline first) and my previous Staircase ++Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. ++Each component shall be described in order to understand the significance of, ++and reasoning for it. ++ ++ ++Design reasoning. ++ ++In BFS, the use of a single runqueue across all CPUs meant that each CPU would ++need to scan the entire runqueue looking for the process with the earliest ++deadline and schedule that next, regardless of which CPU it originally came ++from. This made BFS deterministic with respect to latency and provided ++guaranteed latencies dependent on number of processes and CPUs. The single ++runqueue, however, meant that all CPUs would compete for the single lock ++protecting it, which would lead to increasing lock contention as the number of ++CPUs rose and appeared to limit scalability of common workloads beyond 16 ++logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously ++increased overhead proportionate to the number of queued proecesses and led to ++cache thrashing while iterating over the linked list. ++ ++MuQSS is an evolution of BFS, designed to maintain the same scheduling ++decision mechanism and be virtually deterministic without relying on the ++constrained design of the single runqueue by splitting out the single runqueue ++to be per-CPU and use skiplists instead of linked lists. ++ ++The original reason for going back to a single runqueue design for BFS was that ++once multiple runqueues are introduced, per-CPU or otherwise, there will be ++complex interactions as each runqueue will be responsible for the scheduling ++latency and fairness of the tasks only on its own runqueue, and to achieve ++fairness and low latency across multiple CPUs, any advantage in throughput of ++having CPU local tasks causes other disadvantages. This is due to requiring a ++very complex balancing system to at best achieve some semblance of fairness ++across CPUs and can only maintain relatively low latency for tasks bound to the ++same CPUs, not across them. To increase said fairness and latency across CPUs, ++the advantage of local runqueue locking, which makes for better scalability, is ++lost due to having to grab multiple locks. ++ ++MuQSS works around the problems inherent in multiple runqueue designs by ++making its skip lists priority ordered and through novel use of lockless ++examination of each other runqueue it can decide if it should take the earliest ++deadline task from another runqueue for latency reasons, or for CPU balancing ++reasons. It still does not have a balancing system, choosing to allow the ++next task scheduling decision and task wakeup CPU choice to allow balancing to ++happen by virtue of its choices. ++ ++As a further evolution of the design, MuQSS normally configures sharing of ++runqueues in a logical fashion for when CPU resources are shared for improved ++latency and throughput. By default it shares runqueues and locks between ++multicore siblings. Optionally it can be configured to run with sharing of ++SMT siblings only, all SMP packages or no sharing at all. Additionally it can ++be selected at boot time. ++ ++ ++Design details. ++ ++Custom skip list implementation: ++ ++To avoid the overhead of building up and tearing down skip list structures, ++the variant used by MuQSS has a number of optimisations making it specific for ++its use case in the scheduler. It uses static arrays of 8 'levels' instead of ++building up and tearing down structures dynamically. This makes each runqueue ++only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU ++it means that it scales O(log N) up to 64k x number of logical CPUs which is ++far beyond the realistic task limits each CPU could handle. By being 8 levels ++it also makes the array exactly one cacheline in size. Additionally, each ++skip list node is bidirectional making insertion and removal amortised O(1), ++being O(k) where k is 1-8. Uniquely, we are only ever interested in the very ++first entry in each list at all times with MuQSS, so there is never a need to ++do a search and thus look up is always O(1). In interactive mode, the queues ++will be searched beyond their first entry if the first task is not suitable ++for affinity or SMT nice reasons. ++ ++Task insertion: ++ ++MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into ++a custom skip list as described above (based on the original design by William ++Pugh). Insertion is ordered in such a way that there is never a need to do a ++search by ordering tasks according to static priority primarily, and then ++virtual deadline at the time of insertion. ++ ++Niffies: ++ ++Niffies are a monotonic forward moving timer not unlike the "jiffies" but are ++of nanosecond resolution. Niffies are calculated per-runqueue from the high ++resolution TSC timers, and in order to maintain fairness are synchronised ++between CPUs whenever both runqueues are locked concurrently. ++ ++Virtual deadline: ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in MuQSS is entirely in the virtual deadline mechanism. The one ++tunable in MuQSS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in niffies by this equation: ++ ++ niffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (niffies) is ++constantly moving. ++ ++Task lookup: ++ ++As tasks are already pre-ordered according to anticipated scheduling order in ++the skip lists, lookup for the next suitable task per-runqueue is always a ++matter of simply selecting the first task in the 0th level skip list entry. ++In order to maintain optimal latency and fairness across CPUs, MuQSS does a ++novel examination of every other runqueue in cache locality order, choosing the ++best task across all runqueues. This provides near-determinism of how long any ++task across the entire system may wait before receiving CPU time. The other ++runqueues are first examine lockless and then trylocked to minimise the ++potential lock contention if they are likely to have a suitable better task. ++Each other runqueue lock is only held for as long as it takes to examine the ++entry for suitability. In "interactive" mode, the default setting, MuQSS will ++look for the best deadline task across all CPUs, while in !interactive mode, ++it will only select a better deadline task from another CPU if it is more ++heavily laden than the current one. ++ ++Lookup is therefore O(k) where k is number of CPUs. ++ ++ ++Latency. ++ ++Through the use of virtual deadlines to govern the scheduling order of normal ++tasks, queue-to-activation latency per runqueue is guaranteed to be bound by ++the rr_interval tunable which is set to 6ms by default. This means that the ++longest a CPU bound task will wait for more CPU is proportional to the number ++of running tasks and in the common case of 0-2 running tasks per CPU, will be ++under the 7ms threshold for human perception of jitter. Additionally, as newly ++woken tasks will have an early deadline from their previous runtime, the very ++tasks that are usually latency sensitive will have the shortest interval for ++activation, usually preempting any existing CPU bound tasks. ++ ++Tickless expiry: ++ ++A feature of MuQSS is that it is not tied to the resolution of the chosen tick ++rate in Hz, instead depending entirely on the high resolution timers where ++possible for sub-millisecond accuracy on timeouts regarless of the underlying ++tick rate. This allows MuQSS to be run with the low overhead of low Hz rates ++such as 100 by default, benefiting from the improved throughput and lower ++power usage it provides. Another advantage of this approach is that in ++combination with the Full No HZ option, which disables ticks on running task ++CPUs instead of just idle CPUs, the tick can be disabled at all times ++regardless of how many tasks are running instead of being limited to just one ++running task. Note that this option is NOT recommended for regular desktop ++users. ++ ++ ++Scalability and balancing. ++ ++Unlike traditional approaches where balancing is a combination of CPU selection ++at task wakeup and intermittent balancing based on a vast array of rules set ++according to architecture, busyness calculations and special case management, ++MuQSS indirectly balances on the fly at task wakeup and next task selection. ++During initialisation, MuQSS creates a cache coherency ordered list of CPUs for ++each logical CPU and uses this to aid task/CPU selection when CPUs are busy. ++Additionally it selects any idle CPUs, if they are available, at any time over ++busy CPUs according to the following preference: ++ ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ ++Mux is therefore SMT, MC and Numa aware without the need for extra ++intermittent balancing to maintain CPUs busy and make the most of cache ++coherency. ++ ++ ++Features ++ ++As the initial prime target audience for MuQSS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, ++interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO ++policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS ++does _not_ now feature is support for CGROUPS. The average user should neither ++need to know what these are, nor should they need to be using them to have good ++desktop behaviour. However since some applications refuse to work without ++cgroups, one can enable them with MuQSS as a stub and the filesystem will be ++created which will allow the applications to work. ++ ++rr_interval: ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6. Valid values ++are from 1 to 1000 Decreasing the value will decrease latencies at the cost of ++decreasing throughput, while increasing it will improve throughput, but at the ++cost of worsening latencies. It is based on the fact that humans can detect ++jitter at approximately 7ms, so aiming for much lower latencies is pointless ++under most circumstances. It is worth noting this fact when comparing the ++latency performance of MuQSS to other schedulers. Worst case latencies being ++higher than 7ms are far worse than average latencies not being in the ++microsecond range. ++ ++interactive: ++ ++ /proc/sys/kernel/interactive ++ ++The value is a simple boolean of 1 for on and 0 for off and is set to on by ++default. Disabling this will disable the near-determinism of MuQSS when ++selecting the next task by not examining all CPUs for the earliest deadline ++task, or which CPU to wake to, instead prioritising CPU balancing for improved ++throughput. Latency will still be bound by rr_interval, but on a per-CPU basis ++instead of across the whole system. ++ ++Runqueue sharing. ++ ++By default MuQSS chooses to share runqueue resources (specifically the skip ++list and locking) between multicore siblings. It is configurable at build time ++to select between None, SMT, MC and SMP, corresponding to no sharing, sharing ++only between simultaneous mulithreading siblings, multicore siblings, or ++symmetric multiprocessing physical packages. Additionally it can be se at ++bootime with the use of the rqshare parameter. The reason for configurability ++is that some architectures have CPUs with many multicore siblings (>= 16) ++where it may be detrimental to throughput to share runqueues and another ++sharing option may be desirable. Additionally, more sharing than usual can ++improve latency on a system-wide level at the expense of throughput if desired. ++ ++The options are: ++none, smt, mc, smp ++ ++eg: ++ rqshare=mc ++ ++Isochronous scheduling: ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of CPU available per CPU, configurable as a percentage in ++the following "resource handling" tunable (as opposed to a scheduler tunable): ++ ++iso_cpu: ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of MuQSS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++ ++ ++Idleprio scheduling: ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start a ++video encode or so on without any slowdown of other tasks. To avoid this policy ++from grabbing shared resources and holding them indefinitely, if it detects a ++state where the task is waiting on I/O, the machine is about to suspend to ram ++and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has ++been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without ++superuser privileges since it is effectively a lower scheduling policy. Tasks ++can be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++schedtool -D -e ./mprime ++ ++Subtick accounting: ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the timer ++tick frequency (HZ) is lowered. It is possible to create an application which ++uses almost 100% CPU, yet by being descheduled at the right time, records zero ++CPU usage. While the main problem with this is that there are possible security ++implications, it is also difficult to determine how much CPU a task really does ++use. Mux uses sub-tick accounting from the TSC clock to determine real CPU ++usage. Thus, the amount of CPU reported as being used by MuQSS will more ++accurately represent how much CPU the task itself is using (as is shown for ++example by the 'time' application), so the reported values may be quite ++different to other schedulers. When comparing throughput of MuQSS to other ++designs, it is important to compare the actual completed work in terms of total ++wall clock time taken and total work done, rather than the reported "cpu usage". ++ ++Symmetric MultiThreading (SMT) aware nice: ++ ++SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the ++logical CPU count rises by adding thread units to each CPU core, allowing more ++than one task to be run simultaneously on the same core, the disadvantage of it ++is that the CPU power is shared between the tasks, not summating to the power ++of two CPUs. The practical upshot of this is that two tasks running on ++separate threads of the same core run significantly slower than if they had one ++core each to run on. While smart CPU selection allows each task to have a core ++to itself whenever available (as is done on MuQSS), it cannot offset the ++slowdown that occurs when the cores are all loaded and only a thread is left. ++Most of the time this is harmless as the CPU is effectively overloaded at this ++point and the extra thread is of benefit. However when running a niced task in ++the presence of an un-niced task (say nice 19 v nice 0), the nice task gets ++precisely the same amount of CPU power as the unniced one. MuQSS has an ++optional configuration feature known as SMT-NICE which selectively idles the ++secondary niced thread for a period proportional to the nice difference, ++allowing CPU distribution according to nice level to be maintained, at the ++expense of a small amount of extra overhead. If this is configured in on a ++machine without SMT threads, the overhead is minimal. ++ ++ ++Con Kolivas Sat, 29th October 2016 +diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig +index d6e9fc7a7b19..f2d07b1939fd 100644 +--- a/arch/alpha/Kconfig ++++ b/arch/alpha/Kconfig +@@ -667,6 +667,8 @@ config HZ + default 1200 if HZ_1200 + default 1024 + ++source "kernel/Kconfig.MuQSS" ++ + config SRM_ENV + tristate "SRM environment through procfs" + depends on PROC_FS +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index 002e0cf025f5..a3045ba688ca 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -1236,6 +1236,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config HAVE_ARM_SCU + bool + help +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index a6b5b7ef40ae..621c278bb5f2 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -976,6 +976,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config NR_CPUS + int "Maximum number of CPUs (2-4096)" + range 2 4096 +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index 5181872f9452..bf3a47193bb5 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -888,6 +888,8 @@ config SCHED_SMT + when dealing with POWER5 cpus at a cost of slightly increased + overhead in some places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config PPC_DENORMALISATION + bool "PowerPC denormalisation exception handling" + depends on PPC_BOOK3S_64 +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index fbf26e0f7a6a..cc201f2d200b 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1008,6 +1008,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_MUQSS && SCHED_SMT ++ default y ++ help ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +@@ -1038,6 +1054,8 @@ config SCHED_MC_PRIO + + If unsure say Y here. + ++source "kernel/Kconfig.MuQSS" ++ + config UP_LATE_INIT + def_bool y + depends on !SMP && X86_LOCAL_APIC +diff --git a/fs/proc/base.c b/fs/proc/base.c +index b362523a9829..38e4f305ddf0 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index b2412b4d4c20..0db390aeae9f 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_MUQSS ++#define INIT_TASK_COMM "MuQSS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h +index e9bfe6972aed..16ba1c7e5bde 100644 +--- a/include/linux/ioprio.h ++++ b/include/linux/ioprio.h +@@ -53,6 +53,8 @@ enum { + */ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (iso_task(task)) ++ return 0; + return (task_nice(task) + 20) / 5; + } + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 76cd21fa5501..dc93c8907fcb 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -35,6 +35,10 @@ + #include + #include + ++#ifdef CONFIG_SCHED_MUQSS ++#include ++#endif ++ + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; + struct backing_dev_info; +@@ -660,8 +664,10 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) + int on_cpu; ++#endif ++#ifdef CONFIG_SMP + struct __call_single_node wake_entry; + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ +@@ -687,10 +693,25 @@ struct task_struct { + int static_prio; + int normal_prio; + unsigned int rt_priority; ++#ifdef CONFIG_SCHED_MUQSS ++ int time_slice; ++ u64 deadline; ++ skiplist_node node; /* Skip list node */ ++ u64 last_ran; ++ u64 sched_time; /* sched_clock time spent running */ ++#ifdef CONFIG_SMT_NICE ++ int smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++#ifdef CONFIG_HOTPLUG_CPU ++ bool zerobound; /* Bound to CPU0 for hotplug */ ++#endif ++ unsigned long rt_timeout; ++#else /* CONFIG_SCHED_MUQSS */ + + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +@@ -886,6 +907,10 @@ struct task_struct { + #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + u64 utimescaled; + u64 stimescaled; ++#endif ++#ifdef CONFIG_SCHED_MUQSS ++ /* Unbanked cpu time */ ++ unsigned long utime_ns, stime_ns; + #endif + u64 gtime; + struct prev_cputime prev_cputime; +@@ -1365,6 +1390,40 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_MUQSS ++#define tsk_seruntime(t) ((t)->sched_time) ++#define tsk_rttimeout(t) ((t)->rt_timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++} ++ ++void print_scheduler_version(void); ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return (p->policy == SCHED_ISO); ++} ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++ p->nr_cpus_allowed = current->nr_cpus_allowed; ++} ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "CFS CPU scheduler.\n"); ++} ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_MUQSS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..73d6319a856a 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -28,7 +28,16 @@ static inline bool dl_time_before(u64 a, u64 b) + #ifdef CONFIG_SMP + + struct root_domain; ++#ifdef CONFIG_SCHED_MUQSS ++static inline void dl_clear_root_domain(struct root_domain *rd) ++{ ++} ++static inline void dl_add_task_root_domain(struct task_struct *p) ++{ ++} ++#else /* CONFIG_SCHED_MUQSS */ + extern void dl_add_task_root_domain(struct task_struct *p); + extern void dl_clear_root_domain(struct root_domain *rd); ++#endif /* CONFIG_SCHED_MUQSS */ + + #endif /* CONFIG_SMP */ +diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h +index 6d67e9a5af6b..101fe470aa8f 100644 +--- a/include/linux/sched/nohz.h ++++ b/include/linux/sched/nohz.h +@@ -13,7 +13,7 @@ extern int get_nohz_timer_target(void); + static inline void nohz_balance_enter_idle(int cpu) { } + #endif + +-#ifdef CONFIG_NO_HZ_COMMON ++#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) + void calc_load_nohz_start(void); + void calc_load_nohz_remote(struct rq *rq); + void calc_load_nohz_stop(void); +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..43c9d9e50c09 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,8 +20,20 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_MUQSS ++/* Note different MAX_RT_PRIO */ ++#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) ++ ++#define ISO_PRIO (MAX_RT_PRIO) ++#define NORMAL_PRIO (MAX_RT_PRIO + 1) ++#define IDLE_PRIO (MAX_RT_PRIO + 2) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* CONFIG_SCHED_MUQSS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + ++#endif /* CONFIG_SCHED_MUQSS */ ++ + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..010b2244e0b6 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_MUQSS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index 85fb2f34c59b..5feb3faee812 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -93,7 +93,7 @@ int kernel_wait(pid_t pid, int *stat); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..d4be84ba273b +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,33 @@ ++#ifndef _LINUX_SKIP_LISTS_H ++#define _LINUX_SKIP_LISTS_H ++typedef u64 keyType; ++typedef void *valueType; ++ ++typedef struct nodeStructure skiplist_node; ++ ++struct nodeStructure { ++ int level; /* Levels in this structure */ ++ keyType key; ++ valueType value; ++ skiplist_node *next[8]; ++ skiplist_node *prev[8]; ++}; ++ ++typedef struct listStructure { ++ int entries; ++ int level; /* Maximum level of the list ++ (1 more than the number of levels in the list) */ ++ skiplist_node *header; /* pointer to header */ ++} skiplist; ++ ++void skiplist_init(skiplist_node *slnode); ++skiplist *new_skiplist(skiplist_node *slnode); ++void free_skiplist(skiplist *l); ++void skiplist_node_init(skiplist_node *node); ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); ++void skiplist_delete(skiplist *l, skiplist_node *node); ++ ++static inline bool skiplist_node_empty(skiplist_node *node) { ++ return (!node->next[0]); ++} ++#endif /* _LINUX_SKIP_LISTS_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 3bac0a8ceab2..f48c5c5da651 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -115,9 +115,16 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented on MuQSS only */ + #define SCHED_IDLE 5 ++#ifdef CONFIG_SCHED_MUQSS ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO SCHED_IDLE ++#define SCHED_MAX (SCHED_IDLEPRIO) ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++#else /* CONFIG_SCHED_MUQSS */ + #define SCHED_DEADLINE 6 ++#endif /* CONFIG_SCHED_MUQSS */ + + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ + #define SCHED_RESET_ON_FORK 0x40000000 +diff --git a/init/Kconfig b/init/Kconfig +index 0872a5a2e759..f8aa9d2a9f51 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -96,6 +96,18 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config SCHED_MUQSS ++ bool "MuQSS cpu scheduler" ++ select HIGH_RES_TIMERS ++ help ++ The Multiple Queue Skiplist Scheduler for excellent interactivity and ++ responsiveness on the desktop and highly scalable deterministic ++ low latency on any hardware. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -514,6 +526,7 @@ config SCHED_THERMAL_PRESSURE + default y if ARM64 + depends on SMP + depends on CPU_FREQ_THERMAL ++ depends on !SCHED_MUQSS + help + Select this option to enable thermal pressure accounting in the + scheduler. Thermal pressure is the value conveyed to the scheduler +@@ -863,6 +876,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_MUQSS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -947,9 +961,13 @@ menuconfig CGROUP_SCHED + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group +- tasks. ++ tasks. In combination with MuQSS this is purely a STUB to create the ++ files associated with the CPU controller cgroup but most of the ++ controls do nothing. This is useful for working in environments and ++ with applications that will only work if this control group is ++ present. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_MUQSS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1078,6 +1096,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_MUQSS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1205,6 +1224,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_MUQSS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index a56f0abb63e9..d337ec0f36fc 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -75,9 +75,17 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_MUQSS ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, ++ .time_slice = 1000000, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -87,6 +95,7 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifndef CONFIG_SCHED_MUQSS + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -94,6 +103,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/init/main.c b/init/main.c +index 32b2a8affafd..d3c0b60ff0cf 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -1433,6 +1433,8 @@ static int __ref kernel_init(void *unused) + + do_sysctl_args(); + ++ print_scheduler_version(); ++ + if (ramdisk_execute_command) { + ret = run_init_process(ramdisk_execute_command); + if (!ret) +diff --git a/kernel/Kconfig.MuQSS b/kernel/Kconfig.MuQSS +new file mode 100644 +index 000000000000..a6a58781ef91 +--- /dev/null ++++ b/kernel/Kconfig.MuQSS +@@ -0,0 +1,105 @@ ++choice ++ prompt "CPU scheduler runqueue sharing" ++ default RQ_MC if SCHED_MUQSS ++ default RQ_NONE ++ ++config RQ_NONE ++ bool "No sharing" ++ help ++ This is the default behaviour where the CPU scheduler has one runqueue ++ per CPU, whether it is a physical or logical CPU (hyperthread). ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=none ++ ++ If unsure, say N. ++ ++config RQ_SMT ++ bool "SMT (hyperthread) siblings" ++ depends on SCHED_SMT && SCHED_MUQSS ++ ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by SMT (hyperthread) siblings. As these logical cores share ++ one physical core, sharing the runqueue resource can lead to decreased ++ overhead, lower latency and higher throughput. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smt ++ ++ If unsure, say N. ++ ++config RQ_MC ++ bool "Multicore siblings" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by multicore siblings in addition to any SMT siblings. ++ As these physical cores share caches, sharing the runqueue resource ++ will lead to lower latency, but its effects on overhead and throughput ++ are less predictable. As a general rule, 6 or fewer cores will likely ++ benefit from this, while larger CPUs will only derive a latency ++ benefit. If your workloads are primarily single threaded, this will ++ possibly worsen throughput. If you are only concerned about latency ++ then enable this regardless of how many cores you have. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=mc ++ ++ If unsure, say Y. ++ ++config RQ_MC_LLC ++ bool "Multicore siblings (LLC)" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will behave similarly as ++ with "Multicore siblings". ++ This option takes LLC cache into account when scheduling tasks. ++ Option may benefit CPUs with multiple LLC caches, such as Ryzen ++ and Xeon CPUs. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=llc ++ ++ If unsure, say N. ++ ++config RQ_SMP ++ bool "Symmetric Multi-Processing" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by all physical CPUs unless they are on separate NUMA nodes. ++ As physical CPUs usually do not share resources, sharing the runqueue ++ will normally worsen throughput but improve latency. If you only ++ care about latency enable this. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smp ++ ++ If unsure, say N. ++ ++config RQ_ALL ++ bool "NUMA" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ regardless of the architecture configuration, including across NUMA ++ nodes. This can substantially decrease throughput in NUMA ++ configurations, but light NUMA designs will not be dramatically ++ affected. This option should only be chosen if latency is the prime ++ concern. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=all ++ ++ If unsure, say N. ++endchoice ++ ++config SHARERQ ++ int ++ default 0 if RQ_NONE ++ default 1 if RQ_SMT ++ default 2 if RQ_MC ++ default 3 if RQ_MC_LLC ++ default 4 if RQ_SMP ++ default 5 if RQ_ALL +diff --git a/kernel/Makefile b/kernel/Makefile +index 6c9f19911be0..bbd99777a89a 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -10,7 +10,8 @@ obj-y = fork.o exec_domain.o panic.o \ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ + notifier.o ksysfs.o cred.o reboot.o \ +- async.o range.o smpboot.o ucount.o regset.o ++ async.o range.o smpboot.o ucount.o regset.o \ ++ skip_list.o + + obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o + obj-$(CONFIG_MODULES) += kmod.o +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index 1f236ed375f8..f400301e2086 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -121,7 +121,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -142,7 +142,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/kthread.c b/kernel/kthread.c +index 933a625621b8..93ee79fc2b48 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -471,6 +471,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) + } + EXPORT_SYMBOL(kthread_bind); + ++#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) ++extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); ++ ++/* ++ * new_kthread_bind is a special variant of __kthread_bind_mask. ++ * For new threads to work on muqss we want to call do_set_cpus_allowed ++ * without the task_cpu being set and the task rescheduled until they're ++ * rescheduled on their own so we call __do_set_cpus_allowed directly which ++ * only changes the cpumask. This is particularly important for smpboot threads ++ * to work. ++ */ ++static void new_kthread_bind(struct task_struct *p, unsigned int cpu) ++{ ++ unsigned long flags; ++ ++ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) ++ return; ++ ++ /* It's safe because the task is inactive. */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ __do_set_cpus_allowed(p, cpumask_of(cpu)); ++ p->flags |= PF_NO_SETAFFINITY; ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++#else ++#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) ++#endif ++ + /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). +@@ -491,7 +519,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), + cpu); + if (IS_ERR(p)) + return p; +- kthread_bind(p, cpu); ++ new_kthread_bind(p, cpu); + /* CPU hotplug need to bind once again when unparking the thread. */ + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index f6310f848f34..825f9b8e228f 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -282,7 +282,7 @@ static bool klp_try_switch_task(struct task_struct *task) + { + static char err_buf[STACK_ERR_BUF_SIZE]; + struct rq *rq; +- struct rq_flags flags; ++ struct rq_flags rf; + int ret; + bool success = false; + +@@ -304,7 +304,7 @@ static bool klp_try_switch_task(struct task_struct *task) + * functions. If all goes well, switch the task to the target patch + * state. + */ +- rq = task_rq_lock(task, &flags); ++ rq = task_rq_lock(task, &rf); + + if (task_running(rq, task) && task != current) { + snprintf(err_buf, STACK_ERR_BUF_SIZE, +@@ -323,7 +323,7 @@ static bool klp_try_switch_task(struct task_struct *task) + task->patch_state = klp_target_state; + + done: +- task_rq_unlock(rq, task, &flags); ++ task_rq_unlock(rq, task, &rf); + + /* + * Due to console deadlock issues, pr_debug() can't be used while +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 5fc9c9b70862..1ff14a21193d 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -22,15 +22,23 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + ++ifdef CONFIG_SCHED_MUQSS ++obj-y += MuQSS.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++ ++obj-$(CONFIG_SMP) += topology.o ++else + obj-y += core.o loadavg.o clock.o cputime.o + obj-y += idle.o fair.o rt.o deadline.o + obj-y += wait.o wait_bit.o swait.o completion.o + + obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +new file mode 100644 +index 000000000000..9478e2d473b7 +--- /dev/null ++++ b/kernel/sched/MuQSS.c +@@ -0,0 +1,7866 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * kernel/sched/MuQSS.c, was kernel/sched.c ++ * ++ * Kernel scheduler and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and ++ * make semaphores SMP safe ++ * 1998-11-19 Implemented schedule_timeout() and related stuff ++ * by Andrea Arcangeli ++ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: ++ * hybrid priority-list and round-robin design with ++ * an array-switch method of distributing timeslices ++ * and per-CPU runqueues. Cleanups and useful suggestions ++ * by Davide Libenzi, preemptible kernel bits by Robert Love. ++ * 2003-09-03 Interactivity tuning by Con Kolivas. ++ * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-04-15 Work begun on replacing all interactivity tuning with a ++ * fair scheduling design by Con Kolivas. ++ * 2007-05-05 Load balancing (smp-nice) and other improvements ++ * by Peter Williams ++ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith ++ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri ++ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, ++ * Thomas Gleixner, Mike Kravetz ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS ++ * scheduler by Con Kolivas. ++ * 2019-08-31 LLC bits by Eduards Bezverhijs ++ */ ++#define CREATE_TRACE_POINTS ++#include ++#undef CREATE_TRACE_POINTS ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#include "MuQSS.h" ++#include "smp.h" ++ ++#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) ++#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++ ++#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) ++#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) ++#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) ++ ++#define is_iso_policy(policy) ((policy) == SCHED_ISO) ++#define iso_task(p) unlikely(is_iso_policy((p)->policy)) ++#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) ++ ++#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) ++ ++#define ISO_PERIOD (5 * HZ) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define APPROX_NS_PS (1073741824) /* Approximate ns per second */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ)) ++#define JIFFY_NS (APPROX_NS_PS / HZ) ++#define JIFFY_US (1048576 / HZ) ++#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) ++#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2) ++#define HALF_JIFFY_US (1048576 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "MuQSS CPU scheduler v0.205 by Con Kolivas.\n"); ++} ++ ++/* Define RQ share levels */ ++#define RQSHARE_NONE 0 ++#define RQSHARE_SMT 1 ++#define RQSHARE_MC 2 ++#define RQSHARE_MC_LLC 3 ++#define RQSHARE_SMP 4 ++#define RQSHARE_ALL 5 ++ ++/* Define locality levels */ ++#define LOCALITY_SAME 0 ++#define LOCALITY_SMT 1 ++#define LOCALITY_MC_LLC 2 ++#define LOCALITY_MC 3 ++#define LOCALITY_SMP 4 ++#define LOCALITY_DISTANT 5 ++ ++/* ++ * This determines what level of runqueue sharing will be done and is ++ * configurable at boot time with the bootparam rqshare = ++ */ ++static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */ ++ ++static int __init set_rqshare(char *str) ++{ ++ if (!strncmp(str, "none", 4)) { ++ rqshare = RQSHARE_NONE; ++ return 0; ++ } ++ if (!strncmp(str, "smt", 3)) { ++ rqshare = RQSHARE_SMT; ++ return 0; ++ } ++ if (!strncmp(str, "mc", 2)) { ++ rqshare = RQSHARE_MC; ++ return 0; ++ } ++ if (!strncmp(str, "llc", 3)) { ++ rqshare = RQSHARE_MC_LLC; ++ return 0; ++ } ++ if (!strncmp(str, "smp", 3)) { ++ rqshare = RQSHARE_SMP; ++ return 0; ++ } ++ if (!strncmp(str, "all", 3)) { ++ rqshare = RQSHARE_ALL; ++ return 0; ++ } ++ return 1; ++} ++__setup("rqshare=", set_rqshare); ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++ ++/* ++ * Tunable to choose whether to prioritise latency or throughput, simple ++ * binary yes or no ++ */ ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run five seconds as real time tasks. This is the total over ++ * all online cpus. ++ */ ++int sched_iso_cpu __read_mostly = 70; ++ ++/* ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The relative length of deadline for each priority(nice) level. ++ */ ++static int prio_ratios[NICE_WIDTH] __read_mostly; ++ ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifdef CONFIG_SMP ++/* ++ * Total number of runqueues. Equals number of CPUs when there is no runqueue ++ * sharing but is usually less with SMT/MC sharing of runqueues. ++ */ ++static int total_runqueues __read_mostly = 1; ++ ++static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; ++ ++struct rq *cpu_rq(int cpu) ++{ ++ return &per_cpu(runqueues, (cpu)); ++} ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++/* ++ * For asym packing, by default the lower numbered cpu has higher priority. ++ */ ++int __weak arch_asym_cpu_priority(int cpu) ++{ ++ return -cpu; ++} ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++#include "stats.h" ++ ++/* ++ * All common locking functions performed on rq->lock. rq->clock is local to ++ * the CPU accessing it so it can be modified just with interrupts disabled ++ * when we're not updating niffies. ++ * Looking up task_rq must be done under rq->lock to be safe. ++ */ ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++static void update_irq_load_avg(struct rq *rq, long delta); ++#else ++static inline void update_irq_load_avg(struct rq *rq, long delta) {} ++#endif ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if (irq_delta + steal) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta < 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++/* ++ * Niffies are a globally increasing nanosecond counter. They're only used by ++ * update_load_avg and time_slice_expired, however deadlines are based on them ++ * across CPUs. Update them whenever we will call one of those functions, and ++ * synchronise them across CPUs whenever we hold both runqueue locks. ++ */ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff, minndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ rq->old_clock = rq->clock; ++ jdiff = jiffies - rq->last_jiffy; ++ ++ /* Subtract any niffies added by balancing with other rqs */ ++ ndiff -= rq->niffies - rq->last_niffy; ++ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; ++ if (minndiff < 0) ++ minndiff = 0; ++ ndiff = max(ndiff, minndiff); ++ rq->niffies += ndiff; ++ rq->last_niffy = rq->niffies; ++ if (jdiff) { ++ rq->last_jiffy += jdiff; ++ rq->last_jiffy_niffies = rq->niffies; ++ } ++} ++ ++/* ++ * Any time we have two runqueues locked we use that as an opportunity to ++ * synchronise niffies to the highest value as idle ticks may have artificially ++ * kept niffies low on one CPU and the truth can only be later. ++ */ ++static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) ++{ ++ if (rq1->niffies > rq2->niffies) ++ rq2->niffies = rq1->niffies; ++ else ++ rq1->niffies = rq2->niffies; ++} ++ ++/* ++ * double_rq_lock - safely lock two runqueues ++ * ++ * Note this does not disable interrupts like task_rq_lock, ++ * you need to do so manually before calling. ++ */ ++ ++/* For when we know rq1 != rq2 */ ++static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ if (rq1 < rq2) { ++ raw_spin_lock(rq1->lock); ++ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING); ++ } else { ++ raw_spin_lock(rq2->lock); ++ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING); ++ } ++} ++ ++static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ BUG_ON(!irqs_disabled()); ++ if (rq1->lock == rq2->lock) { ++ raw_spin_lock(rq1->lock); ++ __acquire(rq2->lock); /* Fake it out ;) */ ++ } else ++ __double_rq_lock(rq1, rq2); ++ synchronise_niffies(rq1, rq2); ++} ++ ++/* ++ * double_rq_unlock - safely unlock two runqueues ++ * ++ * Note this does not restore interrupts like task_rq_unlock, ++ * you need to do so manually after calling. ++ */ ++static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) ++ __releases(rq1->lock) ++ __releases(rq2->lock) ++{ ++ raw_spin_unlock(rq1->lock); ++ if (rq1->lock != rq2->lock) ++ raw_spin_unlock(rq2->lock); ++ else ++ __release(rq2->lock); ++} ++ ++static inline void lock_all_rqs(void) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_lock(rq->lock); ++ } ++} ++ ++static inline void unlock_all_rqs(void) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_unlock(rq->lock); ++ } ++ preempt_enable(); ++} ++ ++/* Specially nest trylock an rq */ ++static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) ++{ ++ if (unlikely(!do_raw_spin_trylock(rq->lock))) ++ return false; ++ spin_acquire(&rq->lock->dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ synchronise_niffies(this_rq, rq); ++ return true; ++} ++ ++/* Unlock a specially nested trylocked rq */ ++static inline void unlock_rq(struct rq *rq) ++{ ++ spin_release(&rq->lock->dep_map, _RET_IP_); ++ do_raw_spin_unlock(rq->lock); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* Task can safely be re-inserted now */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++static inline void smp_sched_reschedule(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ smp_send_reschedule(cpu); ++} ++ ++/* ++ * resched_task - mark a task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_task(struct task_struct *p) ++{ ++ int cpu; ++#ifdef CONFIG_LOCKDEP ++ /* Kernel threads call this when creating workqueues while still ++ * inactive from __kthread_bind_mask, holding only the pi_lock */ ++ if (!(p->flags & PF_KTHREAD)) { ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(rq->lock); ++ } ++#endif ++ if (test_tsk_need_resched(p)) ++ return; ++ ++ cpu = task_cpu(p); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(p)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * A task that is not running or queued will not have a node set. ++ * A task that is queued but not running will have a node set. ++ * A task that is currently running will have ->on_cpu set but no node set. ++ */ ++static inline bool task_queued(struct task_struct *p) ++{ ++ return !skiplist_node_empty(&p->node); ++} ++ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); ++static inline void resched_if_idle(struct rq *rq); ++ ++static inline bool deadline_before(u64 deadline, u64 time) ++{ ++ return (deadline < time); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes cpu fairly amongst tasks of the ++ * same nice value, it proportions cpu according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 prio_deadline_diff(int user_prio) ++{ ++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); ++} ++ ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ return prio_deadline_diff(TASK_USER_PRIO(p)); ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return prio_deadline_diff(USER_PRIO(static_prio)); ++} ++ ++static inline int longest_deadline_diff(void) ++{ ++ return prio_deadline_diff(39); ++} ++ ++static inline int ms_longest_deadline_diff(void) ++{ ++ return NS_TO_MS(longest_deadline_diff()); ++} ++ ++static inline bool rq_local(struct rq *rq); ++ ++#ifndef SCHED_CAPACITY_SCALE ++#define SCHED_CAPACITY_SCALE 1024 ++#endif ++ ++static inline int rq_load(struct rq *rq) ++{ ++ return rq->nr_running; ++} ++ ++/* ++ * Update the load average for feeding into cpu frequency governors. Use a ++ * rough estimate of a rolling average with ~ time constant of 32ms. ++ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 ++ * Make sure a call to update_clocks has been made before calling this to get ++ * an updated rq->niffies. ++ */ ++static void update_load_avg(struct rq *rq, unsigned int flags) ++{ ++ long us_interval, load; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += rq_load(rq) * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->load_avg = load; ++ ++ rq->load_update = rq->niffies; ++ update_irq_load_avg(rq, 0); ++ if (likely(rq_local(rq))) ++ cpufreq_trigger(rq, flags); ++} ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++/* ++ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds ++ * here so we scale curload to how long it's been since the last update. ++ */ ++static void update_irq_load_avg(struct rq *rq, long delta) ++{ ++ long us_interval, load; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += NS_TO_US(delta) * SCHED_CAPACITY_SCALE * 5 / 262144; ++ rq->irq_load_avg = load; ++ ++ rq->irq_load_update = rq->niffies; ++} ++#endif ++ ++/* ++ * Removing from the runqueue. Enter with rq locked. Deleting a task ++ * from the skip list is done via the stored node reference in the task struct ++ * and does not require a full look up. Thus it occurs in O(k) time where k ++ * is the "level" of the list the task was stored at - usually < 4, max 8. ++ */ ++static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ skiplist_delete(rq->sl, &p->node); ++ rq->best_key = rq->node->next[0]->key; ++ update_clocks(rq); ++ ++ if (!(flags & DEQUEUE_SAVE)) { ++ sched_info_dequeued(rq, p); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ } ++ rq->nr_running--; ++ if (rt_task(p)) ++ rq->rt_nr_running--; ++ update_load_avg(rq, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_RCU ++static bool rcu_read_critical(struct task_struct *p) ++{ ++ return p->rcu_read_unlock_special.b.blocked; ++} ++#else /* CONFIG_PREEMPT_RCU */ ++#define rcu_read_critical(p) (false) ++#endif /* CONFIG_PREEMPT_RCU */ ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!(p->sched_contributes_to_load) && !(p->flags & (PF_EXITING)) && ++ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); ++} ++ ++/* ++ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check ++ * that the iso_refractory flag is not set. ++ */ ++static inline bool isoprio_suitable(struct rq *rq) ++{ ++ return !rq->iso_refractory; ++} ++ ++static inline void inc_nr_running(struct rq *rq) ++{ ++ rq->nr_running++; ++ if (trace_sched_update_nr_running_tp_enabled()) { ++ call_trace_sched_update_nr_running(rq, 1); ++ } ++} ++ ++static inline void dec_nr_running(struct rq *rq) ++{ ++ rq->nr_running--; ++ if (trace_sched_update_nr_running_tp_enabled()) { ++ call_trace_sched_update_nr_running(rq, -1); ++ } ++} ++ ++/* ++ * Adding to the runqueue. Enter with rq locked. ++ */ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ unsigned int randseed, cflags = 0; ++ u64 sl_id; ++ ++ if (!rt_task(p)) { ++ /* Check it hasn't gotten rt from PI */ ++ if ((idleprio_task(p) && idleprio_suitable(p)) || ++ (iso_task(p) && isoprio_suitable(rq))) ++ p->prio = p->normal_prio; ++ else ++ p->prio = NORMAL_PRIO; ++ } else ++ rq->rt_nr_running++; ++ /* ++ * The sl_id key passed to the skiplist generates a sorted list. ++ * Realtime and sched iso tasks run FIFO so they only need be sorted ++ * according to priority. The skiplist will put tasks of the same ++ * key inserted later in FIFO order. Tasks of sched normal, batch ++ * and idleprio are sorted according to their deadlines. Idleprio ++ * tasks are offset by an impossibly large deadline value ensuring ++ * they get sorted into last positions, but still according to their ++ * own deadlines. This creates a "landscape" of skiplists running ++ * from priority 0 realtime in first place to the lowest priority ++ * idleprio tasks last. Skiplist insertion is an O(log n) process. ++ */ ++ if (p->prio <= ISO_PRIO) { ++ sl_id = p->prio; ++ } else { ++ sl_id = p->deadline; ++ if (idleprio_task(p)) { ++ if (p->prio == IDLE_PRIO) ++ sl_id |= 0xF000000000000000; ++ else ++ sl_id += longest_deadline_diff(); ++ } ++ } ++ /* ++ * Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as the random seed for skiplist insertion. ++ */ ++ update_clocks(rq); ++ if (!(flags & ENQUEUE_RESTORE)) { ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags & ENQUEUE_WAKEUP); ++ } ++ ++ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; ++ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); ++ rq->best_key = rq->node->next[0]->key; ++ if (p->in_iowait) ++ cflags |= SCHED_CPUFREQ_IOWAIT; ++ inc_nr_running(rq); ++ update_load_avg(rq, cflags); ++} ++ ++/* ++ * Returns the relative length of deadline all compared to the shortest ++ * deadline which is that of nice -20. ++ */ ++static inline int task_prio_ratio(struct task_struct *p) ++{ ++ return prio_ratios[TASK_USER_PRIO(p)]; ++} ++ ++/* ++ * task_timeslice - all tasks of all priorities get the exact same timeslice ++ * length. CPU distribution is handled by giving different deadlines to ++ * tasks of different priorities. Use 128 as the base value for fast shifts. ++ */ ++static inline int task_timeslice(struct task_struct *p) ++{ ++ return (rr_interval * task_prio_ratio(p) / 128); ++} ++ ++#ifdef CONFIG_SMP ++/* Entered with rq locked */ ++static inline void resched_if_idle(struct rq *rq) ++{ ++ if (rq_idle(rq)) ++ resched_task(rq->curr); ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return (rq->cpu == smp_processor_id()); ++} ++#ifdef CONFIG_SMT_NICE ++static const cpumask_t *thread_cpumask(int cpu); ++ ++/* Find the best real time priority running on any SMT siblings of cpu and if ++ * none are running, the static priority of the best deadline task running. ++ * The lookups to the other runqueues is done lockless as the occasional wrong ++ * value would be harmless. */ ++static int best_smt_bias(struct rq *this_rq) ++{ ++ int other_cpu, best_bias = 0; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq = cpu_rq(other_cpu); ++ ++ if (rq_idle(rq)) ++ continue; ++ if (unlikely(!rq->online)) ++ continue; ++ if (!rq->rq_mm) ++ continue; ++ if (likely(rq->rq_smt_bias > best_bias)) ++ best_bias = rq->rq_smt_bias; ++ } ++ return best_bias; ++} ++ ++static int task_prio_bias(struct task_struct *p) ++{ ++ if (rt_task(p)) ++ return 1 << 30; ++ else if (task_running_iso(p)) ++ return 1 << 29; ++ else if (task_running_idle(p)) ++ return 0; ++ return MAX_PRIO - p->static_prio; ++} ++ ++static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) ++{ ++ return true; ++} ++ ++static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; ++ ++/* We've already decided p can run on CPU, now test if it shouldn't for SMT ++ * nice reasons. */ ++static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) ++{ ++ int best_bias, task_bias; ++ ++ /* Kernel threads always run */ ++ if (unlikely(!p->mm)) ++ return true; ++ if (rt_task(p)) ++ return true; ++ if (!idleprio_suitable(p)) ++ return true; ++ best_bias = best_smt_bias(this_rq); ++ /* The smt siblings are all idle or running IDLEPRIO */ ++ if (best_bias < 1) ++ return true; ++ task_bias = task_prio_bias(p); ++ if (task_bias < 1) ++ return false; ++ if (task_bias >= best_bias) ++ return true; ++ /* Dither 25% cpu of normal tasks regardless of nice difference */ ++ if (best_bias % 4 == 1) ++ return true; ++ /* Sorry, you lose */ ++ return false; ++} ++#else /* CONFIG_SMT_NICE */ ++#define smt_schedule(p, this_rq) (true) ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) ++{ ++ set_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++/* ++ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to ++ * allow easy lookup of whether any suitable idle CPUs are available. ++ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the ++ * idle_cpus variable than to do a full bitmask check when we are busy. The ++ * bits are set atomically but read locklessly as occasional false positive / ++ * negative is harmless. ++ */ ++static inline void set_cpuidle_map(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ atomic_set_cpu(cpu, &cpu_idle_map); ++} ++ ++static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) ++{ ++ clear_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++ atomic_clear_cpu(cpu, &cpu_idle_map); ++} ++ ++static bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return (cpumask_intersects(p->cpus_ptr, &cpu_idle_map)); ++} ++ ++/* ++ * Resched current on rq. We don't know if rq is local to this CPU nor if it ++ * is locked so we do not use an intermediate variable for the task to avoid ++ * having it dereferenced. ++ */ ++static void resched_curr(struct rq *rq) ++{ ++ int cpu; ++ ++ if (test_tsk_need_resched(rq->curr)) ++ return; ++ ++ rq->preempt = rq->curr; ++ cpu = rq->cpu; ++ ++ /* We're doing this without holding the rq lock if it's not task_rq */ ++ ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(rq->curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(rq->curr)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++#define CPUIDLE_DIFF_THREAD (1) ++#define CPUIDLE_DIFF_CORE_LLC (2) ++#define CPUIDLE_DIFF_CORE (4) ++#define CPUIDLE_CACHE_BUSY (8) ++#define CPUIDLE_DIFF_CPU (16) ++#define CPUIDLE_THREAD_BUSY (32) ++#define CPUIDLE_DIFF_NODE (64) ++ ++/* ++ * The best idle CPU is chosen according to the CPUIDLE ranking above where the ++ * lowest value would give the most suitable CPU to schedule p onto next. The ++ * order works out to be the following: ++ * ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ */ ++static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) ++{ ++ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | ++ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | ++ CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD; ++ int cpu_tmp; ++ ++ if (cpumask_test_cpu(best_cpu, tmpmask)) ++ goto out; ++ ++ for_each_cpu(cpu_tmp, tmpmask) { ++ int ranking, locality; ++ struct rq *tmp_rq; ++ ++ ranking = 0; ++ tmp_rq = cpu_rq(cpu_tmp); ++ ++ locality = rq->cpu_locality[cpu_tmp]; ++#ifdef CONFIG_NUMA ++ if (locality > LOCALITY_SMP) ++ ranking |= CPUIDLE_DIFF_NODE; ++ else ++#endif ++ if (locality > LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CPU; ++#ifdef CONFIG_SCHED_MC ++ else if (locality == LOCALITY_MC_LLC) ++ ranking |= CPUIDLE_DIFF_CORE_LLC; ++ else if (locality == LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CORE; ++ if (!(tmp_rq->cache_idle(tmp_rq))) ++ ranking |= CPUIDLE_CACHE_BUSY; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ if (locality == LOCALITY_SMT) ++ ranking |= CPUIDLE_DIFF_THREAD; ++#endif ++ if (ranking < best_ranking ++#ifdef CONFIG_SCHED_SMT ++ || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq))) ++#endif ++ ) { ++ best_cpu = cpu_tmp; ++ best_ranking = ranking; ++ } ++ } ++out: ++ return best_cpu; ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ struct rq *this_rq = cpu_rq(this_cpu); ++ ++ return (this_rq->cpu_locality[that_cpu] < LOCALITY_SMP); ++} ++ ++/* As per resched_curr but only will resched idle task */ ++static inline void resched_idle(struct rq *rq) ++{ ++ if (test_tsk_need_resched(rq->idle)) ++ return; ++ ++ rq->preempt = rq->idle; ++ ++ set_tsk_need_resched(rq->idle); ++ ++ if (rq_local(rq)) { ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ smp_sched_reschedule(rq->cpu); ++} ++ ++DEFINE_PER_CPU(cpumask_t, idlemask); ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ cpumask_t *tmpmask = &(per_cpu(idlemask, cpu)); ++ struct rq *rq; ++ int best_cpu; ++ ++ cpumask_and(tmpmask, p->cpus_ptr, &cpu_idle_map); ++ best_cpu = best_mask_cpu(cpu, task_rq(p), tmpmask); ++ rq = cpu_rq(best_cpu); ++ if (!smt_schedule(p, rq)) ++ return NULL; ++ rq->preempt = p; ++ resched_idle(rq); ++ return rq; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq->rq_order[cpu]; ++} ++#else /* CONFIG_SMP */ ++static inline void set_cpuidle_map(int cpu) ++{ ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++} ++ ++static inline bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return uprq->curr == uprq->idle; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++} ++ ++static inline void resched_curr(struct rq *rq) ++{ ++ resched_task(rq->curr); ++} ++ ++static inline void resched_if_idle(struct rq *rq) ++{ ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return true; ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq; ++} ++ ++static inline bool smt_schedule(struct task_struct *p, struct rq *rq) ++{ ++ return true; ++} ++#endif /* CONFIG_SMP */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ if (idleprio_task(p)) ++ return IDLE_PRIO; ++ if (iso_task(p)) ++ return ISO_PRIO; ++ return NORMAL_PRIO; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. Enter with rq locked. ++ */ ++static void activate_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ resched_if_idle(rq); ++ ++ /* ++ * Sleep time is in units of nanosecs, so shift by 20 to get a ++ * milliseconds-range estimation of the amount of time that the task ++ * spent sleeping: ++ */ ++ if (unlikely(prof_on == SLEEP_PROFILING)) { ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), ++ (rq->niffies - p->last_ran) >> 20); ++ } ++ ++ p->prio = effective_prio(p); ++ enqueue_task(rq, p, flags); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++} ++ ++/* ++ * deactivate_task - If it's running, it's not on the runqueue and we can just ++ * decrement the nr_running. Enter with rq locked. ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ p->on_rq = 0; ++ sched_info_dequeued(rq, p); ++ /* deactivate_task is always DEQUEUE_SLEEP in muqss */ ++ psi_dequeue(p, DEQUEUE_SLEEP); ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++ struct rq *rq; ++ ++ if (task_cpu(p) == new_cpu) ++ return; ++ ++ /* Do NOT call set_task_cpu on a currently queued task as we will not ++ * be reliably holding the rq lock after changing CPU. */ ++ BUG_ON(task_queued(p)); ++ rq = task_rq(p); ++ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * Furthermore, all task_rq users should acquire both locks, see ++ * task_rq_lock(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(rq->lock))); ++#endif ++ ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ /* ++ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++ p->wake_cpu = new_cpu; ++ ++ if (task_running(rq, p)) { ++ /* ++ * We should only be calling this on a running task if we're ++ * holding rq lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ ++ /* ++ * We can't change the task_thread_info CPU on a running task ++ * as p will still be protected by the rq lock of the CPU it ++ * is still running on so we only set the wake_cpu for it to be ++ * lazily updated once off the CPU. ++ */ ++ return; ++ } ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, new_cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, new_cpu); ++#endif ++ /* We're no longer protecting p after this point since we're holding ++ * the wrong runqueue lock. */ ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Move a task off the runqueue and take it to a cpu for it will ++ * become the running task. ++ */ ++static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ struct rq *p_rq = task_rq(p); ++ ++ dequeue_task(p_rq, p, DEQUEUE_SAVE); ++ if (p_rq != rq) { ++ sched_info_dequeued(p_rq, p); ++ sched_info_queued(rq, p); ++ } ++ set_task_cpu(p, cpu); ++} ++ ++/* ++ * Returns a descheduling task to the runqueue unless it is being ++ * deactivated. ++ */ ++static inline void return_task(struct task_struct *p, struct rq *rq, ++ int cpu, bool deactivate) ++{ ++ if (deactivate) ++ deactivate_task(p, rq); ++ else { ++#ifdef CONFIG_SMP ++ /* ++ * set_task_cpu was called on the running task that doesn't ++ * want to deactivate so it has to be enqueued to a different ++ * CPU and we need its lock. Tag it to be moved with as the ++ * lock is dropped in finish_lock_switch. ++ */ ++ if (unlikely(p->wake_cpu != cpu)) ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ else ++#endif ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ } ++} ++ ++/* Enter with rq lock held. We know p is on the local cpu */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ int running, queued; ++ struct rq_flags rf; ++ unsigned long ncsw; ++ struct rq *rq; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(rq, p)) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ rq = task_rq_lock(p, &rf); ++ trace_sched_wait_task(p); ++ running = task_running(rq, p); ++ queued = task_on_rq_queued(p); ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_rq_unlock(rq, p, &rf); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(queued)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_sched_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++#endif ++ ++/* ++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the ++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or ++ * between themselves, they cooperatively multitask. An idle rq scores as ++ * prio PRIO_LIMIT so it is always preempted. ++ */ ++static inline bool ++can_preempt(struct task_struct *p, int prio, u64 deadline) ++{ ++ /* Better static priority RT task or better policy preemption */ ++ if (p->prio < prio) ++ return true; ++ if (p->prio > prio) ++ return false; ++ if (p->policy == SCHED_BATCH) ++ return false; ++ /* SCHED_NORMAL and ISO will preempt based on deadline */ ++ if (!deadline_before(p->deadline, deadline)) ++ return false; ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * Check to see if p can run on cpu, and if not, whether there are any online ++ * CPUs it can run on instead. This only happens with the hotplug threads that ++ * bring up the CPUs. ++ */ ++static inline bool sched_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (likely(cpumask_test_cpu(cpu, p->cpus_ptr))) ++ return false; ++ if (p->nr_cpus_allowed == 1) { ++ cpumask_t valid_mask; ++ ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_online_mask); ++ if (unlikely(cpumask_empty(&valid_mask))) ++ return false; ++ } ++ return true; ++} ++ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ return true; ++} ++ ++#define cpu_online_map (*(cpumask_t *)cpu_online_mask) ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ int i, this_entries = rq_load(this_rq); ++ cpumask_t tmp; ++ ++ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) ++ return; ++ ++ /* IDLEPRIO tasks never preempt anything but idle */ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ ++ cpumask_and(&tmp, &cpu_online_map, p->cpus_ptr); ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *rq = this_rq->cpu_order[i]; ++ ++ if (!cpumask_test_cpu(rq->cpu, &tmp)) ++ continue; ++ ++ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) ++ continue; ++ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { ++ /* We set rq->preempting lockless, it's a hint only */ ++ rq->preempting = p; ++ resched_curr(rq); ++ return; ++ } ++ } ++} ++ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check); ++#else /* CONFIG_SMP */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ return false; ++} ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) ++ resched_curr(uprq); ++} ++ ++static inline int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq = this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) { ++ __schedstat_inc(rq->ttwu_local); ++ } else { ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(rq->cpu, sd) { ++ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ __schedstat_inc(sd->ttwu_wake_remote); ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ /* ++ * Sync wakeups (i.e. those types of wakeups where the waker ++ * has indicated that it will leave the CPU in short order) ++ * don't trigger a preemption if there are no idle cpus, ++ * instead waiting for current to deschedule. ++ */ ++ if (wake_flags & WF_SYNC) ++ resched_suitable_idle(p); ++ else ++ try_preempt(p, rq); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ int en_flags = ENQUEUE_WAKEUP; ++ ++ lockdep_assert_held(rq->lock); ++ ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++ ++#ifdef CONFIG_SMP ++ if (wake_flags & WF_MIGRATED) ++ en_flags |= ENQUEUE_MIGRATED; ++ else ++#endif ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ activate_task(rq, p, en_flags); ++ ttwu_do_wakeup(rq, p, wake_flags); ++} ++ ++/* ++ * Consider @p being inside a wait loop: ++ * ++ * for (;;) { ++ * set_current_state(TASK_UNINTERRUPTIBLE); ++ * ++ * if (CONDITION) ++ * break; ++ * ++ * schedule(); ++ * } ++ * __set_current_state(TASK_RUNNING); ++ * ++ * between set_current_state() and schedule(). In this case @p is still ++ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in ++ * an atomic manner. ++ * ++ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq ++ * then schedule() must still happen and p->state can be changed to ++ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we ++ * need to do a full wakeup with enqueue. ++ * ++ * Returns: %true when the wakeup is done, ++ * %false otherwise. ++ */ ++static int ttwu_runnable(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = __task_rq_lock(p, NULL); ++ if (likely(task_on_rq_queued(p))) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_rq_unlock(rq, NULL); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void *arg) ++{ ++ struct llist_node *llist = arg; ++ struct rq *rq = this_rq(); ++ struct task_struct *p, *t; ++ struct rq_flags rf; ++ ++ if (!llist) ++ return; ++ ++ /* ++ * rq::ttwu_pending racy indication of out-standing wakeups. ++ * Races such that false-negatives are possible, since they ++ * are shorter lived that false-positives would be. ++ */ ++ WRITE_ONCE(rq->ttwu_pending, 0); ++ ++ rq_lock_irqsave(rq, &rf); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { ++ if (WARN_ON_ONCE(p->on_cpu)) ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) ++ set_task_cpu(p, cpu_of(rq)); ++ ++ ttwu_do_activate(rq, p, 0); ++ } ++ ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++void send_call_function_single_ipi(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (!set_nr_if_polling(rq->idle)) ++ arch_send_call_function_single_ipi(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if ++ * necessary. The wakee CPU on receipt of the IPI will queue the task ++ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost ++ * of the wakeup instead of the waker. ++ */ ++static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ WRITE_ONCE(rq->ttwu_pending, 1); ++ __smp_call_single_queue(cpu, &p->wake_entry.llist); ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ rq_lock_irqsave(rq, &rf); ++ if (likely(is_idle_task(rq->curr))) ++ smp_sched_reschedule(cpu); ++ /* Else cpu is not in idle, do nothing here */ ++ rq_unlock_irqrestore(rq, &rf); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++static inline bool ttwu_queue_cond(int cpu, int wake_flags) ++{ ++ /* ++ * If the CPU does not share cache, then queue the task on the ++ * remote rqs wakelist to avoid accessing remote data. ++ */ ++ if (!cpus_share_cache(smp_processor_id(), cpu)) ++ return true; ++ ++ /* ++ * If the task is descheduling and the only running task on the ++ * CPU then use the wakelist to offload the task activation to ++ * the soon-to-be-idle CPU as the current CPU is likely busy. ++ * nr_running is checked to avoid unnecessary task stacking. ++ */ ++ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) ++ return true; ++ ++ return false; ++} ++ ++static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { ++ if (WARN_ON_ONCE(cpu == smp_processor_id())) ++ return false; ++ ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ __ttwu_queue_wakelist(p, cpu, wake_flags); ++ return true; ++ } ++ ++ return false; ++} ++ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ cpumask_t valid_mask; ++ ++ if (p->flags & PF_KTHREAD) ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_all_mask); ++ else ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_active_mask); ++ ++ if (unlikely(!cpumask_weight(&valid_mask))) { ++ /* We shouldn't be hitting this any more */ ++ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm, ++ p->pid, cpumask_weight(p->cpus_ptr)); ++ return cpumask_any(p->cpus_ptr); ++ } ++ return cpumask_any(&valid_mask); ++} ++ ++/* ++ * For a task that's just being woken up we have a valuable balancing ++ * opportunity so choose the nearest cache most lightly loaded runqueue. ++ * Entered with rq locked and returns with the chosen runqueue locked. ++ */ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ unsigned int idlest = ~0U; ++ struct rq *rq = NULL; ++ int i; ++ ++ if (suitable_idle_cpus(p)) { ++ int cpu = task_cpu(p); ++ ++ if (unlikely(needs_other_cpu(p, cpu))) ++ cpu = valid_task_cpu(p); ++ rq = resched_best_idle(p, cpu); ++ if (likely(rq)) ++ return rq->cpu; ++ } ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *other_rq = task_rq(p)->cpu_order[i]; ++ int entries; ++ ++ if (!other_rq->online) ++ continue; ++ if (needs_other_cpu(p, other_rq->cpu)) ++ continue; ++ entries = rq_load(other_rq); ++ if (entries >= idlest) ++ continue; ++ idlest = entries; ++ rq = other_rq; ++ } ++ if (unlikely(!rq)) ++ return task_cpu(p); ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++ ++static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ return false; ++} ++ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ return NULL; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (ttwu_queue_wakelist(p, cpu, wake_flags)) ++ return; ++ ++ rq_lock(rq); ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ rq_unlock(rq); ++} ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int ++try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_runnable()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with smp_store_mb() ++ * in set_current_state() that the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) ++ goto unlock; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ * ++ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure ++ * schedule()'s deactivate_task() has 'happened' and p will no longer ++ * care about it's own p->state. See the comment in __schedule(). ++ */ ++ smp_acquire__after_ctrl_dep(); ++ ++ /* ++ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq ++ * == 0), which means we need to do an enqueue, change p->state to ++ * TASK_WAKING such that we can unlock p->pi_lock before doing the ++ * enqueue, such as ttwu_queue_wakelist(). ++ */ ++ p->state = TASK_WAKING; ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, considering queueing p on the remote CPUs wake_list ++ * which potentially sends an IPI instead of spinning on p->on_cpu to ++ * let the waker make forward progress. This is safe because IRQs are ++ * disabled and the IPI will deliver after on_cpu is cleared. ++ * ++ * Ensure we load task_cpu(p) after p->on_cpu: ++ * ++ * set_task_cpu(p, cpu); ++ * STORE p->cpu = @cpu ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock ++ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) ++ * STORE p->on_cpu = 1 LOAD p->cpu ++ * ++ * to ensure we observe the correct CPU on which the task is currently ++ * scheduling. ++ */ ++ if (smp_load_acquire(&p->on_cpu) && ++ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) ++ goto unlock; ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ cpu = select_best_cpu(p); ++ if (task_cpu(p) != cpu) { ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++ ++#else ++ cpu = task_cpu(p); ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, task_cpu(p), wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state ++ * @p: Process for which the function is to be invoked. ++ * @func: Function to invoke. ++ * @arg: Argument to function. ++ * ++ * If the specified task can be quickly locked into a definite state ++ * (either sleeping or on a given runqueue), arrange to keep it in that ++ * state while invoking @func(@arg). This function can use ->on_rq and ++ * task_curr() to work out what the state is, if required. Given that ++ * @func can be invoked with a runqueue lock held, it had better be quite ++ * lightweight. ++ * ++ * Returns: ++ * @false if the task slipped out from under the locks. ++ * @true if the task was locked onto a runqueue or is sleeping. ++ * However, @func can override this by returning @false. ++ */ ++bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) ++{ ++ bool ret = false; ++ struct rq *rq; ++ ++ lockdep_assert_irqs_enabled(); ++ raw_spin_lock_irq(&p->pi_lock); ++ if (p->on_rq) { ++ rq = __task_rq_lock(p, NULL); ++ if (task_rq(p) == rq) ++ ret = func(p, arg); ++ rq_unlock(rq); ++ } else { ++ switch (p->state) { ++ case TASK_RUNNING: ++ case TASK_WAKING: ++ break; ++ default: ++ smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). ++ if (!p->on_rq) ++ ret = func(p, arg); ++ } ++ } ++ raw_spin_unlock_irq(&p->pi_lock); ++ return ret; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++static void time_slice_expired(struct task_struct *p, struct rq *rq); ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++#ifdef CONFIG_SMP ++ p->wake_entry.u_flags = CSD_TYPE_TTWU; ++#endif ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * The process state is set to the same value of the process executing ++ * do_fork() code. That is running. This guarantees that nobody will ++ * actually run it, and a signal or other external event cannot wake ++ * it up and insert it on the runqueue either. ++ */ ++ ++ /* Should be reset in fork.c but done here for ease of MuQSS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = ++ p->stime_ns = ++ p->utime_ns = 0; ++ skiplist_node_init(&p->node); ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR || p-> policy == SCHED_ISO) { ++ p->policy = SCHED_NORMAL; ++ p->normal_prio = normal_prio(p); ++ } ++ ++ if (PRIO_TO_NICE(p->static_prio) < 0) { ++ p->static_prio = NICE_TO_PRIO(0); ++ p->normal_prio = p->static_prio; ++ } ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rseq_migrate(p); ++ set_task_cpu(p, smp_processor_id()); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ return 0; ++} ++ ++void sched_post_fork(struct task_struct *p) ++{ ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); ++ ++static void account_task_cpu(struct rq *rq, struct task_struct *p) ++{ ++ update_clocks(rq); ++ /* This isn't really a context switch but accounting is the same */ ++ update_cpu_clock_switch(rq, p); ++ p->last_ran = rq->niffies; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++static inline int hrexpiry_enabled(struct rq *rq) ++{ ++ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrexpiry_timer); ++} ++ ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++static inline void hrexpiry_clear(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (hrtimer_active(&rq->hrexpiry_timer)) ++ hrtimer_cancel(&rq->hrexpiry_timer); ++} ++ ++/* ++ * High-resolution time_slice expiry. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrexpiry(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); ++ struct task_struct *p; ++ ++ /* This can happen during CPU hotplug / resume */ ++ if (unlikely(cpu_of(rq) != smp_processor_id())) ++ goto out; ++ ++ /* ++ * We're doing this without the runqueue lock but this should always ++ * be run on the local CPU. Time slice should run out in __schedule ++ * but we set it to zero here in case niffies is slightly less. ++ */ ++ p = rq->curr; ++ p->time_slice = 0; ++ __set_tsk_resched(p); ++out: ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Called to set the hrexpiry timer state. ++ * ++ * called with irqs disabled from the local CPU only ++ */ ++static void hrexpiry_start(struct rq *rq, u64 delay) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ ++ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED); ++} ++ ++static void init_rq_hrexpiry(struct rq *rq) ++{ ++ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ rq->hrexpiry_timer.function = hrexpiry; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return HALF_JIFFY_US; ++ return 0; ++} ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ struct task_struct *parent, *rq_curr; ++ struct rq *rq, *new_rq; ++ unsigned long flags; ++ ++ parent = p->parent; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ p->state = TASK_RUNNING; ++ /* Task_rq can't change yet on a new task */ ++ new_rq = rq = task_rq(p); ++ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { ++ set_task_cpu(p, valid_task_cpu(p)); ++ new_rq = task_rq(p); ++ } ++ ++ double_rq_lock(rq, new_rq); ++ rq_curr = rq->curr; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = rq_curr->normal_prio; ++ ++ trace_sched_wakeup_new(p); ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. If it's negative, it won't ++ * matter since that's the same as being 0. rq->rq_deadline is only ++ * modified within schedule() so it is always equal to ++ * current->deadline. ++ */ ++ account_task_cpu(rq, rq_curr); ++ p->last_ran = rq_curr->last_ran; ++ if (likely(rq_curr->policy != SCHED_FIFO)) { ++ rq_curr->time_slice /= 2; ++ if (rq_curr->time_slice < RESCHED_US) { ++ /* ++ * Forking task has run out of timeslice. Reschedule it and ++ * start its child with a new time slice and deadline. The ++ * child will end up running first because its deadline will ++ * be slightly earlier. ++ */ ++ __set_tsk_resched(rq_curr); ++ time_slice_expired(p, new_rq); ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++ else if (unlikely(rq != new_rq)) ++ try_preempt(p, new_rq); ++ } else { ++ p->time_slice = rq_curr->time_slice; ++ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { ++ /* ++ * The VM isn't cloned, so we're in a good position to ++ * do child-runs-first in anticipation of an exec. This ++ * usually avoids a lot of COW overhead. ++ */ ++ __set_tsk_resched(rq_curr); ++ } else { ++ /* ++ * Adjust the hrexpiry since rq_curr will keep ++ * running and its timeslice has been shortened. ++ */ ++ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); ++ try_preempt(p, new_rq); ++ } ++ } ++ } else { ++ time_slice_expired(p, new_rq); ++ try_preempt(p, new_rq); ++ } ++ activate_task(new_rq, p, 0); ++ double_rq_unlock(rq, new_rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * This must be the very last reference to @prev from this CPU. After ++ * p->on_cpu is cleared, the task can be moved to a different CPU. We ++ * must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock->dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock->owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock->dep_map, 0, 0, _THIS_IP_); ++ ++#ifdef CONFIG_SMP ++ /* ++ * If prev was marked as migrating to another CPU in return_task, drop ++ * the local runqueue lock but leave interrupts disabled and grab the ++ * remote lock we're migrating it to before enabling them. ++ */ ++ if (unlikely(task_on_rq_migrating(prev))) { ++ sched_info_dequeued(rq, prev); ++ /* ++ * We move the ownership of prev to the new cpu now. ttwu can't ++ * activate prev to the wrong cpu since it has to grab this ++ * runqueue in ttwu_remote. ++ */ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ prev->cpu = prev->wake_cpu; ++#else ++ task_thread_info(prev)->cpu = prev->wake_cpu; ++#endif ++ raw_spin_unlock(rq->lock); ++ ++ raw_spin_lock(&prev->pi_lock); ++ rq = __task_rq_lock(prev, NULL); ++ /* Check that someone else hasn't already queued prev */ ++ if (likely(!task_queued(prev))) { ++ enqueue_task(rq, prev, 0); ++ prev->on_rq = TASK_ON_RQ_QUEUED; ++ /* Wake up the CPU if it's not already running */ ++ resched_if_idle(rq); ++ } ++ raw_spin_unlock(&prev->pi_lock); ++ } ++#endif ++ rq_unlock(rq); ++ local_irq_enable(); ++} ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_switch ++# define finish_arch_switch(prev) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static void finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq, prev); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++{ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline void ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++static unsigned long nr_uninterruptible(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_uninterruptible; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ if (rq_load(raw_rq()) == 1) ++ return true; ++ else ++ return false; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int cpu; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += cpu_rq(cpu)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpufreq menu ++ * governor are using nonsensical data. Boosting frequency for a CPU that has ++ * IO-wait which might not even end up running the task when it does become ++ * runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long cpu, sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += nr_iowait_cpu(cpu); ++ ++ return sum; ++} ++ ++unsigned long nr_active(void) ++{ ++ return nr_running() + nr_uninterruptible(); ++} ++ ++/* Variables and functions for calc_load */ ++static unsigned long calc_load_update; ++unsigned long avenrun[3]; ++EXPORT_SYMBOL(avenrun); ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} ++ ++/* ++ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. ++ */ ++void calc_global_load(void) ++{ ++ long active; ++ ++ if (time_before(jiffies, READ_ONCE(calc_load_update))) ++ return; ++ active = nr_active() * FIXED_1; ++ ++ avenrun[0] = calc_load(avenrun[0], EXP_1, active); ++ avenrun[1] = calc_load(avenrun[1], EXP_5, active); ++ avenrun[2] = calc_load(avenrun[2], EXP_15, active); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++} ++ ++/** ++ * fixed_power_int - compute: x^n, in O(log n) time ++ * ++ * @x: base of the power ++ * @frac_bits: fractional bits of @x ++ * @n: power to raise @x to. ++ * ++ * By exploiting the relation between the definition of the natural power ++ * function: x^n := x*x*...*x (x multiplied by itself for n times), and ++ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, ++ * (where: n_i \elem {0, 1}, the binary vector representing n), ++ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is ++ * of course trivially computable in O(log_2 n), the length of our binary ++ * vector. ++ */ ++static unsigned long ++fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) ++{ ++ unsigned long result = 1UL << frac_bits; ++ ++ if (n) { ++ for (;;) { ++ if (n & 1) { ++ result *= x; ++ result += 1UL << (frac_bits - 1); ++ result >>= frac_bits; ++ } ++ n >>= 1; ++ if (!n) ++ break; ++ x *= x; ++ x += 1UL << (frac_bits - 1); ++ x >>= frac_bits; ++ } ++ } ++ ++ return result; ++} ++ ++/* ++ * a1 = a0 * e + a * (1 - e) ++ * ++ * a2 = a1 * e + a * (1 - e) ++ * = (a0 * e + a * (1 - e)) * e + a * (1 - e) ++ * = a0 * e^2 + a * (1 - e) * (1 + e) ++ * ++ * a3 = a2 * e + a * (1 - e) ++ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) ++ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) ++ * ++ * ... ++ * ++ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] ++ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) ++ * = a0 * e^n + a * (1 - e^n) ++ * ++ * [1] application of the geometric series: ++ * ++ * n 1 - x^(n+1) ++ * S_n := \Sum x^i = ------------- ++ * i=0 1 - x ++ */ ++unsigned long ++calc_load_n(unsigned long load, unsigned long exp, ++ unsigned long active, unsigned int n) ++{ ++ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++#ifdef CONFIG_PARAVIRT ++static inline u64 steal_ticks(u64 steal) ++{ ++ if (unlikely(steal > NSEC_PER_SEC)) ++ return div_u64(steal, TICK_NSEC); ++ ++ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); ++} ++#endif ++ ++#ifndef nsecs_to_cputime ++# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) ++#endif ++ ++/* ++ * On each tick, add the number of nanoseconds to the unbanked variables and ++ * once one tick's worth has accumulated, account it allowing for accurate ++ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we ++ * deduct nanoseconds. ++ */ ++static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ if (atomic_read(&rq->nr_iowait) > 0) { ++ rq->iowait_ns += ns; ++ if (rq->iowait_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->iowait_ns); ++ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->iowait_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->idle_ns += ns; ++ if (rq->idle_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->idle_ns); ++ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->idle_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(idle); ++} ++ ++static void pc_system_time(struct rq *rq, struct task_struct *p, ++ int hardirq_offset, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->stime_ns += ns; ++ if (p->stime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->stime_ns); ++ p->stime_ns %= JIFFY_NS; ++ p->stime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_system_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (hardirq_count() - hardirq_offset) { ++ rq->irq_ns += ns; ++ if (rq->irq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->irq_ns); ++ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->irq_ns %= JIFFY_NS; ++ } ++ } else if (in_serving_softirq()) { ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->system_ns += ns; ++ if (rq->system_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->system_ns); ++ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->system_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->utime_ns += ns; ++ if (p->utime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->utime_ns); ++ p->utime_ns %= JIFFY_NS; ++ p->utime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_user_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (this_cpu_ksoftirqd() == p) { ++ /* ++ * ksoftirqd time do not get accounted in cpu_softirq_time. ++ * So, we have to handle it separately here. ++ */ ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } ++ ++ if (task_nice(p) > 0 || idleprio_task(p)) { ++ rq->nice_ns += ns; ++ if (rq->nice_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->nice_ns); ++ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->nice_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->user_ns += ns; ++ if (rq->user_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->user_ns); ++ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->user_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++/* ++ * This is called on clock ticks. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate tick timekeeping */ ++ if (user_mode(get_irq_regs())) ++ pc_user_time(rq, p, account_ns); ++ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { ++ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); ++ } else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++ ++ p->last_ran = rq->niffies; ++} ++ ++/* ++ * This is called on context switches. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate subtick timekeeping */ ++ if (p != idle) ++ pc_user_time(rq, p, account_ns); ++ else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++} ++ ++/* ++ * Return any ns on the sched_clock that have not yet been accounted in ++ * @p in case that task is currently running. ++ * ++ * Called with task_rq_lock(p) held. ++ */ ++static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) ++{ ++ u64 ns = 0; ++ ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_clocks(rq); ++ ns = rq->niffies - p->last_ran; ++ } ++ ++ return ns; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ struct rq_flags rf; ++ struct rq *rq; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimisation chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_rq_lock(p, &rf); ++ ns = p->sched_time + do_task_delta_exec(p, rq); ++ task_rq_unlock(rq, p, &rf); ++ ++ return ns; ++} ++ ++/* ++ * Functions to test for when SCHED_ISO tasks have used their allocated ++ * quota as real time scheduling and convert them back to SCHED_NORMAL. All ++ * data is modified only by the local runqueue during scheduler_tick with ++ * interrupts disabled. ++ */ ++ ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a ++ * slow division. ++ */ ++static inline void iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++ rq->iso_ticks += 100; ++ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { ++ rq->iso_refractory = true; ++ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) ++ rq->iso_ticks = ISO_PERIOD * 100; ++ } ++} ++ ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq, int ticks) ++{ ++ if (rq->iso_ticks > 0 || rq->iso_refractory) { ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; ++ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { ++ rq->iso_refractory = false; ++ if (unlikely(rq->iso_ticks < 0)) ++ rq->iso_ticks = 0; ++ } ++ } ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if (rt_task(p) || task_running_iso(p)) ++ iso_tick(rq); ++ else ++ no_iso_tick(rq, 1); ++ ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->policy == SCHED_FIFO) ++ return; ++ ++ if (iso_task(p)) { ++ if (task_running_iso(p)) { ++ if (rq->iso_refractory) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Force it to reschedule as ++ * SCHED_NORMAL by zeroing its time_slice ++ */ ++ p->time_slice = 0; ++ } ++ } else if (!rq->iso_refractory) { ++ /* Can now run again ISO. Reschedule to pick up prio */ ++ goto out_resched; ++ } ++ } ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ * Dither is used as a backup for when hrexpiry is disabled or high res ++ * timers not configured in. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++out_resched: ++ rq_lock(rq); ++ __set_tsk_resched(p); ++ rq_unlock(rq); ++} ++ ++static inline void task_tick(struct rq *rq) ++{ ++ if (!rq_idle(rq)) ++ task_running_tick(rq); ++ else if (rq->last_jiffy > rq->last_scheduler_tick) ++ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * We can stop the timer tick any time highres timers are active since ++ * we rely entirely on highres timeouts for task expiry rescheduling. ++ */ ++static void sched_stop_tick(struct rq *rq, int cpu) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (!tick_nohz_full_enabled()) ++ return; ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++static inline void sched_start_tick(struct rq *rq, int cpu) ++{ ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (!tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ rq_lock_irq(rq); ++ if (cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ curr = rq->curr; ++ update_rq_clock(rq); ++ ++ if (!is_idle_task(curr)) { ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ delta = rq_clock_task(rq) - curr->last_ran; ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ } ++ task_tick(rq); ++ ++out_unlock: ++ rq_unlock_irq(rq, NULL); ++ ++out_requeue: ++ ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ /* There cannot be competing actions, but don't rely on stop-machine. */ ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); ++ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); ++ /* Don't cancel, as this would mess up the state machine. */ ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_stop_tick(struct rq *rq, int cpu) {} ++static inline void sched_start_tick(struct rq *rq, int cpu) {} ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ arch_scale_freq_tick(); ++ sched_clock_tick(); ++ update_clocks(rq); ++ update_load_avg(rq, 0); ++ update_cpu_clock_tick(rq, rq->curr); ++ task_tick(rq); ++ rq->last_scheduler_tick = rq->last_jiffy; ++ rq->last_tick = rq->clock; ++ psi_task_tick(rq); ++ perf_event_task_tick(); ++ sched_stop_tick(rq, cpu); ++} ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline. Make sure update_clocks has been called recently to update ++ * rq->niffies. ++ */ ++static void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ p->deadline = rq->niffies + task_deadline_diff(p); ++#ifdef CONFIG_SMT_NICE ++ if (!p->mm) ++ p->smt_bias = 0; ++ else if (rt_task(p)) ++ p->smt_bias = 1 << 30; ++ else if (task_running_iso(p)) ++ p->smt_bias = 1 << 29; ++ else if (idleprio_task(p)) { ++ if (task_running_idle(p)) ++ p->smt_bias = 0; ++ else ++ p->smt_bias = 1; ++ } else if (--p->smt_bias < 1) ++ p->smt_bias = MAX_PRIO - p->static_prio; ++#endif ++} ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (p->time_slice < RESCHED_US || batch_task(p)) ++ time_slice_expired(p, rq); ++} ++ ++/* ++ * Task selection with skiplists is a simple matter of picking off the first ++ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) ++ * being bound to the number of processors. ++ * ++ * Runqueues are selectively locked based on their unlocked data and then ++ * unlocked if not needed. At most 3 locks will be held at any time and are ++ * released as soon as they're no longer needed. All balancing between CPUs ++ * is thus done here in an extremely simple first come best fit manner. ++ * ++ * This iterates over runqueues in cache locality order. In interactive mode ++ * it iterates over all CPUs and finds the task with the best key/deadline. ++ * In non-interactive mode it will only take a task if it's from the current ++ * runqueue or a runqueue with more tasks than the current one with a better ++ * key/deadline. ++ */ ++#ifdef CONFIG_SMP ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct rq *locked = NULL, *chosen = NULL; ++ struct task_struct *edt = idle; ++ int i, best_entries = 0; ++ u64 best_key = ~0ULL; ++ ++ for (i = 0; i < total_runqueues; i++) { ++ struct rq *other_rq = rq_order(rq, i); ++ skiplist_node *next; ++ int entries; ++ ++ entries = other_rq->sl->entries; ++ /* ++ * Check for queued entres lockless first. The local runqueue ++ * is locked so entries will always be accurate. ++ */ ++ if (!sched_interactive) { ++ /* ++ * Don't reschedule balance across nodes unless the CPU ++ * is idle. ++ */ ++ if (edt != idle && rq->cpu_locality[other_rq->cpu] > LOCALITY_SMP) ++ break; ++ if (entries <= best_entries) ++ continue; ++ } else if (!entries) ++ continue; ++ ++ /* if (i) implies other_rq != rq */ ++ if (i) { ++ /* Check for best id queued lockless first */ ++ if (other_rq->best_key >= best_key) ++ continue; ++ ++ if (unlikely(!trylock_rq(rq, other_rq))) ++ continue; ++ ++ /* Need to reevaluate entries after locking */ ++ entries = other_rq->sl->entries; ++ if (unlikely(!entries)) { ++ unlock_rq(other_rq); ++ continue; ++ } ++ } ++ ++ next = other_rq->node; ++ /* ++ * In interactive mode we check beyond the best entry on other ++ * runqueues if we can't get the best for smt or affinity ++ * reasons. ++ */ ++ while ((next = next->next[0]) != other_rq->node) { ++ struct task_struct *p; ++ u64 key = next->key; ++ ++ /* Reevaluate key after locking */ ++ if (key >= best_key) ++ break; ++ ++ p = next->value; ++ if (!smt_schedule(p, rq)) { ++ if (i && !sched_interactive) ++ break; ++ continue; ++ } ++ ++ if (sched_other_cpu(p, cpu)) { ++ if (sched_interactive || !i) ++ continue; ++ break; ++ } ++ /* Make sure affinity is ok */ ++ if (i) { ++ /* From this point on p is the best so far */ ++ if (locked) ++ unlock_rq(locked); ++ chosen = locked = other_rq; ++ } ++ best_entries = entries; ++ best_key = key; ++ edt = p; ++ break; ++ } ++ /* rq->preempting is a hint only as the state may have changed ++ * since it was set with the resched call but if we have met ++ * the condition we can break out here. */ ++ if (edt == rq->preempting) ++ break; ++ if (i && other_rq != chosen) ++ unlock_rq(other_rq); ++ } ++ ++ if (likely(edt != idle)) ++ take_task(rq, cpu, edt); ++ ++ if (locked) ++ unlock_rq(locked); ++ ++ rq->preempting = NULL; ++ ++ return edt; ++} ++#else /* CONFIG_SMP */ ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct task_struct *edt; ++ ++ if (unlikely(!rq->sl->entries)) ++ return idle; ++ edt = rq->node->next[0]->value; ++ take_task(rq, cpu, edt); ++ return edt; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, preempt_disable_ip); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++ ++ if (task_scs_end_corrupted(prev)) ++ panic("corrupted shadow stack detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++/* ++ * The currently running task's information is all stored in rq local data ++ * which is only modified by the local CPU. ++ */ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ if (p == rq->idle || p->policy == SCHED_FIFO) ++ hrexpiry_clear(rq); ++ else ++ hrexpiry_start(rq, US_TO_NS(p->time_slice)); ++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) ++ rq->dither = 0; ++ else ++ rq->dither = rq_dither(rq); ++ ++ rq->rq_deadline = p->deadline; ++ rq->rq_prio = p->prio; ++#ifdef CONFIG_SMT_NICE ++ rq->rq_mm = p->mm; ++ rq->rq_smt_bias = p->smt_bias; ++#endif ++} ++ ++#ifdef CONFIG_SMT_NICE ++static void check_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; ++static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; ++ ++/* Iterate over smt siblings when we've scheduled a process on cpu and decide ++ * whether they should continue running or be descheduled. */ ++static void check_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct task_struct *p; ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ continue; ++ p = rq->curr; ++ if (!smt_schedule(p, this_rq)) ++ resched_curr(rq); ++ } ++} ++ ++static void wake_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ resched_idle(rq); ++ } ++} ++#else ++static void check_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_siblings(struct rq __maybe_unused *this_rq) {} ++#endif ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next, *idle; ++ unsigned long *switch_count; ++ unsigned long prev_state; ++ bool deactivate = false; ++ struct rq *rq; ++ u64 niffies; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ idle = rq->idle; ++ ++ schedule_debug(prev, preempt); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(): ++ * ++ * __set_current_state(@state) signal_wake_up() ++ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) ++ * wake_up_state(p, state) ++ * LOCK rq->lock LOCK p->pi_state ++ * smp_mb__after_spinlock() smp_mb__after_spinlock() ++ * if (signal_pending_state()) if (p->state & @state) ++ * ++ * Also, the membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ rq_lock(rq); ++ smp_mb__after_spinlock(); ++#ifdef CONFIG_SMP ++ if (rq->preempt) { ++ /* ++ * Make sure resched_curr hasn't triggered a preemption ++ * locklessly on a task that has since scheduled away. Spurious ++ * wakeup of idle is okay though. ++ */ ++ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { ++ rq->preempt = NULL; ++ clear_preempt_need_resched(); ++ rq_unlock_irq(rq, NULL); ++ return; ++ } ++ rq->preempt = NULL; ++ } ++#endif ++ ++ switch_count = &prev->nivcsw; ++ ++ /* ++ * We must load prev->state once (task_struct::state is volatile), such ++ * that: ++ * ++ * - we form a control dependency vs deactivate_task() below. ++ * - ptrace_{,un}freeze_traced() can change ->state underneath us. ++ */ ++ prev_state = prev->state; ++ if (!preempt && prev_state) { ++ if (signal_pending_state(prev_state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ prev->sched_contributes_to_load = ++ (prev_state & TASK_UNINTERRUPTIBLE) && ++ !(prev_state & TASK_NOLOAD) && ++ !(prev->flags & PF_FROZEN); ++ ++ if (prev->sched_contributes_to_load) ++ rq->nr_uninterruptible++; ++ ++ /* ++ * __schedule() ttwu() ++ * prev_state = prev->state; if (p->on_rq && ...) ++ * if (prev_state) goto out; ++ * p->on_rq = 0; smp_acquire__after_ctrl_dep(); ++ * p->state = TASK_WAKING ++ * ++ * Where __schedule() and ttwu() have matching control dependencies. ++ * ++ * After this, schedule() must not care about p->state any more. ++ */ ++ deactivate = true; ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ /* ++ * Store the niffy value here for use by the next task's last_ran ++ * below to avoid losing niffies due to update_clocks being called ++ * again after this point. ++ */ ++ update_clocks(rq); ++ niffies = rq->niffies; ++ update_cpu_clock_switch(rq, prev); ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ if (idle != prev) { ++ check_deadline(prev, rq); ++ return_task(prev, rq, cpu, deactivate); ++ } ++ ++ next = earliest_deadline_task(rq, cpu, idle); ++ if (likely(next->prio != PRIO_LIMIT)) ++ clear_cpuidle_map(cpu); ++ else { ++ set_cpuidle_map(cpu); ++ update_load_avg(rq, 0); ++ } ++ ++ set_rq_task(rq, next); ++ next->last_ran = niffies; ++ ++ if (likely(prev != next)) { ++ /* ++ * Don't reschedule an idle task or deactivated tasks ++ */ ++ if (prev == idle) { ++ inc_nr_running(rq); ++ if (rt_task(next)) ++ rq->rt_nr_running++; ++ } else if (!deactivate) ++ resched_suitable_idle(prev); ++ if (unlikely(next == idle)) { ++ dec_nr_running(rq); ++ if (rt_task(prev)) ++ rq->rt_nr_running--; ++ wake_siblings(rq); ++ } else ++ check_siblings(rq); ++ rq->nr_switches++; ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ ++ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); ++ ++ trace_sched_switch(preempt, prev, next); ++ context_switch(rq, prev, next); /* unlocks the rq */ ++ } else { ++ check_siblings(rq); ++ rq_unlock(rq); ++ local_irq_enable(); ++ } ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(). */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ unsigned int task_flags; ++ ++ if (!tsk->state) ++ return; ++ ++ task_flags = tsk->flags; ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker and because wq_worker_sleeping() ++ * requires it. ++ */ ++ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (task_flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static inline void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++ ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != IN_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio, oldprio; ++ struct rq *rq; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_rq_lock(p, NULL); ++ update_rq_clock(rq); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ oldprio = p->prio; ++ p->prio = prio; ++ if (task_running(rq, p)){ ++ if (prio > oldprio) ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (prio < oldprio) ++ try_preempt(p, rq); ++ } ++out_unlock: ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_rq_unlock(rq, NULL); ++ preempt_enable(); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++/* ++ * Adjust the deadline for when the priority is to change, before it's ++ * changed. ++ */ ++static inline void adjust_deadline(struct task_struct *p, int new_prio) ++{ ++ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); ++} ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static, old_static; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (has_rt_policy(p)) { ++ p->static_prio = new_static; ++ goto out_unlock; ++ } ++ ++ adjust_deadline(p, new_static); ++ old_static = p->static_prio; ++ p->static_prio = new_static; ++ p->prio = effective_prio(p); ++ ++ if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (new_static < old_static) ++ try_preempt(p, rq); ++ } else if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ if (old_static < new_static) ++ resched_task(p); ++ } ++out_unlock: ++ task_rq_unlock(rq, p, &rf); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int delta, prio = p->prio - MAX_RT_PRIO; ++ ++ /* rt tasks and iso tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ /* Convert to ms to avoid overflows */ ++ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); ++ if (unlikely(delta < 0)) ++ delta = 0; ++ delta = delta * 40 / ms_longest_deadline_diff(); ++ if (delta <= 80) ++ prio += delta; ++ if (idleprio_task(p)) ++ prio += 40; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (rq->curr != rq->idle) ++ return 0; ++ ++ if (rq->nr_running) ++ return 0; ++ ++#ifdef CONFIG_SMP ++ if (rq->ttwu_pending) ++ return 0; ++#endif ++ ++ return 1; ++} ++ ++/** ++ * available_idle_cpu - is a given CPU idle for enqueuing work. ++ * @cpu: the CPU in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int available_idle_cpu(int cpu) ++{ ++ if (!idle_cpu(cpu)) ++ return 0; ++ ++ if (vcpu_is_preempted(cpu)) ++ return 0; ++ ++ return 1; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the CPU @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, ++ int prio, const struct sched_attr *attr, ++ bool keep_boost) ++{ ++ int oldrtprio, oldprio; ++ ++ /* ++ * If params can't change scheduling class changes aren't allowed ++ * either. ++ */ ++ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) ++ return; ++ ++ p->policy = policy; ++ oldrtprio = p->rt_priority; ++ p->rt_priority = prio; ++ p->normal_prio = normal_prio(p); ++ oldprio = p->prio; ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ ++ if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (p->prio < oldprio || p->rt_priority > oldrtprio) ++ try_preempt(p, rq); ++ } ++} ++ ++/* ++ * Check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority; ++ unsigned long rlim_rtprio = 0; ++ struct rq_flags rf; ++ int reset_on_fork; ++ struct rq *rq; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ priority = 0; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); ++ policy &= ~SCHED_RESET_ON_FORK; ++ ++ if (!SCHED_RANGE(policy)) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH is 0. ++ */ ++ if (priority < 0 || ++ (p->mm && priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if (is_rt_policy(policy) != (priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (is_rt_policy(policy)) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (priority > p->rt_priority && ++ priority > rlim_rtprio) ++ return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy != SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag: */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ * ++ * To be able to change p->policy safely, the runqueue lock must be ++ * held. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea: ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || ++ priority == p->rt_priority))) { ++ retval = 0; ++ goto unlock; ++ } ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ __setscheduler(p, rq, policy, priority, attr, pi); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ task_rq_unlock(rq, p, &rf); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ preempt_enable(); ++out: ++ return 0; ++ ++unlock: ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Use sched_set_fifo(), read its comment. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++ ++/* ++ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally ++ * incapable of resource management, which is the one thing an OS really should ++ * be doing. ++ * ++ * This is of course the reason it is limited to privileged users only. ++ * ++ * Worse still; it is fundamentally impossible to compose static priority ++ * workloads. You cannot take two correctly working static prio workloads ++ * and smash them together and still expect them to work. ++ * ++ * For this reason 'all' FIFO tasks the kernel creates are basically at: ++ * ++ * MAX_RT_PRIO / 2 ++ * ++ * The administrator _MUST_ configure the system, the kernel simply doesn't ++ * know enough information to make a sensible choice. ++ */ ++void sched_set_fifo(struct task_struct *p) ++{ ++ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; ++ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_fifo); ++ ++/* ++ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. ++ */ ++void sched_set_fifo_low(struct task_struct *p) ++{ ++ struct sched_param sp = { .sched_priority = 1 }; ++ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_fifo_low); ++ ++void sched_set_normal(struct task_struct *p, int nice) ++{ ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ .sched_nice = nice, ++ }; ++ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_normal); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, ++ struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) ++ attr.sched_policy = SETPARAM_POLICY; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setattr(p, &attr); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (rt_task(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ cpumask_t *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ unsigned long flags; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ put_online_cpus(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min(len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ ++ if (!sched_yield_type) ++ return; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ ++ if (sched_yield_type > 1) ++ time_slice_expired(current, rq); ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ rq_unlock(rq); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ struct task_struct *rq_p; ++ struct rq *rq, *p_rq; ++ unsigned long flags; ++ int yielded = 0; ++ ++ local_irq_save(flags); ++ rq = this_rq(); ++ ++again: ++ p_rq = task_rq(p); ++ /* ++ * If we're the only runnable task on the rq and target rq also ++ * has only one task, there's absolutely no point in yielding. ++ */ ++ if (task_running(p_rq, p) || p->state) { ++ yielded = -ESRCH; ++ goto out_irq; ++ } ++ ++ double_rq_lock(rq, p_rq); ++ if (unlikely(task_rq(p) != p_rq)) { ++ double_rq_unlock(rq, p_rq); ++ goto again; ++ } ++ ++ yielded = 1; ++ schedstat_inc(rq->yld_count); ++ rq_p = rq->curr; ++ if (p->deadline > rq_p->deadline) ++ p->deadline = rq_p->deadline; ++ p->time_slice += rq_p->time_slice; ++ if (p->time_slice > timeslice()) ++ p->time_slice = timeslice(); ++ time_slice_expired(rq_p, rq); ++ if (preempt && rq != p_rq) ++ resched_task(p_rq->curr); ++ double_rq_unlock(rq, p_rq); ++out_irq: ++ local_irq_restore(flags); ++ ++ if (yielded > 0) ++ schedule(); ++ return yielded; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ unsigned int time_slice; ++ struct rq_flags rf; ++ struct rq *rq; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ rq = task_rq_lock(p, &rf); ++ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); ++ task_rq_unlock(rq, p, &rf); ++ ++ rcu_read_unlock(); ++ *t = ns_to_timespec64(time_slice); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * this syscall writes the default timeslice value of a given process ++ * into the user-space timespec buffer. A value of '0' means infinity. ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", ++ free, task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL, KERN_INFO); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++#ifdef CONFIG_SMP ++void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ ++ if (task_queued(p)) { ++ /* ++ * Because __kthread_bind() calls this on blocked tasks without ++ * holding rq->lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ } ++} ++ ++/* ++ * Calling do_set_cpus_allowed from outside the scheduler code should not be ++ * called on a running or queued task. We should be holding pi_lock. ++ */ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ __do_set_cpus_allowed(p, new_mask); ++ if (needs_other_cpu(p, task_cpu(p))) { ++ struct rq *rq; ++ ++ rq = __task_rq_lock(p, NULL); ++ set_task_cpu(p, valid_task_cpu(p)); ++ resched_task(p); ++ __task_rq_unlock(rq, NULL); ++ } ++} ++#endif ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(rq->lock); ++ idle->last_ran = rq->niffies; ++ time_slice_expired(idle, rq); ++ idle->state = TASK_RUNNING; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ idle->flags |= PF_IDLE; ++ ++ scs_task_reset(idle); ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#ifdef CONFIG_SMT_NICE ++ idle->smt_bias = 0; ++#endif ++#endif ++ set_rq_task(rq, idle); ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_rq = TASK_ON_RQ_QUEUED; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rq_lock_irqsave(rq, &rf); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(rq); ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++void nohz_balance_enter_idle(int cpu) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(), default_cpu = -1; ++ struct sched_domain *sd; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { ++ if (!idle_cpu(cpu)) ++ return cpu; ++ default_cpu = cpu; ++ } ++ ++ rcu_read_lock(); ++ for_each_domain(cpu, sd) { ++ for_each_cpu_and(i, sched_domain_span(sd), ++ housekeeping_cpumask(HK_FLAG_TIMER)) { ++ if (cpu == i) ++ continue; ++ ++ if (!idle_cpu(i)) { ++ cpu = i; ++ goto unlock; ++ } ++ } ++ } ++ ++ if (default_cpu == -1) ++ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ cpu = default_cpu; ++unlock: ++ rcu_read_unlock(); ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (cpu_is_offline(cpu)) ++ return true; /* Don't try to wake offline CPUs. */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Wake up the specified CPU. If the CPU is going offline, it is the ++ * caller's responsibility to deal with the lost wakeup, for example, ++ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. ++ */ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (!wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ bool queued = false, running_wrong = false, kthread; ++ unsigned int dest_cpu; ++ struct rq_flags rf; ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ kthread = !!(p->flags & PF_KTHREAD); ++ if (kthread) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ /* ++ * Picking a ~random cpu helps in cases where we are changing affinity ++ * for groups of tasks (ie. cpuset), so that load balancing is not ++ * immediately required to distribute the tasks within their new mask. ++ */ ++ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ queued = task_queued(p); ++ __do_set_cpus_allowed(p, new_mask); ++ ++ if (kthread) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(rq, p)) { ++ /* Task is running on the wrong cpu now, reschedule it. */ ++ if (rq == this_rq()) { ++ set_task_cpu(p, dest_cpu); ++ set_tsk_need_resched(p); ++ running_wrong = true; ++ } else ++ resched_task(p); ++ } else { ++ if (queued) { ++ /* ++ * Switch runqueue locks after dequeueing the task ++ * here while still holding the pi_lock to be holding ++ * the correct lock for enqueueing. ++ */ ++ dequeue_task(rq, p, 0); ++ rq_unlock(rq); ++ ++ rq = cpu_rq(dest_cpu); ++ rq_lock(rq); ++ } ++ set_task_cpu(p, dest_cpu); ++ if (queued) ++ enqueue_task(rq, p, 0); ++ } ++ if (queued) ++ try_preempt(p, rq); ++ if (running_wrong) ++ preempt_disable(); ++out: ++ task_rq_unlock(rq, p, &rf); ++ ++ if (running_wrong) { ++ __schedule(true); ++ preempt_enable(); ++ } ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Run through task list and find tasks affined to the dead cpu, then remove ++ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold ++ * cpu 0 and src_cpu's runqueue locks. We should be holding both rq lock and ++ * pi_lock to change cpus_mask but it's not going to matter here. ++ */ ++static void bind_zero(int src_cpu) ++{ ++ struct task_struct *p, *t; ++ struct rq *rq0; ++ int bound = 0; ++ ++ if (src_cpu == 0) ++ return; ++ ++ rq0 = cpu_rq(0); ++ ++ do_each_thread(t, p) { ++ if (cpumask_test_cpu(src_cpu, p->cpus_ptr)) { ++ bool local = (task_cpu(p) == src_cpu); ++ struct rq *rq = task_rq(p); ++ ++ /* task_running is the cpu stopper thread */ ++ if (local && task_running(rq, p)) ++ continue; ++ atomic_clear_cpu(src_cpu, &p->cpus_mask); ++ atomic_set_cpu(0, &p->cpus_mask); ++ p->zerobound = true; ++ bound++; ++ if (local) { ++ bool queued = task_queued(p); ++ ++ if (queued) ++ dequeue_task(rq, p, 0); ++ set_task_cpu(p, 0); ++ if (queued) ++ enqueue_task(rq0, p, 0); ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (bound) { ++ printk(KERN_INFO "MuQSS removed affinity for %d processes to cpu %d\n", ++ bound, src_cpu); ++ } ++} ++ ++/* Find processes with the zerobound flag and reenable their affinity for the ++ * CPU coming alive. */ ++static void unbind_zero(int src_cpu) ++{ ++ int unbound = 0, zerobound = 0; ++ struct task_struct *p, *t; ++ ++ if (src_cpu == 0) ++ return; ++ ++ do_each_thread(t, p) { ++ if (!p->mm) ++ p->zerobound = false; ++ if (p->zerobound) { ++ unbound++; ++ cpumask_set_cpu(src_cpu, &p->cpus_mask); ++ /* Once every CPU affinity has been re-enabled, remove ++ * the zerobound flag */ ++ if (cpumask_subset(cpu_possible_mask, p->cpus_ptr)) { ++ p->zerobound = false; ++ zerobound++; ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (unbound) { ++ printk(KERN_INFO "MuQSS added affinity for %d processes to cpu %d\n", ++ unbound, src_cpu); ++ } ++ if (zerobound) { ++ printk(KERN_INFO "MuQSS released forced binding to cpu0 for %d processes\n", ++ zerobound); ++ } ++} ++ ++/* ++ * Ensure that the idle task is using init_mm right before its cpu goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ BUG_ON(current != this_rq()->idle); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ finish_arch_post_lock_switch(); ++ } ++ ++ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ ++} ++#else /* CONFIG_HOTPLUG_CPU */ ++static void unbind_zero(int src_cpu) {} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++ ++static struct ctl_table sd_ctl_dir[] = { ++ { ++ .procname = "sched_domain", ++ .mode = 0555, ++ }, ++ {} ++}; ++ ++static struct ctl_table sd_ctl_root[] = { ++ { ++ .procname = "kernel", ++ .mode = 0555, ++ .child = sd_ctl_dir, ++ }, ++ {} ++}; ++ ++static struct ctl_table *sd_alloc_ctl_entry(int n) ++{ ++ struct ctl_table *entry = ++ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); ++ ++ return entry; ++} ++ ++static void sd_free_ctl_entry(struct ctl_table **tablep) ++{ ++ struct ctl_table *entry; ++ ++ /* ++ * In the intermediate directories, both the child directory and ++ * procname are dynamically allocated and could fail but the mode ++ * will always be set. In the lowest directory the names are ++ * static strings and all have proc handlers. ++ */ ++ for (entry = *tablep; entry->mode; entry++) { ++ if (entry->child) ++ sd_free_ctl_entry(&entry->child); ++ if (entry->proc_handler == NULL) ++ kfree(entry->procname); ++ } ++ ++ kfree(*tablep); ++ *tablep = NULL; ++} ++ ++static void ++set_table_entry(struct ctl_table *entry, ++ const char *procname, void *data, int maxlen, ++ umode_t mode, proc_handler *proc_handler) ++{ ++ entry->procname = procname; ++ entry->data = data; ++ entry->maxlen = maxlen; ++ entry->mode = mode; ++ entry->proc_handler = proc_handler; ++} ++ ++static struct ctl_table * ++sd_alloc_ctl_domain_table(struct sched_domain *sd) ++{ ++ struct ctl_table *table = sd_alloc_ctl_entry(9); ++ ++ if (table == NULL) ++ return NULL; ++ ++ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); ++ /* &table[8] is terminator */ ++ ++ return table; ++} ++ ++static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) ++{ ++ struct ctl_table *entry, *table; ++ struct sched_domain *sd; ++ int domain_num = 0, i; ++ char buf[32]; ++ ++ for_each_domain(cpu, sd) ++ domain_num++; ++ entry = table = sd_alloc_ctl_entry(domain_num + 1); ++ if (table == NULL) ++ return NULL; ++ ++ i = 0; ++ for_each_domain(cpu, sd) { ++ snprintf(buf, 32, "domain%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_domain_table(sd); ++ entry++; ++ i++; ++ } ++ return table; ++} ++ ++static cpumask_var_t sd_sysctl_cpus; ++static struct ctl_table_header *sd_sysctl_header; ++ ++void register_sched_domain_sysctl(void) ++{ ++ static struct ctl_table *cpu_entries; ++ static struct ctl_table **cpu_idx; ++ char buf[32]; ++ int i; ++ ++ if (!cpu_entries) { ++ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); ++ if (!cpu_entries) ++ return; ++ ++ WARN_ON(sd_ctl_dir[0].child); ++ sd_ctl_dir[0].child = cpu_entries; ++ } ++ ++ if (!cpu_idx) { ++ struct ctl_table *e = cpu_entries; ++ ++ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); ++ if (!cpu_idx) ++ return; ++ ++ /* deal with sparse possible map */ ++ for_each_possible_cpu(i) { ++ cpu_idx[i] = e; ++ e++; ++ } ++ } ++ ++ if (!cpumask_available(sd_sysctl_cpus)) { ++ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) ++ return; ++ ++ /* init to possible to not have holes in @cpu_entries */ ++ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); ++ } ++ ++ for_each_cpu(i, sd_sysctl_cpus) { ++ struct ctl_table *e = cpu_idx[i]; ++ ++ if (e->child) ++ sd_free_ctl_entry(&e->child); ++ ++ if (!e->procname) { ++ snprintf(buf, 32, "cpu%d", i); ++ e->procname = kstrdup(buf, GFP_KERNEL); ++ } ++ e->mode = 0555; ++ e->child = sd_alloc_ctl_cpu_table(i); ++ ++ __cpumask_clear_cpu(i, sd_sysctl_cpus); ++ } ++ ++ WARN_ON(sd_sysctl_header); ++ sd_sysctl_header = register_sysctl_table(sd_ctl_root); ++} ++ ++void dirty_sched_domain_sysctl(int cpu) ++{ ++ if (cpumask_available(sd_sysctl_cpus)) ++ __cpumask_set_cpu(cpu, sd_sysctl_cpus); ++} ++ ++/* may be called multiple times per register */ ++void unregister_sched_domain_sysctl(void) ++{ ++ unregister_sysctl_table(sd_sysctl_header); ++ sd_sysctl_header = NULL; ++} ++#endif /* CONFIG_SYSCTL */ ++ ++void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) { ++ cpumask_set_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = true; ++ } ++} ++ ++void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) { ++ int cpu = cpu_of(rq); ++ ++ cpumask_clear_cpu(cpu, rq->rd->online); ++ rq->online = false; ++ clear_cpuidle_map(cpu); ++ } ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) { ++ sched_domains_numa_masks_set(cpu); ++ cpuset_cpu_active(); ++ } ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all CPUs have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ rq_lock_irqsave(rq, &rf); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_online(rq); ++ } ++ unbind_zero(cpu); ++ rq_unlock_irqrestore(rq, &rf); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ sched_domains_numa_masks_clear(cpu); ++ return 0; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* Handle pending wakeups and then migrate everything off */ ++ sched_tick_stop(cpu); ++ ++ local_irq_save(flags); ++ double_rq_lock(rq, cpu_rq(0)); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ bind_zero(cpu); ++ double_rq_unlock(rq, cpu_rq(0)); ++ sched_start_tick(rq, cpu); ++ hrexpiry_clear(rq); ++ local_irq_restore(flags); ++ ++ return 0; ++} ++#endif ++ ++#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) ++/* ++ * Cheaper version of the below functions in case support for SMT and MC is ++ * compiled in but CPUs have no siblings. ++ */ ++static bool sole_cpu_idle(struct rq *rq) ++{ ++ return rq_idle(rq); ++} ++#endif ++#ifdef CONFIG_SCHED_SMT ++static const cpumask_t *thread_cpumask(int cpu) ++{ ++ return topology_sibling_cpumask(cpu); ++} ++/* All this CPU's SMT siblings are idle */ ++static bool siblings_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); ++} ++#endif ++#ifdef CONFIG_SCHED_MC ++static const cpumask_t *core_cpumask(int cpu) ++{ ++ return topology_core_cpumask(cpu); ++} ++/* All this CPU's shared cache siblings are idle */ ++static bool cache_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->core_mask, &cpu_idle_map); ++} ++/* MC siblings CPU mask which share the same LLC */ ++static const cpumask_t *llc_core_cpumask(int cpu) ++{ ++#ifdef CONFIG_X86 ++ return per_cpu(cpu_llc_shared_map, cpu); ++#else ++ return topology_core_cpumask(cpu); ++#endif ++} ++#endif ++ ++enum sched_domain_level { ++ SD_LV_NONE = 0, ++ SD_LV_SIBLING, ++ SD_LV_MC, ++ SD_LV_BOOK, ++ SD_LV_CPU, ++ SD_LV_NODE, ++ SD_LV_ALLNODES, ++ SD_LV_MAX ++}; ++ ++/* ++ * Set up the relative cache distance of each online cpu from each ++ * other in a simple array for quick lookup. Locality is determined ++ * by the closest sched_domain that CPUs are separated by. CPUs with ++ * shared cache in SMT and MC are treated as local. Separate CPUs ++ * (within the same package or physically) within the same node are ++ * treated as not local. CPUs not even in the same domain (different ++ * nodes) are treated as very distant. ++ */ ++static void __init select_leaders(void) ++{ ++ struct rq *rq, *other_rq, *leader; ++ struct sched_domain *sd; ++ int cpu, other_cpu; ++#ifdef CONFIG_SCHED_SMT ++ bool smt_threads = false; ++#endif ++ ++ for (cpu = 0; cpu < num_online_cpus(); cpu++) { ++ rq = cpu_rq(cpu); ++ leader = NULL; ++ /* First check if this cpu is in the same node */ ++ for_each_domain(cpu, sd) { ++ if (sd->level > SD_LV_MC) ++ continue; ++ if (rqshare != RQSHARE_ALL) ++ leader = NULL; ++ /* Set locality to local node if not already found lower */ ++ for_each_cpu(other_cpu, sched_domain_span(sd)) { ++ if (rqshare >= RQSHARE_SMP) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smp_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ if (!other_rq->smp_leader) ++ other_rq->smp_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMP) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMP; ++ } ++ } ++ ++ /* ++ * Each runqueue has its own function in case it doesn't have ++ * siblings of its own allowing mixed topologies. ++ */ ++#ifdef CONFIG_SCHED_MC ++ leader = NULL; ++ if (cpumask_weight(core_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->core_mask, llc_core_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->core_mask); ++ for_each_cpu(other_cpu, core_cpumask(cpu)) { ++ if (rqshare == RQSHARE_MC || ++ (rqshare == RQSHARE_MC_LLC && cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu)))) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the mc_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ if (!other_rq->mc_leader) ++ other_rq->mc_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_MC) { ++ /* this is to get LLC into play even in case LLC sharing is not used */ ++ if (cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu))) ++ rq->cpu_locality[other_cpu] = LOCALITY_MC_LLC; ++ else ++ rq->cpu_locality[other_cpu] = LOCALITY_MC; ++ } ++ } ++ rq->cache_idle = cache_cpu_idle; ++ } ++#endif ++#ifdef CONFIG_SCHED_SMT ++ leader = NULL; ++ if (cpumask_weight(thread_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->thread_mask); ++ for_each_cpu(other_cpu, thread_cpumask(cpu)) { ++ if (rqshare == RQSHARE_SMT) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smt_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ if (!other_rq->smt_leader) ++ other_rq->smt_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMT) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMT; ++ } ++ rq->siblings_idle = siblings_cpu_idle; ++ smt_threads = true; ++ } ++#endif ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (smt_threads) { ++ check_siblings = &check_smt_siblings; ++ wake_siblings = &wake_smt_siblings; ++ smt_schedule = &smt_should_schedule; ++ } ++#endif ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for_each_online_cpu(other_cpu) { ++ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); ++ } ++ } ++} ++ ++/* FIXME freeing locked spinlock */ ++static void __init share_and_free_rq(struct rq *leader, struct rq *rq) ++{ ++ WARN_ON(rq->nr_running > 0); ++ ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ rq->is_leader = false; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++} ++ ++static void __init share_rqs(void) ++{ ++ struct rq *rq, *leader; ++ int cpu; ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->smp_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMP runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ share_and_free_rq(leader, rq); ++ } else ++ rq_unlock(rq); ++ } ++ ++#ifdef CONFIG_SCHED_MC ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->mc_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing MC runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ share_and_free_rq(leader, rq); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_MC */ ++ ++#ifdef CONFIG_SCHED_SMT ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->smt_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMT runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ share_and_free_rq(leader, rq); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_SMT */ ++} ++ ++static void __init setup_rq_orders(void) ++{ ++ int *selected_cpus, *ordered_cpus; ++ struct rq *rq, *other_rq; ++ int cpu, other_cpu, i; ++ ++ selected_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC); ++ ordered_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC); ++ ++ total_runqueues = 0; ++ for_each_online_cpu(cpu) { ++ int locality, total_rqs = 0, total_cpus = 0; ++ ++ rq = cpu_rq(cpu); ++ if (rq->is_leader) ++ total_runqueues++; ++ ++ for (locality = LOCALITY_SAME; locality <= LOCALITY_DISTANT; locality++) { ++ int selected_cpu_cnt, selected_cpu_idx, test_cpu_idx, cpu_idx, best_locality, test_cpu; ++ int ordered_cpus_idx; ++ ++ ordered_cpus_idx = -1; ++ selected_cpu_cnt = 0; ++ ++ for_each_online_cpu(test_cpu) { ++ if (cpu < num_online_cpus() / 2) ++ other_cpu = cpu + test_cpu; ++ else ++ other_cpu = cpu - test_cpu; ++ if (other_cpu < 0) ++ other_cpu += num_online_cpus(); ++ else ++ other_cpu %= num_online_cpus(); ++ /* gather CPUs of the same locality */ ++ if (rq->cpu_locality[other_cpu] == locality) { ++ selected_cpus[selected_cpu_cnt] = other_cpu; ++ selected_cpu_cnt++; ++ } ++ } ++ ++ /* reserve first CPU as starting point */ ++ if (selected_cpu_cnt > 0) { ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[ordered_cpus_idx]; ++ selected_cpus[ordered_cpus_idx] = -1; ++ } ++ ++ /* take each CPU and sort it within the same locality based on each inter-CPU localities */ ++ for (test_cpu_idx = 1; test_cpu_idx < selected_cpu_cnt; test_cpu_idx++) { ++ /* starting point with worst locality and current CPU */ ++ best_locality = LOCALITY_DISTANT; ++ selected_cpu_idx = test_cpu_idx; ++ ++ /* try to find the best locality within group */ ++ for (cpu_idx = 1; cpu_idx < selected_cpu_cnt; cpu_idx++) { ++ /* if CPU has not been used and locality is better */ ++ if (selected_cpus[cpu_idx] > -1) { ++ other_rq = cpu_rq(ordered_cpus[ordered_cpus_idx]); ++ if (best_locality > other_rq->cpu_locality[selected_cpus[cpu_idx]]) { ++ /* assign best locality and best CPU idx in array */ ++ best_locality = other_rq->cpu_locality[selected_cpus[cpu_idx]]; ++ selected_cpu_idx = cpu_idx; ++ } ++ } ++ } ++ ++ /* add our next best CPU to ordered list */ ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[selected_cpu_idx]; ++ /* mark this CPU as used */ ++ selected_cpus[selected_cpu_idx] = -1; ++ } ++ ++ /* set up RQ and CPU orders */ ++ for (test_cpu = 0; test_cpu <= ordered_cpus_idx; test_cpu++) { ++ other_rq = cpu_rq(ordered_cpus[test_cpu]); ++ /* set up cpu orders */ ++ rq->cpu_order[total_cpus++] = other_rq; ++ if (other_rq->is_leader) { ++ /* set up RQ orders */ ++ rq->rq_order[total_rqs++] = other_rq; ++ } ++ } ++ } ++ } ++ ++ kfree(selected_cpus); ++ kfree(ordered_cpus); ++ ++#ifdef CONFIG_X86 ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < total_runqueues; i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d RQ order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->rq_order[i]->cpu, per_cpu(cpu_llc_id, rq->rq_order[i]->cpu)); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < num_online_cpus(); i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d CPU order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->cpu_order[i]->cpu, per_cpu(cpu_llc_id, rq->cpu_order[i]->cpu)); ++ } ++ } ++#endif ++} ++ ++void __init sched_init_smp(void) ++{ ++ sched_init_numa(); ++ ++ /* ++ * There's no userspace yet to cause hotplug operations; hence all the ++ * cpu masks are stable and all blatant races in the below code cannot ++ * happen. ++ */ ++ mutex_lock(&sched_domains_mutex); ++ sched_init_domains(cpu_active_mask); ++ mutex_unlock(&sched_domains_mutex); ++ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ local_irq_disable(); ++ mutex_lock(&sched_domains_mutex); ++ lock_all_rqs(); ++ ++ printk(KERN_INFO "MuQSS possible/present/online CPUs: %d/%d/%d\n", ++ num_possible_cpus(), num_present_cpus(), num_online_cpus()); ++ ++ select_leaders(); ++ ++ unlock_all_rqs(); ++ mutex_unlock(&sched_domains_mutex); ++ ++ share_rqs(); ++ ++ local_irq_enable(); ++ ++ setup_rq_orders(); ++ ++ switch (rqshare) { ++ case RQSHARE_ALL: ++ /* This should only ever read 1 */ ++ printk(KERN_INFO "MuQSS runqueue share type ALL total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMP: ++ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC: ++ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC_LLC: ++ printk(KERN_INFO "MuQSS runqueue share type LLC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMT: ++ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_NONE: ++ printk(KERN_INFO "MuQSS runqueue share type NONE total runqueues: %d\n", ++ total_runqueues); ++ break; ++ } ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ sched_smp_initialized = true; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++#ifdef CONFIG_SMP ++ int cpu_ids; ++#endif ++ int i; ++ struct rq *rq; ++ ++ wait_bit_init(); ++ ++ prio_ratios[0] = 128; ++ for (i = 1 ; i < NICE_WIDTH ; i++) ++ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; ++ ++ skiplist_node_init(&init_task.node); ++ ++#ifdef CONFIG_SMP ++ init_defrootdomain(); ++ cpumask_clear(&cpu_idle_map); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC); ++ skiplist_init(rq->node); ++ rq->sl = new_skiplist(rq->node); ++ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC); ++ raw_spin_lock_init(rq->lock); ++ rq->nr_running = 0; ++ rq->nr_uninterruptible = 0; ++ rq->nr_switches = 0; ++ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; ++ rq->last_jiffy = jiffies; ++ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = ++ rq->iowait_ns = rq->idle_ns = 0; ++ rq->dither = 0; ++ set_rq_task(rq, &init_task); ++ rq->iso_ticks = 0; ++ rq->iso_refractory = false; ++#ifdef CONFIG_SMP ++ rq->is_leader = true; ++ rq->smp_leader = NULL; ++#ifdef CONFIG_SCHED_MC ++ rq->mc_leader = NULL; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ rq->smt_leader = NULL; ++#endif ++ rq->sd = NULL; ++ rq->rd = NULL; ++ rq->online = false; ++ rq->cpu = i; ++ rq_attach_root(rq, &def_root_domain); ++#endif ++ init_rq_hrexpiry(rq); ++ atomic_set(&rq->nr_iowait, 0); ++ } ++ ++#ifdef CONFIG_SMP ++ cpu_ids = i; ++ /* ++ * Set the base locality for cpu cache distance calculation to ++ * "distant" (3). Make sure the distance from a CPU to itself is 0. ++ */ ++ for_each_possible_cpu(i) { ++ int j; ++ ++ rq = cpu_rq(i); ++#ifdef CONFIG_SCHED_SMT ++ rq->siblings_idle = sole_cpu_idle; ++#endif ++#ifdef CONFIG_SCHED_MC ++ rq->cache_idle = sole_cpu_idle; ++#endif ++ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); ++ for_each_possible_cpu(j) { ++ if (i == j) ++ rq->cpu_locality[j] = LOCALITY_SAME; ++ else ++ rq->cpu_locality[j] = LOCALITY_DISTANT; ++ } ++ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->rq_order[0] = rq->cpu_order[0] = rq; ++ for (j = 1; j < cpu_ids; j++) ++ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j); ++ } ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && !preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, preempt_disable_ip); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++static inline void normalise_rt_tasks(void) ++{ ++ struct sched_attr attr = {}; ++ struct task_struct *g, *p; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p) && !iso_task(p)) ++ continue; ++ ++ rq = task_rq_lock(p, &rf); ++ __setscheduler(p, rq, SCHED_NORMAL, 0, &attr, false); ++ task_rq_unlock(rq, p, &rf); ++ } ++ read_unlock(&tasklist_lock); ++} ++ ++void normalize_rt_tasks(void) ++{ ++ normalise_rt_tasks(); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++void init_idle_bootup_task(struct task_struct *idle) ++{} ++ ++#ifdef CONFIG_SCHED_DEBUG ++__read_mostly bool sched_debug_enabled; ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ seq_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void call_trace_sched_update_nr_running(struct rq *rq, int count) ++{ ++ trace_sched_update_nr_running_tp(rq, count); ++} ++ ++/* CFS Compat */ ++#ifdef CONFIG_RCU_TORTURE_TEST ++int sysctl_sched_rt_runtime; ++#endif +diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h +new file mode 100644 +index 000000000000..09a1f2fe64ba +--- /dev/null ++++ b/kernel/sched/MuQSS.h +@@ -0,0 +1,1070 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef MUQSS_SCHED_H ++#define MUQSS_SCHED_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_PARAVIRT ++#include ++#endif ++ ++#include "cpupri.h" ++ ++#include ++ ++#ifdef CONFIG_SCHED_DEBUG ++# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) ++#else ++# define SCHED_WARN_ON(x) ((void)(x)) ++#endif ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++extern void call_trace_sched_update_nr_running(struct rq *rq, int count); ++ ++struct rq; ++ ++#ifdef CONFIG_SMP ++ ++static inline bool sched_asym_prefer(int a, int b) ++{ ++ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); ++} ++ ++struct perf_domain { ++ struct em_perf_domain *em_pd; ++ struct perf_domain *next; ++ struct rcu_head rcu; ++}; ++ ++/* Scheduling group status flags */ ++#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ ++#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ ++ ++/* ++ * We add the notion of a root-domain which will be used to define per-domain ++ * variables. Each exclusive cpuset essentially defines an island domain by ++ * fully partitioning the member cpus from any other cpuset. Whenever a new ++ * exclusive cpuset is created, we also create and attach a new root-domain ++ * object. ++ * ++ */ ++struct root_domain { ++ atomic_t refcount; ++ atomic_t rto_count; ++ struct rcu_head rcu; ++ cpumask_var_t span; ++ cpumask_var_t online; ++ ++ /* ++ * Indicate pullable load on at least one CPU, e.g: ++ * - More than one runnable task ++ * - Running task is misfit ++ */ ++ int overload; ++ ++ /* Indicate one or more cpus over-utilized (tipping point) */ ++ int overutilized; ++ ++ /* ++ * The bit corresponding to a CPU gets set here if such CPU has more ++ * than one runnable -deadline task (as it is below for RT tasks). ++ */ ++ cpumask_var_t dlo_mask; ++ atomic_t dlo_count; ++ /* Replace unused CFS structures with void */ ++ //struct dl_bw dl_bw; ++ //struct cpudl cpudl; ++ void *dl_bw; ++ void *cpudl; ++ ++ /* ++ * The "RT overload" flag: it gets set if a CPU has more than ++ * one runnable RT task. ++ */ ++ cpumask_var_t rto_mask; ++ //struct cpupri cpupri; ++ void *cpupri; ++ ++ unsigned long max_cpu_capacity; ++ ++ /* ++ * NULL-terminated list of performance domains intersecting with the ++ * CPUs of the rd. Protected by RCU. ++ */ ++ struct perf_domain *pd; ++}; ++ ++extern void init_defrootdomain(void); ++extern int sched_init_domains(const struct cpumask *cpu_map); ++extern void rq_attach_root(struct rq *rq, struct root_domain *rd); ++ ++static inline void cpupri_cleanup(void __maybe_unused *cpupri) ++{ ++} ++ ++static inline void cpudl_cleanup(void __maybe_unused *cpudl) ++{ ++} ++ ++static inline void init_dl_bw(void __maybe_unused *dl_bw) ++{ ++} ++ ++static inline int cpudl_init(void __maybe_unused *dl_bw) ++{ ++ return 0; ++} ++ ++static inline int cpupri_init(void __maybe_unused *cpupri) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ raw_spinlock_t *lock; ++ raw_spinlock_t *orig_lock; ++ ++ struct task_struct __rcu *curr; ++ struct task_struct *idle; ++ struct task_struct *stop; ++ struct mm_struct *prev_mm; ++ ++ unsigned int nr_running; ++ /* ++ * This is part of a global counter where only the total sum ++ * over all CPUs matters. A task can increase this counter on ++ * one CPU and if it got migrated afterwards it may decrease ++ * it on another CPU. Always updated under the runqueue lock: ++ */ ++ unsigned long nr_uninterruptible; ++#ifdef CONFIG_SMP ++ unsigned int ttwu_pending; ++#endif ++ u64 nr_switches; ++ ++ /* Stored data about rq->curr to work outside rq lock */ ++ u64 rq_deadline; ++ int rq_prio; ++ ++ /* Best queued id for use outside lock */ ++ u64 best_key; ++ ++ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ ++ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ ++ u64 niffies; /* Last time this RQ updated rq clock */ ++ u64 last_niffy; /* Last niffies as updated by local clock */ ++ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ ++ ++ u64 load_update; /* When we last updated load */ ++ unsigned long load_avg; /* Rolling load average */ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ u64 irq_load_update; /* When we last updated IRQ load */ ++ unsigned long irq_load_avg; /* Rolling IRQ load average */ ++#endif ++#ifdef CONFIG_SMT_NICE ++ struct mm_struct *rq_mm; ++ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++ /* Accurate timekeeping data */ ++ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, ++ iowait_ns, idle_ns; ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++ skiplist_node *node; ++ skiplist *sl; ++#ifdef CONFIG_SMP ++ struct task_struct *preempt; /* Preempt triggered on this task */ ++ struct task_struct *preempting; /* Hint only, what task is preempting */ ++ ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++ struct root_domain *rd; ++ struct sched_domain *sd; ++ ++ unsigned long cpu_capacity_orig; ++ ++ int *cpu_locality; /* CPU relative cache distance */ ++ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */ ++ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */ ++ ++ bool is_leader; ++ struct rq *smp_leader; /* First physical CPU per node */ ++#ifdef CONFIG_SCHED_THERMAL_PRESSURE ++ struct sched_avg avg_thermal; ++#endif /* CONFIG_SCHED_THERMAL_PRESSURE */ ++#ifdef CONFIG_SCHED_SMT ++ struct rq *smt_leader; /* First logical CPU in SMT siblings */ ++ cpumask_t thread_mask; ++ bool (*siblings_idle)(struct rq *rq); ++ /* See if all smt siblings are idle */ ++#endif /* CONFIG_SCHED_SMT */ ++#ifdef CONFIG_SCHED_MC ++ struct rq *mc_leader; /* First logical CPU in MC siblings */ ++ cpumask_t core_mask; ++ bool (*cache_idle)(struct rq *rq); ++ /* See if all cache siblings are idle */ ++#endif /* CONFIG_SCHED_MC */ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ u64 clock, old_clock, last_tick; ++ /* Ensure that all clocks are in the same cache line */ ++ u64 clock_task ____cacheline_aligned; ++ int dither; ++ ++ int iso_ticks; ++ bool iso_refractory; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ struct hrtimer hrexpiry_timer; ++#endif ++ ++ int rt_nr_running; /* Number real time tasks running */ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock_task; ++} ++ ++/** ++ * By default the decay is the default pelt decay period. ++ * The decay shift can change the decay period in ++ * multiples of 32. ++ * Decay shift Decay period(ms) ++ * 0 32 ++ * 1 64 ++ * 2 128 ++ * 3 256 ++ * 4 512 ++ */ ++extern int sched_thermal_decay_shift; ++ ++static inline u64 rq_clock_thermal(struct rq *rq) ++{ ++ return rq_clock_task(rq) >> sched_thermal_decay_shift; ++} ++ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++#ifdef CONFIG_SMP ++struct rq *cpu_rq(int cpu); ++#endif ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#endif /* CONFIG_SMP */ ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline int task_running(struct rq *rq, struct task_struct *p) ++{ ++#ifdef CONFIG_SMP ++ return p->on_cpu; ++#else ++ return task_current(rq, p); ++#endif ++} ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++static inline void rq_lock(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock(rq->lock); ++} ++ ++static inline void rq_unlock(struct rq *rq) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(rq->lock); ++} ++ ++static inline void rq_lock_irq(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irq(rq->lock); ++} ++ ++static inline void rq_unlock_irq(struct rq *rq, struct rq_flags __always_unused *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(rq->lock); ++} ++ ++static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(rq->lock, rf->flags); ++} ++ ++static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(rq->lock, rf->flags); ++} ++ ++static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ while (42) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ } ++ return rq; ++} ++ ++static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ rq_unlock(rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ while (42) { ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ } ++ return rq; ++} ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags __always_unused *rf) ++{ ++ rq_unlock(rq); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ return rq; ++} ++ ++/* ++ * {de,en}queue flags: Most not used on MuQSS. ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks ++ * are in a known state which allows modification. Such pairs ++ * should preserve as much state as possible. ++ * ++ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location ++ * in the runqueue. ++ * ++ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) ++ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) ++ * ENQUEUE_MIGRATED - the task was migrated during wakeup ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ ++ ++#define ENQUEUE_WAKEUP 0x01 ++#define ENQUEUE_RESTORE 0x02 ++ ++#ifdef CONFIG_SMP ++#define ENQUEUE_MIGRATED 0x40 ++#else ++#define ENQUEUE_MIGRATED 0x00 ++#endif ++ ++#ifdef CONFIG_NUMA ++enum numa_topology_type { ++ NUMA_DIRECT, ++ NUMA_GLUELESS_MESH, ++ NUMA_BACKPLANE, ++}; ++extern enum numa_topology_type sched_numa_topology_type; ++extern int sched_max_numa_distance; ++extern bool find_numa_distance(int distance); ++extern void sched_init_numa(void); ++extern void sched_domains_numa_masks_set(unsigned int cpu); ++extern void sched_domains_numa_masks_clear(unsigned int cpu); ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline void sched_init_numa(void) { } ++static inline void sched_domains_numa_masks_set(unsigned int cpu) { } ++static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++ ++extern struct mutex sched_domains_mutex; ++extern struct static_key_false sched_schedstats; ++ ++#define rcu_dereference_check_sched_domain(p) \ ++ rcu_dereference_check((p), \ ++ lockdep_is_held(&sched_domains_mutex)) ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. ++ * See destroy_sched_domains: call_rcu for details. ++ * ++ * The domain tree of any CPU may only be accessed from within ++ * preempt-disabled sections. ++ */ ++#define for_each_domain(cpu, __sd) \ ++ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ ++ __sd; __sd = __sd->parent) ++ ++/** ++ * highest_flag_domain - Return highest sched_domain containing flag. ++ * @cpu: The cpu whose highest level of sched domain is to ++ * be returned. ++ * @flag: The flag to check for the highest sched_domain ++ * for the given cpu. ++ * ++ * Returns the highest sched_domain of a cpu which contains the given flag. ++ */ ++static inline struct sched_domain *highest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd, *hsd = NULL; ++ ++ for_each_domain(cpu, sd) { ++ if (!(sd->flags & flag)) ++ break; ++ hsd = sd; ++ } ++ ++ return hsd; ++} ++ ++static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd; ++ ++ for_each_domain(cpu, sd) { ++ if (sd->flags & flag) ++ break; ++ } ++ ++ return sd; ++} ++ ++DECLARE_PER_CPU(struct sched_domain *, sd_llc); ++DECLARE_PER_CPU(int, sd_llc_size); ++DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); ++DECLARE_PER_CPU(struct sched_domain *, sd_numa); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); ++ ++struct sched_group_capacity { ++ atomic_t ref; ++ /* ++ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity ++ * for a single CPU. ++ */ ++ unsigned long capacity; ++ unsigned long min_capacity; /* Min per-CPU capacity in group */ ++ unsigned long max_capacity; /* Max per-CPU capacity in group */ ++ unsigned long next_update; ++ int imbalance; /* XXX unrelated to capacity but shared group state */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++ int id; ++#endif ++ ++ unsigned long cpumask[]; /* balance mask */ ++}; ++ ++struct sched_group { ++ struct sched_group *next; /* Must be a circular list */ ++ atomic_t ref; ++ ++ unsigned int group_weight; ++ struct sched_group_capacity *sgc; ++ int asym_prefer_cpu; /* cpu of highest priority in group */ ++ ++ /* ++ * The CPUs this group covers. ++ * ++ * NOTE: this field is variable length. (Allocated dynamically ++ * by attaching extra space to the end of the structure, ++ * depending on how many CPUs the kernel has booted up with) ++ */ ++ unsigned long cpumask[0]; ++}; ++ ++static inline struct cpumask *sched_group_span(struct sched_group *sg) ++{ ++ return to_cpumask(sg->cpumask); ++} ++ ++/* ++ * See build_balance_mask(). ++ */ ++static inline struct cpumask *group_balance_mask(struct sched_group *sg) ++{ ++ return to_cpumask(sg->sgc->cpumask); ++} ++ ++/** ++ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. ++ * @group: The group whose first cpu is to be returned. ++ */ ++static inline unsigned int group_first_cpu(struct sched_group *group) ++{ ++ return cpumask_first(sched_group_span(group)); ++} ++ ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void dirty_sched_domain_sysctl(int cpu); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void dirty_sched_domain_sysctl(int cpu) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern void flush_smp_call_function_from_idle(void); ++ ++extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); ++extern void set_rq_online (struct rq *rq); ++extern void set_rq_offline(struct rq *rq); ++extern bool sched_smp_initialized; ++ ++static inline void update_group_capacity(struct sched_domain *sd, int cpu) ++{ ++} ++ ++static inline void trigger_load_balance(struct rq *rq) ++{ ++} ++ ++#define sched_feat(x) 0 ++ ++#else /* CONFIG_SMP */ ++ ++static inline void flush_smp_call_function_from_idle(void) { } ++ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ SCHED_WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++extern bool sched_debug_enabled; ++#endif ++ ++extern void schedule_idle(void); ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++static inline bool sched_stop_runnable(struct rq *rq) ++{ ++ return rq->stop && task_on_rq_queued(rq->stop); ++} ++ ++#ifdef CONFIG_SMP ++static inline int cpu_of(struct rq *rq) ++{ ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static inline int cpu_of(struct rq *rq) ++{ ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); ++ ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, ++ cpu_of(rq))); ++ ++ if (data) ++ data->func(data, rq->niffies, flags); ++} ++#else ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) ++{ ++} ++#endif /* CONFIG_CPU_FREQ */ ++ ++static __always_inline ++unsigned int uclamp_rq_util_with(struct rq __maybe_unused *rq, unsigned int util, ++ struct task_struct __maybe_unused *p) ++{ ++ return util; ++} ++ ++static inline bool uclamp_is_used(void) ++{ ++ return false; ++} ++ ++#ifndef arch_scale_freq_tick ++static __always_inline ++void arch_scale_freq_tick(void) ++{ ++} ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return tsk_seruntime(t); ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ struct rq_flags rf; ++ u64 ns; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = tsk_seruntime(t); ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++/** ++ * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. ++ * @cpu: the CPU in question. ++ * ++ * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e. ++ * ++ * f_curr ++ * ------ * SCHED_CAPACITY_SCALE ++ * f_max ++ */ ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern bool sched_can_stop_tick(struct rq *rq); ++extern int __init sched_tick_offload_init(void); ++ ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out of ++ * nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (sched_can_stop_tick(rq)) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++static inline bool rt_rq_is_runnable(struct rq *rt_rq) ++{ ++ return rt_rq->rt_nr_running; ++} ++ ++/** ++ * enum schedutil_type - CPU utilization type ++ * @FREQUENCY_UTIL: Utilization used to select frequency ++ * @ENERGY_UTIL: Utilization used during energy calculation ++ * ++ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time ++ * need to be aggregated differently depending on the usage made of them. This ++ * enum is used within schedutil_freq_util() to differentiate the types of ++ * utilization expected by the callers, and adjust the aggregation accordingly. ++ */ ++enum schedutil_type { ++ FREQUENCY_UTIL, ++ ENERGY_UTIL, ++}; ++ ++#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL ++ ++unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, ++ unsigned long max, enum schedutil_type type, ++ struct task_struct *p); ++ ++static inline unsigned long cpu_bw_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_cfs(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline unsigned long cpu_util_rt(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->rt_nr_running); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->irq_load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ util *= (max - irq); ++ util /= max; ++ ++ return util; ++ ++} ++#else ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ return util; ++} ++#endif ++#endif ++ ++#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) ++#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) ++ ++DECLARE_STATIC_KEY_FALSE(sched_energy_present); ++ ++static inline bool sched_energy_enabled(void) ++{ ++ return static_branch_unlikely(&sched_energy_present); ++} ++ ++#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ ++ ++#define perf_domain_span(pd) NULL ++static inline bool sched_energy_enabled(void) { return false; } ++ ++#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ if (!(p->flags & PF_KTHREAD)) ++ return false; ++ ++ if (p->nr_cpus_allowed != 1) ++ return false; ++ ++ return true; ++} ++#endif ++ ++void swake_up_all_locked(struct swait_queue_head *q); ++void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++/* pelt.h compat CONFIG_SCHED_THERMAL_PRESSURE impossible with MUQSS */ ++static inline int ++update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) ++{ ++ return 0; ++} ++ ++static inline u64 thermal_load_avg(struct rq *rq) ++{ ++ return 0; ++} ++ ++#ifdef CONFIG_RCU_TORTURE_TEST ++extern int sysctl_sched_rt_runtime; ++#endif ++ ++#endif /* MUQSS_SCHED_H */ +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 97d318b0cd0c..fb466c681a1f 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -172,6 +172,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifdef CONFIG_SCHED_MUQSS ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(rq) ++#else ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(&rq->rt) ++#endif ++ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -200,7 +206,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + struct rq *rq = cpu_rq(cpu); + + if (!uclamp_is_used() && +- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { ++ type == FREQUENCY_UTIL && rt_rq_runnable(rq)) { + return max; + } + +@@ -642,7 +648,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + struct task_struct *thread; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), ++#ifdef CONFIG_SCHED_MUQSS ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, ++#endif + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, +diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h +index efbb492bb94c..f0288c32ab17 100644 +--- a/kernel/sched/cpupri.h ++++ b/kernel/sched/cpupri.h +@@ -17,6 +17,7 @@ struct cpupri { + int *cpu_to_pri; + }; + ++#ifndef CONFIG_SCHED_MUQSS + #ifdef CONFIG_SMP + int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask); +@@ -27,3 +28,4 @@ void cpupri_set(struct cpupri *cp, int cpu, int pri); + int cpupri_init(struct cpupri *cp); + void cpupri_cleanup(struct cpupri *cp); + #endif ++#endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 5a55d2300452..283a580754a7 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -266,26 +266,6 @@ static inline u64 account_other_time(u64 max) + return accounted; + } + +-#ifdef CONFIG_64BIT +-static inline u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- return t->se.sum_exec_runtime; +-} +-#else +-static u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- u64 ns; +- struct rq_flags rf; +- struct rq *rq; +- +- rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; +- task_rq_unlock(rq, t, &rf); +- +- return ns; +-} +-#endif +- + /* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. +@@ -614,7 +594,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index c6932b8f4467..3fc76869cf5f 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -395,6 +395,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_MUQSS + /* + * idle-task scheduling class. + */ +@@ -508,3 +509,4 @@ const struct sched_class idle_sched_class + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index df80bfcea92e..e17785e91f0e 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,19 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_MUQSS ++#include "MuQSS.h" ++ ++/* Begin compatibility wrappers for MuQSS/CFS differences */ ++#define rq_rt_nr_running(rq) ((rq)->rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->nr_running) ++ ++#else /* CONFIG_SCHED_MUQSS */ ++ ++#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running) ++ ++ + #include + + #include +@@ -2633,3 +2646,25 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) + + void swake_up_all_locked(struct swait_queue_head *q); + void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++/* MuQSS compatibility functions */ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return t->se.sum_exec_runtime; ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ u64 ns; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = t->se.sum_exec_runtime; ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index dd7770226086..4bd57a89aa6a 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -454,7 +454,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + struct root_domain *old_rd = NULL; + unsigned long flags; + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_lock_irqsave(rq->lock, flags); ++#else + raw_spin_lock_irqsave(&rq->lock, flags); ++#endif + + if (rq->rd) { + old_rd = rq->rd; +@@ -480,7 +484,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_unlock_irqrestore(rq->lock, flags); ++#else + raw_spin_unlock_irqrestore(&rq->lock, flags); ++#endif + + if (old_rd) + call_rcu(&old_rd->rcu, free_rootdomain); +diff --git a/kernel/skip_list.c b/kernel/skip_list.c +new file mode 100644 +index 000000000000..bf5c6e97e139 +--- /dev/null ++++ b/kernel/skip_list.c +@@ -0,0 +1,148 @@ ++/* ++ Copyright (C) 2011,2016 Con Kolivas. ++ ++ Code based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++The routine randomLevel has been hard-coded to generate random ++levels using p=0.25. It can be easily changed. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++Levels start at zero and go up to MaxLevel (which is equal to ++MaxNumberOfLevels-1). ++ ++The routines defined in this file are: ++ ++init: defines slnode ++ ++new_skiplist: returns a new, empty list ++ ++randomLevel: Returns a random level based on a u64 random seed passed to it. ++In MuQSS, the "niffy" time is used for this purpose. ++ ++insert(l,key, value): inserts the binding (key, value) into l. This operation ++occurs in O(log n) time. ++ ++delnode(slnode, l, node): deletes any binding of key from the l based on the ++actual node value. This operation occurs in O(k) time where k is the ++number of levels of the node in question (max 8). The original delete ++function occurred in O(log n) time and involved a search. ++ ++MuQSS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++ ++*/ ++ ++#include ++#include ++ ++#define MaxNumberOfLevels 8 ++#define MaxLevel (MaxNumberOfLevels - 1) ++ ++void skiplist_init(skiplist_node *slnode) ++{ ++ int i; ++ ++ slnode->key = 0xFFFFFFFFFFFFFFFF; ++ slnode->level = 0; ++ slnode->value = NULL; ++ for (i = 0; i < MaxNumberOfLevels; i++) ++ slnode->next[i] = slnode->prev[i] = slnode; ++} ++ ++skiplist *new_skiplist(skiplist_node *slnode) ++{ ++ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); ++ ++ BUG_ON(!l); ++ l->header = slnode; ++ return l; ++} ++ ++void free_skiplist(skiplist *l) ++{ ++ skiplist_node *p, *q; ++ ++ p = l->header; ++ do { ++ q = p->next[0]; ++ p->next[0]->prev[0] = q->prev[0]; ++ skiplist_node_init(p); ++ p = q; ++ } while (p != l->header); ++ kfree(l); ++} ++ ++void skiplist_node_init(skiplist_node *node) ++{ ++ memset(node, 0, sizeof(skiplist_node)); ++} ++ ++static inline unsigned int randomLevel(const long unsigned int randseed) ++{ ++ return find_first_bit(&randseed, MaxLevel) / 2; ++} ++ ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) ++{ ++ skiplist_node *update[MaxNumberOfLevels]; ++ skiplist_node *p, *q; ++ int k = l->level; ++ ++ p = l->header; ++ do { ++ while (q = p->next[k], q->key <= key) ++ p = q; ++ update[k] = p; ++ } while (--k >= 0); ++ ++ ++l->entries; ++ k = randomLevel(randseed); ++ if (k > l->level) { ++ k = ++l->level; ++ update[k] = l->header; ++ } ++ ++ node->level = k; ++ node->key = key; ++ node->value = value; ++ do { ++ p = update[k]; ++ node->next[k] = p->next[k]; ++ p->next[k] = node; ++ node->prev[k] = p; ++ node->next[k]->prev[k] = node; ++ } while (--k >= 0); ++} ++ ++void skiplist_delete(skiplist *l, skiplist_node *node) ++{ ++ int k, m = node->level; ++ ++ for (k = 0; k <= m; k++) { ++ node->prev[k]->next[k] = node->next[k]; ++ node->next[k]->prev[k] = node->prev[k]; ++ } ++ skiplist_node_init(node); ++ if (m == l->level) { ++ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) ++ m--; ++ l->level = m; ++ } ++ l->entries--; ++} +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index afad085960b8..f2753a92b2d8 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -120,6 +120,14 @@ static unsigned long long_max = LONG_MAX; + static int one_hundred = 100; + static int two_hundred = 200; + static int one_thousand = 1000; ++static int zero = 0; ++static int one = 1; ++#ifdef CONFIG_SCHED_MUQSS ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -184,7 +192,7 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; + int sysctl_legacy_va_layout; + #endif + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -193,7 +201,7 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; + static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; + #endif /* CONFIG_SMP */ +-#endif /* CONFIG_SCHED_DEBUG */ ++#endif /* CONFIG_SCHED_DEBUG && !CONFIG_SCHED_MUQSS */ + + #ifdef CONFIG_COMPACTION + static int min_extfrag_threshold; +@@ -1652,6 +1660,7 @@ int proc_do_static_key(struct ctl_table *table, int write, + } + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_MUQSS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -1843,6 +1852,56 @@ static struct ctl_table kern_table[] = { + .extra1 = SYSCTL_ONE, + }, + #endif ++#elif defined(CONFIG_SCHED_MUQSS) ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++ { ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#if defined(CONFIG_SMP) && defined(CONFIG_SCHEDSTATS) ++ { ++ .procname = "sched_schedstats", ++ .data = NULL, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sysctl_schedstats, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++#endif /* CONFIG_SMP && CONFIG_SCHEDSTATS */ ++#endif /* CONFIG_SCHED_MUQSS */ ++ + #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + { + .procname = "sched_energy_aware", +diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig +index a09b1d61df6a..3c6267b7a630 100644 +--- a/kernel/time/Kconfig ++++ b/kernel/time/Kconfig +@@ -132,7 +132,7 @@ config CONTEXT_TRACKING + + config CONTEXT_TRACKING_FORCE + bool "Force context tracking" +- depends on CONTEXT_TRACKING ++ depends on CONTEXT_TRACKING && !SCHED_MUQSS + default y if !NO_HZ_FULL + help + The major pre-requirement for full dynticks to work is to +diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c +index f5490222e134..7a61971cca74 100644 +--- a/kernel/time/clockevents.c ++++ b/kernel/time/clockevents.c +@@ -190,8 +190,13 @@ int clockevents_tick_resume(struct clock_event_device *dev) + + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + ++#ifdef CONFIG_SCHED_MUQSS ++/* Limit min_delta to 100us */ ++#define MIN_DELTA_LIMIT (NSEC_PER_SEC / 10000) ++#else + /* Limit min_delta to a jiffie */ + #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) ++#endif + + /** + * clockevents_increase_min_delta - raise minimum delta of a clock event device +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index a71758e34e45..ebb84a65d928 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -216,7 +216,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -850,7 +850,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index c3ad64fb9d8b..8255b3a180b8 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1584,7 +1584,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) ++static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) + { + u64 nextevt = hrtimer_get_next_event(); + +@@ -1602,6 +1602,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) + if (nextevt <= basem) + return basem; + ++ if (nextevt < expires && nextevt - basem <= TICK_NSEC) ++ base->is_idle = false; ++ + /* + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to +@@ -1671,7 +1674,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) + } + raw_spin_unlock(&base->lock); + +- return cmp_next_hrtimer_event(basem, expires); ++ return cmp_next_hrtimer_event(base, basem, expires); + } + + /** +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index 4738ad48a667..f050201c0574 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1053,10 +1053,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_MUQSS ++ /* No deadline on MuQSS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + +-- +2.25.1 + diff --git a/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.11/4500_uksm.patch b/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.11/4500_uksm.patch new file mode 100644 index 000000000..b69783237 --- /dev/null +++ b/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.11/4500_uksm.patch @@ -0,0 +1,6971 @@ +# Calculate format=diff merge(sys-kernel/calculate-sources[uksm])!= +From f785a5c37288394e6e74308bf35c38dcfa665201 Mon Sep 17 00:00:00 2001 +From: Piotr Gorski +Date: Mon, 22 Feb 2021 18:08:28 +0100 +Subject: [PATCH] UKSM for 5.11 + +Signed-off-by: Piotr Gorski +--- + Documentation/vm/uksm.txt | 61 + + fs/exec.c | 1 + + fs/proc/meminfo.c | 4 + + include/linux/ksm.h | 43 +- + include/linux/mm_types.h | 3 + + include/linux/mmzone.h | 3 + + include/linux/pgtable.h | 17 +- + include/linux/sradix-tree.h | 77 + + include/linux/uksm.h | 149 + + kernel/fork.c | 2 +- + lib/Makefile | 2 +- + lib/sradix-tree.c | 476 +++ + mm/Kconfig | 26 + + mm/Makefile | 3 +- + mm/ksm.c | 11 - + mm/memory.c | 33 +- + mm/mmap.c | 37 + + mm/uksm.c | 5614 +++++++++++++++++++++++++++++++++++ + mm/vmstat.c | 3 + + 19 files changed, 6539 insertions(+), 26 deletions(-) + create mode 100644 Documentation/vm/uksm.txt + create mode 100644 include/linux/sradix-tree.h + create mode 100644 include/linux/uksm.h + create mode 100644 lib/sradix-tree.c + create mode 100644 mm/uksm.c + +diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt +new file mode 100644 +index 000000000..be19a3127 +--- /dev/null ++++ b/Documentation/vm/uksm.txt +@@ -0,0 +1,61 @@ ++The Ultra Kernel Samepage Merging feature ++---------------------------------------------- ++/* ++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia ++ * ++ * This is an improvement upon KSM. Some basic data structures and routines ++ * are borrowed from ksm.c . ++ * ++ * Its new features: ++ * 1. Full system scan: ++ * It automatically scans all user processes' anonymous VMAs. Kernel-user ++ * interaction to submit a memory area to KSM is no longer needed. ++ * ++ * 2. Rich area detection: ++ * It automatically detects rich areas containing abundant duplicated ++ * pages based. Rich areas are given a full scan speed. Poor areas are ++ * sampled at a reasonable speed with very low CPU consumption. ++ * ++ * 3. Ultra Per-page scan speed improvement: ++ * A new hash algorithm is proposed. As a result, on a machine with ++ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it ++ * can scan memory areas that does not contain duplicated pages at speed of ++ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of ++ * 477MB/sec ~ 923MB/sec. ++ * ++ * 4. Thrashing area avoidance: ++ * Thrashing area(an VMA that has frequent Ksm page break-out) can be ++ * filtered out. My benchmark shows it's more efficient than KSM's per-page ++ * hash value based volatile page detection. ++ * ++ * ++ * 5. Misc changes upon KSM: ++ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page ++ * comparison. It's much faster than default C version on x86. ++ * * rmap_item now has an struct *page member to loosely cache a ++ * address-->page mapping, which reduces too much time-costly ++ * follow_page(). ++ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. ++ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ ++ * ksm is needed for this case. ++ * ++ * 6. Full Zero Page consideration(contributed by Figo Zhang) ++ * Now uksmd consider full zero pages as special pages and merge them to an ++ * special unswappable uksm zero page. ++ */ ++ ++ChangeLog: ++ ++2012-05-05 The creation of this Doc ++2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. ++2012-05-28 UKSM 0.1.1.2 bug fix release ++2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 ++2012-07-2 UKSM 0.1.2-beta2 ++2012-07-10 UKSM 0.1.2-beta3 ++2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. ++2012-10-13 UKSM 0.1.2.1 Bug fixes. ++2012-12-31 UKSM 0.1.2.2 Minor bug fixes. ++2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug". ++2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings. ++2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation. ++2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration. +diff --git a/fs/exec.c b/fs/exec.c +index 5d4d52039..86f9f6526 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -65,6 +65,7 @@ + #include + #include + #include ++#include + + #include + #include +diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c +index d6fc74619..e173b259f 100644 +--- a/fs/proc/meminfo.c ++++ b/fs/proc/meminfo.c +@@ -108,6 +108,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) + #endif + show_val_kb(m, "PageTables: ", + global_node_page_state(NR_PAGETABLE)); ++#ifdef CONFIG_UKSM ++ show_val_kb(m, "KsmZeroPages: ", ++ global_zone_page_state(NR_UKSM_ZERO_PAGES)); ++#endif + + show_val_kb(m, "NFS_Unstable: ", 0); + show_val_kb(m, "Bounce: ", +diff --git a/include/linux/ksm.h b/include/linux/ksm.h +index 161e8164a..f0dbdf3c9 100644 +--- a/include/linux/ksm.h ++++ b/include/linux/ksm.h +@@ -21,20 +21,16 @@ struct mem_cgroup; + #ifdef CONFIG_KSM + int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags); +-int __ksm_enter(struct mm_struct *mm); +-void __ksm_exit(struct mm_struct *mm); + +-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++static inline struct stable_node *page_stable_node(struct page *page) + { +- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) +- return __ksm_enter(mm); +- return 0; ++ return PageKsm(page) ? page_rmapping(page) : NULL; + } + +-static inline void ksm_exit(struct mm_struct *mm) ++static inline void set_page_stable_node(struct page *page, ++ struct stable_node *stable_node) + { +- if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) +- __ksm_exit(mm); ++ page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); + } + + /* +@@ -54,6 +50,33 @@ struct page *ksm_might_need_to_copy(struct page *page, + void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); + void ksm_migrate_page(struct page *newpage, struct page *oldpage); + ++#ifdef CONFIG_KSM_LEGACY ++int __ksm_enter(struct mm_struct *mm); ++void __ksm_exit(struct mm_struct *mm); ++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++{ ++ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) ++ return __ksm_enter(mm); ++ return 0; ++} ++ ++static inline void ksm_exit(struct mm_struct *mm) ++{ ++ if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) ++ __ksm_exit(mm); ++} ++ ++#elif defined(CONFIG_UKSM) ++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++{ ++ return 0; ++} ++ ++static inline void ksm_exit(struct mm_struct *mm) ++{ ++} ++#endif /* !CONFIG_UKSM */ ++ + #else /* !CONFIG_KSM */ + + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +@@ -89,4 +112,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) + #endif /* CONFIG_MMU */ + #endif /* !CONFIG_KSM */ + ++#include ++ + #endif /* __LINUX_KSM_H */ +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 07d9acb5b..858a2f712 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -369,6 +369,9 @@ struct vm_area_struct { + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ + #endif + struct vm_userfaultfd_ctx vm_userfaultfd_ctx; ++#ifdef CONFIG_UKSM ++ struct vma_slot *uksm_vma_slot; ++#endif + } __randomize_layout; + + struct core_thread { +diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h +index b593316bf..90d5b5a43 100644 +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -157,6 +157,9 @@ enum zone_stat_item { + NR_ZSPAGES, /* allocated in zsmalloc */ + #endif + NR_FREE_CMA_PAGES, ++#ifdef CONFIG_UKSM ++ NR_UKSM_ZERO_PAGES, ++#endif + NR_VM_ZONE_STAT_ITEMS }; + + enum node_stat_item { +diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h +index 8fcdfa52e..cc511ae57 100644 +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -1115,12 +1115,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, + extern void untrack_pfn_moved(struct vm_area_struct *vma); + #endif + ++#ifdef CONFIG_UKSM ++static inline int is_uksm_zero_pfn(unsigned long pfn) ++{ ++ extern unsigned long uksm_zero_pfn; ++ return pfn == uksm_zero_pfn; ++} ++#else ++static inline int is_uksm_zero_pfn(unsigned long pfn) ++{ ++ return 0; ++} ++#endif ++ + #ifdef __HAVE_COLOR_ZERO_PAGE + static inline int is_zero_pfn(unsigned long pfn) + { + extern unsigned long zero_pfn; + unsigned long offset_from_zero_pfn = pfn - zero_pfn; +- return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); ++ return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn); + } + + #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) +@@ -1129,7 +1142,7 @@ static inline int is_zero_pfn(unsigned long pfn) + static inline int is_zero_pfn(unsigned long pfn) + { + extern unsigned long zero_pfn; +- return pfn == zero_pfn; ++ return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); + } + + static inline unsigned long my_zero_pfn(unsigned long addr) +diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h +new file mode 100644 +index 000000000..d71edba6b +--- /dev/null ++++ b/include/linux/sradix-tree.h +@@ -0,0 +1,77 @@ ++#ifndef _LINUX_SRADIX_TREE_H ++#define _LINUX_SRADIX_TREE_H ++ ++ ++#define INIT_SRADIX_TREE(root, mask) \ ++do { \ ++ (root)->height = 0; \ ++ (root)->gfp_mask = (mask); \ ++ (root)->rnode = NULL; \ ++} while (0) ++ ++#define ULONG_BITS (sizeof(unsigned long) * 8) ++#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) ++//#define SRADIX_TREE_MAP_SHIFT 6 ++//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) ++//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) ++ ++struct sradix_tree_node { ++ unsigned int height; /* Height from the bottom */ ++ unsigned int count; ++ unsigned int fulls; /* Number of full sublevel trees */ ++ struct sradix_tree_node *parent; ++ void *stores[0]; ++}; ++ ++/* A simple radix tree implementation */ ++struct sradix_tree_root { ++ unsigned int height; ++ struct sradix_tree_node *rnode; ++ ++ /* Where found to have available empty stores in its sublevels */ ++ struct sradix_tree_node *enter_node; ++ unsigned int shift; ++ unsigned int stores_size; ++ unsigned int mask; ++ unsigned long min; /* The first hole index */ ++ unsigned long num; ++ //unsigned long *height_to_maxindex; ++ ++ /* How the node is allocated and freed. */ ++ struct sradix_tree_node *(*alloc)(void); ++ void (*free)(struct sradix_tree_node *node); ++ ++ /* When a new node is added and removed */ ++ void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); ++ void (*assign)(struct sradix_tree_node *node, unsigned int index, void *item); ++ void (*rm)(struct sradix_tree_node *node, unsigned int offset); ++}; ++ ++struct sradix_tree_path { ++ struct sradix_tree_node *node; ++ int offset; ++}; ++ ++static inline ++void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) ++{ ++ root->height = 0; ++ root->rnode = NULL; ++ root->shift = shift; ++ root->stores_size = 1UL << shift; ++ root->mask = root->stores_size - 1; ++} ++ ++ ++extern void *sradix_tree_next(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index, ++ int (*iter)(void *, unsigned long)); ++ ++extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); ++ ++extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index); ++ ++extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); ++ ++#endif /* _LINUX_SRADIX_TREE_H */ +diff --git a/include/linux/uksm.h b/include/linux/uksm.h +new file mode 100644 +index 000000000..bb8651f53 +--- /dev/null ++++ b/include/linux/uksm.h +@@ -0,0 +1,149 @@ ++#ifndef __LINUX_UKSM_H ++#define __LINUX_UKSM_H ++/* ++ * Memory merging support. ++ * ++ * This code enables dynamic sharing of identical pages found in different ++ * memory areas, even if they are not shared by fork(). ++ */ ++ ++/* if !CONFIG_UKSM this file should not be compiled at all. */ ++#ifdef CONFIG_UKSM ++ ++#include ++#include ++#include ++#include ++#include ++ ++extern unsigned long zero_pfn __read_mostly; ++extern unsigned long uksm_zero_pfn __read_mostly; ++extern struct page *empty_uksm_zero_page; ++ ++/* must be done before linked to mm */ ++extern void uksm_vma_add_new(struct vm_area_struct *vma); ++extern void uksm_remove_vma(struct vm_area_struct *vma); ++ ++#define UKSM_SLOT_NEED_SORT (1 << 0) ++#define UKSM_SLOT_NEED_RERAND (1 << 1) ++#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ ++#define UKSM_SLOT_FUL_SCANNED (1 << 3) ++#define UKSM_SLOT_IN_UKSM (1 << 4) ++ ++struct vma_slot { ++ struct sradix_tree_node *snode; ++ unsigned long sindex; ++ ++ struct list_head slot_list; ++ unsigned long fully_scanned_round; ++ unsigned long dedup_num; ++ unsigned long pages_scanned; ++ unsigned long this_sampled; ++ unsigned long last_scanned; ++ unsigned long pages_to_scan; ++ struct scan_rung *rung; ++ struct page **rmap_list_pool; ++ unsigned int *pool_counts; ++ unsigned long pool_size; ++ struct vm_area_struct *vma; ++ struct mm_struct *mm; ++ unsigned long ctime_j; ++ unsigned long pages; ++ unsigned long flags; ++ unsigned long pages_cowed; /* pages cowed this round */ ++ unsigned long pages_merged; /* pages merged this round */ ++ unsigned long pages_bemerged; ++ ++ /* when it has page merged in this eval round */ ++ struct list_head dedup_list; ++}; ++ ++static inline void uksm_unmap_zero_page(pte_t pte) ++{ ++ if (pte_pfn(pte) == uksm_zero_pfn) ++ __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); ++} ++ ++static inline void uksm_map_zero_page(pte_t pte) ++{ ++ if (pte_pfn(pte) == uksm_zero_pfn) ++ __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); ++} ++ ++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) ++{ ++ if (vma->uksm_vma_slot && PageKsm(page)) ++ vma->uksm_vma_slot->pages_cowed++; ++} ++ ++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) ++{ ++ if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) ++ vma->uksm_vma_slot->pages_cowed++; ++} ++ ++static inline int uksm_flags_can_scan(unsigned long vm_flags) ++{ ++#ifdef VM_SAO ++ if (vm_flags & VM_SAO) ++ return 0; ++#endif ++ ++ return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | ++ VM_HUGETLB | VM_MIXEDMAP | VM_SHARED ++ | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN)); ++} ++ ++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) ++{ ++ if (uksm_flags_can_scan(*vm_flags_p)) ++ *vm_flags_p |= VM_MERGEABLE; ++} ++ ++/* ++ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will ++ * be removed when uksm zero page patch is stable enough. ++ */ ++static inline void uksm_bugon_zeropage(pte_t pte) ++{ ++ BUG_ON(pte_pfn(pte) == uksm_zero_pfn); ++} ++#else ++static inline void uksm_vma_add_new(struct vm_area_struct *vma) ++{ ++} ++ ++static inline void uksm_remove_vma(struct vm_area_struct *vma) ++{ ++} ++ ++static inline void uksm_unmap_zero_page(pte_t pte) ++{ ++} ++ ++static inline void uksm_map_zero_page(pte_t pte) ++{ ++} ++ ++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) ++{ ++} ++ ++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) ++{ ++} ++ ++static inline int uksm_flags_can_scan(unsigned long vm_flags) ++{ ++ return 0; ++} ++ ++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) ++{ ++} ++ ++static inline void uksm_bugon_zeropage(pte_t pte) ++{ ++} ++#endif /* !CONFIG_UKSM */ ++#endif /* __LINUX_UKSM_H */ +diff --git a/kernel/fork.c b/kernel/fork.c +index d66cd1014..5e0081886 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -588,7 +588,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; +- ++ uksm_vma_add_new(tmp); + mm->map_count++; + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(tmp, mpnt); +diff --git a/lib/Makefile b/lib/Makefile +index afeff05fa..691f13e0f 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -31,7 +31,7 @@ endif + KCSAN_SANITIZE_random32.o := n + + lib-y := ctype.o string.o vsprintf.o cmdline.o \ +- rbtree.o radix-tree.o timerqueue.o xarray.o \ ++ rbtree.o radix-tree.o sradix-tree.o timerqueue.o xarray.o \ + idr.o extable.o sha1.o irq_regs.o argv_split.o \ + flex_proportions.o ratelimit.o show_mem.o \ + is_single_threaded.o plist.o decompress.o kobject_uevent.o \ +diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c +new file mode 100644 +index 000000000..ab21e6309 +--- /dev/null ++++ b/lib/sradix-tree.c +@@ -0,0 +1,476 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) ++{ ++ return node->fulls == root->stores_size || ++ (node->height == 1 && node->count == root->stores_size); ++} ++ ++/* ++ * Extend a sradix tree so it can store key @index. ++ */ ++static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) ++{ ++ struct sradix_tree_node *node; ++ unsigned int height; ++ ++ if (unlikely(root->rnode == NULL)) { ++ if (!(node = root->alloc())) ++ return -ENOMEM; ++ ++ node->height = 1; ++ root->rnode = node; ++ root->height = 1; ++ } ++ ++ /* Figure out what the height should be. */ ++ height = root->height; ++ index >>= root->shift * height; ++ ++ while (index) { ++ index >>= root->shift; ++ height++; ++ } ++ ++ while (height > root->height) { ++ unsigned int newheight; ++ ++ if (!(node = root->alloc())) ++ return -ENOMEM; ++ ++ /* Increase the height. */ ++ node->stores[0] = root->rnode; ++ root->rnode->parent = node; ++ if (root->extend) ++ root->extend(node, root->rnode); ++ ++ newheight = root->height + 1; ++ node->height = newheight; ++ node->count = 1; ++ if (sradix_node_full(root, root->rnode)) ++ node->fulls = 1; ++ ++ root->rnode = node; ++ root->height = newheight; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Search the next item from the current node, that is not NULL ++ * and can satify root->iter(). ++ */ ++void *sradix_tree_next(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index, ++ int (*iter)(void *item, unsigned long height)) ++{ ++ unsigned long offset; ++ void *item; ++ ++ if (unlikely(node == NULL)) { ++ node = root->rnode; ++ for (offset = 0; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (unlikely(offset >= root->stores_size)) ++ return NULL; ++ ++ if (node->height == 1) ++ return item; ++ else ++ goto go_down; ++ } ++ ++ while (node) { ++ offset = (index & root->mask) + 1; ++ for (; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (offset < root->stores_size) ++ break; ++ ++ node = node->parent; ++ index >>= root->shift; ++ } ++ ++ if (!node) ++ return NULL; ++ ++ while (node->height > 1) { ++go_down: ++ node = item; ++ for (offset = 0; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (unlikely(offset >= root->stores_size)) ++ return NULL; ++ } ++ ++ BUG_ON(offset > root->stores_size); ++ ++ return item; ++} ++ ++/* ++ * Blindly insert the item to the tree. Typically, we reuse the ++ * first empty store item. ++ */ ++int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) ++{ ++ unsigned long index; ++ unsigned int height; ++ struct sradix_tree_node *node, *tmp = NULL; ++ int offset, offset_saved; ++ void **store = NULL; ++ int error, i, j, shift; ++ ++go_on: ++ index = root->min; ++ ++ if (root->enter_node && !sradix_node_full(root, root->enter_node)) { ++ node = root->enter_node; ++ BUG_ON((index >> (root->shift * root->height))); ++ } else { ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height)) ++ || sradix_node_full(root, node)) { ++ error = sradix_tree_extend(root, index); ++ if (error) ++ return error; ++ ++ node = root->rnode; ++ } ++ } ++ ++ ++ height = node->height; ++ shift = (height - 1) * root->shift; ++ offset = (index >> shift) & root->mask; ++ while (shift > 0) { ++ offset_saved = offset; ++ for (; offset < root->stores_size; offset++) { ++ store = &node->stores[offset]; ++ tmp = *store; ++ ++ if (!tmp || !sradix_node_full(root, tmp)) ++ break; ++ } ++ BUG_ON(offset >= root->stores_size); ++ ++ if (offset != offset_saved) { ++ index += (offset - offset_saved) << shift; ++ index &= ~((1UL << shift) - 1); ++ } ++ ++ if (!tmp) { ++ if (!(tmp = root->alloc())) ++ return -ENOMEM; ++ ++ tmp->height = shift / root->shift; ++ *store = tmp; ++ tmp->parent = node; ++ node->count++; ++// if (root->extend) ++// root->extend(node, tmp); ++ } ++ ++ node = tmp; ++ shift -= root->shift; ++ offset = (index >> shift) & root->mask; ++ } ++ ++ BUG_ON(node->height != 1); ++ ++ ++ store = &node->stores[offset]; ++ for (i = 0, j = 0; ++ j < root->stores_size - node->count && ++ i < root->stores_size - offset && j < num; i++) { ++ if (!store[i]) { ++ store[i] = item[j]; ++ if (root->assign) ++ root->assign(node, index + i, item[j]); ++ j++; ++ } ++ } ++ ++ node->count += j; ++ root->num += j; ++ num -= j; ++ ++ while (sradix_node_full(root, node)) { ++ node = node->parent; ++ if (!node) ++ break; ++ ++ node->fulls++; ++ } ++ ++ if (unlikely(!node)) { ++ /* All nodes are full */ ++ root->min = 1 << (root->height * root->shift); ++ root->enter_node = NULL; ++ } else { ++ root->min = index + i - 1; ++ root->min |= (1UL << (node->height - 1)) - 1; ++ root->min++; ++ root->enter_node = node; ++ } ++ ++ if (num) { ++ item += j; ++ goto go_on; ++ } ++ ++ return 0; ++} ++ ++ ++/** ++ * sradix_tree_shrink - shrink height of a sradix tree to minimal ++ * @root sradix tree root ++ * ++ */ ++static inline void sradix_tree_shrink(struct sradix_tree_root *root) ++{ ++ /* try to shrink tree height */ ++ while (root->height > 1) { ++ struct sradix_tree_node *to_free = root->rnode; ++ ++ /* ++ * The candidate node has more than one child, or its child ++ * is not at the leftmost store, we cannot shrink. ++ */ ++ if (to_free->count != 1 || !to_free->stores[0]) ++ break; ++ ++ root->rnode = to_free->stores[0]; ++ root->rnode->parent = NULL; ++ root->height--; ++ if (unlikely(root->enter_node == to_free)) ++ root->enter_node = NULL; ++ root->free(to_free); ++ } ++} ++ ++/* ++ * Del the item on the known leaf node and index ++ */ ++void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index) ++{ ++ unsigned int offset; ++ struct sradix_tree_node *start, *end; ++ ++ BUG_ON(node->height != 1); ++ ++ start = node; ++ while (node && !(--node->count)) ++ node = node->parent; ++ ++ end = node; ++ if (!node) { ++ root->rnode = NULL; ++ root->height = 0; ++ root->min = 0; ++ root->num = 0; ++ root->enter_node = NULL; ++ } else { ++ offset = (index >> (root->shift * (node->height - 1))) & root->mask; ++ if (root->rm) ++ root->rm(node, offset); ++ node->stores[offset] = NULL; ++ root->num--; ++ if (root->min > index) { ++ root->min = index; ++ root->enter_node = node; ++ } ++ } ++ ++ if (start != end) { ++ do { ++ node = start; ++ start = start->parent; ++ if (unlikely(root->enter_node == node)) ++ root->enter_node = end; ++ root->free(node); ++ } while (start != end); ++ ++ /* ++ * Note that shrink may free "end", so enter_node still need to ++ * be checked inside. ++ */ ++ sradix_tree_shrink(root); ++ } else if (node->count == root->stores_size - 1) { ++ /* It WAS a full leaf node. Update the ancestors */ ++ node = node->parent; ++ while (node) { ++ node->fulls--; ++ if (node->fulls != root->stores_size - 1) ++ break; ++ ++ node = node->parent; ++ } ++ } ++} ++ ++void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node; ++ int shift; ++ ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height))) ++ return NULL; ++ ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ node = node->stores[offset]; ++ if (!node) ++ return NULL; ++ ++ shift -= root->shift; ++ } while (shift >= 0); ++ ++ return node; ++} ++ ++/* ++ * Return the item if it exists, otherwise create it in place ++ * and return the created item. ++ */ ++void *sradix_tree_lookup_create(struct sradix_tree_root *root, ++ unsigned long index, void *(*item_alloc)(void)) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node, *tmp; ++ void *item; ++ int shift, error; ++ ++ if (root->rnode == NULL || (index >> (root->shift * root->height))) { ++ if (item_alloc) { ++ error = sradix_tree_extend(root, index); ++ if (error) ++ return NULL; ++ } else { ++ return NULL; ++ } ++ } ++ ++ node = root->rnode; ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ if (!node->stores[offset]) { ++ if (!(tmp = root->alloc())) ++ return NULL; ++ ++ tmp->height = shift / root->shift; ++ node->stores[offset] = tmp; ++ tmp->parent = node; ++ node->count++; ++ node = tmp; ++ } else { ++ node = node->stores[offset]; ++ } ++ ++ shift -= root->shift; ++ } while (shift > 0); ++ ++ BUG_ON(node->height != 1); ++ offset = index & root->mask; ++ if (node->stores[offset]) { ++ return node->stores[offset]; ++ } else if (item_alloc) { ++ if (!(item = item_alloc())) ++ return NULL; ++ ++ node->stores[offset] = item; ++ ++ /* ++ * NOTE: we do NOT call root->assign here, since this item is ++ * newly created by us having no meaning. Caller can call this ++ * if it's necessary to do so. ++ */ ++ ++ node->count++; ++ root->num++; ++ ++ while (sradix_node_full(root, node)) { ++ node = node->parent; ++ if (!node) ++ break; ++ ++ node->fulls++; ++ } ++ ++ if (unlikely(!node)) { ++ /* All nodes are full */ ++ root->min = 1 << (root->height * root->shift); ++ } else { ++ if (root->min == index) { ++ root->min |= (1UL << (node->height - 1)) - 1; ++ root->min++; ++ root->enter_node = node; ++ } ++ } ++ ++ return item; ++ } else { ++ return NULL; ++ } ++ ++} ++ ++int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node; ++ int shift; ++ ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height))) ++ return -ENOENT; ++ ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ node = node->stores[offset]; ++ if (!node) ++ return -ENOENT; ++ ++ shift -= root->shift; ++ } while (shift > 0); ++ ++ offset = index & root->mask; ++ if (!node->stores[offset]) ++ return -ENOENT; ++ ++ sradix_tree_delete_from_leaf(root, node, index); ++ ++ return 0; ++} +diff --git a/mm/Kconfig b/mm/Kconfig +index f730605b8..89d11e029 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -317,6 +317,32 @@ config KSM + See Documentation/vm/ksm.rst for more information: KSM is inactive + until a program has madvised that an area is MADV_MERGEABLE, and + root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). ++choice ++ prompt "Choose UKSM/KSM strategy" ++ default UKSM ++ depends on KSM ++ help ++ This option allows to select a UKSM/KSM stragety. ++ ++config UKSM ++ bool "Ultra-KSM for page merging" ++ depends on KSM ++ help ++ UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same ++ page Merging), but with a fundamentally rewritten core algorithm. With ++ an advanced algorithm, UKSM now can transparently scans all anonymously ++ mapped user space applications with an significantly improved scan speed ++ and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from ++ UKSM. Now UKSM has its first stable release and first real world enterprise user. ++ For more information, please goto its project page. ++ (github.com/dolohow/uksm) ++ ++config KSM_LEGACY ++ bool "Legacy KSM implementation" ++ depends on KSM ++ help ++ The legacy KSM implementation from Red Hat. ++endchoice + + config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" +diff --git a/mm/Makefile b/mm/Makefile +index b6cd2fffa..abe1ab4d4 100644 +--- a/mm/Makefile ++++ b/mm/Makefile +@@ -76,7 +76,8 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o + obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o + obj-$(CONFIG_SLOB) += slob.o + obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +-obj-$(CONFIG_KSM) += ksm.o ++obj-$(CONFIG_KSM_LEGACY) += ksm.o ++obj-$(CONFIG_UKSM) += uksm.o + obj-$(CONFIG_PAGE_POISONING) += page_poison.o + obj-$(CONFIG_SLAB) += slab.o + obj-$(CONFIG_SLUB) += slub.o +diff --git a/mm/ksm.c b/mm/ksm.c +index 9694ee2c7..63af6a528 100644 +--- a/mm/ksm.c ++++ b/mm/ksm.c +@@ -858,17 +858,6 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, + return err; + } + +-static inline struct stable_node *page_stable_node(struct page *page) +-{ +- return PageKsm(page) ? page_rmapping(page) : NULL; +-} +- +-static inline void set_page_stable_node(struct page *page, +- struct stable_node *stable_node) +-{ +- page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); +-} +- + #ifdef CONFIG_SYSFS + /* + * Only called through the sysfs control interface: +diff --git a/mm/memory.c b/mm/memory.c +index feff48e14..9fe7c8e59 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -146,6 +146,25 @@ EXPORT_SYMBOL(zero_pfn); + + unsigned long highest_memmap_pfn __read_mostly; + ++#ifdef CONFIG_UKSM ++unsigned long uksm_zero_pfn __read_mostly; ++EXPORT_SYMBOL_GPL(uksm_zero_pfn); ++struct page *empty_uksm_zero_page; ++ ++static int __init setup_uksm_zero_page(void) ++{ ++ empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0); ++ if (!empty_uksm_zero_page) ++ panic("Oh boy, that early out of memory?"); ++ ++ SetPageReserved(empty_uksm_zero_page); ++ uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); ++ ++ return 0; ++} ++core_initcall(setup_uksm_zero_page); ++#endif ++ + /* + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() + */ +@@ -161,6 +180,7 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) + trace_rss_stat(mm, member, count); + } + ++ + #if defined(SPLIT_RSS_COUNTING) + + void sync_mm_rss(struct mm_struct *mm) +@@ -869,6 +889,11 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; ++ ++ /* Should return NULL in vm_normal_page() */ ++ uksm_bugon_zeropage(pte); ++ } else { ++ uksm_map_zero_page(pte); + } + + /* +@@ -1248,8 +1273,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + tlb_remove_tlb_entry(tlb, pte, addr); +- if (unlikely(!page)) ++ if (unlikely(!page)) { ++ uksm_unmap_zero_page(ptent); + continue; ++ } + + if (!PageAnon(page)) { + if (pte_dirty(ptent)) { +@@ -2597,6 +2624,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src, + + if (likely(src)) { + copy_user_highpage(dst, src, addr, vma); ++ uksm_cow_page(vma, src); + return true; + } + +@@ -2843,6 +2871,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) + vmf->address); + if (!new_page) + goto oom; ++ uksm_cow_pte(vma, vmf->orig_pte); + } else { + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, + vmf->address); +@@ -2885,7 +2914,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) + mm_counter_file(old_page)); + inc_mm_counter_fast(mm, MM_ANONPAGES); + } ++ uksm_bugon_zeropage(vmf->orig_pte); + } else { ++ uksm_unmap_zero_page(vmf->orig_pte); + inc_mm_counter_fast(mm, MM_ANONPAGES); + } + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); +diff --git a/mm/mmap.c b/mm/mmap.c +index dc7206032..f2b9cbd7f 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -46,6 +46,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -181,6 +182,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) + if (vma->vm_file) + fput(vma->vm_file); + mpol_put(vma_policy(vma)); ++ uksm_remove_vma(vma); + vm_area_free(vma); + return next; + } +@@ -750,9 +752,16 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, + long adjust_next = 0; + int remove_next = 0; + ++/* ++ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is ++ * acquired ++ */ ++ uksm_remove_vma(vma); ++ + if (next && !insert) { + struct vm_area_struct *exporter = NULL, *importer = NULL; + ++ uksm_remove_vma(next); + if (end >= next->vm_end) { + /* + * vma expands, overlapping all the next, and +@@ -883,6 +892,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, + end_changed = true; + } + vma->vm_pgoff = pgoff; ++ + if (adjust_next) { + next->vm_start += adjust_next; + next->vm_pgoff += adjust_next >> PAGE_SHIFT; +@@ -987,6 +997,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, + if (remove_next == 2) { + remove_next = 1; + end = next->vm_end; ++ uksm_remove_vma(next); + goto again; + } + else if (next) +@@ -1013,10 +1024,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, + */ + VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); + } ++ } else { ++ if (next && !insert) ++ uksm_vma_add_new(next); + } + if (insert && file) + uprobe_mmap(insert); + ++ uksm_vma_add_new(vma); + validate_mm(mm); + + return 0; +@@ -1472,6 +1487,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr, + vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + ++ /* If uksm is enabled, we add VM_MERGEABLE to new VMAs. */ ++ uksm_vm_flags_mod(&vm_flags); ++ + if (flags & MAP_LOCKED) + if (!can_do_mlock()) + return -EPERM; +@@ -1867,6 +1885,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, + allow_write_access(file); + } + file = vma->vm_file; ++ uksm_vma_add_new(vma); + out: + perf_event_mmap(vma); + +@@ -1909,6 +1928,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + free_vma: ++ uksm_remove_vma(vma); + vm_area_free(vma); + unacct_error: + if (charged) +@@ -2768,6 +2788,8 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + else + err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + ++ uksm_vma_add_new(new); ++ + /* Success. */ + if (!err) + return 0; +@@ -3075,6 +3097,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla + if ((flags & (~VM_EXEC)) != 0) + return -EINVAL; + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; ++ uksm_vm_flags_mod(&flags); + + mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (IS_ERR_VALUE(mapped_addr)) +@@ -3120,6 +3143,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla + vma->vm_flags = flags; + vma->vm_page_prot = vm_get_page_prot(flags); + vma_link(mm, vma, prev, rb_link, rb_parent); ++ uksm_vma_add_new(vma); + out: + perf_event_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; +@@ -3197,6 +3221,12 @@ void exit_mmap(struct mm_struct *mm) + mmap_write_unlock(mm); + } + ++ /* ++ * Taking write lock on mmap does not harm others, ++ * but it's crucial for uksm to avoid races. ++ */ ++ mmap_write_lock(mm); ++ + if (mm->locked_vm) { + vma = mm->mmap; + while (vma) { +@@ -3232,6 +3262,11 @@ void exit_mmap(struct mm_struct *mm) + cond_resched(); + } + vm_unacct_memory(nr_accounted); ++ ++ mm->mmap = NULL; ++ mm->mm_rb = RB_ROOT; ++ vmacache_invalidate(mm); ++ mmap_write_unlock(mm); + } + + /* Insert vm structure into process list sorted by address +@@ -3339,6 +3374,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); + *need_rmap_locks = false; ++ uksm_vma_add_new(new_vma); + } + return new_vma; + +@@ -3507,6 +3543,7 @@ static struct vm_area_struct *__install_special_mapping( + vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); + + perf_event_mmap(vma); ++ uksm_vma_add_new(vma); + + return vma; + +diff --git a/mm/uksm.c b/mm/uksm.c +new file mode 100644 +index 000000000..e4732c00b +--- /dev/null ++++ b/mm/uksm.c +@@ -0,0 +1,5614 @@ ++/* ++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia ++ * ++ * This is an improvement upon KSM. Some basic data structures and routines ++ * are borrowed from ksm.c . ++ * ++ * Its new features: ++ * 1. Full system scan: ++ * It automatically scans all user processes' anonymous VMAs. Kernel-user ++ * interaction to submit a memory area to KSM is no longer needed. ++ * ++ * 2. Rich area detection: ++ * It automatically detects rich areas containing abundant duplicated ++ * pages based. Rich areas are given a full scan speed. Poor areas are ++ * sampled at a reasonable speed with very low CPU consumption. ++ * ++ * 3. Ultra Per-page scan speed improvement: ++ * A new hash algorithm is proposed. As a result, on a machine with ++ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it ++ * can scan memory areas that does not contain duplicated pages at speed of ++ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of ++ * 477MB/sec ~ 923MB/sec. ++ * ++ * 4. Thrashing area avoidance: ++ * Thrashing area(an VMA that has frequent Ksm page break-out) can be ++ * filtered out. My benchmark shows it's more efficient than KSM's per-page ++ * hash value based volatile page detection. ++ * ++ * ++ * 5. Misc changes upon KSM: ++ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page ++ * comparison. It's much faster than default C version on x86. ++ * * rmap_item now has an struct *page member to loosely cache a ++ * address-->page mapping, which reduces too much time-costly ++ * follow_page(). ++ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. ++ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ ++ * ksm is needed for this case. ++ * ++ * 6. Full Zero Page consideration(contributed by Figo Zhang) ++ * Now uksmd consider full zero pages as special pages and merge them to an ++ * special unswappable uksm zero page. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include "internal.h" ++ ++#ifdef CONFIG_X86 ++#undef memcmp ++ ++#ifdef CONFIG_X86_32 ++#define memcmp memcmpx86_32 ++/* ++ * Compare 4-byte-aligned address s1 and s2, with length n ++ */ ++int memcmpx86_32(void *s1, void *s2, size_t n) ++{ ++ size_t num = n / 4; ++ register int res; ++ ++ __asm__ __volatile__ ++ ( ++ "testl %3,%3\n\t" ++ "repe; cmpsd\n\t" ++ "je 1f\n\t" ++ "sbbl %0,%0\n\t" ++ "orl $1,%0\n" ++ "1:" ++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) ++ : "0" (0) ++ : "cc"); ++ ++ return res; ++} ++ ++/* ++ * Check the page is all zero ? ++ */ ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned char same; ++ ++ len /= 4; ++ ++ __asm__ __volatile__ ++ ("repe; scasl;" ++ "sete %0" ++ : "=qm" (same), "+D" (s1), "+c" (len) ++ : "a" (0) ++ : "cc"); ++ ++ return same; ++} ++ ++ ++#elif defined(CONFIG_X86_64) ++#define memcmp memcmpx86_64 ++/* ++ * Compare 8-byte-aligned address s1 and s2, with length n ++ */ ++int memcmpx86_64(void *s1, void *s2, size_t n) ++{ ++ size_t num = n / 8; ++ register int res; ++ ++ __asm__ __volatile__ ++ ( ++ "testq %q3,%q3\n\t" ++ "repe; cmpsq\n\t" ++ "je 1f\n\t" ++ "sbbq %q0,%q0\n\t" ++ "orq $1,%q0\n" ++ "1:" ++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) ++ : "0" (0) ++ : "cc"); ++ ++ return res; ++} ++ ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned char same; ++ ++ len /= 8; ++ ++ __asm__ __volatile__ ++ ("repe; scasq;" ++ "sete %0" ++ : "=qm" (same), "+D" (s1), "+c" (len) ++ : "a" (0) ++ : "cc"); ++ ++ return same; ++} ++ ++#endif ++#else ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned long *src = s1; ++ int i; ++ ++ len /= sizeof(*src); ++ ++ for (i = 0; i < len; i++) { ++ if (src[i]) ++ return 0; ++ } ++ ++ return 1; ++} ++#endif ++ ++#define UKSM_RUNG_ROUND_FINISHED (1 << 0) ++#define TIME_RATIO_SCALE 10000 ++ ++#define SLOT_TREE_NODE_SHIFT 8 ++#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) ++struct slot_tree_node { ++ unsigned long size; ++ struct sradix_tree_node snode; ++ void *stores[SLOT_TREE_NODE_STORE_SIZE]; ++}; ++ ++static struct kmem_cache *slot_tree_node_cachep; ++ ++static struct sradix_tree_node *slot_tree_node_alloc(void) ++{ ++ struct slot_tree_node *p; ++ ++ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (!p) ++ return NULL; ++ ++ return &p->snode; ++} ++ ++static void slot_tree_node_free(struct sradix_tree_node *node) ++{ ++ struct slot_tree_node *p; ++ ++ p = container_of(node, struct slot_tree_node, snode); ++ kmem_cache_free(slot_tree_node_cachep, p); ++} ++ ++static void slot_tree_node_extend(struct sradix_tree_node *parent, ++ struct sradix_tree_node *child) ++{ ++ struct slot_tree_node *p, *c; ++ ++ p = container_of(parent, struct slot_tree_node, snode); ++ c = container_of(child, struct slot_tree_node, snode); ++ ++ p->size += c->size; ++} ++ ++void slot_tree_node_assign(struct sradix_tree_node *node, ++ unsigned int index, void *item) ++{ ++ struct vma_slot *slot = item; ++ struct slot_tree_node *cur; ++ ++ slot->snode = node; ++ slot->sindex = index; ++ ++ while (node) { ++ cur = container_of(node, struct slot_tree_node, snode); ++ cur->size += slot->pages; ++ node = node->parent; ++ } ++} ++ ++void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset) ++{ ++ struct vma_slot *slot; ++ struct slot_tree_node *cur; ++ unsigned long pages; ++ ++ if (node->height == 1) { ++ slot = node->stores[offset]; ++ pages = slot->pages; ++ } else { ++ cur = container_of(node->stores[offset], ++ struct slot_tree_node, snode); ++ pages = cur->size; ++ } ++ ++ while (node) { ++ cur = container_of(node, struct slot_tree_node, snode); ++ cur->size -= pages; ++ node = node->parent; ++ } ++} ++ ++unsigned long slot_iter_index; ++int slot_iter(void *item, unsigned long height) ++{ ++ struct slot_tree_node *node; ++ struct vma_slot *slot; ++ ++ if (height == 1) { ++ slot = item; ++ if (slot_iter_index < slot->pages) { ++ /*in this one*/ ++ return 1; ++ } else { ++ slot_iter_index -= slot->pages; ++ return 0; ++ } ++ ++ } else { ++ node = container_of(item, struct slot_tree_node, snode); ++ if (slot_iter_index < node->size) { ++ /*in this one*/ ++ return 1; ++ } else { ++ slot_iter_index -= node->size; ++ return 0; ++ } ++ } ++} ++ ++ ++static inline void slot_tree_init_root(struct sradix_tree_root *root) ++{ ++ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); ++ root->alloc = slot_tree_node_alloc; ++ root->free = slot_tree_node_free; ++ root->extend = slot_tree_node_extend; ++ root->assign = slot_tree_node_assign; ++ root->rm = slot_tree_node_rm; ++} ++ ++void slot_tree_init(void) ++{ ++ slot_tree_node_cachep = kmem_cache_create("slot_tree_node", ++ sizeof(struct slot_tree_node), 0, ++ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, ++ NULL); ++} ++ ++ ++/* Each rung of this ladder is a list of VMAs having a same scan ratio */ ++struct scan_rung { ++ //struct list_head scanned_list; ++ struct sradix_tree_root vma_root; ++ struct sradix_tree_root vma_root2; ++ ++ struct vma_slot *current_scan; ++ unsigned long current_offset; ++ ++ /* ++ * The initial value for current_offset, it should loop over ++ * [0~ step - 1] to let all slot have its chance to be scanned. ++ */ ++ unsigned long offset_init; ++ unsigned long step; /* dynamic step for current_offset */ ++ unsigned int flags; ++ unsigned long pages_to_scan; ++ //unsigned long fully_scanned_slots; ++ /* ++ * a little bit tricky - if cpu_time_ratio > 0, then the value is the ++ * the cpu time ratio it can spend in rung_i for every scan ++ * period. if < 0, then it is the cpu time ratio relative to the ++ * max cpu percentage user specified. Both in unit of ++ * 1/TIME_RATIO_SCALE ++ */ ++ int cpu_ratio; ++ ++ /* ++ * How long it will take for all slots in this rung to be fully ++ * scanned? If it's zero, we don't care about the cover time: ++ * it's fully scanned. ++ */ ++ unsigned int cover_msecs; ++ //unsigned long vma_num; ++ //unsigned long pages; /* Sum of all slot's pages in rung */ ++}; ++ ++/** ++ * node of either the stable or unstale rbtree ++ * ++ */ ++struct tree_node { ++ struct rb_node node; /* link in the main (un)stable rbtree */ ++ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ ++ u32 hash; ++ unsigned long count; /* TODO: merged with sub_root */ ++ struct list_head all_list; /* all tree nodes in stable/unstable tree */ ++}; ++ ++/** ++ * struct stable_node - node of the stable rbtree ++ * @node: rb node of this ksm page in the stable tree ++ * @hlist: hlist head of rmap_items using this ksm page ++ * @kpfn: page frame number of this ksm page ++ */ ++struct stable_node { ++ struct rb_node node; /* link in sub-rbtree */ ++ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ ++ struct hlist_head hlist; ++ unsigned long kpfn; ++ u32 hash_max; /* if ==0 then it's not been calculated yet */ ++ struct list_head all_list; /* in a list for all stable nodes */ ++}; ++ ++/** ++ * struct node_vma - group rmap_items linked in a same stable ++ * node together. ++ */ ++struct node_vma { ++ union { ++ struct vma_slot *slot; ++ unsigned long key; /* slot is used as key sorted on hlist */ ++ }; ++ struct hlist_node hlist; ++ struct hlist_head rmap_hlist; ++ struct stable_node *head; ++}; ++ ++/** ++ * struct rmap_item - reverse mapping item for virtual addresses ++ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list ++ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree ++ * @mm: the memory structure this rmap_item is pointing into ++ * @address: the virtual address this rmap_item tracks (+ flags in low bits) ++ * @node: rb node of this rmap_item in the unstable tree ++ * @head: pointer to stable_node heading this list in the stable tree ++ * @hlist: link into hlist of rmap_items hanging off that stable_node ++ */ ++struct rmap_item { ++ struct vma_slot *slot; ++ struct page *page; ++ unsigned long address; /* + low bits used for flags below */ ++ unsigned long hash_round; ++ unsigned long entry_index; ++ union { ++ struct {/* when in unstable tree */ ++ struct rb_node node; ++ struct tree_node *tree_node; ++ u32 hash_max; ++ }; ++ struct { /* when in stable tree */ ++ struct node_vma *head; ++ struct hlist_node hlist; ++ struct anon_vma *anon_vma; ++ }; ++ }; ++} __aligned(4); ++ ++struct rmap_list_entry { ++ union { ++ struct rmap_item *item; ++ unsigned long addr; ++ }; ++ /* lowest bit is used for is_addr tag */ ++} __aligned(4); /* 4 aligned to fit in to pages*/ ++ ++ ++/* Basic data structure definition ends */ ++ ++ ++/* ++ * Flags for rmap_item to judge if it's listed in the stable/unstable tree. ++ * The flags use the low bits of rmap_item.address ++ */ ++#define UNSTABLE_FLAG 0x1 ++#define STABLE_FLAG 0x2 ++#define get_rmap_addr(x) ((x)->address & PAGE_MASK) ++ ++/* ++ * rmap_list_entry helpers ++ */ ++#define IS_ADDR_FLAG 1 ++#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) ++#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) ++#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) ++ ++ ++/* ++ * High speed caches for frequently allocated and freed structs ++ */ ++static struct kmem_cache *rmap_item_cache; ++static struct kmem_cache *stable_node_cache; ++static struct kmem_cache *node_vma_cache; ++static struct kmem_cache *vma_slot_cache; ++static struct kmem_cache *tree_node_cache; ++#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ ++ sizeof(struct __struct), __alignof__(struct __struct),\ ++ (__flags), NULL) ++ ++/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ ++#define SCAN_LADDER_SIZE 4 ++static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; ++ ++/* The evaluation rounds uksmd has finished */ ++static unsigned long long uksm_eval_round = 1; ++ ++/* ++ * we add 1 to this var when we consider we should rebuild the whole ++ * unstable tree. ++ */ ++static unsigned long uksm_hash_round = 1; ++ ++/* ++ * How many times the whole memory is scanned. ++ */ ++static unsigned long long fully_scanned_round = 1; ++ ++/* The total number of virtual pages of all vma slots */ ++static u64 uksm_pages_total; ++ ++/* The number of pages has been scanned since the start up */ ++static u64 uksm_pages_scanned; ++ ++static u64 scanned_virtual_pages; ++ ++/* The number of pages has been scanned since last encode_benefit call */ ++static u64 uksm_pages_scanned_last; ++ ++/* If the scanned number is tooo large, we encode it here */ ++static u64 pages_scanned_stored; ++ ++static unsigned long pages_scanned_base; ++ ++/* The number of nodes in the stable tree */ ++static unsigned long uksm_pages_shared; ++ ++/* The number of page slots additionally sharing those nodes */ ++static unsigned long uksm_pages_sharing; ++ ++/* The number of nodes in the unstable tree */ ++static unsigned long uksm_pages_unshared; ++ ++/* ++ * Milliseconds ksmd should sleep between scans, ++ * >= 100ms to be consistent with ++ * scan_time_to_sleep_msec() ++ */ ++static unsigned int uksm_sleep_jiffies; ++ ++/* The real value for the uksmd next sleep */ ++static unsigned int uksm_sleep_real; ++ ++/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ ++static unsigned int uksm_sleep_saved; ++ ++/* Max percentage of cpu utilization ksmd can take to scan in one batch */ ++static unsigned int uksm_max_cpu_percentage; ++ ++static int uksm_cpu_governor; ++ ++static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; ++ ++struct uksm_cpu_preset_s { ++ int cpu_ratio[SCAN_LADDER_SIZE]; ++ unsigned int cover_msecs[SCAN_LADDER_SIZE]; ++ unsigned int max_cpu; /* percentage */ ++}; ++ ++struct uksm_cpu_preset_s uksm_cpu_preset[4] = { ++ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, ++ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, ++ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, ++ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, ++}; ++ ++/* The default value for uksm_ema_page_time if it's not initialized */ ++#define UKSM_PAGE_TIME_DEFAULT 500 ++ ++/*cost to scan one page by expotional moving average in nsecs */ ++static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; ++ ++/* The expotional moving average alpha weight, in percentage. */ ++#define EMA_ALPHA 20 ++ ++/* ++ * The threshold used to filter out thrashing areas, ++ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound ++ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio ++ * will be considered as having a zero duplication ratio. ++ */ ++static unsigned int uksm_thrash_threshold = 50; ++ ++/* How much dedup ratio is considered to be abundant*/ ++static unsigned int uksm_abundant_threshold = 10; ++ ++/* All slots having merged pages in this eval round. */ ++struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); ++ ++/* How many times the ksmd has slept since startup */ ++static unsigned long long uksm_sleep_times; ++ ++#define UKSM_RUN_STOP 0 ++#define UKSM_RUN_MERGE 1 ++static unsigned int uksm_run = 1; ++ ++static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); ++static DEFINE_MUTEX(uksm_thread_mutex); ++ ++/* ++ * List vma_slot_new is for newly created vma_slot waiting to be added by ++ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to ++ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding ++ * VMA has been removed/freed. ++ */ ++struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); ++struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); ++struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); ++static DEFINE_SPINLOCK(vma_slot_list_lock); ++ ++/* The unstable tree heads */ ++static struct rb_root root_unstable_tree = RB_ROOT; ++ ++/* ++ * All tree_nodes are in a list to be freed at once when unstable tree is ++ * freed after each scan round. ++ */ ++static struct list_head unstable_tree_node_list = ++ LIST_HEAD_INIT(unstable_tree_node_list); ++ ++/* List contains all stable nodes */ ++static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); ++ ++/* ++ * When the hash strength is changed, the stable tree must be delta_hashed and ++ * re-structured. We use two set of below structs to speed up the ++ * re-structuring of stable tree. ++ */ ++static struct list_head ++stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), ++ LIST_HEAD_INIT(stable_tree_node_list[1])}; ++ ++static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; ++static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; ++static struct rb_root *root_stable_treep = &root_stable_tree[0]; ++static unsigned long stable_tree_index; ++ ++/* The hash strength needed to hash a full page */ ++#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) ++ ++/* The hash strength needed for loop-back hashing */ ++#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) ++ ++/* The random offsets in a page */ ++static u32 *random_nums; ++ ++/* The hash strength */ ++static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; ++ ++/* The delta value each time the hash strength increases or decreases */ ++static unsigned long hash_strength_delta; ++#define HASH_STRENGTH_DELTA_MAX 5 ++ ++/* The time we have saved due to random_sample_hash */ ++static u64 rshash_pos; ++ ++/* The time we have wasted due to hash collision */ ++static u64 rshash_neg; ++ ++struct uksm_benefit { ++ u64 pos; ++ u64 neg; ++ u64 scanned; ++ unsigned long base; ++} benefit; ++ ++/* ++ * The relative cost of memcmp, compared to 1 time unit of random sample ++ * hash, this value is tested when ksm module is initialized ++ */ ++static unsigned long memcmp_cost; ++ ++static unsigned long rshash_neg_cont_zero; ++static unsigned long rshash_cont_obscure; ++ ++/* The possible states of hash strength adjustment heuristic */ ++enum rshash_states { ++ RSHASH_STILL, ++ RSHASH_TRYUP, ++ RSHASH_TRYDOWN, ++ RSHASH_NEW, ++ RSHASH_PRE_STILL, ++}; ++ ++/* The possible direction we are about to adjust hash strength */ ++enum rshash_direct { ++ GO_UP, ++ GO_DOWN, ++ OBSCURE, ++ STILL, ++}; ++ ++/* random sampling hash state machine */ ++static struct { ++ enum rshash_states state; ++ enum rshash_direct pre_direct; ++ u8 below_count; ++ /* Keep a lookup window of size 5, iff above_count/below_count > 3 ++ * in this window we stop trying. ++ */ ++ u8 lookup_window_index; ++ u64 stable_benefit; ++ unsigned long turn_point_down; ++ unsigned long turn_benefit_down; ++ unsigned long turn_point_up; ++ unsigned long turn_benefit_up; ++ unsigned long stable_point; ++} rshash_state; ++ ++/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ ++static u32 *zero_hash_table; ++ ++static inline struct node_vma *alloc_node_vma(void) ++{ ++ struct node_vma *node_vma; ++ ++ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (node_vma) { ++ INIT_HLIST_HEAD(&node_vma->rmap_hlist); ++ INIT_HLIST_NODE(&node_vma->hlist); ++ } ++ return node_vma; ++} ++ ++static inline void free_node_vma(struct node_vma *node_vma) ++{ ++ kmem_cache_free(node_vma_cache, node_vma); ++} ++ ++ ++static inline struct vma_slot *alloc_vma_slot(void) ++{ ++ struct vma_slot *slot; ++ ++ /* ++ * In case ksm is not initialized by now. ++ * Oops, we need to consider the call site of uksm_init() in the future. ++ */ ++ if (!vma_slot_cache) ++ return NULL; ++ ++ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (slot) { ++ INIT_LIST_HEAD(&slot->slot_list); ++ INIT_LIST_HEAD(&slot->dedup_list); ++ slot->flags |= UKSM_SLOT_NEED_RERAND; ++ } ++ return slot; ++} ++ ++static inline void free_vma_slot(struct vma_slot *vma_slot) ++{ ++ kmem_cache_free(vma_slot_cache, vma_slot); ++} ++ ++ ++ ++static inline struct rmap_item *alloc_rmap_item(void) ++{ ++ struct rmap_item *rmap_item; ++ ++ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (rmap_item) { ++ /* bug on lowest bit is not clear for flag use */ ++ BUG_ON(is_addr(rmap_item)); ++ } ++ return rmap_item; ++} ++ ++static inline void free_rmap_item(struct rmap_item *rmap_item) ++{ ++ rmap_item->slot = NULL; /* debug safety */ ++ kmem_cache_free(rmap_item_cache, rmap_item); ++} ++ ++static inline struct stable_node *alloc_stable_node(void) ++{ ++ struct stable_node *node; ++ ++ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (!node) ++ return NULL; ++ ++ INIT_HLIST_HEAD(&node->hlist); ++ list_add(&node->all_list, &stable_node_list); ++ return node; ++} ++ ++static inline void free_stable_node(struct stable_node *stable_node) ++{ ++ list_del(&stable_node->all_list); ++ kmem_cache_free(stable_node_cache, stable_node); ++} ++ ++static inline struct tree_node *alloc_tree_node(struct list_head *list) ++{ ++ struct tree_node *node; ++ ++ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (!node) ++ return NULL; ++ ++ list_add(&node->all_list, list); ++ return node; ++} ++ ++static inline void free_tree_node(struct tree_node *node) ++{ ++ list_del(&node->all_list); ++ kmem_cache_free(tree_node_cache, node); ++} ++ ++static void uksm_drop_anon_vma(struct rmap_item *rmap_item) ++{ ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ ++ put_anon_vma(anon_vma); ++} ++ ++ ++/** ++ * Remove a stable node from stable_tree, may unlink from its tree_node and ++ * may remove its parent tree_node if no other stable node is pending. ++ * ++ * @stable_node The node need to be removed ++ * @unlink_rb Will this node be unlinked from the rbtree? ++ * @remove_tree_ node Will its tree_node be removed if empty? ++ */ ++static void remove_node_from_stable_tree(struct stable_node *stable_node, ++ int unlink_rb, int remove_tree_node) ++{ ++ struct node_vma *node_vma; ++ struct rmap_item *rmap_item; ++ struct hlist_node *n; ++ ++ if (!hlist_empty(&stable_node->hlist)) { ++ hlist_for_each_entry_safe(node_vma, n, ++ &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { ++ uksm_pages_sharing--; ++ ++ uksm_drop_anon_vma(rmap_item); ++ rmap_item->address &= PAGE_MASK; ++ } ++ free_node_vma(node_vma); ++ cond_resched(); ++ } ++ ++ /* the last one is counted as shared */ ++ uksm_pages_shared--; ++ uksm_pages_sharing++; ++ } ++ ++ if (stable_node->tree_node && unlink_rb) { ++ rb_erase(&stable_node->node, ++ &stable_node->tree_node->sub_root); ++ ++ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && ++ remove_tree_node) { ++ rb_erase(&stable_node->tree_node->node, ++ root_stable_treep); ++ free_tree_node(stable_node->tree_node); ++ } else { ++ stable_node->tree_node->count--; ++ } ++ } ++ ++ free_stable_node(stable_node); ++} ++ ++ ++/* ++ * get_uksm_page: checks if the page indicated by the stable node ++ * is still its ksm page, despite having held no reference to it. ++ * In which case we can trust the content of the page, and it ++ * returns the gotten page; but if the page has now been zapped, ++ * remove the stale node from the stable tree and return NULL. ++ * ++ * You would expect the stable_node to hold a reference to the ksm page. ++ * But if it increments the page's count, swapping out has to wait for ++ * ksmd to come around again before it can free the page, which may take ++ * seconds or even minutes: much too unresponsive. So instead we use a ++ * "keyhole reference": access to the ksm page from the stable node peeps ++ * out through its keyhole to see if that page still holds the right key, ++ * pointing back to this stable node. This relies on freeing a PageAnon ++ * page to reset its page->mapping to NULL, and relies on no other use of ++ * a page to put something that might look like our key in page->mapping. ++ * ++ * include/linux/pagemap.h page_cache_get_speculative() is a good reference, ++ * but this is different - made simpler by uksm_thread_mutex being held, but ++ * interesting for assuming that no other use of the struct page could ever ++ * put our expected_mapping into page->mapping (or a field of the union which ++ * coincides with page->mapping). The RCU calls are not for KSM at all, but ++ * to keep the page_count protocol described with page_cache_get_speculative. ++ * ++ * Note: it is possible that get_uksm_page() will return NULL one moment, ++ * then page the next, if the page is in between page_freeze_refs() and ++ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page ++ * is on its way to being freed; but it is an anomaly to bear in mind. ++ * ++ * @unlink_rb: if the removal of this node will firstly unlink from ++ * its rbtree. stable_node_reinsert will prevent this when restructuring the ++ * node from its old tree. ++ * ++ * @remove_tree_node: if this is the last one of its tree_node, will the ++ * tree_node be freed ? If we are inserting stable node, this tree_node may ++ * be reused, so don't free it. ++ */ ++static struct page *get_uksm_page(struct stable_node *stable_node, ++ int unlink_rb, int remove_tree_node) ++{ ++ struct page *page; ++ void *expected_mapping; ++ unsigned long kpfn; ++ ++ expected_mapping = (void *)((unsigned long)stable_node | ++ PAGE_MAPPING_KSM); ++again: ++ kpfn = READ_ONCE(stable_node->kpfn); ++ page = pfn_to_page(kpfn); ++ ++ /* ++ * page is computed from kpfn, so on most architectures reading ++ * page->mapping is naturally ordered after reading node->kpfn, ++ * but on Alpha we need to be more careful. ++ */ ++ smp_rmb(); ++ ++ if (READ_ONCE(page->mapping) != expected_mapping) ++ goto stale; ++ ++ /* ++ * We cannot do anything with the page while its refcount is 0. ++ * Usually 0 means free, or tail of a higher-order page: in which ++ * case this node is no longer referenced, and should be freed; ++ * however, it might mean that the page is under page_freeze_refs(). ++ * The __remove_mapping() case is easy, again the node is now stale; ++ * but if page is swapcache in migrate_page_move_mapping(), it might ++ * still be our page, in which case it's essential to keep the node. ++ */ ++ while (!get_page_unless_zero(page)) { ++ /* ++ * Another check for page->mapping != expected_mapping would ++ * work here too. We have chosen the !PageSwapCache test to ++ * optimize the common case, when the page is or is about to ++ * be freed: PageSwapCache is cleared (under spin_lock_irq) ++ * in the freeze_refs section of __remove_mapping(); but Anon ++ * page->mapping reset to NULL later, in free_pages_prepare(). ++ */ ++ if (!PageSwapCache(page)) ++ goto stale; ++ cpu_relax(); ++ } ++ ++ if (READ_ONCE(page->mapping) != expected_mapping) { ++ put_page(page); ++ goto stale; ++ } ++ ++ lock_page(page); ++ if (READ_ONCE(page->mapping) != expected_mapping) { ++ unlock_page(page); ++ put_page(page); ++ goto stale; ++ } ++ unlock_page(page); ++ return page; ++stale: ++ /* ++ * We come here from above when page->mapping or !PageSwapCache ++ * suggests that the node is stale; but it might be under migration. ++ * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), ++ * before checking whether node->kpfn has been changed. ++ */ ++ smp_rmb(); ++ if (stable_node->kpfn != kpfn) ++ goto again; ++ ++ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); ++ ++ return NULL; ++} ++ ++/* ++ * Removing rmap_item from stable or unstable tree. ++ * This function will clean the information from the stable/unstable tree. ++ */ ++static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ++{ ++ if (rmap_item->address & STABLE_FLAG) { ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct page *page; ++ ++ node_vma = rmap_item->head; ++ stable_node = node_vma->head; ++ page = get_uksm_page(stable_node, 1, 1); ++ if (!page) ++ goto out; ++ ++ /* ++ * page lock is needed because it's racing with ++ * try_to_unmap_ksm(), etc. ++ */ ++ lock_page(page); ++ hlist_del(&rmap_item->hlist); ++ ++ if (hlist_empty(&node_vma->rmap_hlist)) { ++ hlist_del(&node_vma->hlist); ++ free_node_vma(node_vma); ++ } ++ unlock_page(page); ++ ++ put_page(page); ++ if (hlist_empty(&stable_node->hlist)) { ++ /* do NOT call remove_node_from_stable_tree() here, ++ * it's possible for a forked rmap_item not in ++ * stable tree while the in-tree rmap_items were ++ * deleted. ++ */ ++ uksm_pages_shared--; ++ } else ++ uksm_pages_sharing--; ++ ++ ++ uksm_drop_anon_vma(rmap_item); ++ } else if (rmap_item->address & UNSTABLE_FLAG) { ++ if (rmap_item->hash_round == uksm_hash_round) { ++ ++ rb_erase(&rmap_item->node, ++ &rmap_item->tree_node->sub_root); ++ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { ++ rb_erase(&rmap_item->tree_node->node, ++ &root_unstable_tree); ++ ++ free_tree_node(rmap_item->tree_node); ++ } else ++ rmap_item->tree_node->count--; ++ } ++ uksm_pages_unshared--; ++ } ++ ++ rmap_item->address &= PAGE_MASK; ++ rmap_item->hash_max = 0; ++ ++out: ++ cond_resched(); /* we're called from many long loops */ ++} ++ ++static inline int slot_in_uksm(struct vma_slot *slot) ++{ ++ return list_empty(&slot->slot_list); ++} ++ ++/* ++ * Test if the mm is exiting ++ */ ++static inline bool uksm_test_exit(struct mm_struct *mm) ++{ ++ return atomic_read(&mm->mm_users) == 0; ++} ++ ++static inline unsigned long vma_pool_size(struct vma_slot *slot) ++{ ++ return round_up(sizeof(struct rmap_list_entry) * slot->pages, ++ PAGE_SIZE) >> PAGE_SHIFT; ++} ++ ++#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) ++ ++/* must be done with sem locked */ ++static int slot_pool_alloc(struct vma_slot *slot) ++{ ++ unsigned long pool_size; ++ ++ if (slot->rmap_list_pool) ++ return 0; ++ ++ pool_size = vma_pool_size(slot); ++ slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *), ++ GFP_KERNEL); ++ if (!slot->rmap_list_pool) ++ return -ENOMEM; ++ ++ slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int), ++ GFP_KERNEL); ++ if (!slot->pool_counts) { ++ kfree(slot->rmap_list_pool); ++ return -ENOMEM; ++ } ++ ++ slot->pool_size = pool_size; ++ BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); ++ slot->flags |= UKSM_SLOT_IN_UKSM; ++ uksm_pages_total += slot->pages; ++ ++ return 0; ++} ++ ++/* ++ * Called after vma is unlinked from its mm ++ */ ++void uksm_remove_vma(struct vm_area_struct *vma) ++{ ++ struct vma_slot *slot; ++ ++ if (!vma->uksm_vma_slot) ++ return; ++ ++ spin_lock(&vma_slot_list_lock); ++ slot = vma->uksm_vma_slot; ++ if (!slot) ++ goto out; ++ ++ if (slot_in_uksm(slot)) { ++ /** ++ * This slot has been added by ksmd, so move to the del list ++ * waiting ksmd to free it. ++ */ ++ list_add_tail(&slot->slot_list, &vma_slot_del); ++ } else { ++ /** ++ * It's still on new list. It's ok to free slot directly. ++ */ ++ list_del(&slot->slot_list); ++ free_vma_slot(slot); ++ } ++out: ++ vma->uksm_vma_slot = NULL; ++ spin_unlock(&vma_slot_list_lock); ++} ++ ++/** ++ * Need to do two things: ++ * 1. check if slot was moved to del list ++ * 2. make sure the mmap_sem is manipulated under valid vma. ++ * ++ * My concern here is that in some cases, this may make ++ * vma_slot_list_lock() waiters to serialized further by some ++ * sem->wait_lock, can this really be expensive? ++ * ++ * ++ * @return ++ * 0: if successfully locked mmap_sem ++ * -ENOENT: this slot was moved to del list ++ * -EBUSY: vma lock failed ++ */ ++static int try_down_read_slot_mmap_sem(struct vma_slot *slot) ++{ ++ struct vm_area_struct *vma; ++ struct mm_struct *mm; ++ struct rw_semaphore *sem; ++ ++ spin_lock(&vma_slot_list_lock); ++ ++ /* the slot_list was removed and inited from new list, when it enters ++ * uksm_list. If now it's not empty, then it must be moved to del list ++ */ ++ if (!slot_in_uksm(slot)) { ++ spin_unlock(&vma_slot_list_lock); ++ return -ENOENT; ++ } ++ ++ BUG_ON(slot->pages != vma_pages(slot->vma)); ++ /* Ok, vma still valid */ ++ vma = slot->vma; ++ mm = vma->vm_mm; ++ sem = &mm->mmap_lock; ++ ++ if (uksm_test_exit(mm)) { ++ spin_unlock(&vma_slot_list_lock); ++ return -ENOENT; ++ } ++ ++ if (down_read_trylock(sem)) { ++ spin_unlock(&vma_slot_list_lock); ++ if (slot_pool_alloc(slot)) { ++ uksm_remove_vma(vma); ++ up_read(sem); ++ return -ENOENT; ++ } ++ return 0; ++ } ++ ++ spin_unlock(&vma_slot_list_lock); ++ return -EBUSY; ++} ++ ++static inline unsigned long ++vma_page_address(struct page *page, struct vm_area_struct *vma) ++{ ++ pgoff_t pgoff = page->index; ++ unsigned long address; ++ ++ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ++ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { ++ /* page should be within @vma mapping range */ ++ return -EFAULT; ++ } ++ return address; ++} ++ ++ ++/* return 0 on success with the item's mmap_sem locked */ ++static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) ++{ ++ struct mm_struct *mm; ++ struct vma_slot *slot = item->slot; ++ int err = -EINVAL; ++ ++ struct page *page; ++ ++ /* ++ * try_down_read_slot_mmap_sem() returns non-zero if the slot ++ * has been removed by uksm_remove_vma(). ++ */ ++ if (try_down_read_slot_mmap_sem(slot)) ++ return -EBUSY; ++ ++ mm = slot->vma->vm_mm; ++ ++ if (uksm_test_exit(mm)) ++ goto failout_up; ++ ++ page = item->page; ++ rcu_read_lock(); ++ if (!get_page_unless_zero(page)) { ++ rcu_read_unlock(); ++ goto failout_up; ++ } ++ ++ /* No need to consider huge page here. */ ++ if (item->slot->vma->anon_vma != page_anon_vma(page) || ++ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { ++ /* ++ * TODO: ++ * should we release this item becase of its stale page ++ * mapping? ++ */ ++ put_page(page); ++ rcu_read_unlock(); ++ goto failout_up; ++ } ++ rcu_read_unlock(); ++ return 0; ++ ++failout_up: ++ mmap_read_unlock(mm); ++ return err; ++} ++ ++/* ++ * What kind of VMA is considered ? ++ */ ++static inline int vma_can_enter(struct vm_area_struct *vma) ++{ ++ return uksm_flags_can_scan(vma->vm_flags); ++} ++ ++/* ++ * Called whenever a fresh new vma is created A new vma_slot. ++ * is created and inserted into a global list Must be called. ++ * after vma is inserted to its mm. ++ */ ++void uksm_vma_add_new(struct vm_area_struct *vma) ++{ ++ struct vma_slot *slot; ++ ++ if (!vma_can_enter(vma)) { ++ vma->uksm_vma_slot = NULL; ++ return; ++ } ++ ++ slot = alloc_vma_slot(); ++ if (!slot) { ++ vma->uksm_vma_slot = NULL; ++ return; ++ } ++ ++ vma->uksm_vma_slot = slot; ++ vma->vm_flags |= VM_MERGEABLE; ++ slot->vma = vma; ++ slot->mm = vma->vm_mm; ++ slot->ctime_j = jiffies; ++ slot->pages = vma_pages(vma); ++ spin_lock(&vma_slot_list_lock); ++ list_add_tail(&slot->slot_list, &vma_slot_new); ++ spin_unlock(&vma_slot_list_lock); ++} ++ ++/* 32/3 < they < 32/2 */ ++#define shiftl 8 ++#define shiftr 12 ++ ++#define HASH_FROM_TO(from, to) \ ++for (index = from; index < to; index++) { \ ++ pos = random_nums[index]; \ ++ hash += key[pos]; \ ++ hash += (hash << shiftl); \ ++ hash ^= (hash >> shiftr); \ ++} ++ ++ ++#define HASH_FROM_DOWN_TO(from, to) \ ++for (index = from - 1; index >= to; index--) { \ ++ hash ^= (hash >> shiftr); \ ++ hash ^= (hash >> (shiftr*2)); \ ++ hash -= (hash << shiftl); \ ++ hash += (hash << (shiftl*2)); \ ++ pos = random_nums[index]; \ ++ hash -= key[pos]; \ ++} ++ ++/* ++ * The main random sample hash function. ++ */ ++static u32 random_sample_hash(void *addr, u32 hash_strength) ++{ ++ u32 hash = 0xdeadbeef; ++ int index, pos, loop = hash_strength; ++ u32 *key = (u32 *)addr; ++ ++ if (loop > HASH_STRENGTH_FULL) ++ loop = HASH_STRENGTH_FULL; ++ ++ HASH_FROM_TO(0, loop); ++ ++ if (hash_strength > HASH_STRENGTH_FULL) { ++ loop = hash_strength - HASH_STRENGTH_FULL; ++ HASH_FROM_TO(0, loop); ++ } ++ ++ return hash; ++} ++ ++ ++/** ++ * It's used when hash strength is adjusted ++ * ++ * @addr The page's virtual address ++ * @from The original hash strength ++ * @to The hash strength changed to ++ * @hash The hash value generated with "from" hash value ++ * ++ * return the hash value ++ */ ++static u32 delta_hash(void *addr, int from, int to, u32 hash) ++{ ++ u32 *key = (u32 *)addr; ++ int index, pos; /* make sure they are int type */ ++ ++ if (to > from) { ++ if (from >= HASH_STRENGTH_FULL) { ++ from -= HASH_STRENGTH_FULL; ++ to -= HASH_STRENGTH_FULL; ++ HASH_FROM_TO(from, to); ++ } else if (to <= HASH_STRENGTH_FULL) { ++ HASH_FROM_TO(from, to); ++ } else { ++ HASH_FROM_TO(from, HASH_STRENGTH_FULL); ++ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); ++ } ++ } else { ++ if (from <= HASH_STRENGTH_FULL) { ++ HASH_FROM_DOWN_TO(from, to); ++ } else if (to >= HASH_STRENGTH_FULL) { ++ from -= HASH_STRENGTH_FULL; ++ to -= HASH_STRENGTH_FULL; ++ HASH_FROM_DOWN_TO(from, to); ++ } else { ++ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); ++ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); ++ } ++ } ++ ++ return hash; ++} ++ ++/** ++ * ++ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round ++ * has finished. ++ * ++ * return 0 if no page has been scanned since last call, 1 otherwise. ++ */ ++static inline int encode_benefit(void) ++{ ++ u64 scanned_delta, pos_delta, neg_delta; ++ unsigned long base = benefit.base; ++ ++ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; ++ ++ if (!scanned_delta) ++ return 0; ++ ++ scanned_delta >>= base; ++ pos_delta = rshash_pos >> base; ++ neg_delta = rshash_neg >> base; ++ ++ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || ++ CAN_OVERFLOW_U64(benefit.neg, neg_delta) || ++ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { ++ benefit.scanned >>= 1; ++ benefit.neg >>= 1; ++ benefit.pos >>= 1; ++ benefit.base++; ++ scanned_delta >>= 1; ++ pos_delta >>= 1; ++ neg_delta >>= 1; ++ } ++ ++ benefit.pos += pos_delta; ++ benefit.neg += neg_delta; ++ benefit.scanned += scanned_delta; ++ ++ BUG_ON(!benefit.scanned); ++ ++ rshash_pos = rshash_neg = 0; ++ uksm_pages_scanned_last = uksm_pages_scanned; ++ ++ return 1; ++} ++ ++static inline void reset_benefit(void) ++{ ++ benefit.pos = 0; ++ benefit.neg = 0; ++ benefit.base = 0; ++ benefit.scanned = 0; ++} ++ ++static inline void inc_rshash_pos(unsigned long delta) ++{ ++ if (CAN_OVERFLOW_U64(rshash_pos, delta)) ++ encode_benefit(); ++ ++ rshash_pos += delta; ++} ++ ++static inline void inc_rshash_neg(unsigned long delta) ++{ ++ if (CAN_OVERFLOW_U64(rshash_neg, delta)) ++ encode_benefit(); ++ ++ rshash_neg += delta; ++} ++ ++ ++static inline u32 page_hash(struct page *page, unsigned long hash_strength, ++ int cost_accounting) ++{ ++ u32 val; ++ unsigned long delta; ++ ++ void *addr = kmap_atomic(page); ++ ++ val = random_sample_hash(addr, hash_strength); ++ kunmap_atomic(addr); ++ ++ if (cost_accounting) { ++ if (hash_strength < HASH_STRENGTH_FULL) ++ delta = HASH_STRENGTH_FULL - hash_strength; ++ else ++ delta = 0; ++ ++ inc_rshash_pos(delta); ++ } ++ ++ return val; ++} ++ ++static int memcmp_pages_with_cost(struct page *page1, struct page *page2, ++ int cost_accounting) ++{ ++ char *addr1, *addr2; ++ int ret; ++ ++ addr1 = kmap_atomic(page1); ++ addr2 = kmap_atomic(page2); ++ ret = memcmp(addr1, addr2, PAGE_SIZE); ++ kunmap_atomic(addr2); ++ kunmap_atomic(addr1); ++ ++ if (cost_accounting) ++ inc_rshash_neg(memcmp_cost); ++ ++ return ret; ++} ++ ++static inline int pages_identical_with_cost(struct page *page1, struct page *page2) ++{ ++ return !memcmp_pages_with_cost(page1, page2, 0); ++} ++ ++static inline int is_page_full_zero(struct page *page) ++{ ++ char *addr; ++ int ret; ++ ++ addr = kmap_atomic(page); ++ ret = is_full_zero(addr, PAGE_SIZE); ++ kunmap_atomic(addr); ++ ++ return ret; ++} ++ ++static int write_protect_page(struct vm_area_struct *vma, struct page *page, ++ pte_t *orig_pte, pte_t *old_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ struct page_vma_mapped_walk pvmw = { ++ .page = page, ++ .vma = vma, ++ }; ++ struct mmu_notifier_range range; ++ int swapped; ++ int err = -EFAULT; ++ ++ pvmw.address = page_address_in_vma(page, vma); ++ if (pvmw.address == -EFAULT) ++ goto out; ++ ++ BUG_ON(PageTransCompound(page)); ++ ++ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, pvmw.address, ++ pvmw.address + PAGE_SIZE); ++ mmu_notifier_invalidate_range_start(&range); ++ ++ if (!page_vma_mapped_walk(&pvmw)) ++ goto out_mn; ++ if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) ++ goto out_unlock; ++ ++ if (old_pte) ++ *old_pte = *pvmw.pte; ++ ++ if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || ++ (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) { ++ pte_t entry; ++ ++ swapped = PageSwapCache(page); ++ flush_cache_page(vma, pvmw.address, page_to_pfn(page)); ++ /* ++ * Ok this is tricky, when get_user_pages_fast() run it doesn't ++ * take any lock, therefore the check that we are going to make ++ * with the pagecount against the mapcount is racey and ++ * O_DIRECT can happen right after the check. ++ * So we clear the pte and flush the tlb before the check ++ * this assure us that no O_DIRECT can happen after the check ++ * or in the middle of the check. ++ */ ++ entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); ++ /* ++ * Check that no O_DIRECT or similar I/O is in progress on the ++ * page ++ */ ++ if (page_mapcount(page) + 1 + swapped != page_count(page)) { ++ set_pte_at(mm, pvmw.address, pvmw.pte, entry); ++ goto out_unlock; ++ } ++ if (pte_dirty(entry)) ++ set_page_dirty(page); ++ ++ if (pte_protnone(entry)) ++ entry = pte_mkclean(pte_clear_savedwrite(entry)); ++ else ++ entry = pte_mkclean(pte_wrprotect(entry)); ++ ++ set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); ++ } ++ *orig_pte = *pvmw.pte; ++ err = 0; ++ ++out_unlock: ++ page_vma_mapped_walk_done(&pvmw); ++out_mn: ++ mmu_notifier_invalidate_range_end(&range); ++out: ++ return err; ++} ++ ++#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ ++#define MERGE_ERR_COLLI 2 /* there is a collision */ ++#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ ++#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ ++ ++ ++/** ++ * replace_page - replace page in vma by new ksm page ++ * @vma: vma that holds the pte pointing to page ++ * @page: the page we are replacing by kpage ++ * @kpage: the ksm page we replace page by ++ * @orig_pte: the original value of the pte ++ * ++ * Returns 0 on success, MERGE_ERR_PGERR on failure. ++ */ ++static int replace_page(struct vm_area_struct *vma, struct page *page, ++ struct page *kpage, pte_t orig_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ struct mmu_notifier_range range; ++ pgd_t *pgd; ++ p4d_t *p4d; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *ptep; ++ spinlock_t *ptl; ++ pte_t entry; ++ ++ unsigned long addr; ++ int err = MERGE_ERR_PGERR; ++ ++ addr = page_address_in_vma(page, vma); ++ if (addr == -EFAULT) ++ goto out; ++ ++ pgd = pgd_offset(mm, addr); ++ if (!pgd_present(*pgd)) ++ goto out; ++ ++ p4d = p4d_offset(pgd, addr); ++ pud = pud_offset(p4d, addr); ++ if (!pud_present(*pud)) ++ goto out; ++ ++ pmd = pmd_offset(pud, addr); ++ BUG_ON(pmd_trans_huge(*pmd)); ++ if (!pmd_present(*pmd)) ++ goto out; ++ ++ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, ++ addr + PAGE_SIZE); ++ mmu_notifier_invalidate_range_start(&range); ++ ++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ if (!pte_same(*ptep, orig_pte)) { ++ pte_unmap_unlock(ptep, ptl); ++ goto out_mn; ++ } ++ ++ flush_cache_page(vma, addr, pte_pfn(*ptep)); ++ ptep_clear_flush_notify(vma, addr, ptep); ++ entry = mk_pte(kpage, vma->vm_page_prot); ++ ++ /* special treatment is needed for zero_page */ ++ if ((page_to_pfn(kpage) == uksm_zero_pfn) || ++ (page_to_pfn(kpage) == zero_pfn)) { ++ entry = pte_mkspecial(entry); ++ dec_mm_counter(mm, MM_ANONPAGES); ++ inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); ++ } else { ++ get_page(kpage); ++ page_add_anon_rmap(kpage, vma, addr, false); ++ } ++ ++ set_pte_at_notify(mm, addr, ptep, entry); ++ ++ page_remove_rmap(page, false); ++ if (!page_mapped(page)) ++ try_to_free_swap(page); ++ put_page(page); ++ ++ pte_unmap_unlock(ptep, ptl); ++ err = 0; ++out_mn: ++ mmu_notifier_invalidate_range_end(&range); ++out: ++ return err; ++} ++ ++ ++/** ++ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The ++ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its ++ * hash_max member has not been calculated. ++ * ++ * @page The page needs to be hashed ++ * @hash_old The hash value calculated with current hash strength ++ * ++ * return the new hash value calculated at HASH_STRENGTH_MAX ++ */ ++static inline u32 page_hash_max(struct page *page, u32 hash_old) ++{ ++ u32 hash_max = 0; ++ void *addr; ++ ++ addr = kmap_atomic(page); ++ hash_max = delta_hash(addr, hash_strength, ++ HASH_STRENGTH_MAX, hash_old); ++ ++ kunmap_atomic(addr); ++ ++ if (!hash_max) ++ hash_max = 1; ++ ++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); ++ return hash_max; ++} ++ ++/* ++ * We compare the hash again, to ensure that it is really a hash collision ++ * instead of being caused by page write. ++ */ ++static inline int check_collision(struct rmap_item *rmap_item, ++ u32 hash) ++{ ++ int err; ++ struct page *page = rmap_item->page; ++ ++ /* if this rmap_item has already been hash_maxed, then the collision ++ * must appears in the second-level rbtree search. In this case we check ++ * if its hash_max value has been changed. Otherwise, the collision ++ * happens in the first-level rbtree search, so we check against it's ++ * current hash value. ++ */ ++ if (rmap_item->hash_max) { ++ inc_rshash_neg(memcmp_cost); ++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); ++ ++ if (rmap_item->hash_max == page_hash_max(page, hash)) ++ err = MERGE_ERR_COLLI; ++ else ++ err = MERGE_ERR_CHANGED; ++ } else { ++ inc_rshash_neg(memcmp_cost + hash_strength); ++ ++ if (page_hash(page, hash_strength, 0) == hash) ++ err = MERGE_ERR_COLLI; ++ else ++ err = MERGE_ERR_CHANGED; ++ } ++ ++ return err; ++} ++ ++/** ++ * Try to merge a rmap_item.page with a kpage in stable node. kpage must ++ * already be a ksm page. ++ * ++ * @return 0 if the pages were merged, -EFAULT otherwise. ++ */ ++static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, ++ struct page *kpage, u32 hash) ++{ ++ struct vm_area_struct *vma = rmap_item->slot->vma; ++ struct mm_struct *mm = vma->vm_mm; ++ pte_t orig_pte = __pte(0); ++ int err = MERGE_ERR_PGERR; ++ struct page *page; ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ page = rmap_item->page; ++ ++ if (page == kpage) { /* ksm page forked */ ++ err = 0; ++ goto out; ++ } ++ ++ /* ++ * We need the page lock to read a stable PageSwapCache in ++ * write_protect_page(). We use trylock_page() instead of ++ * lock_page() because we don't want to wait here - we ++ * prefer to continue scanning and merging different pages, ++ * then come back to this page when it is unlocked. ++ */ ++ if (!trylock_page(page)) ++ goto out; ++ ++ if (!PageAnon(page) || !PageKsm(kpage)) ++ goto out_unlock; ++ ++ if (PageTransCompound(page)) { ++ err = split_huge_page(page); ++ if (err) ++ goto out_unlock; ++ } ++ ++ /* ++ * If this anonymous page is mapped only here, its pte may need ++ * to be write-protected. If it's mapped elsewhere, all of its ++ * ptes are necessarily already write-protected. But in either ++ * case, we need to lock and check page_count is not raised. ++ */ ++ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { ++ if (pages_identical_with_cost(page, kpage)) ++ err = replace_page(vma, page, kpage, orig_pte); ++ else ++ err = check_collision(rmap_item, hash); ++ } ++ ++ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { ++ munlock_vma_page(page); ++ if (!PageMlocked(kpage)) { ++ unlock_page(page); ++ lock_page(kpage); ++ mlock_vma_page(kpage); ++ page = kpage; /* for final unlock */ ++ } ++ } ++ ++out_unlock: ++ unlock_page(page); ++out: ++ return err; ++} ++ ++ ++ ++/** ++ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance ++ * to restore a page mapping that has been changed in try_to_merge_two_pages. ++ * ++ * @return 0 on success. ++ */ ++static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, ++ pte_t orig_pte, pte_t wprt_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ p4d_t *p4d; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *ptep; ++ spinlock_t *ptl; ++ ++ int err = -EFAULT; ++ ++ pgd = pgd_offset(mm, addr); ++ if (!pgd_present(*pgd)) ++ goto out; ++ ++ p4d = p4d_offset(pgd, addr); ++ pud = pud_offset(p4d, addr); ++ if (!pud_present(*pud)) ++ goto out; ++ ++ pmd = pmd_offset(pud, addr); ++ if (!pmd_present(*pmd)) ++ goto out; ++ ++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ if (!pte_same(*ptep, wprt_pte)) { ++ /* already copied, let it be */ ++ pte_unmap_unlock(ptep, ptl); ++ goto out; ++ } ++ ++ /* ++ * Good boy, still here. When we still get the ksm page, it does not ++ * return to the free page pool, there is no way that a pte was changed ++ * to other page and gets back to this page. And remind that ksm page ++ * do not reuse in do_wp_page(). So it's safe to restore the original ++ * pte. ++ */ ++ flush_cache_page(vma, addr, pte_pfn(*ptep)); ++ ptep_clear_flush_notify(vma, addr, ptep); ++ set_pte_at_notify(mm, addr, ptep, orig_pte); ++ ++ pte_unmap_unlock(ptep, ptl); ++ err = 0; ++out: ++ return err; ++} ++ ++/** ++ * try_to_merge_two_pages() - take two identical pages and prepare ++ * them to be merged into one page(rmap_item->page) ++ * ++ * @return 0 if we successfully merged two identical pages into ++ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision ++ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been ++ * changed since it's hashed. MERGE_ERR_PGERR otherwise. ++ * ++ */ ++static int try_to_merge_two_pages(struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ u32 hash) ++{ ++ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); ++ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); ++ struct vm_area_struct *vma1 = rmap_item->slot->vma; ++ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; ++ struct page *page = rmap_item->page; ++ struct page *tree_page = tree_rmap_item->page; ++ int err = MERGE_ERR_PGERR; ++ struct address_space *saved_mapping; ++ ++ ++ if (rmap_item->page == tree_rmap_item->page) ++ goto out; ++ ++ if (!trylock_page(page)) ++ goto out; ++ ++ if (!PageAnon(page)) ++ goto out_unlock; ++ ++ if (PageTransCompound(page)) { ++ err = split_huge_page(page); ++ if (err) ++ goto out_unlock; ++ } ++ ++ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { ++ unlock_page(page); ++ goto out; ++ } ++ ++ /* ++ * While we hold page lock, upgrade page from ++ * PageAnon+anon_vma to PageKsm+NULL stable_node: ++ * stable_tree_insert() will update stable_node. ++ */ ++ saved_mapping = page->mapping; ++ set_page_stable_node(page, NULL); ++ mark_page_accessed(page); ++ if (!PageDirty(page)) ++ SetPageDirty(page); ++ ++ unlock_page(page); ++ ++ if (!trylock_page(tree_page)) ++ goto restore_out; ++ ++ if (!PageAnon(tree_page)) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if (PageTransCompound(tree_page)) { ++ err = split_huge_page(tree_page); ++ if (err) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ } ++ ++ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if (pages_identical_with_cost(page, tree_page)) { ++ err = replace_page(vma2, tree_page, page, wprt_pte2); ++ if (err) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if ((vma2->vm_flags & VM_LOCKED)) { ++ munlock_vma_page(tree_page); ++ if (!PageMlocked(page)) { ++ unlock_page(tree_page); ++ lock_page(page); ++ mlock_vma_page(page); ++ tree_page = page; /* for final unlock */ ++ } ++ } ++ ++ unlock_page(tree_page); ++ ++ goto out; /* success */ ++ ++ } else { ++ if (tree_rmap_item->hash_max && ++ tree_rmap_item->hash_max == rmap_item->hash_max) { ++ err = MERGE_ERR_COLLI_MAX; ++ } else if (page_hash(page, hash_strength, 0) == ++ page_hash(tree_page, hash_strength, 0)) { ++ inc_rshash_neg(memcmp_cost + hash_strength * 2); ++ err = MERGE_ERR_COLLI; ++ } else { ++ err = MERGE_ERR_CHANGED; ++ } ++ ++ unlock_page(tree_page); ++ } ++ ++restore_out: ++ lock_page(page); ++ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), ++ orig_pte1, wprt_pte1)) ++ page->mapping = saved_mapping; ++ ++out_unlock: ++ unlock_page(page); ++out: ++ return err; ++} ++ ++static inline int hash_cmp(u32 new_val, u32 node_val) ++{ ++ if (new_val > node_val) ++ return 1; ++ else if (new_val < node_val) ++ return -1; ++ else ++ return 0; ++} ++ ++static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) ++{ ++ u32 hash_max = item->hash_max; ++ ++ if (!hash_max) { ++ hash_max = page_hash_max(item->page, hash); ++ ++ item->hash_max = hash_max; ++ } ++ ++ return hash_max; ++} ++ ++ ++ ++/** ++ * stable_tree_search() - search the stable tree for a page ++ * ++ * @item: the rmap_item we are comparing with ++ * @hash: the hash value of this item->page already calculated ++ * ++ * @return the page we have found, NULL otherwise. The page returned has ++ * been gotten. ++ */ ++static struct page *stable_tree_search(struct rmap_item *item, u32 hash) ++{ ++ struct rb_node *node = root_stable_treep->rb_node; ++ struct tree_node *tree_node; ++ unsigned long hash_max; ++ struct page *page = item->page; ++ struct stable_node *stable_node; ++ ++ stable_node = page_stable_node(page); ++ if (stable_node) { ++ /* ksm page forked, that is ++ * if (PageKsm(page) && !in_stable_tree(rmap_item)) ++ * it's actually gotten once outside. ++ */ ++ get_page(page); ++ return page; ++ } ++ ++ while (node) { ++ int cmp; ++ ++ tree_node = rb_entry(node, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) ++ node = node->rb_left; ++ else if (cmp > 0) ++ node = node->rb_right; ++ else ++ break; ++ } ++ ++ if (!node) ++ return NULL; ++ ++ if (tree_node->count == 1) { ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ BUG_ON(!stable_node); ++ ++ goto get_page_out; ++ } ++ ++ /* ++ * ok, we have to search the second ++ * level subtree, hash the page to a ++ * full strength. ++ */ ++ node = tree_node->sub_root.rb_node; ++ BUG_ON(!node); ++ hash_max = rmap_item_hash_max(item, hash); ++ ++ while (node) { ++ int cmp; ++ ++ stable_node = rb_entry(node, struct stable_node, node); ++ ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ if (cmp < 0) ++ node = node->rb_left; ++ else if (cmp > 0) ++ node = node->rb_right; ++ else ++ goto get_page_out; ++ } ++ ++ return NULL; ++ ++get_page_out: ++ page = get_uksm_page(stable_node, 1, 1); ++ return page; ++} ++ ++static int try_merge_rmap_item(struct rmap_item *item, ++ struct page *kpage, ++ struct page *tree_page) ++{ ++ struct vm_area_struct *vma = item->slot->vma; ++ struct page_vma_mapped_walk pvmw = { ++ .page = kpage, ++ .vma = vma, ++ }; ++ ++ pvmw.address = get_rmap_addr(item); ++ if (!page_vma_mapped_walk(&pvmw)) ++ return 0; ++ ++ if (pte_write(*pvmw.pte)) { ++ /* has changed, abort! */ ++ page_vma_mapped_walk_done(&pvmw); ++ return 0; ++ } ++ ++ get_page(tree_page); ++ page_add_anon_rmap(tree_page, vma, pvmw.address, false); ++ ++ flush_cache_page(vma, pvmw.address, page_to_pfn(kpage)); ++ ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); ++ set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte, ++ mk_pte(tree_page, vma->vm_page_prot)); ++ ++ page_remove_rmap(kpage, false); ++ put_page(kpage); ++ ++ page_vma_mapped_walk_done(&pvmw); ++ ++ return 1; ++} ++ ++/** ++ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted ++ * into stable tree, the page was found to be identical to a stable ksm page, ++ * this is the last chance we can merge them into one. ++ * ++ * @item1: the rmap_item holding the page which we wanted to insert ++ * into stable tree. ++ * @item2: the other rmap_item we found when unstable tree search ++ * @oldpage: the page currently mapped by the two rmap_items ++ * @tree_page: the page we found identical in stable tree node ++ * @success1: return if item1 is successfully merged ++ * @success2: return if item2 is successfully merged ++ */ ++static void try_merge_with_stable(struct rmap_item *item1, ++ struct rmap_item *item2, ++ struct page **kpage, ++ struct page *tree_page, ++ int *success1, int *success2) ++{ ++ struct vm_area_struct *vma1 = item1->slot->vma; ++ struct vm_area_struct *vma2 = item2->slot->vma; ++ *success1 = 0; ++ *success2 = 0; ++ ++ if (unlikely(*kpage == tree_page)) { ++ /* I don't think this can really happen */ ++ pr_warn("UKSM: unexpected condition detected in " ++ "%s -- *kpage == tree_page !\n", __func__); ++ *success1 = 1; ++ *success2 = 1; ++ return; ++ } ++ ++ if (!PageAnon(*kpage) || !PageKsm(*kpage)) ++ goto failed; ++ ++ if (!trylock_page(tree_page)) ++ goto failed; ++ ++ /* If the oldpage is still ksm and still pointed ++ * to in the right place, and still write protected, ++ * we are confident it's not changed, no need to ++ * memcmp anymore. ++ * be ware, we cannot take nested pte locks, ++ * deadlock risk. ++ */ ++ if (!try_merge_rmap_item(item1, *kpage, tree_page)) ++ goto unlock_failed; ++ ++ /* ok, then vma2, remind that pte1 already set */ ++ if (!try_merge_rmap_item(item2, *kpage, tree_page)) ++ goto success_1; ++ ++ *success2 = 1; ++success_1: ++ *success1 = 1; ++ ++ ++ if ((*success1 && vma1->vm_flags & VM_LOCKED) || ++ (*success2 && vma2->vm_flags & VM_LOCKED)) { ++ munlock_vma_page(*kpage); ++ if (!PageMlocked(tree_page)) ++ mlock_vma_page(tree_page); ++ } ++ ++ /* ++ * We do not need oldpage any more in the caller, so can break the lock ++ * now. ++ */ ++ unlock_page(*kpage); ++ *kpage = tree_page; /* Get unlocked outside. */ ++ return; ++ ++unlock_failed: ++ unlock_page(tree_page); ++failed: ++ return; ++} ++ ++static inline void stable_node_hash_max(struct stable_node *node, ++ struct page *page, u32 hash) ++{ ++ u32 hash_max = node->hash_max; ++ ++ if (!hash_max) { ++ hash_max = page_hash_max(page, hash); ++ node->hash_max = hash_max; ++ } ++} ++ ++static inline ++struct stable_node *new_stable_node(struct tree_node *tree_node, ++ struct page *kpage, u32 hash_max) ++{ ++ struct stable_node *new_stable_node; ++ ++ new_stable_node = alloc_stable_node(); ++ if (!new_stable_node) ++ return NULL; ++ ++ new_stable_node->kpfn = page_to_pfn(kpage); ++ new_stable_node->hash_max = hash_max; ++ new_stable_node->tree_node = tree_node; ++ set_page_stable_node(kpage, new_stable_node); ++ ++ return new_stable_node; ++} ++ ++static inline ++struct stable_node *first_level_insert(struct tree_node *tree_node, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ struct page **kpage, u32 hash, ++ int *success1, int *success2) ++{ ++ int cmp; ++ struct page *tree_page; ++ u32 hash_max = 0; ++ struct stable_node *stable_node, *new_snode; ++ struct rb_node *parent = NULL, **new; ++ ++ /* this tree node contains no sub-tree yet */ ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ cmp = memcmp_pages_with_cost(*kpage, tree_page, 1); ++ if (!cmp) { ++ try_merge_with_stable(rmap_item, tree_rmap_item, kpage, ++ tree_page, success1, success2); ++ put_page(tree_page); ++ if (!*success1 && !*success2) ++ goto failed; ++ ++ return stable_node; ++ ++ } else { ++ /* ++ * collision in first level try to create a subtree. ++ * A new node need to be created. ++ */ ++ put_page(tree_page); ++ ++ stable_node_hash_max(stable_node, tree_page, ++ tree_node->hash); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ parent = &stable_node->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto failed; ++ } ++ ++ } else { ++ /* the only stable_node deleted, we reuse its tree_node. ++ */ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++ new_snode = new_stable_node(tree_node, *kpage, hash_max); ++ if (!new_snode) ++ goto failed; ++ ++ rb_link_node(&new_snode->node, parent, new); ++ rb_insert_color(&new_snode->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ ++ return new_snode; ++ ++failed: ++ return NULL; ++} ++ ++static inline ++struct stable_node *stable_subtree_insert(struct tree_node *tree_node, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ struct page **kpage, u32 hash, ++ int *success1, int *success2) ++{ ++ struct page *tree_page; ++ u32 hash_max; ++ struct stable_node *stable_node, *new_snode; ++ struct rb_node *parent, **new; ++ ++research: ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ BUG_ON(!*new); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ while (*new) { ++ int cmp; ++ ++ stable_node = rb_entry(*new, struct stable_node, node); ++ ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else { ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ cmp = memcmp_pages_with_cost(*kpage, tree_page, 1); ++ if (!cmp) { ++ try_merge_with_stable(rmap_item, ++ tree_rmap_item, kpage, ++ tree_page, success1, success2); ++ ++ put_page(tree_page); ++ if (!*success1 && !*success2) ++ goto failed; ++ /* ++ * successfully merged with a stable ++ * node ++ */ ++ return stable_node; ++ } else { ++ put_page(tree_page); ++ goto failed; ++ } ++ } else { ++ /* ++ * stable node may be deleted, ++ * and subtree maybe ++ * restructed, cannot ++ * continue, research it. ++ */ ++ if (tree_node->count) { ++ goto research; ++ } else { ++ /* reuse the tree node*/ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ } ++ } ++ } ++ ++ new_snode = new_stable_node(tree_node, *kpage, hash_max); ++ if (!new_snode) ++ goto failed; ++ ++ rb_link_node(&new_snode->node, parent, new); ++ rb_insert_color(&new_snode->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ ++ return new_snode; ++ ++failed: ++ return NULL; ++} ++ ++ ++/** ++ * stable_tree_insert() - try to insert a merged page in unstable tree to ++ * the stable tree ++ * ++ * @kpage: the page need to be inserted ++ * @hash: the current hash of this page ++ * @rmap_item: the rmap_item being scanned ++ * @tree_rmap_item: the rmap_item found on unstable tree ++ * @success1: return if rmap_item is merged ++ * @success2: return if tree_rmap_item is merged ++ * ++ * @return the stable_node on stable tree if at least one ++ * rmap_item is inserted into stable tree, NULL ++ * otherwise. ++ */ ++static struct stable_node * ++stable_tree_insert(struct page **kpage, u32 hash, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ int *success1, int *success2) ++{ ++ struct rb_node **new = &root_stable_treep->rb_node; ++ struct rb_node *parent = NULL; ++ struct stable_node *stable_node; ++ struct tree_node *tree_node; ++ u32 hash_max = 0; ++ ++ *success1 = *success2 = 0; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ if (tree_node->count == 1) { ++ stable_node = first_level_insert(tree_node, rmap_item, ++ tree_rmap_item, kpage, ++ hash, success1, success2); ++ } else { ++ stable_node = stable_subtree_insert(tree_node, ++ rmap_item, tree_rmap_item, kpage, ++ hash, success1, success2); ++ } ++ } else { ++ ++ /* no tree node found */ ++ tree_node = alloc_tree_node(stable_tree_node_listp); ++ if (!tree_node) { ++ stable_node = NULL; ++ goto out; ++ } ++ ++ stable_node = new_stable_node(tree_node, *kpage, hash_max); ++ if (!stable_node) { ++ free_tree_node(tree_node); ++ goto out; ++ } ++ ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, root_stable_treep); ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ ++ rb_link_node(&stable_node->node, parent, new); ++ rb_insert_color(&stable_node->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ } ++ ++out: ++ return stable_node; ++} ++ ++ ++/** ++ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem ++ * ++ * @return 0 on success, -EBUSY if unable to lock the mmap_sem, ++ * -EINVAL if the page mapping has been changed. ++ */ ++static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) ++{ ++ int err; ++ ++ err = get_mergeable_page_lock_mmap(tree_rmap_item); ++ ++ if (err == -EINVAL) { ++ /* its page map has been changed, remove it */ ++ remove_rmap_item_from_tree(tree_rmap_item); ++ } ++ ++ /* The page is gotten and mmap_sem is locked now. */ ++ return err; ++} ++ ++ ++/** ++ * unstable_tree_search_insert() - search an unstable tree rmap_item with the ++ * same hash value. Get its page and trylock the mmap_sem ++ */ ++static inline ++struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, ++ u32 hash) ++ ++{ ++ struct rb_node **new = &root_unstable_tree.rb_node; ++ struct rb_node *parent = NULL; ++ struct tree_node *tree_node; ++ u32 hash_max; ++ struct rmap_item *tree_rmap_item; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ /* got the tree_node */ ++ if (tree_node->count == 1) { ++ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, ++ struct rmap_item, node); ++ BUG_ON(!tree_rmap_item); ++ ++ goto get_page_out; ++ } ++ ++ /* well, search the collision subtree */ ++ new = &tree_node->sub_root.rb_node; ++ BUG_ON(!*new); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ ++ while (*new) { ++ int cmp; ++ ++ tree_rmap_item = rb_entry(*new, struct rmap_item, ++ node); ++ ++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); ++ parent = *new; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto get_page_out; ++ } ++ } else { ++ /* alloc a new tree_node */ ++ tree_node = alloc_tree_node(&unstable_tree_node_list); ++ if (!tree_node) ++ return NULL; ++ ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, &root_unstable_tree); ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++ /* did not found even in sub-tree */ ++ rmap_item->tree_node = tree_node; ++ rmap_item->address |= UNSTABLE_FLAG; ++ rmap_item->hash_round = uksm_hash_round; ++ rb_link_node(&rmap_item->node, parent, new); ++ rb_insert_color(&rmap_item->node, &tree_node->sub_root); ++ ++ uksm_pages_unshared++; ++ return NULL; ++ ++get_page_out: ++ if (tree_rmap_item->page == rmap_item->page) ++ return NULL; ++ ++ if (get_tree_rmap_item_page(tree_rmap_item)) ++ return NULL; ++ ++ return tree_rmap_item; ++} ++ ++static void hold_anon_vma(struct rmap_item *rmap_item, ++ struct anon_vma *anon_vma) ++{ ++ rmap_item->anon_vma = anon_vma; ++ get_anon_vma(anon_vma); ++} ++ ++ ++/** ++ * stable_tree_append() - append a rmap_item to a stable node. Deduplication ++ * ratio statistics is done in this function. ++ * ++ */ ++static void stable_tree_append(struct rmap_item *rmap_item, ++ struct stable_node *stable_node, int logdedup) ++{ ++ struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL; ++ unsigned long key = (unsigned long)rmap_item->slot; ++ unsigned long factor = rmap_item->slot->rung->step; ++ ++ BUG_ON(!stable_node); ++ rmap_item->address |= STABLE_FLAG; ++ ++ if (hlist_empty(&stable_node->hlist)) { ++ uksm_pages_shared++; ++ goto node_vma_new; ++ } else { ++ uksm_pages_sharing++; ++ } ++ ++ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { ++ if (node_vma->key >= key) ++ break; ++ ++ if (logdedup) { ++ node_vma->slot->pages_bemerged += factor; ++ if (list_empty(&node_vma->slot->dedup_list)) ++ list_add(&node_vma->slot->dedup_list, ++ &vma_slot_dedup); ++ } ++ } ++ ++ if (node_vma) { ++ if (node_vma->key == key) { ++ node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist); ++ goto node_vma_ok; ++ } else if (node_vma->key > key) { ++ node_vma_cont = node_vma; ++ } ++ } ++ ++node_vma_new: ++ /* no same vma already in node, alloc a new node_vma */ ++ new_node_vma = alloc_node_vma(); ++ BUG_ON(!new_node_vma); ++ new_node_vma->head = stable_node; ++ new_node_vma->slot = rmap_item->slot; ++ ++ if (!node_vma) { ++ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); ++ } else if (node_vma->key != key) { ++ if (node_vma->key < key) ++ hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist); ++ else { ++ hlist_add_before(&new_node_vma->hlist, ++ &node_vma->hlist); ++ } ++ ++ } ++ node_vma = new_node_vma; ++ ++node_vma_ok: /* ok, ready to add to the list */ ++ rmap_item->head = node_vma; ++ hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); ++ hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); ++ if (logdedup) { ++ rmap_item->slot->pages_merged++; ++ if (node_vma_cont) { ++ node_vma = node_vma_cont; ++ hlist_for_each_entry_continue(node_vma, hlist) { ++ node_vma->slot->pages_bemerged += factor; ++ if (list_empty(&node_vma->slot->dedup_list)) ++ list_add(&node_vma->slot->dedup_list, ++ &vma_slot_dedup); ++ } ++ } ++ } ++} ++ ++/* ++ * We use break_ksm to break COW on a ksm page: it's a stripped down ++ * ++ * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) ++ * put_page(page); ++ * ++ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, ++ * in case the application has unmapped and remapped mm,addr meanwhile. ++ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP ++ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. ++ */ ++static int break_ksm(struct vm_area_struct *vma, unsigned long addr) ++{ ++ struct page *page; ++ int ret = 0; ++ ++ do { ++ cond_resched(); ++ page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); ++ if (IS_ERR_OR_NULL(page)) ++ break; ++ if (PageKsm(page)) { ++ ret = handle_mm_fault(vma, addr, ++ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, ++ NULL); ++ } else ++ ret = VM_FAULT_WRITE; ++ put_page(page); ++ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); ++ /* ++ * We must loop because handle_mm_fault() may back out if there's ++ * any difficulty e.g. if pte accessed bit gets updated concurrently. ++ * ++ * VM_FAULT_WRITE is what we have been hoping for: it indicates that ++ * COW has been broken, even if the vma does not permit VM_WRITE; ++ * but note that a concurrent fault might break PageKsm for us. ++ * ++ * VM_FAULT_SIGBUS could occur if we race with truncation of the ++ * backing file, which also invalidates anonymous pages: that's ++ * okay, that truncation will have unmapped the PageKsm for us. ++ * ++ * VM_FAULT_OOM: at the time of writing (late July 2009), setting ++ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the ++ * current task has TIF_MEMDIE set, and will be OOM killed on return ++ * to user; and ksmd, having no mm, would never be chosen for that. ++ * ++ * But if the mm is in a limited mem_cgroup, then the fault may fail ++ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and ++ * even ksmd can fail in this way - though it's usually breaking ksm ++ * just to undo a merge it made a moment before, so unlikely to oom. ++ * ++ * That's a pity: we might therefore have more kernel pages allocated ++ * than we're counting as nodes in the stable tree; but uksm_do_scan ++ * will retry to break_cow on each pass, so should recover the page ++ * in due course. The important thing is to not let VM_MERGEABLE ++ * be cleared while any such pages might remain in the area. ++ */ ++ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; ++} ++ ++static void break_cow(struct rmap_item *rmap_item) ++{ ++ struct vm_area_struct *vma = rmap_item->slot->vma; ++ struct mm_struct *mm = vma->vm_mm; ++ unsigned long addr = get_rmap_addr(rmap_item); ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ break_ksm(vma, addr); ++out: ++ return; ++} ++ ++/* ++ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather ++ * than check every pte of a given vma, the locking doesn't quite work for ++ * that - an rmap_item is assigned to the stable tree after inserting ksm ++ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing ++ * rmap_items from parent to child at fork time (so as not to waste time ++ * if exit comes before the next scan reaches it). ++ * ++ * Similarly, although we'd like to remove rmap_items (so updating counts ++ * and freeing memory) when unmerging an area, it's easier to leave that ++ * to the next pass of ksmd - consider, for example, how ksmd might be ++ * in cmp_and_merge_page on one of the rmap_items we would be removing. ++ */ ++inline int unmerge_uksm_pages(struct vm_area_struct *vma, ++ unsigned long start, unsigned long end) ++{ ++ unsigned long addr; ++ int err = 0; ++ ++ for (addr = start; addr < end && !err; addr += PAGE_SIZE) { ++ if (uksm_test_exit(vma->vm_mm)) ++ break; ++ if (signal_pending(current)) ++ err = -ERESTARTSYS; ++ else ++ err = break_ksm(vma, addr); ++ } ++ return err; ++} ++ ++static inline void inc_uksm_pages_scanned(void) ++{ ++ u64 delta; ++ ++ ++ if (uksm_pages_scanned == U64_MAX) { ++ encode_benefit(); ++ ++ delta = uksm_pages_scanned >> pages_scanned_base; ++ ++ if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { ++ pages_scanned_stored >>= 1; ++ delta >>= 1; ++ pages_scanned_base++; ++ } ++ ++ pages_scanned_stored += delta; ++ ++ uksm_pages_scanned = uksm_pages_scanned_last = 0; ++ } ++ ++ uksm_pages_scanned++; ++} ++ ++static inline int find_zero_page_hash(int strength, u32 hash) ++{ ++ return (zero_hash_table[strength] == hash); ++} ++ ++static ++int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) ++{ ++ struct page *zero_page = empty_uksm_zero_page; ++ struct mm_struct *mm = vma->vm_mm; ++ pte_t orig_pte = __pte(0); ++ int err = -EFAULT; ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ if (!trylock_page(page)) ++ goto out; ++ ++ if (!PageAnon(page)) ++ goto out_unlock; ++ ++ if (PageTransCompound(page)) { ++ err = split_huge_page(page); ++ if (err) ++ goto out_unlock; ++ } ++ ++ if (write_protect_page(vma, page, &orig_pte, 0) == 0) { ++ if (is_page_full_zero(page)) ++ err = replace_page(vma, page, zero_page, orig_pte); ++ } ++ ++out_unlock: ++ unlock_page(page); ++out: ++ return err; ++} ++ ++/* ++ * cmp_and_merge_page() - first see if page can be merged into the stable ++ * tree; if not, compare hash to previous and if it's the same, see if page ++ * can be inserted into the unstable tree, or merged with a page already there ++ * and both transferred to the stable tree. ++ * ++ * @page: the page that we are searching identical page to. ++ * @rmap_item: the reverse mapping into the virtual address of this page ++ */ ++static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) ++{ ++ struct rmap_item *tree_rmap_item; ++ struct page *page; ++ struct page *kpage = NULL; ++ u32 hash_max; ++ int err; ++ unsigned int success1, success2; ++ struct stable_node *snode; ++ int cmp; ++ struct rb_node *parent = NULL, **new; ++ ++ remove_rmap_item_from_tree(rmap_item); ++ page = rmap_item->page; ++ ++ /* We first start with searching the page inside the stable tree */ ++ kpage = stable_tree_search(rmap_item, hash); ++ if (kpage) { ++ err = try_to_merge_with_uksm_page(rmap_item, kpage, ++ hash); ++ if (!err) { ++ /* ++ * The page was successfully merged, add ++ * its rmap_item to the stable tree. ++ * page lock is needed because it's ++ * racing with try_to_unmap_ksm(), etc. ++ */ ++ lock_page(kpage); ++ snode = page_stable_node(kpage); ++ stable_tree_append(rmap_item, snode, 1); ++ unlock_page(kpage); ++ put_page(kpage); ++ return; /* success */ ++ } ++ put_page(kpage); ++ ++ /* ++ * if it's a collision and it has been search in sub-rbtree ++ * (hash_max != 0), we want to abort, because if it is ++ * successfully merged in unstable tree, the collision trends to ++ * happen again. ++ */ ++ if (err == MERGE_ERR_COLLI && rmap_item->hash_max) ++ return; ++ } ++ ++ tree_rmap_item = ++ unstable_tree_search_insert(rmap_item, hash); ++ if (tree_rmap_item) { ++ err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); ++ /* ++ * As soon as we merge this page, we want to remove the ++ * rmap_item of the page we have merged with from the unstable ++ * tree, and insert it instead as new node in the stable tree. ++ */ ++ if (!err) { ++ kpage = page; ++ remove_rmap_item_from_tree(tree_rmap_item); ++ lock_page(kpage); ++ snode = stable_tree_insert(&kpage, hash, ++ rmap_item, tree_rmap_item, ++ &success1, &success2); ++ ++ /* ++ * Do not log dedup for tree item, it's not counted as ++ * scanned in this round. ++ */ ++ if (success2) ++ stable_tree_append(tree_rmap_item, snode, 0); ++ ++ /* ++ * The order of these two stable append is important: ++ * we are scanning rmap_item. ++ */ ++ if (success1) ++ stable_tree_append(rmap_item, snode, 1); ++ ++ /* ++ * The original kpage may be unlocked inside ++ * stable_tree_insert() already. This page ++ * should be unlocked before doing ++ * break_cow(). ++ */ ++ unlock_page(kpage); ++ ++ if (!success1) ++ break_cow(rmap_item); ++ ++ if (!success2) ++ break_cow(tree_rmap_item); ++ ++ } else if (err == MERGE_ERR_COLLI) { ++ BUG_ON(tree_rmap_item->tree_node->count > 1); ++ ++ rmap_item_hash_max(tree_rmap_item, ++ tree_rmap_item->tree_node->hash); ++ ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); ++ parent = &tree_rmap_item->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto put_up_out; ++ ++ rmap_item->tree_node = tree_rmap_item->tree_node; ++ rmap_item->address |= UNSTABLE_FLAG; ++ rmap_item->hash_round = uksm_hash_round; ++ rb_link_node(&rmap_item->node, parent, new); ++ rb_insert_color(&rmap_item->node, ++ &tree_rmap_item->tree_node->sub_root); ++ rmap_item->tree_node->count++; ++ } else { ++ /* ++ * either one of the page has changed or they collide ++ * at the max hash, we consider them as ill items. ++ */ ++ remove_rmap_item_from_tree(tree_rmap_item); ++ } ++put_up_out: ++ put_page(tree_rmap_item->page); ++ mmap_read_unlock(tree_rmap_item->slot->vma->vm_mm); ++ } ++} ++ ++ ++ ++ ++static inline unsigned long get_pool_index(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; ++ if (pool_index >= slot->pool_size) ++ BUG(); ++ return pool_index; ++} ++ ++static inline unsigned long index_page_offset(unsigned long index) ++{ ++ return offset_in_page(sizeof(struct rmap_list_entry *) * index); ++} ++ ++static inline ++struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, ++ unsigned long index, int need_alloc) ++{ ++ unsigned long pool_index; ++ struct page *page; ++ void *addr; ++ ++ ++ pool_index = get_pool_index(slot, index); ++ if (!slot->rmap_list_pool[pool_index]) { ++ if (!need_alloc) ++ return NULL; ++ ++ page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); ++ if (!page) ++ return NULL; ++ ++ slot->rmap_list_pool[pool_index] = page; ++ } ++ ++ addr = kmap(slot->rmap_list_pool[pool_index]); ++ addr += index_page_offset(index); ++ ++ return addr; ++} ++ ++static inline void put_rmap_list_entry(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ kunmap(slot->rmap_list_pool[pool_index]); ++} ++ ++static inline int entry_is_new(struct rmap_list_entry *entry) ++{ ++ return !entry->item; ++} ++ ++static inline unsigned long get_index_orig_addr(struct vma_slot *slot, ++ unsigned long index) ++{ ++ return slot->vma->vm_start + (index << PAGE_SHIFT); ++} ++ ++static inline unsigned long get_entry_address(struct rmap_list_entry *entry) ++{ ++ unsigned long addr; ++ ++ if (is_addr(entry->addr)) ++ addr = get_clean_addr(entry->addr); ++ else if (entry->item) ++ addr = get_rmap_addr(entry->item); ++ else ++ BUG(); ++ ++ return addr; ++} ++ ++static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) ++{ ++ if (is_addr(entry->addr)) ++ return NULL; ++ ++ return entry->item; ++} ++ ++static inline void inc_rmap_list_pool_count(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ slot->pool_counts[pool_index]++; ++} ++ ++static inline void dec_rmap_list_pool_count(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ BUG_ON(!slot->pool_counts[pool_index]); ++ slot->pool_counts[pool_index]--; ++} ++ ++static inline int entry_has_rmap(struct rmap_list_entry *entry) ++{ ++ return !is_addr(entry->addr) && entry->item; ++} ++ ++static inline void swap_entries(struct rmap_list_entry *entry1, ++ unsigned long index1, ++ struct rmap_list_entry *entry2, ++ unsigned long index2) ++{ ++ struct rmap_list_entry tmp; ++ ++ /* swapping two new entries is meaningless */ ++ BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); ++ ++ tmp = *entry1; ++ *entry1 = *entry2; ++ *entry2 = tmp; ++ ++ if (entry_has_rmap(entry1)) ++ entry1->item->entry_index = index1; ++ ++ if (entry_has_rmap(entry2)) ++ entry2->item->entry_index = index2; ++ ++ if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { ++ inc_rmap_list_pool_count(entry1->item->slot, index1); ++ dec_rmap_list_pool_count(entry1->item->slot, index2); ++ } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { ++ inc_rmap_list_pool_count(entry2->item->slot, index2); ++ dec_rmap_list_pool_count(entry2->item->slot, index1); ++ } ++} ++ ++static inline void free_entry_item(struct rmap_list_entry *entry) ++{ ++ unsigned long index; ++ struct rmap_item *item; ++ ++ if (!is_addr(entry->addr)) { ++ BUG_ON(!entry->item); ++ item = entry->item; ++ entry->addr = get_rmap_addr(item); ++ set_is_addr(entry->addr); ++ index = item->entry_index; ++ remove_rmap_item_from_tree(item); ++ dec_rmap_list_pool_count(item->slot, index); ++ free_rmap_item(item); ++ } ++} ++ ++static inline int pool_entry_boundary(unsigned long index) ++{ ++ unsigned long linear_addr; ++ ++ linear_addr = sizeof(struct rmap_list_entry *) * index; ++ return index && !offset_in_page(linear_addr); ++} ++ ++static inline void try_free_last_pool(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ if (slot->rmap_list_pool[pool_index] && ++ !slot->pool_counts[pool_index]) { ++ __free_page(slot->rmap_list_pool[pool_index]); ++ slot->rmap_list_pool[pool_index] = NULL; ++ slot->flags |= UKSM_SLOT_NEED_SORT; ++ } ++ ++} ++ ++static inline unsigned long vma_item_index(struct vm_area_struct *vma, ++ struct rmap_item *item) ++{ ++ return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; ++} ++ ++static int within_same_pool(struct vma_slot *slot, ++ unsigned long i, unsigned long j) ++{ ++ unsigned long pool_i, pool_j; ++ ++ pool_i = get_pool_index(slot, i); ++ pool_j = get_pool_index(slot, j); ++ ++ return (pool_i == pool_j); ++} ++ ++static void sort_rmap_entry_list(struct vma_slot *slot) ++{ ++ unsigned long i, j; ++ struct rmap_list_entry *entry, *swap_entry; ++ ++ entry = get_rmap_list_entry(slot, 0, 0); ++ for (i = 0; i < slot->pages; ) { ++ ++ if (!entry) ++ goto skip_whole_pool; ++ ++ if (entry_is_new(entry)) ++ goto next_entry; ++ ++ if (is_addr(entry->addr)) { ++ entry->addr = 0; ++ goto next_entry; ++ } ++ ++ j = vma_item_index(slot->vma, entry->item); ++ if (j == i) ++ goto next_entry; ++ ++ if (within_same_pool(slot, i, j)) ++ swap_entry = entry + j - i; ++ else ++ swap_entry = get_rmap_list_entry(slot, j, 1); ++ ++ swap_entries(entry, i, swap_entry, j); ++ if (!within_same_pool(slot, i, j)) ++ put_rmap_list_entry(slot, j); ++ continue; ++ ++skip_whole_pool: ++ i += PAGE_SIZE / sizeof(*entry); ++ if (i < slot->pages) ++ entry = get_rmap_list_entry(slot, i, 0); ++ continue; ++ ++next_entry: ++ if (i >= slot->pages - 1 || ++ !within_same_pool(slot, i, i + 1)) { ++ put_rmap_list_entry(slot, i); ++ if (i + 1 < slot->pages) ++ entry = get_rmap_list_entry(slot, i + 1, 0); ++ } else ++ entry++; ++ i++; ++ continue; ++ } ++ ++ /* free empty pool entries which contain no rmap_item */ ++ /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ ++ for (i = 0; i < slot->pool_size; i++) { ++ unsigned char has_rmap; ++ void *addr; ++ ++ if (!slot->rmap_list_pool[i]) ++ continue; ++ ++ has_rmap = 0; ++ addr = kmap(slot->rmap_list_pool[i]); ++ BUG_ON(!addr); ++ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { ++ entry = (struct rmap_list_entry *)addr + j; ++ if (is_addr(entry->addr)) ++ continue; ++ if (!entry->item) ++ continue; ++ has_rmap = 1; ++ } ++ kunmap(slot->rmap_list_pool[i]); ++ if (!has_rmap) { ++ BUG_ON(slot->pool_counts[i]); ++ __free_page(slot->rmap_list_pool[i]); ++ slot->rmap_list_pool[i] = NULL; ++ } ++ } ++ ++ slot->flags &= ~UKSM_SLOT_NEED_SORT; ++} ++ ++/* ++ * vma_fully_scanned() - if all the pages in this slot have been scanned. ++ */ ++static inline int vma_fully_scanned(struct vma_slot *slot) ++{ ++ return slot->pages_scanned == slot->pages; ++} ++ ++/** ++ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to ++ * its random permutation. This function is embedded with the random ++ * permutation index management code. ++ */ ++static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) ++{ ++ unsigned long rand_range, addr, swap_index, scan_index; ++ struct rmap_item *item = NULL; ++ struct rmap_list_entry *scan_entry, *swap_entry = NULL; ++ struct page *page; ++ ++ scan_index = swap_index = slot->pages_scanned % slot->pages; ++ ++ if (pool_entry_boundary(scan_index)) ++ try_free_last_pool(slot, scan_index - 1); ++ ++ if (vma_fully_scanned(slot)) { ++ if (slot->flags & UKSM_SLOT_NEED_SORT) ++ slot->flags |= UKSM_SLOT_NEED_RERAND; ++ else ++ slot->flags &= ~UKSM_SLOT_NEED_RERAND; ++ if (slot->flags & UKSM_SLOT_NEED_SORT) ++ sort_rmap_entry_list(slot); ++ } ++ ++ scan_entry = get_rmap_list_entry(slot, scan_index, 1); ++ if (!scan_entry) ++ return NULL; ++ ++ if (entry_is_new(scan_entry)) { ++ scan_entry->addr = get_index_orig_addr(slot, scan_index); ++ set_is_addr(scan_entry->addr); ++ } ++ ++ if (slot->flags & UKSM_SLOT_NEED_RERAND) { ++ rand_range = slot->pages - scan_index; ++ BUG_ON(!rand_range); ++ swap_index = scan_index + (prandom_u32() % rand_range); ++ } ++ ++ if (swap_index != scan_index) { ++ swap_entry = get_rmap_list_entry(slot, swap_index, 1); ++ ++ if (!swap_entry) ++ return NULL; ++ ++ if (entry_is_new(swap_entry)) { ++ swap_entry->addr = get_index_orig_addr(slot, ++ swap_index); ++ set_is_addr(swap_entry->addr); ++ } ++ swap_entries(scan_entry, scan_index, swap_entry, swap_index); ++ } ++ ++ addr = get_entry_address(scan_entry); ++ item = get_entry_item(scan_entry); ++ BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); ++ ++ page = follow_page(slot->vma, addr, FOLL_GET); ++ if (IS_ERR_OR_NULL(page)) ++ goto nopage; ++ ++ if (!PageAnon(page)) ++ goto putpage; ++ ++ /*check is zero_page pfn or uksm_zero_page*/ ++ if ((page_to_pfn(page) == zero_pfn) ++ || (page_to_pfn(page) == uksm_zero_pfn)) ++ goto putpage; ++ ++ flush_anon_page(slot->vma, page, addr); ++ flush_dcache_page(page); ++ ++ ++ *hash = page_hash(page, hash_strength, 1); ++ inc_uksm_pages_scanned(); ++ /*if the page content all zero, re-map to zero-page*/ ++ if (find_zero_page_hash(hash_strength, *hash)) { ++ if (!cmp_and_merge_zero_page(slot->vma, page)) { ++ slot->pages_merged++; ++ ++ /* For full-zero pages, no need to create rmap item */ ++ goto putpage; ++ } else { ++ inc_rshash_neg(memcmp_cost / 2); ++ } ++ } ++ ++ if (!item) { ++ item = alloc_rmap_item(); ++ if (item) { ++ /* It has already been zeroed */ ++ item->slot = slot; ++ item->address = addr; ++ item->entry_index = scan_index; ++ scan_entry->item = item; ++ inc_rmap_list_pool_count(slot, scan_index); ++ } else ++ goto putpage; ++ } ++ ++ BUG_ON(item->slot != slot); ++ /* the page may have changed */ ++ item->page = page; ++ put_rmap_list_entry(slot, scan_index); ++ if (swap_entry) ++ put_rmap_list_entry(slot, swap_index); ++ return item; ++ ++putpage: ++ put_page(page); ++ page = NULL; ++nopage: ++ /* no page, store addr back and free rmap_item if possible */ ++ free_entry_item(scan_entry); ++ put_rmap_list_entry(slot, scan_index); ++ if (swap_entry) ++ put_rmap_list_entry(slot, swap_index); ++ return NULL; ++} ++ ++static inline int in_stable_tree(struct rmap_item *rmap_item) ++{ ++ return rmap_item->address & STABLE_FLAG; ++} ++ ++/** ++ * scan_vma_one_page() - scan the next page in a vma_slot. Called with ++ * mmap_sem locked. ++ */ ++static noinline void scan_vma_one_page(struct vma_slot *slot) ++{ ++ u32 hash; ++ struct mm_struct *mm; ++ struct rmap_item *rmap_item = NULL; ++ struct vm_area_struct *vma = slot->vma; ++ ++ mm = vma->vm_mm; ++ BUG_ON(!mm); ++ BUG_ON(!slot); ++ ++ rmap_item = get_next_rmap_item(slot, &hash); ++ if (!rmap_item) ++ goto out1; ++ ++ if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) ++ goto out2; ++ ++ cmp_and_merge_page(rmap_item, hash); ++out2: ++ put_page(rmap_item->page); ++out1: ++ slot->pages_scanned++; ++ slot->this_sampled++; ++ if (slot->fully_scanned_round != fully_scanned_round) ++ scanned_virtual_pages++; ++ ++ if (vma_fully_scanned(slot)) ++ slot->fully_scanned_round = fully_scanned_round; ++} ++ ++static inline unsigned long rung_get_pages(struct scan_rung *rung) ++{ ++ struct slot_tree_node *node; ++ ++ if (!rung->vma_root.rnode) ++ return 0; ++ ++ node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); ++ ++ return node->size; ++} ++ ++#define RUNG_SAMPLED_MIN 3 ++ ++static inline ++void uksm_calc_rung_step(struct scan_rung *rung, ++ unsigned long page_time, unsigned long ratio) ++{ ++ unsigned long sampled, pages; ++ ++ /* will be fully scanned ? */ ++ if (!rung->cover_msecs) { ++ rung->step = 1; ++ return; ++ } ++ ++ sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) ++ * ratio / page_time; ++ ++ /* ++ * Before we finsish a scan round and expensive per-round jobs, ++ * we need to have a chance to estimate the per page time. So ++ * the sampled number can not be too small. ++ */ ++ if (sampled < RUNG_SAMPLED_MIN) ++ sampled = RUNG_SAMPLED_MIN; ++ ++ pages = rung_get_pages(rung); ++ if (likely(pages > sampled)) ++ rung->step = pages / sampled; ++ else ++ rung->step = 1; ++} ++ ++static inline int step_need_recalc(struct scan_rung *rung) ++{ ++ unsigned long pages, stepmax; ++ ++ pages = rung_get_pages(rung); ++ stepmax = pages / RUNG_SAMPLED_MIN; ++ ++ return pages && (rung->step > pages || ++ (stepmax && rung->step > stepmax)); ++} ++ ++static inline ++void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) ++{ ++ struct vma_slot *slot; ++ ++ if (finished) ++ rung->flags |= UKSM_RUNG_ROUND_FINISHED; ++ ++ if (step_recalc || step_need_recalc(rung)) { ++ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); ++ BUG_ON(step_need_recalc(rung)); ++ } ++ ++ slot_iter_index = prandom_u32() % rung->step; ++ BUG_ON(!rung->vma_root.rnode); ++ slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); ++ BUG_ON(!slot); ++ ++ rung->current_scan = slot; ++ rung->current_offset = slot_iter_index; ++} ++ ++static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) ++{ ++ return &slot->rung->vma_root; ++} ++ ++/* ++ * return if resetted. ++ */ ++static int advance_current_scan(struct scan_rung *rung) ++{ ++ unsigned short n; ++ struct vma_slot *slot, *next = NULL; ++ ++ BUG_ON(!rung->vma_root.num); ++ ++ slot = rung->current_scan; ++ n = (slot->pages - rung->current_offset) % rung->step; ++ slot_iter_index = rung->step - n; ++ next = sradix_tree_next(&rung->vma_root, slot->snode, ++ slot->sindex, slot_iter); ++ ++ if (next) { ++ rung->current_offset = slot_iter_index; ++ rung->current_scan = next; ++ return 0; ++ } else { ++ reset_current_scan(rung, 1, 0); ++ return 1; ++ } ++} ++ ++static inline void rung_rm_slot(struct vma_slot *slot) ++{ ++ struct scan_rung *rung = slot->rung; ++ struct sradix_tree_root *root; ++ ++ if (rung->current_scan == slot) ++ advance_current_scan(rung); ++ ++ root = slot_get_root(slot); ++ sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); ++ slot->snode = NULL; ++ if (step_need_recalc(rung)) { ++ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); ++ BUG_ON(step_need_recalc(rung)); ++ } ++ ++ /* In case advance_current_scan loop back to this slot again */ ++ if (rung->vma_root.num && rung->current_scan == slot) ++ reset_current_scan(slot->rung, 1, 0); ++} ++ ++static inline void rung_add_new_slots(struct scan_rung *rung, ++ struct vma_slot **slots, unsigned long num) ++{ ++ int err; ++ struct vma_slot *slot; ++ unsigned long i; ++ struct sradix_tree_root *root = &rung->vma_root; ++ ++ err = sradix_tree_enter(root, (void **)slots, num); ++ BUG_ON(err); ++ ++ for (i = 0; i < num; i++) { ++ slot = slots[i]; ++ slot->rung = rung; ++ BUG_ON(vma_fully_scanned(slot)); ++ } ++ ++ if (rung->vma_root.num == num) ++ reset_current_scan(rung, 0, 1); ++} ++ ++static inline int rung_add_one_slot(struct scan_rung *rung, ++ struct vma_slot *slot) ++{ ++ int err; ++ ++ err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); ++ if (err) ++ return err; ++ ++ slot->rung = rung; ++ if (rung->vma_root.num == 1) ++ reset_current_scan(rung, 0, 1); ++ ++ return 0; ++} ++ ++/* ++ * Return true if the slot is deleted from its rung. ++ */ ++static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) ++{ ++ struct scan_rung *old_rung = slot->rung; ++ int err; ++ ++ if (old_rung == rung) ++ return 0; ++ ++ rung_rm_slot(slot); ++ err = rung_add_one_slot(rung, slot); ++ if (err) { ++ err = rung_add_one_slot(old_rung, slot); ++ WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ ++ } ++ ++ return 1; ++} ++ ++static inline int vma_rung_up(struct vma_slot *slot) ++{ ++ struct scan_rung *rung; ++ ++ rung = slot->rung; ++ if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) ++ rung++; ++ ++ return vma_rung_enter(slot, rung); ++} ++ ++static inline int vma_rung_down(struct vma_slot *slot) ++{ ++ struct scan_rung *rung; ++ ++ rung = slot->rung; ++ if (slot->rung != &uksm_scan_ladder[0]) ++ rung--; ++ ++ return vma_rung_enter(slot, rung); ++} ++ ++/** ++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. ++ */ ++static unsigned long cal_dedup_ratio(struct vma_slot *slot) ++{ ++ unsigned long ret; ++ unsigned long pages; ++ ++ pages = slot->this_sampled; ++ if (!pages) ++ return 0; ++ ++ BUG_ON(slot->pages_scanned == slot->last_scanned); ++ ++ ret = slot->pages_merged; ++ ++ /* Thrashing area filtering */ ++ if (ret && uksm_thrash_threshold) { ++ if (slot->pages_cowed * 100 / slot->pages_merged ++ > uksm_thrash_threshold) { ++ ret = 0; ++ } else { ++ ret = slot->pages_merged - slot->pages_cowed; ++ } ++ } ++ ++ return ret * 100 / pages; ++} ++ ++/** ++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. ++ */ ++static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) ++{ ++ unsigned long ret; ++ unsigned long pages; ++ ++ pages = slot->pages; ++ if (!pages) ++ return 0; ++ ++ ret = slot->pages_bemerged; ++ ++ /* Thrashing area filtering */ ++ if (ret && uksm_thrash_threshold) { ++ if (slot->pages_cowed * 100 / slot->pages_bemerged ++ > uksm_thrash_threshold) { ++ ret = 0; ++ } else { ++ ret = slot->pages_bemerged - slot->pages_cowed; ++ } ++ } ++ ++ return ret * 100 / pages; ++} ++ ++/** ++ * stable_node_reinsert() - When the hash_strength has been adjusted, the ++ * stable tree need to be restructured, this is the function re-inserting the ++ * stable node. ++ */ ++static inline void stable_node_reinsert(struct stable_node *new_node, ++ struct page *page, ++ struct rb_root *root_treep, ++ struct list_head *tree_node_listp, ++ u32 hash) ++{ ++ struct rb_node **new = &root_treep->rb_node; ++ struct rb_node *parent = NULL; ++ struct stable_node *stable_node; ++ struct tree_node *tree_node; ++ struct page *tree_page; ++ int cmp; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ /* find a stable tree node with same first level hash value */ ++ stable_node_hash_max(new_node, page, hash); ++ if (tree_node->count == 1) { ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ stable_node_hash_max(stable_node, ++ tree_page, hash); ++ put_page(tree_page); ++ ++ /* prepare for stable node insertion */ ++ ++ cmp = hash_cmp(new_node->hash_max, ++ stable_node->hash_max); ++ parent = &stable_node->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto failed; ++ ++ goto add_node; ++ } else { ++ /* the only stable_node deleted, the tree node ++ * was not deleted. ++ */ ++ goto tree_node_reuse; ++ } ++ } ++ ++ /* well, search the collision subtree */ ++ new = &tree_node->sub_root.rb_node; ++ parent = NULL; ++ BUG_ON(!*new); ++ while (*new) { ++ int cmp; ++ ++ stable_node = rb_entry(*new, struct stable_node, node); ++ ++ cmp = hash_cmp(new_node->hash_max, ++ stable_node->hash_max); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else { ++ /* oh, no, still a collision */ ++ goto failed; ++ } ++ } ++ ++ goto add_node; ++ } ++ ++ /* no tree node found */ ++ tree_node = alloc_tree_node(tree_node_listp); ++ if (!tree_node) { ++ pr_err("UKSM: memory allocation error!\n"); ++ goto failed; ++ } else { ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, root_treep); ++ ++tree_node_reuse: ++ /* prepare for stable node insertion */ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++add_node: ++ rb_link_node(&new_node->node, parent, new); ++ rb_insert_color(&new_node->node, &tree_node->sub_root); ++ new_node->tree_node = tree_node; ++ tree_node->count++; ++ return; ++ ++failed: ++ /* This can only happen when two nodes have collided ++ * in two levels. ++ */ ++ new_node->tree_node = NULL; ++ return; ++} ++ ++static inline void free_all_tree_nodes(struct list_head *list) ++{ ++ struct tree_node *node, *tmp; ++ ++ list_for_each_entry_safe(node, tmp, list, all_list) { ++ free_tree_node(node); ++ } ++} ++ ++/** ++ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash ++ * strength to the current hash_strength. It re-structures the hole tree. ++ */ ++static inline void stable_tree_delta_hash(u32 prev_hash_strength) ++{ ++ struct stable_node *node, *tmp; ++ struct rb_root *root_new_treep; ++ struct list_head *new_tree_node_listp; ++ ++ stable_tree_index = (stable_tree_index + 1) % 2; ++ root_new_treep = &root_stable_tree[stable_tree_index]; ++ new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; ++ *root_new_treep = RB_ROOT; ++ BUG_ON(!list_empty(new_tree_node_listp)); ++ ++ /* ++ * we need to be safe, the node could be removed by get_uksm_page() ++ */ ++ list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { ++ void *addr; ++ struct page *node_page; ++ u32 hash; ++ ++ /* ++ * We are completely re-structuring the stable nodes to a new ++ * stable tree. We don't want to touch the old tree unlinks and ++ * old tree_nodes. The old tree_nodes will be freed at once. ++ */ ++ node_page = get_uksm_page(node, 0, 0); ++ if (!node_page) ++ continue; ++ ++ if (node->tree_node) { ++ hash = node->tree_node->hash; ++ ++ addr = kmap_atomic(node_page); ++ ++ hash = delta_hash(addr, prev_hash_strength, ++ hash_strength, hash); ++ kunmap_atomic(addr); ++ } else { ++ /* ++ *it was not inserted to rbtree due to collision in last ++ *round scan. ++ */ ++ hash = page_hash(node_page, hash_strength, 0); ++ } ++ ++ stable_node_reinsert(node, node_page, root_new_treep, ++ new_tree_node_listp, hash); ++ put_page(node_page); ++ } ++ ++ root_stable_treep = root_new_treep; ++ free_all_tree_nodes(stable_tree_node_listp); ++ BUG_ON(!list_empty(stable_tree_node_listp)); ++ stable_tree_node_listp = new_tree_node_listp; ++} ++ ++static inline void inc_hash_strength(unsigned long delta) ++{ ++ hash_strength += 1 << delta; ++ if (hash_strength > HASH_STRENGTH_MAX) ++ hash_strength = HASH_STRENGTH_MAX; ++} ++ ++static inline void dec_hash_strength(unsigned long delta) ++{ ++ unsigned long change = 1 << delta; ++ ++ if (hash_strength <= change + 1) ++ hash_strength = 1; ++ else ++ hash_strength -= change; ++} ++ ++static inline void inc_hash_strength_delta(void) ++{ ++ hash_strength_delta++; ++ if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) ++ hash_strength_delta = HASH_STRENGTH_DELTA_MAX; ++} ++ ++static inline unsigned long get_current_neg_ratio(void) ++{ ++ u64 pos = benefit.pos; ++ u64 neg = benefit.neg; ++ ++ if (!neg) ++ return 0; ++ ++ if (!pos || neg > pos) ++ return 100; ++ ++ if (neg > div64_u64(U64_MAX, 100)) ++ pos = div64_u64(pos, 100); ++ else ++ neg *= 100; ++ ++ return div64_u64(neg, pos); ++} ++ ++static inline unsigned long get_current_benefit(void) ++{ ++ u64 pos = benefit.pos; ++ u64 neg = benefit.neg; ++ u64 scanned = benefit.scanned; ++ ++ if (neg > pos) ++ return 0; ++ ++ return div64_u64((pos - neg), scanned); ++} ++ ++static inline int judge_rshash_direction(void) ++{ ++ u64 current_neg_ratio, stable_benefit; ++ u64 current_benefit, delta = 0; ++ int ret = STILL; ++ ++ /* ++ * Try to probe a value after the boot, and in case the system ++ * are still for a long time. ++ */ ++ if ((fully_scanned_round & 0xFFULL) == 10) { ++ ret = OBSCURE; ++ goto out; ++ } ++ ++ current_neg_ratio = get_current_neg_ratio(); ++ ++ if (current_neg_ratio == 0) { ++ rshash_neg_cont_zero++; ++ if (rshash_neg_cont_zero > 2) ++ return GO_DOWN; ++ else ++ return STILL; ++ } ++ rshash_neg_cont_zero = 0; ++ ++ if (current_neg_ratio > 90) { ++ ret = GO_UP; ++ goto out; ++ } ++ ++ current_benefit = get_current_benefit(); ++ stable_benefit = rshash_state.stable_benefit; ++ ++ if (!stable_benefit) { ++ ret = OBSCURE; ++ goto out; ++ } ++ ++ if (current_benefit > stable_benefit) ++ delta = current_benefit - stable_benefit; ++ else if (current_benefit < stable_benefit) ++ delta = stable_benefit - current_benefit; ++ ++ delta = div64_u64(100 * delta, stable_benefit); ++ ++ if (delta > 50) { ++ rshash_cont_obscure++; ++ if (rshash_cont_obscure > 2) ++ return OBSCURE; ++ else ++ return STILL; ++ } ++ ++out: ++ rshash_cont_obscure = 0; ++ return ret; ++} ++ ++/** ++ * rshash_adjust() - The main function to control the random sampling state ++ * machine for hash strength adapting. ++ * ++ * return true if hash_strength has changed. ++ */ ++static inline int rshash_adjust(void) ++{ ++ unsigned long prev_hash_strength = hash_strength; ++ ++ if (!encode_benefit()) ++ return 0; ++ ++ switch (rshash_state.state) { ++ case RSHASH_STILL: ++ switch (judge_rshash_direction()) { ++ case GO_UP: ++ if (rshash_state.pre_direct == GO_DOWN) ++ hash_strength_delta = 0; ++ ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.pre_direct = GO_UP; ++ break; ++ ++ case GO_DOWN: ++ if (rshash_state.pre_direct == GO_UP) ++ hash_strength_delta = 0; ++ ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.pre_direct = GO_DOWN; ++ break; ++ ++ case OBSCURE: ++ rshash_state.stable_point = hash_strength; ++ rshash_state.turn_point_down = hash_strength; ++ rshash_state.turn_point_up = hash_strength; ++ rshash_state.turn_benefit_down = get_current_benefit(); ++ rshash_state.turn_benefit_up = get_current_benefit(); ++ rshash_state.lookup_window_index = 0; ++ rshash_state.state = RSHASH_TRYDOWN; ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ break; ++ ++ case STILL: ++ break; ++ default: ++ BUG(); ++ } ++ break; ++ ++ case RSHASH_TRYDOWN: ++ if (rshash_state.lookup_window_index++ % 5 == 0) ++ rshash_state.below_count = 0; ++ ++ if (get_current_benefit() < rshash_state.stable_benefit) ++ rshash_state.below_count++; ++ else if (get_current_benefit() > ++ rshash_state.turn_benefit_down) { ++ rshash_state.turn_point_down = hash_strength; ++ rshash_state.turn_benefit_down = get_current_benefit(); ++ } ++ ++ if (rshash_state.below_count >= 3 || ++ judge_rshash_direction() == GO_UP || ++ hash_strength == 1) { ++ hash_strength = rshash_state.stable_point; ++ hash_strength_delta = 0; ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.lookup_window_index = 0; ++ rshash_state.state = RSHASH_TRYUP; ++ hash_strength_delta = 0; ++ } else { ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ } ++ break; ++ ++ case RSHASH_TRYUP: ++ if (rshash_state.lookup_window_index++ % 5 == 0) ++ rshash_state.below_count = 0; ++ ++ if (get_current_benefit() < rshash_state.turn_benefit_down) ++ rshash_state.below_count++; ++ else if (get_current_benefit() > rshash_state.turn_benefit_up) { ++ rshash_state.turn_point_up = hash_strength; ++ rshash_state.turn_benefit_up = get_current_benefit(); ++ } ++ ++ if (rshash_state.below_count >= 3 || ++ judge_rshash_direction() == GO_DOWN || ++ hash_strength == HASH_STRENGTH_MAX) { ++ hash_strength = rshash_state.turn_benefit_up > ++ rshash_state.turn_benefit_down ? ++ rshash_state.turn_point_up : ++ rshash_state.turn_point_down; ++ ++ rshash_state.state = RSHASH_PRE_STILL; ++ } else { ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ } ++ ++ break; ++ ++ case RSHASH_NEW: ++ case RSHASH_PRE_STILL: ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.state = RSHASH_STILL; ++ hash_strength_delta = 0; ++ break; ++ default: ++ BUG(); ++ } ++ ++ /* rshash_neg = rshash_pos = 0; */ ++ reset_benefit(); ++ ++ if (prev_hash_strength != hash_strength) ++ stable_tree_delta_hash(prev_hash_strength); ++ ++ return prev_hash_strength != hash_strength; ++} ++ ++/** ++ * round_update_ladder() - The main function to do update of all the ++ * adjustments whenever a scan round is finished. ++ */ ++static noinline void round_update_ladder(void) ++{ ++ int i; ++ unsigned long dedup; ++ struct vma_slot *slot, *tmp_slot; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) ++ uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; ++ ++ list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { ++ ++ /* slot may be rung_rm_slot() when mm exits */ ++ if (slot->snode) { ++ dedup = cal_dedup_ratio_old(slot); ++ if (dedup && dedup >= uksm_abundant_threshold) ++ vma_rung_up(slot); ++ } ++ ++ slot->pages_bemerged = 0; ++ slot->pages_cowed = 0; ++ ++ list_del_init(&slot->dedup_list); ++ } ++} ++ ++static void uksm_del_vma_slot(struct vma_slot *slot) ++{ ++ int i, j; ++ struct rmap_list_entry *entry; ++ ++ if (slot->snode) { ++ /* ++ * In case it just failed when entering the rung, it's not ++ * necessary. ++ */ ++ rung_rm_slot(slot); ++ } ++ ++ if (!list_empty(&slot->dedup_list)) ++ list_del(&slot->dedup_list); ++ ++ if (!slot->rmap_list_pool || !slot->pool_counts) { ++ /* In case it OOMed in uksm_vma_enter() */ ++ goto out; ++ } ++ ++ for (i = 0; i < slot->pool_size; i++) { ++ void *addr; ++ ++ if (!slot->rmap_list_pool[i]) ++ continue; ++ ++ addr = kmap(slot->rmap_list_pool[i]); ++ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { ++ entry = (struct rmap_list_entry *)addr + j; ++ if (is_addr(entry->addr)) ++ continue; ++ if (!entry->item) ++ continue; ++ ++ remove_rmap_item_from_tree(entry->item); ++ free_rmap_item(entry->item); ++ slot->pool_counts[i]--; ++ } ++ BUG_ON(slot->pool_counts[i]); ++ kunmap(slot->rmap_list_pool[i]); ++ __free_page(slot->rmap_list_pool[i]); ++ } ++ kfree(slot->rmap_list_pool); ++ kfree(slot->pool_counts); ++ ++out: ++ slot->rung = NULL; ++ if (slot->flags & UKSM_SLOT_IN_UKSM) { ++ BUG_ON(uksm_pages_total < slot->pages); ++ uksm_pages_total -= slot->pages; ++ } ++ ++ if (slot->fully_scanned_round == fully_scanned_round) ++ scanned_virtual_pages -= slot->pages; ++ else ++ scanned_virtual_pages -= slot->pages_scanned; ++ free_vma_slot(slot); ++} ++ ++ ++#define SPIN_LOCK_PERIOD 32 ++static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; ++static inline void cleanup_vma_slots(void) ++{ ++ struct vma_slot *slot; ++ int i; ++ ++ i = 0; ++ spin_lock(&vma_slot_list_lock); ++ while (!list_empty(&vma_slot_del)) { ++ slot = list_entry(vma_slot_del.next, ++ struct vma_slot, slot_list); ++ list_del(&slot->slot_list); ++ cleanup_slots[i++] = slot; ++ if (i == SPIN_LOCK_PERIOD) { ++ spin_unlock(&vma_slot_list_lock); ++ while (--i >= 0) ++ uksm_del_vma_slot(cleanup_slots[i]); ++ i = 0; ++ spin_lock(&vma_slot_list_lock); ++ } ++ } ++ spin_unlock(&vma_slot_list_lock); ++ ++ while (--i >= 0) ++ uksm_del_vma_slot(cleanup_slots[i]); ++} ++ ++/* ++ * Expotional moving average formula ++ */ ++static inline unsigned long ema(unsigned long curr, unsigned long last_ema) ++{ ++ /* ++ * For a very high burst, even the ema cannot work well, a false very ++ * high per-page time estimation can result in feedback in very high ++ * overhead of context switch and rung update -- this will then lead ++ * to higher per-paper time, this may not converge. ++ * ++ * Instead, we try to approach this value in a binary manner. ++ */ ++ if (curr > last_ema * 10) ++ return last_ema * 2; ++ ++ return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; ++} ++ ++/* ++ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to ++ * nanoseconds based on current uksm_sleep_jiffies. ++ */ ++static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) ++{ ++ return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / ++ (TIME_RATIO_SCALE - ratio) * ratio; ++} ++ ++ ++static inline unsigned long rung_real_ratio(int cpu_time_ratio) ++{ ++ unsigned long ret; ++ ++ BUG_ON(!cpu_time_ratio); ++ ++ if (cpu_time_ratio > 0) ++ ret = cpu_time_ratio; ++ else ++ ret = (unsigned long)(-cpu_time_ratio) * ++ uksm_max_cpu_percentage / 100UL; ++ ++ return ret ? ret : 1; ++} ++ ++static noinline void uksm_calc_scan_pages(void) ++{ ++ struct scan_rung *ladder = uksm_scan_ladder; ++ unsigned long sleep_usecs, nsecs; ++ unsigned long ratio; ++ int i; ++ unsigned long per_page; ++ ++ if (uksm_ema_page_time > 100000 || ++ (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) ++ uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; ++ ++ per_page = uksm_ema_page_time; ++ BUG_ON(!per_page); ++ ++ /* ++ * For every 8 eval round, we try to probe a uksm_sleep_jiffies value ++ * based on saved user input. ++ */ ++ if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) ++ uksm_sleep_jiffies = uksm_sleep_saved; ++ ++ /* We require a rung scan at least 1 page in a period. */ ++ nsecs = per_page; ++ ratio = rung_real_ratio(ladder[0].cpu_ratio); ++ if (cpu_ratio_to_nsec(ratio) < nsecs) { ++ sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio ++ / NSEC_PER_USEC; ++ uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ ratio = rung_real_ratio(ladder[i].cpu_ratio); ++ ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / ++ per_page; ++ BUG_ON(!ladder[i].pages_to_scan); ++ uksm_calc_rung_step(&ladder[i], per_page, ratio); ++ } ++} ++ ++/* ++ * From the scan time of this round (ns) to next expected min sleep time ++ * (ms), be careful of the possible overflows. ratio is taken from ++ * rung_real_ratio() ++ */ ++static inline ++unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) ++{ ++ scan_time >>= 20; /* to msec level now */ ++ BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); ++ ++ return (unsigned int) ((unsigned long) scan_time * ++ (TIME_RATIO_SCALE - ratio) / ratio); ++} ++ ++#define __round_mask(x, y) ((__typeof__(x))((y)-1)) ++#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) ++ ++static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) ++{ ++ struct scan_rung *rung; ++ ++ rung = &uksm_scan_ladder[0]; ++ rung_add_new_slots(rung, slots, num); ++} ++ ++static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; ++ ++static void uksm_enter_all_slots(void) ++{ ++ struct vma_slot *slot; ++ unsigned long index; ++ struct list_head empty_vma_list; ++ int i; ++ ++ i = 0; ++ index = 0; ++ INIT_LIST_HEAD(&empty_vma_list); ++ ++ spin_lock(&vma_slot_list_lock); ++ while (!list_empty(&vma_slot_new)) { ++ slot = list_entry(vma_slot_new.next, ++ struct vma_slot, slot_list); ++ ++ if (!slot->vma->anon_vma) { ++ list_move(&slot->slot_list, &empty_vma_list); ++ } else if (vma_can_enter(slot->vma)) { ++ batch_slots[index++] = slot; ++ list_del_init(&slot->slot_list); ++ } else { ++ list_move(&slot->slot_list, &vma_slot_noadd); ++ } ++ ++ if (++i == SPIN_LOCK_PERIOD || ++ (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { ++ spin_unlock(&vma_slot_list_lock); ++ ++ if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { ++ uksm_vma_enter(batch_slots, index); ++ index = 0; ++ } ++ i = 0; ++ cond_resched(); ++ spin_lock(&vma_slot_list_lock); ++ } ++ } ++ ++ list_splice(&empty_vma_list, &vma_slot_new); ++ ++ spin_unlock(&vma_slot_list_lock); ++ ++ if (index) ++ uksm_vma_enter(batch_slots, index); ++ ++} ++ ++static inline int rung_round_finished(struct scan_rung *rung) ++{ ++ return rung->flags & UKSM_RUNG_ROUND_FINISHED; ++} ++ ++static inline void judge_slot(struct vma_slot *slot) ++{ ++ struct scan_rung *rung = slot->rung; ++ unsigned long dedup; ++ int deleted; ++ ++ dedup = cal_dedup_ratio(slot); ++ if (vma_fully_scanned(slot) && uksm_thrash_threshold) ++ deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); ++ else if (dedup && dedup >= uksm_abundant_threshold) ++ deleted = vma_rung_up(slot); ++ else ++ deleted = vma_rung_down(slot); ++ ++ slot->pages_merged = 0; ++ slot->pages_cowed = 0; ++ slot->this_sampled = 0; ++ ++ if (vma_fully_scanned(slot)) ++ slot->pages_scanned = 0; ++ ++ slot->last_scanned = slot->pages_scanned; ++ ++ /* If its deleted in above, then rung was already advanced. */ ++ if (!deleted) ++ advance_current_scan(rung); ++} ++ ++ ++static inline int hash_round_finished(void) ++{ ++ if (scanned_virtual_pages > (uksm_pages_total >> 2)) { ++ scanned_virtual_pages = 0; ++ if (uksm_pages_scanned) ++ fully_scanned_round++; ++ ++ return 1; ++ } else { ++ return 0; ++ } ++} ++ ++#define UKSM_MMSEM_BATCH 5 ++#define BUSY_RETRY 100 ++ ++/** ++ * uksm_do_scan() - the main worker function. ++ */ ++static noinline void uksm_do_scan(void) ++{ ++ struct vma_slot *slot, *iter; ++ struct mm_struct *busy_mm; ++ unsigned char round_finished, all_rungs_emtpy; ++ int i, err, mmsem_batch; ++ unsigned long pcost; ++ long long delta_exec; ++ unsigned long vpages, max_cpu_ratio; ++ unsigned long long start_time, end_time, scan_time; ++ unsigned int expected_jiffies; ++ ++ might_sleep(); ++ ++ vpages = 0; ++ ++ start_time = task_sched_runtime(current); ++ max_cpu_ratio = 0; ++ mmsem_batch = 0; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE;) { ++ struct scan_rung *rung = &uksm_scan_ladder[i]; ++ unsigned long ratio; ++ int busy_retry; ++ ++ if (!rung->pages_to_scan) { ++ i++; ++ continue; ++ } ++ ++ if (!rung->vma_root.num) { ++ rung->pages_to_scan = 0; ++ i++; ++ continue; ++ } ++ ++ ratio = rung_real_ratio(rung->cpu_ratio); ++ if (ratio > max_cpu_ratio) ++ max_cpu_ratio = ratio; ++ ++ busy_retry = BUSY_RETRY; ++ /* ++ * Do not consider rung_round_finished() here, just used up the ++ * rung->pages_to_scan quota. ++ */ ++ while (rung->pages_to_scan && rung->vma_root.num && ++ likely(!freezing(current))) { ++ int reset = 0; ++ ++ slot = rung->current_scan; ++ ++ BUG_ON(vma_fully_scanned(slot)); ++ ++ if (mmsem_batch) ++ err = 0; ++ else ++ err = try_down_read_slot_mmap_sem(slot); ++ ++ if (err == -ENOENT) { ++rm_slot: ++ rung_rm_slot(slot); ++ continue; ++ } ++ ++ busy_mm = slot->mm; ++ ++ if (err == -EBUSY) { ++ /* skip other vmas on the same mm */ ++ do { ++ reset = advance_current_scan(rung); ++ iter = rung->current_scan; ++ busy_retry--; ++ if (iter->vma->vm_mm != busy_mm || ++ !busy_retry || reset) ++ break; ++ } while (1); ++ ++ if (iter->vma->vm_mm != busy_mm) { ++ continue; ++ } else { ++ /* scan round finsished */ ++ break; ++ } ++ } ++ ++ BUG_ON(!vma_can_enter(slot->vma)); ++ if (uksm_test_exit(slot->vma->vm_mm)) { ++ mmsem_batch = 0; ++ mmap_read_unlock(slot->vma->vm_mm); ++ goto rm_slot; ++ } ++ ++ if (mmsem_batch) ++ mmsem_batch--; ++ else ++ mmsem_batch = UKSM_MMSEM_BATCH; ++ ++ /* Ok, we have take the mmap_sem, ready to scan */ ++ scan_vma_one_page(slot); ++ rung->pages_to_scan--; ++ vpages++; ++ ++ if (rung->current_offset + rung->step > slot->pages - 1 ++ || vma_fully_scanned(slot)) { ++ mmap_read_unlock(slot->vma->vm_mm); ++ judge_slot(slot); ++ mmsem_batch = 0; ++ } else { ++ rung->current_offset += rung->step; ++ if (!mmsem_batch) ++ mmap_read_unlock(slot->vma->vm_mm); ++ } ++ ++ busy_retry = BUSY_RETRY; ++ cond_resched(); ++ } ++ ++ if (mmsem_batch) { ++ mmap_read_unlock(slot->vma->vm_mm); ++ mmsem_batch = 0; ++ } ++ ++ if (freezing(current)) ++ break; ++ ++ cond_resched(); ++ } ++ end_time = task_sched_runtime(current); ++ delta_exec = end_time - start_time; ++ ++ if (freezing(current)) ++ return; ++ ++ cleanup_vma_slots(); ++ uksm_enter_all_slots(); ++ ++ round_finished = 1; ++ all_rungs_emtpy = 1; ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ struct scan_rung *rung = &uksm_scan_ladder[i]; ++ ++ if (rung->vma_root.num) { ++ all_rungs_emtpy = 0; ++ if (!rung_round_finished(rung)) ++ round_finished = 0; ++ } ++ } ++ ++ if (all_rungs_emtpy) ++ round_finished = 0; ++ ++ if (round_finished) { ++ round_update_ladder(); ++ uksm_eval_round++; ++ ++ if (hash_round_finished() && rshash_adjust()) { ++ /* Reset the unstable root iff hash strength changed */ ++ uksm_hash_round++; ++ root_unstable_tree = RB_ROOT; ++ free_all_tree_nodes(&unstable_tree_node_list); ++ } ++ ++ /* ++ * A number of pages can hang around indefinitely on per-cpu ++ * pagevecs, raised page count preventing write_protect_page ++ * from merging them. Though it doesn't really matter much, ++ * it is puzzling to see some stuck in pages_volatile until ++ * other activity jostles them out, and they also prevented ++ * LTP's KSM test from succeeding deterministically; so drain ++ * them here (here rather than on entry to uksm_do_scan(), ++ * so we don't IPI too often when pages_to_scan is set low). ++ */ ++ lru_add_drain_all(); ++ } ++ ++ ++ if (vpages && delta_exec > 0) { ++ pcost = (unsigned long) delta_exec / vpages; ++ if (likely(uksm_ema_page_time)) ++ uksm_ema_page_time = ema(pcost, uksm_ema_page_time); ++ else ++ uksm_ema_page_time = pcost; ++ } ++ ++ uksm_calc_scan_pages(); ++ uksm_sleep_real = uksm_sleep_jiffies; ++ /* in case of radical cpu bursts, apply the upper bound */ ++ end_time = task_sched_runtime(current); ++ if (max_cpu_ratio && end_time > start_time) { ++ scan_time = end_time - start_time; ++ expected_jiffies = msecs_to_jiffies( ++ scan_time_to_sleep(scan_time, max_cpu_ratio)); ++ ++ if (expected_jiffies > uksm_sleep_real) ++ uksm_sleep_real = expected_jiffies; ++ ++ /* We have a 1 second up bound for responsiveness. */ ++ if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) ++ uksm_sleep_real = msecs_to_jiffies(1000); ++ } ++ ++ return; ++} ++ ++static int ksmd_should_run(void) ++{ ++ return uksm_run & UKSM_RUN_MERGE; ++} ++ ++static int uksm_scan_thread(void *nothing) ++{ ++ set_freezable(); ++ set_user_nice(current, 5); ++ ++ while (!kthread_should_stop()) { ++ mutex_lock(&uksm_thread_mutex); ++ if (ksmd_should_run()) ++ uksm_do_scan(); ++ mutex_unlock(&uksm_thread_mutex); ++ ++ try_to_freeze(); ++ ++ if (ksmd_should_run()) { ++ schedule_timeout_interruptible(uksm_sleep_real); ++ uksm_sleep_times++; ++ } else { ++ wait_event_freezable(uksm_thread_wait, ++ ksmd_should_run() || kthread_should_stop()); ++ } ++ } ++ return 0; ++} ++ ++void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) ++{ ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct rmap_item *rmap_item; ++ int search_new_forks = 0; ++ unsigned long address; ++ ++ VM_BUG_ON_PAGE(!PageKsm(page), page); ++ VM_BUG_ON_PAGE(!PageLocked(page), page); ++ ++ stable_node = page_stable_node(page); ++ if (!stable_node) ++ return; ++again: ++ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ struct anon_vma_chain *vmac; ++ struct vm_area_struct *vma; ++ ++ cond_resched(); ++ anon_vma_lock_read(anon_vma); ++ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, ++ 0, ULONG_MAX) { ++ cond_resched(); ++ vma = vmac->vma; ++ address = get_rmap_addr(rmap_item); ++ ++ if (address < vma->vm_start || ++ address >= vma->vm_end) ++ continue; ++ ++ if ((rmap_item->slot->vma == vma) == ++ search_new_forks) ++ continue; ++ ++ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) ++ continue; ++ ++ if (!rwc->rmap_one(page, vma, address, rwc->arg)) { ++ anon_vma_unlock_read(anon_vma); ++ return; ++ } ++ ++ if (rwc->done && rwc->done(page)) { ++ anon_vma_unlock_read(anon_vma); ++ return; ++ } ++ } ++ anon_vma_unlock_read(anon_vma); ++ } ++ } ++ if (!search_new_forks++) ++ goto again; ++} ++ ++#ifdef CONFIG_MIGRATION ++/* Common ksm interface but may be specific to uksm */ ++void ksm_migrate_page(struct page *newpage, struct page *oldpage) ++{ ++ struct stable_node *stable_node; ++ ++ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); ++ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); ++ VM_BUG_ON(newpage->mapping != oldpage->mapping); ++ ++ stable_node = page_stable_node(newpage); ++ if (stable_node) { ++ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); ++ stable_node->kpfn = page_to_pfn(newpage); ++ /* ++ * newpage->mapping was set in advance; now we need smp_wmb() ++ * to make sure that the new stable_node->kpfn is visible ++ * to get_ksm_page() before it can see that oldpage->mapping ++ * has gone stale (or that PageSwapCache has been cleared). ++ */ ++ smp_wmb(); ++ set_page_stable_node(oldpage, NULL); ++ } ++} ++#endif /* CONFIG_MIGRATION */ ++ ++#ifdef CONFIG_MEMORY_HOTREMOVE ++static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, ++ unsigned long end_pfn) ++{ ++ struct rb_node *node; ++ ++ for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { ++ struct stable_node *stable_node; ++ ++ stable_node = rb_entry(node, struct stable_node, node); ++ if (stable_node->kpfn >= start_pfn && ++ stable_node->kpfn < end_pfn) ++ return stable_node; ++ } ++ return NULL; ++} ++ ++static int uksm_memory_callback(struct notifier_block *self, ++ unsigned long action, void *arg) ++{ ++ struct memory_notify *mn = arg; ++ struct stable_node *stable_node; ++ ++ switch (action) { ++ case MEM_GOING_OFFLINE: ++ /* ++ * Keep it very simple for now: just lock out ksmd and ++ * MADV_UNMERGEABLE while any memory is going offline. ++ * mutex_lock_nested() is necessary because lockdep was alarmed ++ * that here we take uksm_thread_mutex inside notifier chain ++ * mutex, and later take notifier chain mutex inside ++ * uksm_thread_mutex to unlock it. But that's safe because both ++ * are inside mem_hotplug_mutex. ++ */ ++ mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); ++ break; ++ ++ case MEM_OFFLINE: ++ /* ++ * Most of the work is done by page migration; but there might ++ * be a few stable_nodes left over, still pointing to struct ++ * pages which have been offlined: prune those from the tree. ++ */ ++ while ((stable_node = uksm_check_stable_tree(mn->start_pfn, ++ mn->start_pfn + mn->nr_pages)) != NULL) ++ remove_node_from_stable_tree(stable_node, 1, 1); ++ /* fallthrough */ ++ ++ case MEM_CANCEL_OFFLINE: ++ mutex_unlock(&uksm_thread_mutex); ++ break; ++ } ++ return NOTIFY_OK; ++} ++#endif /* CONFIG_MEMORY_HOTREMOVE */ ++ ++#ifdef CONFIG_SYSFS ++/* ++ * This all compiles without CONFIG_SYSFS, but is a waste of space. ++ */ ++ ++#define UKSM_ATTR_RO(_name) \ ++ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) ++#define UKSM_ATTR(_name) \ ++ static struct kobj_attribute _name##_attr = \ ++ __ATTR(_name, 0644, _name##_show, _name##_store) ++ ++static ssize_t max_cpu_percentage_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_max_cpu_percentage); ++} ++ ++static ssize_t max_cpu_percentage_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned long max_cpu_percentage; ++ int err; ++ ++ err = kstrtoul(buf, 10, &max_cpu_percentage); ++ if (err || max_cpu_percentage > 100) ++ return -EINVAL; ++ ++ if (max_cpu_percentage == 100) ++ max_cpu_percentage = 99; ++ else if (max_cpu_percentage < 10) ++ max_cpu_percentage = 10; ++ ++ uksm_max_cpu_percentage = max_cpu_percentage; ++ ++ return count; ++} ++UKSM_ATTR(max_cpu_percentage); ++ ++static ssize_t sleep_millisecs_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); ++} ++ ++static ssize_t sleep_millisecs_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned long msecs; ++ int err; ++ ++ err = kstrtoul(buf, 10, &msecs); ++ if (err || msecs > MSEC_PER_SEC) ++ return -EINVAL; ++ ++ uksm_sleep_jiffies = msecs_to_jiffies(msecs); ++ uksm_sleep_saved = uksm_sleep_jiffies; ++ ++ return count; ++} ++UKSM_ATTR(sleep_millisecs); ++ ++ ++static ssize_t cpu_governor_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); ++ int i; ++ ++ buf[0] = '\0'; ++ for (i = 0; i < n ; i++) { ++ if (uksm_cpu_governor == i) ++ strcat(buf, "["); ++ ++ strcat(buf, uksm_cpu_governor_str[i]); ++ ++ if (uksm_cpu_governor == i) ++ strcat(buf, "]"); ++ ++ strcat(buf, " "); ++ } ++ strcat(buf, "\n"); ++ ++ return strlen(buf); ++} ++ ++static inline void init_performance_values(void) ++{ ++ int i; ++ struct scan_rung *rung; ++ struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; ++ ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = uksm_scan_ladder + i; ++ rung->cpu_ratio = preset->cpu_ratio[i]; ++ rung->cover_msecs = preset->cover_msecs[i]; ++ } ++ ++ uksm_max_cpu_percentage = preset->max_cpu; ++} ++ ++static ssize_t cpu_governor_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); ++ ++ for (n--; n >= 0 ; n--) { ++ if (!strncmp(buf, uksm_cpu_governor_str[n], ++ strlen(uksm_cpu_governor_str[n]))) ++ break; ++ } ++ ++ if (n < 0) ++ return -EINVAL; ++ else ++ uksm_cpu_governor = n; ++ ++ init_performance_values(); ++ ++ return count; ++} ++UKSM_ATTR(cpu_governor); ++ ++static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_run); ++} ++ ++static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = kstrtoul(buf, 10, &flags); ++ if (err || flags > UINT_MAX) ++ return -EINVAL; ++ if (flags > UKSM_RUN_MERGE) ++ return -EINVAL; ++ ++ mutex_lock(&uksm_thread_mutex); ++ if (uksm_run != flags) ++ uksm_run = flags; ++ mutex_unlock(&uksm_thread_mutex); ++ ++ if (flags & UKSM_RUN_MERGE) ++ wake_up_interruptible(&uksm_thread_wait); ++ ++ return count; ++} ++UKSM_ATTR(run); ++ ++static ssize_t abundant_threshold_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_abundant_threshold); ++} ++ ++static ssize_t abundant_threshold_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = kstrtoul(buf, 10, &flags); ++ if (err || flags > 99) ++ return -EINVAL; ++ ++ uksm_abundant_threshold = flags; ++ ++ return count; ++} ++UKSM_ATTR(abundant_threshold); ++ ++static ssize_t thrash_threshold_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_thrash_threshold); ++} ++ ++static ssize_t thrash_threshold_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = kstrtoul(buf, 10, &flags); ++ if (err || flags > 99) ++ return -EINVAL; ++ ++ uksm_thrash_threshold = flags; ++ ++ return count; ++} ++UKSM_ATTR(thrash_threshold); ++ ++static ssize_t cpu_ratios_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int i, size; ++ struct scan_rung *rung; ++ char *p = buf; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ if (rung->cpu_ratio > 0) ++ size = sprintf(p, "%d ", rung->cpu_ratio); ++ else ++ size = sprintf(p, "MAX/%d ", ++ TIME_RATIO_SCALE / -rung->cpu_ratio); ++ ++ p += size; ++ } ++ ++ *p++ = '\n'; ++ *p = '\0'; ++ ++ return p - buf; ++} ++ ++static ssize_t cpu_ratios_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int i, cpuratios[SCAN_LADDER_SIZE], err; ++ unsigned long value; ++ struct scan_rung *rung; ++ char *p, *end = NULL; ++ ++ p = kzalloc(count, GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ memcpy(p, buf, count); ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ if (i != SCAN_LADDER_SIZE - 1) { ++ end = strchr(p, ' '); ++ if (!end) ++ return -EINVAL; ++ ++ *end = '\0'; ++ } ++ ++ if (strstr(p, "MAX/")) { ++ p = strchr(p, '/') + 1; ++ err = kstrtoul(p, 10, &value); ++ if (err || value > TIME_RATIO_SCALE || !value) ++ return -EINVAL; ++ ++ cpuratios[i] = -(int) (TIME_RATIO_SCALE / value); ++ } else { ++ err = kstrtoul(p, 10, &value); ++ if (err || value > TIME_RATIO_SCALE || !value) ++ return -EINVAL; ++ ++ cpuratios[i] = value; ++ } ++ ++ p = end + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ rung->cpu_ratio = cpuratios[i]; ++ } ++ ++ return count; ++} ++UKSM_ATTR(cpu_ratios); ++ ++static ssize_t eval_intervals_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int i, size; ++ struct scan_rung *rung; ++ char *p = buf; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ size = sprintf(p, "%u ", rung->cover_msecs); ++ p += size; ++ } ++ ++ *p++ = '\n'; ++ *p = '\0'; ++ ++ return p - buf; ++} ++ ++static ssize_t eval_intervals_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int i, err; ++ unsigned long values[SCAN_LADDER_SIZE]; ++ struct scan_rung *rung; ++ char *p, *end = NULL; ++ ssize_t ret = count; ++ ++ p = kzalloc(count + 2, GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ memcpy(p, buf, count); ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ if (i != SCAN_LADDER_SIZE - 1) { ++ end = strchr(p, ' '); ++ if (!end) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ *end = '\0'; ++ } ++ ++ err = kstrtoul(p, 10, &values[i]); ++ if (err) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ p = end + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ rung->cover_msecs = values[i]; ++ } ++ ++out: ++ kfree(p); ++ return ret; ++} ++UKSM_ATTR(eval_intervals); ++ ++static ssize_t ema_per_page_time_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_ema_page_time); ++} ++UKSM_ATTR_RO(ema_per_page_time); ++ ++static ssize_t pages_shared_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_shared); ++} ++UKSM_ATTR_RO(pages_shared); ++ ++static ssize_t pages_sharing_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_sharing); ++} ++UKSM_ATTR_RO(pages_sharing); ++ ++static ssize_t pages_unshared_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_unshared); ++} ++UKSM_ATTR_RO(pages_unshared); ++ ++static ssize_t full_scans_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%llu\n", fully_scanned_round); ++} ++UKSM_ATTR_RO(full_scans); ++ ++static ssize_t pages_scanned_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ unsigned long base = 0; ++ u64 delta, ret; ++ ++ if (pages_scanned_stored) { ++ base = pages_scanned_base; ++ ret = pages_scanned_stored; ++ delta = uksm_pages_scanned >> base; ++ if (CAN_OVERFLOW_U64(ret, delta)) { ++ ret >>= 1; ++ delta >>= 1; ++ base++; ++ ret += delta; ++ } ++ } else { ++ ret = uksm_pages_scanned; ++ } ++ ++ while (ret > ULONG_MAX) { ++ ret >>= 1; ++ base++; ++ } ++ ++ if (base) ++ return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); ++ else ++ return sprintf(buf, "%lu\n", (unsigned long)ret); ++} ++UKSM_ATTR_RO(pages_scanned); ++ ++static ssize_t hash_strength_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", hash_strength); ++} ++UKSM_ATTR_RO(hash_strength); ++ ++static ssize_t sleep_times_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%llu\n", uksm_sleep_times); ++} ++UKSM_ATTR_RO(sleep_times); ++ ++ ++static struct attribute *uksm_attrs[] = { ++ &max_cpu_percentage_attr.attr, ++ &sleep_millisecs_attr.attr, ++ &cpu_governor_attr.attr, ++ &run_attr.attr, ++ &ema_per_page_time_attr.attr, ++ &pages_shared_attr.attr, ++ &pages_sharing_attr.attr, ++ &pages_unshared_attr.attr, ++ &full_scans_attr.attr, ++ &pages_scanned_attr.attr, ++ &hash_strength_attr.attr, ++ &sleep_times_attr.attr, ++ &thrash_threshold_attr.attr, ++ &abundant_threshold_attr.attr, ++ &cpu_ratios_attr.attr, ++ &eval_intervals_attr.attr, ++ NULL, ++}; ++ ++static struct attribute_group uksm_attr_group = { ++ .attrs = uksm_attrs, ++ .name = "uksm", ++}; ++#endif /* CONFIG_SYSFS */ ++ ++static inline void init_scan_ladder(void) ++{ ++ int i; ++ struct scan_rung *rung; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = uksm_scan_ladder + i; ++ slot_tree_init_root(&rung->vma_root); ++ } ++ ++ init_performance_values(); ++ uksm_calc_scan_pages(); ++} ++ ++static inline int cal_positive_negative_costs(void) ++{ ++ struct page *p1, *p2; ++ unsigned char *addr1, *addr2; ++ unsigned long i, time_start, hash_cost; ++ unsigned long loopnum = 0; ++ ++ /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ ++ volatile u32 hash; ++ volatile int ret; ++ ++ p1 = alloc_page(GFP_KERNEL); ++ if (!p1) ++ return -ENOMEM; ++ ++ p2 = alloc_page(GFP_KERNEL); ++ if (!p2) ++ return -ENOMEM; ++ ++ addr1 = kmap_atomic(p1); ++ addr2 = kmap_atomic(p2); ++ memset(addr1, prandom_u32(), PAGE_SIZE); ++ memcpy(addr2, addr1, PAGE_SIZE); ++ ++ /* make sure that the two pages differ in last byte */ ++ addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; ++ kunmap_atomic(addr2); ++ kunmap_atomic(addr1); ++ ++ time_start = jiffies; ++ while (jiffies - time_start < 100) { ++ for (i = 0; i < 100; i++) ++ hash = page_hash(p1, HASH_STRENGTH_FULL, 0); ++ loopnum += 100; ++ } ++ hash_cost = (jiffies - time_start); ++ ++ time_start = jiffies; ++ for (i = 0; i < loopnum; i++) ++ ret = pages_identical_with_cost(p1, p2); ++ memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); ++ memcmp_cost /= hash_cost; ++ pr_info("UKSM: relative memcmp_cost = %lu " ++ "hash=%u cmp_ret=%d.\n", ++ memcmp_cost, hash, ret); ++ ++ __free_page(p1); ++ __free_page(p2); ++ return 0; ++} ++ ++static int init_zeropage_hash_table(void) ++{ ++ struct page *page; ++ char *addr; ++ int i; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ addr = kmap_atomic(page); ++ memset(addr, 0, PAGE_SIZE); ++ kunmap_atomic(addr); ++ ++ zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32), ++ GFP_KERNEL); ++ if (!zero_hash_table) ++ return -ENOMEM; ++ ++ for (i = 0; i < HASH_STRENGTH_MAX; i++) ++ zero_hash_table[i] = page_hash(page, i, 0); ++ ++ __free_page(page); ++ ++ return 0; ++} ++ ++static inline int init_random_sampling(void) ++{ ++ unsigned long i; ++ ++ random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); ++ if (!random_nums) ++ return -ENOMEM; ++ ++ for (i = 0; i < HASH_STRENGTH_FULL; i++) ++ random_nums[i] = i; ++ ++ for (i = 0; i < HASH_STRENGTH_FULL; i++) { ++ unsigned long rand_range, swap_index, tmp; ++ ++ rand_range = HASH_STRENGTH_FULL - i; ++ swap_index = i + prandom_u32() % rand_range; ++ tmp = random_nums[i]; ++ random_nums[i] = random_nums[swap_index]; ++ random_nums[swap_index] = tmp; ++ } ++ ++ rshash_state.state = RSHASH_NEW; ++ rshash_state.below_count = 0; ++ rshash_state.lookup_window_index = 0; ++ ++ return cal_positive_negative_costs(); ++} ++ ++static int __init uksm_slab_init(void) ++{ ++ rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); ++ if (!rmap_item_cache) ++ goto out; ++ ++ stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); ++ if (!stable_node_cache) ++ goto out_free1; ++ ++ node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); ++ if (!node_vma_cache) ++ goto out_free2; ++ ++ vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); ++ if (!vma_slot_cache) ++ goto out_free3; ++ ++ tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); ++ if (!tree_node_cache) ++ goto out_free4; ++ ++ return 0; ++ ++out_free4: ++ kmem_cache_destroy(vma_slot_cache); ++out_free3: ++ kmem_cache_destroy(node_vma_cache); ++out_free2: ++ kmem_cache_destroy(stable_node_cache); ++out_free1: ++ kmem_cache_destroy(rmap_item_cache); ++out: ++ return -ENOMEM; ++} ++ ++static void __init uksm_slab_free(void) ++{ ++ kmem_cache_destroy(stable_node_cache); ++ kmem_cache_destroy(rmap_item_cache); ++ kmem_cache_destroy(node_vma_cache); ++ kmem_cache_destroy(vma_slot_cache); ++ kmem_cache_destroy(tree_node_cache); ++} ++ ++/* Common interface to ksm, different to it. */ ++int ksm_madvise(struct vm_area_struct *vma, unsigned long start, ++ unsigned long end, int advice, unsigned long *vm_flags) ++{ ++ int err; ++ ++ switch (advice) { ++ case MADV_MERGEABLE: ++ return 0; /* just ignore the advice */ ++ ++ case MADV_UNMERGEABLE: ++ if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags)) ++ return 0; /* just ignore the advice */ ++ ++ if (vma->anon_vma) { ++ err = unmerge_uksm_pages(vma, start, end); ++ if (err) ++ return err; ++ } ++ ++ uksm_remove_vma(vma); ++ *vm_flags &= ~VM_MERGEABLE; ++ break; ++ } ++ ++ return 0; ++} ++ ++/* Common interface to ksm, actually the same. */ ++struct page *ksm_might_need_to_copy(struct page *page, ++ struct vm_area_struct *vma, unsigned long address) ++{ ++ struct anon_vma *anon_vma = page_anon_vma(page); ++ struct page *new_page; ++ ++ if (PageKsm(page)) { ++ if (page_stable_node(page)) ++ return page; /* no need to copy it */ ++ } else if (!anon_vma) { ++ return page; /* no need to copy it */ ++ } else if (anon_vma->root == vma->anon_vma->root && ++ page->index == linear_page_index(vma, address)) { ++ return page; /* still no need to copy it */ ++ } ++ if (!PageUptodate(page)) ++ return page; /* let do_swap_page report the error */ ++ ++ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); ++ if (new_page) { ++ copy_user_highpage(new_page, page, address, vma); ++ ++ SetPageDirty(new_page); ++ __SetPageUptodate(new_page); ++ __SetPageLocked(new_page); ++ } ++ ++ return new_page; ++} ++ ++/* Copied from mm/ksm.c and required from 5.1 */ ++bool reuse_ksm_page(struct page *page, ++ struct vm_area_struct *vma, ++ unsigned long address) ++{ ++#ifdef CONFIG_DEBUG_VM ++ if (WARN_ON(is_zero_pfn(page_to_pfn(page))) || ++ WARN_ON(!page_mapped(page)) || ++ WARN_ON(!PageLocked(page))) { ++ dump_page(page, "reuse_ksm_page"); ++ return false; ++ } ++#endif ++ ++ if (PageSwapCache(page) || !page_stable_node(page)) ++ return false; ++ /* Prohibit parallel get_ksm_page() */ ++ if (!page_ref_freeze(page, 1)) ++ return false; ++ ++ page_move_anon_rmap(page, vma); ++ page->index = linear_page_index(vma, address); ++ page_ref_unfreeze(page, 1); ++ ++ return true; ++} ++ ++static int __init uksm_init(void) ++{ ++ struct task_struct *uksm_thread; ++ int err; ++ ++ uksm_sleep_jiffies = msecs_to_jiffies(100); ++ uksm_sleep_saved = uksm_sleep_jiffies; ++ ++ slot_tree_init(); ++ init_scan_ladder(); ++ ++ ++ err = init_random_sampling(); ++ if (err) ++ goto out_free2; ++ ++ err = uksm_slab_init(); ++ if (err) ++ goto out_free1; ++ ++ err = init_zeropage_hash_table(); ++ if (err) ++ goto out_free0; ++ ++ uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); ++ if (IS_ERR(uksm_thread)) { ++ pr_err("uksm: creating kthread failed\n"); ++ err = PTR_ERR(uksm_thread); ++ goto out_free; ++ } ++ ++#ifdef CONFIG_SYSFS ++ err = sysfs_create_group(mm_kobj, &uksm_attr_group); ++ if (err) { ++ pr_err("uksm: register sysfs failed\n"); ++ kthread_stop(uksm_thread); ++ goto out_free; ++ } ++#else ++ uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ ++ ++#endif /* CONFIG_SYSFS */ ++ ++#ifdef CONFIG_MEMORY_HOTREMOVE ++ /* ++ * Choose a high priority since the callback takes uksm_thread_mutex: ++ * later callbacks could only be taking locks which nest within that. ++ */ ++ hotplug_memory_notifier(uksm_memory_callback, 100); ++#endif ++ return 0; ++ ++out_free: ++ kfree(zero_hash_table); ++out_free0: ++ uksm_slab_free(); ++out_free1: ++ kfree(random_nums); ++out_free2: ++ kfree(uksm_scan_ladder); ++ return err; ++} ++ ++#ifdef MODULE ++subsys_initcall(ksm_init); ++#else ++late_initcall(uksm_init); ++#endif ++ +diff --git a/mm/vmstat.c b/mm/vmstat.c +index f8942160f..95d6d9267 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -1216,6 +1216,9 @@ const char * const vmstat_text[] = { + #endif + "nr_page_table_pages", + ++#ifdef CONFIG_UKSM ++ "nr_uksm_zero_pages", ++#endif + /* enum writeback_stat_item counters */ + "nr_dirty_threshold", + "nr_dirty_background_threshold", +-- +2.30.1.457.gf011795891 + diff --git a/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.11/4501_muqss.patch b/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.11/4501_muqss.patch new file mode 100644 index 000000000..3ce1db196 --- /dev/null +++ b/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.11/4501_muqss.patch @@ -0,0 +1,11255 @@ +# Calculate format=diff merge(sys-kernel/calculate-sources[muqss])!= +From 18247bb9a2aca72363326f63e868b7cfda0d771c Mon Sep 17 00:00:00 2001 +From: Con Kolivas +Date: Fri, 25 Oct 2019 14:00:52 +1100 +Subject: [PATCH 01/16] MultiQueue Skiplist Scheduler v0.208 + +--- + .../admin-guide/kernel-parameters.txt | 8 + + Documentation/admin-guide/sysctl/kernel.rst | 34 + + Documentation/scheduler/sched-BFS.txt | 351 + + Documentation/scheduler/sched-MuQSS.txt | 373 + + arch/alpha/Kconfig | 2 + + arch/arm/Kconfig | 2 + + arch/arm64/Kconfig | 2 + + arch/powerpc/Kconfig | 2 + + arch/powerpc/platforms/cell/spufs/sched.c | 5 - + arch/x86/Kconfig | 18 + + drivers/cpufreq/Kconfig | 1 + + fs/proc/base.c | 2 +- + include/linux/init_task.h | 4 + + include/linux/ioprio.h | 2 + + include/linux/sched.h | 61 +- + include/linux/sched/deadline.h | 9 + + include/linux/sched/nohz.h | 2 +- + include/linux/sched/prio.h | 12 + + include/linux/sched/rt.h | 2 + + include/linux/sched/task.h | 2 +- + include/linux/skip_list.h | 33 + + include/uapi/linux/sched.h | 9 +- + init/Kconfig | 24 +- + init/init_task.c | 10 + + init/main.c | 2 + + kernel/Kconfig.MuQSS | 106 + + kernel/Makefile | 3 +- + kernel/delayacct.c | 2 +- + kernel/exit.c | 4 +- + kernel/kthread.c | 30 +- + kernel/livepatch/transition.c | 6 +- + kernel/sched/Makefile | 10 +- + kernel/sched/MuQSS.c | 7931 +++++++++++++++++ + kernel/sched/MuQSS.h | 1082 +++ + kernel/sched/cpufreq_schedutil.c | 12 +- + kernel/sched/cpupri.h | 2 + + kernel/sched/cputime.c | 22 +- + kernel/sched/idle.c | 2 + + kernel/sched/sched.h | 35 + + kernel/sched/topology.c | 8 + + kernel/skip_list.c | 148 + + kernel/sysctl.c | 63 +- + kernel/time/Kconfig | 2 +- + kernel/time/clockevents.c | 5 + + kernel/time/posix-cpu-timers.c | 4 +- + kernel/time/timer.c | 7 +- + kernel/trace/trace_selftest.c | 5 + + 47 files changed, 10411 insertions(+), 50 deletions(-) + create mode 100644 Documentation/scheduler/sched-BFS.txt + create mode 100644 Documentation/scheduler/sched-MuQSS.txt + create mode 100644 include/linux/skip_list.h + create mode 100644 kernel/Kconfig.MuQSS + create mode 100644 kernel/sched/MuQSS.c + create mode 100644 kernel/sched/MuQSS.h + create mode 100644 kernel/skip_list.c + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index a10b545c2070..788ff10a2abe 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4659,6 +4659,14 @@ + Memory area to be used by remote processor image, + managed by CMA. + ++ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type. ++ Format: ++ smt -- Share SMT (hyperthread) sibling runqueues ++ mc -- Share MC (multicore) sibling runqueues ++ smp -- Share SMP runqueues ++ none -- So not share any runqueues ++ Default value is mc ++ + rw [KNL] Mount root device read-write on boot + + S [KNL] Run init in single mode +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 1d56a6b73a4e..51d1903f999b 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -436,6 +436,16 @@ this allows system administrators to override the + ``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded. + + ++iso_cpu: (MuQSS CPU scheduler only) ++=================================== ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling five ++seconds over the -whole- system, meaning all cpus. ++ ++Set to 70 (percent) by default. ++ ++ + kexec_load_disabled + =================== + +@@ -1077,6 +1087,20 @@ ROM/Flash boot loader. Maybe to tell it what to do after + rebooting. ??? + + ++rr_interval: (MuQSS CPU scheduler only) ++======================================= ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. Conversely decreasing it will decrease average and maximum ++latencies but at the expense of throughput. This value is in ++milliseconds and the default value chosen depends on the number of ++cpus available at scheduler initialisation with a minimum of 6. ++ ++Valid values are from 1-1000. ++ ++ + sched_energy_aware + ================== + +@@ -1515,3 +1539,13 @@ is 10 seconds. + + The softlockup threshold is (``2 * watchdog_thresh``). Setting this + tunable to zero will disable lockup detection altogether. ++ ++ ++yield_type: (MuQSS CPU scheduler only) ++====================================== ++ ++This determines what type of yield calls to sched_yield will perform. ++ ++ 0: No yield. ++ 1: Yield only to better priority/deadline tasks. (default) ++ 2: Expire timeslice and recalculate deadline. +diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt +new file mode 100644 +index 000000000000..c0282002a079 +--- /dev/null ++++ b/Documentation/scheduler/sched-BFS.txt +@@ -0,0 +1,351 @@ ++BFS - The Brain Fuck Scheduler by Con Kolivas. ++ ++Goals. ++ ++The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to ++completely do away with the complex designs of the past for the cpu process ++scheduler and instead implement one that is very simple in basic design. ++The main focus of BFS is to achieve excellent desktop interactivity and ++responsiveness without heuristics and tuning knobs that are difficult to ++understand, impossible to model and predict the effect of, and when tuned to ++one workload cause massive detriment to another. ++ ++ ++Design summary. ++ ++BFS is best described as a single runqueue, O(n) lookup, earliest effective ++virtual deadline first design, loosely based on EEVDF (earliest eligible virtual ++deadline first) and my previous Staircase Deadline scheduler. Each component ++shall be described in order to understand the significance of, and reasoning for ++it. The codebase when the first stable version was released was approximately ++9000 lines less code than the existing mainline linux kernel scheduler (in ++2.6.31). This does not even take into account the removal of documentation and ++the cgroups code that is not used. ++ ++Design reasoning. ++ ++The single runqueue refers to the queued but not running processes for the ++entire system, regardless of the number of CPUs. The reason for going back to ++a single runqueue design is that once multiple runqueues are introduced, ++per-CPU or otherwise, there will be complex interactions as each runqueue will ++be responsible for the scheduling latency and fairness of the tasks only on its ++own runqueue, and to achieve fairness and low latency across multiple CPUs, any ++advantage in throughput of having CPU local tasks causes other disadvantages. ++This is due to requiring a very complex balancing system to at best achieve some ++semblance of fairness across CPUs and can only maintain relatively low latency ++for tasks bound to the same CPUs, not across them. To increase said fairness ++and latency across CPUs, the advantage of local runqueue locking, which makes ++for better scalability, is lost due to having to grab multiple locks. ++ ++A significant feature of BFS is that all accounting is done purely based on CPU ++used and nowhere is sleep time used in any way to determine entitlement or ++interactivity. Interactivity "estimators" that use some kind of sleep/run ++algorithm are doomed to fail to detect all interactive tasks, and to falsely tag ++tasks that aren't interactive as being so. The reason for this is that it is ++close to impossible to determine that when a task is sleeping, whether it is ++doing it voluntarily, as in a userspace application waiting for input in the ++form of a mouse click or otherwise, or involuntarily, because it is waiting for ++another thread, process, I/O, kernel activity or whatever. Thus, such an ++estimator will introduce corner cases, and more heuristics will be required to ++cope with those corner cases, introducing more corner cases and failed ++interactivity detection and so on. Interactivity in BFS is built into the design ++by virtue of the fact that tasks that are waking up have not used up their quota ++of CPU time, and have earlier effective deadlines, thereby making it very likely ++they will preempt any CPU bound task of equivalent nice level. See below for ++more information on the virtual deadline mechanism. Even if they do not preempt ++a running task, because the rr interval is guaranteed to have a bound upper ++limit on how long a task will wait for, it will be scheduled within a timeframe ++that will not cause visible interface jitter. ++ ++ ++Design details. ++ ++Task insertion. ++ ++BFS inserts tasks into each relevant queue as an O(1) insertion into a double ++linked list. On insertion, *every* running queue is checked to see if the newly ++queued task can run on any idle queue, or preempt the lowest running task on the ++system. This is how the cross-CPU scheduling of BFS achieves significantly lower ++latency per extra CPU the system has. In this case the lookup is, in the worst ++case scenario, O(n) where n is the number of CPUs on the system. ++ ++Data protection. ++ ++BFS has one single lock protecting the process local data of every task in the ++global queue. Thus every insertion, removal and modification of task data in the ++global runqueue needs to grab the global lock. However, once a task is taken by ++a CPU, the CPU has its own local data copy of the running process' accounting ++information which only that CPU accesses and modifies (such as during a ++timer tick) thus allowing the accounting data to be updated lockless. Once a ++CPU has taken a task to run, it removes it from the global queue. Thus the ++global queue only ever has, at most, ++ ++ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 ++ ++tasks in the global queue. This value is relevant for the time taken to look up ++tasks during scheduling. This will increase if many tasks with CPU affinity set ++in their policy to limit which CPUs they're allowed to run on if they outnumber ++the number of CPUs. The +1 is because when rescheduling a task, the CPU's ++currently running task is put back on the queue. Lookup will be described after ++the virtual deadline mechanism is explained. ++ ++Virtual deadline. ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in BFS is entirely in the virtual deadline mechanism. The one ++tunable in BFS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in jiffies by this equation: ++ ++ jiffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. Once a task is descheduled, it is put back on the queue, and an ++O(n) lookup of all queued-but-not-running tasks is done to determine which has ++the earliest deadline and that task is chosen to receive CPU next. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (jiffies) is ++constantly moving. ++ ++Task lookup. ++ ++BFS has 103 priority queues. 100 of these are dedicated to the static priority ++of realtime tasks, and the remaining 3 are, in order of best to worst priority, ++SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority ++scheduling). When a task of these priorities is queued, a bitmap of running ++priorities is set showing which of these priorities has tasks waiting for CPU ++time. When a CPU is made to reschedule, the lookup for the next task to get ++CPU time is performed in the following way: ++ ++First the bitmap is checked to see what static priority tasks are queued. If ++any realtime priorities are found, the corresponding queue is checked and the ++first task listed there is taken (provided CPU affinity is suitable) and lookup ++is complete. If the priority corresponds to a SCHED_ISO task, they are also ++taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds ++to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this ++stage, every task in the runlist that corresponds to that priority is checked ++to see which has the earliest set deadline, and (provided it has suitable CPU ++affinity) it is taken off the runqueue and given the CPU. If a task has an ++expired deadline, it is taken and the rest of the lookup aborted (as they are ++chosen in FIFO order). ++ ++Thus, the lookup is O(n) in the worst case only, where n is as described ++earlier, as tasks may be chosen before the whole task list is looked over. ++ ++ ++Scalability. ++ ++The major limitations of BFS will be that of scalability, as the separate ++runqueue designs will have less lock contention as the number of CPUs rises. ++However they do not scale linearly even with separate runqueues as multiple ++runqueues will need to be locked concurrently on such designs to be able to ++achieve fair CPU balancing, to try and achieve some sort of nice-level fairness ++across CPUs, and to achieve low enough latency for tasks on a busy CPU when ++other CPUs would be more suited. BFS has the advantage that it requires no ++balancing algorithm whatsoever, as balancing occurs by proxy simply because ++all CPUs draw off the global runqueue, in priority and deadline order. Despite ++the fact that scalability is _not_ the prime concern of BFS, it both shows very ++good scalability to smaller numbers of CPUs and is likely a more scalable design ++at these numbers of CPUs. ++ ++It also has some very low overhead scalability features built into the design ++when it has been deemed their overhead is so marginal that they're worth adding. ++The first is the local copy of the running process' data to the CPU it's running ++on to allow that data to be updated lockless where possible. Then there is ++deference paid to the last CPU a task was running on, by trying that CPU first ++when looking for an idle CPU to use the next time it's scheduled. Finally there ++is the notion of cache locality beyond the last running CPU. The sched_domains ++information is used to determine the relative virtual "cache distance" that ++other CPUs have from the last CPU a task was running on. CPUs with shared ++caches, such as SMT siblings, or multicore CPUs with shared caches, are treated ++as cache local. CPUs without shared caches are treated as not cache local, and ++CPUs on different NUMA nodes are treated as very distant. This "relative cache ++distance" is used by modifying the virtual deadline value when doing lookups. ++Effectively, the deadline is unaltered between "cache local" CPUs, doubled for ++"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning ++behind the doubling of deadlines is as follows. The real cost of migrating a ++task from one CPU to another is entirely dependant on the cache footprint of ++the task, how cache intensive the task is, how long it's been running on that ++CPU to take up the bulk of its cache, how big the CPU cache is, how fast and ++how layered the CPU cache is, how fast a context switch is... and so on. In ++other words, it's close to random in the real world where we do more than just ++one sole workload. The only thing we can be sure of is that it's not free. So ++BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs ++is more important than cache locality, and cache locality only plays a part ++after that. Doubling the effective deadline is based on the premise that the ++"cache local" CPUs will tend to work on the same tasks up to double the number ++of cache local CPUs, and once the workload is beyond that amount, it is likely ++that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA ++is a value I pulled out of my arse. ++ ++When choosing an idle CPU for a waking task, the cache locality is determined ++according to where the task last ran and then idle CPUs are ranked from best ++to worst to choose the most suitable idle CPU based on cache locality, NUMA ++node locality and hyperthread sibling business. They are chosen in the ++following preference (if idle): ++ ++* Same core, idle or busy cache, idle threads ++* Other core, same cache, idle or busy cache, idle threads. ++* Same node, other CPU, idle cache, idle threads. ++* Same node, other CPU, busy cache, idle threads. ++* Same core, busy threads. ++* Other core, same cache, busy threads. ++* Same node, other CPU, busy threads. ++* Other node, other CPU, idle cache, idle threads. ++* Other node, other CPU, busy cache, idle threads. ++* Other node, other CPU, busy threads. ++ ++This shows the SMT or "hyperthread" awareness in the design as well which will ++choose a real idle core first before a logical SMT sibling which already has ++tasks on the physical CPU. ++ ++Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. ++However this benchmarking was performed on an earlier design that was far less ++scalable than the current one so it's hard to know how scalable it is in terms ++of both CPUs (due to the global runqueue) and heavily loaded machines (due to ++O(n) lookup) at this stage. Note that in terms of scalability, the number of ++_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) ++quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark ++results are very promising indeed, without needing to tweak any knobs, features ++or options. Benchmark contributions are most welcome. ++ ++ ++Features ++ ++As the initial prime target audience for BFS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval ++and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition ++to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is ++support for CGROUPS. The average user should neither need to know what these ++are, nor should they need to be using them to have good desktop behaviour. ++ ++rr_interval ++ ++There is only one "scheduler" tunable, the round robin interval. This can be ++accessed in ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6 on a ++uniprocessor machine, and automatically set to a progressively higher value on ++multiprocessor machines. The reasoning behind increasing the value on more CPUs ++is that the effective latency is decreased by virtue of there being more CPUs on ++BFS (for reasons explained above), and increasing the value allows for less ++cache contention and more throughput. Valid values are from 1 to 1000 ++Decreasing the value will decrease latencies at the cost of decreasing ++throughput, while increasing it will improve throughput, but at the cost of ++worsening latencies. The accuracy of the rr interval is limited by HZ resolution ++of the kernel configuration. Thus, the worst case latencies are usually slightly ++higher than this actual value. The default value of 6 is not an arbitrary one. ++It is based on the fact that humans can detect jitter at approximately 7ms, so ++aiming for much lower latencies is pointless under most circumstances. It is ++worth noting this fact when comparing the latency performance of BFS to other ++schedulers. Worst case latencies being higher than 7ms are far worse than ++average latencies not being in the microsecond range. ++ ++Isochronous scheduling. ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of _total CPU_ available across the machine, configurable ++as a percentage in the following "resource handling" tunable (as opposed to a ++scheduler tunable): ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of BFS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++Because some applications constantly set their policy as well as their nice ++level, there is potential for them to undo the override specified by the user ++on the command line of setting the policy to SCHED_ISO. To counter this, once ++a task has been set to SCHED_ISO policy, it needs superuser privileges to set ++it back to SCHED_NORMAL. This will ensure the task remains ISO and all child ++processes and threads will also inherit the ISO policy. ++ ++Idleprio scheduling. ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start ++a video encode or so on without any slowdown of other tasks. To avoid this ++policy from grabbing shared resources and holding them indefinitely, if it ++detects a state where the task is waiting on I/O, the machine is about to ++suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As ++per the Isochronous task management, once a task has been scheduled as IDLEPRIO, ++it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can ++be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++ schedtool -D -e ./mprime ++ ++Subtick accounting. ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the ++timer tick frequency (HZ) is lowered. It is possible to create an application ++which uses almost 100% CPU, yet by being descheduled at the right time, records ++zero CPU usage. While the main problem with this is that there are possible ++security implications, it is also difficult to determine how much CPU a task ++really does use. BFS tries to use the sub-tick accounting from the TSC clock, ++where possible, to determine real CPU usage. This is not entirely reliable, but ++is far more likely to produce accurate CPU usage data than the existing designs ++and will not show tasks as consuming no CPU usage when they actually are. Thus, ++the amount of CPU reported as being used by BFS will more accurately represent ++how much CPU the task itself is using (as is shown for example by the 'time' ++application), so the reported values may be quite different to other schedulers. ++Values reported as the 'load' are more prone to problems with this design, but ++per process values are closer to real usage. When comparing throughput of BFS ++to other designs, it is important to compare the actual completed work in terms ++of total wall clock time taken and total work done, rather than the reported ++"cpu usage". ++ ++ ++Con Kolivas Fri Aug 27 2010 +diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt +new file mode 100644 +index 000000000000..ae28b85c9995 +--- /dev/null ++++ b/Documentation/scheduler/sched-MuQSS.txt +@@ -0,0 +1,373 @@ ++MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. ++ ++MuQSS is a per-cpu runqueue variant of the original BFS scheduler with ++one 8 level skiplist per runqueue, and fine grained locking for much more ++scalability. ++ ++ ++Goals. ++ ++The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from ++here on (pronounced mux) is to completely do away with the complex designs of ++the past for the cpu process scheduler and instead implement one that is very ++simple in basic design. The main focus of MuQSS is to achieve excellent desktop ++interactivity and responsiveness without heuristics and tuning knobs that are ++difficult to understand, impossible to model and predict the effect of, and when ++tuned to one workload cause massive detriment to another, while still being ++scalable to many CPUs and processes. ++ ++ ++Design summary. ++ ++MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) ++lookup, earliest effective virtual deadline first tickless design, loosely based ++on EEVDF (earliest eligible virtual deadline first) and my previous Staircase ++Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. ++Each component shall be described in order to understand the significance of, ++and reasoning for it. ++ ++ ++Design reasoning. ++ ++In BFS, the use of a single runqueue across all CPUs meant that each CPU would ++need to scan the entire runqueue looking for the process with the earliest ++deadline and schedule that next, regardless of which CPU it originally came ++from. This made BFS deterministic with respect to latency and provided ++guaranteed latencies dependent on number of processes and CPUs. The single ++runqueue, however, meant that all CPUs would compete for the single lock ++protecting it, which would lead to increasing lock contention as the number of ++CPUs rose and appeared to limit scalability of common workloads beyond 16 ++logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously ++increased overhead proportionate to the number of queued proecesses and led to ++cache thrashing while iterating over the linked list. ++ ++MuQSS is an evolution of BFS, designed to maintain the same scheduling ++decision mechanism and be virtually deterministic without relying on the ++constrained design of the single runqueue by splitting out the single runqueue ++to be per-CPU and use skiplists instead of linked lists. ++ ++The original reason for going back to a single runqueue design for BFS was that ++once multiple runqueues are introduced, per-CPU or otherwise, there will be ++complex interactions as each runqueue will be responsible for the scheduling ++latency and fairness of the tasks only on its own runqueue, and to achieve ++fairness and low latency across multiple CPUs, any advantage in throughput of ++having CPU local tasks causes other disadvantages. This is due to requiring a ++very complex balancing system to at best achieve some semblance of fairness ++across CPUs and can only maintain relatively low latency for tasks bound to the ++same CPUs, not across them. To increase said fairness and latency across CPUs, ++the advantage of local runqueue locking, which makes for better scalability, is ++lost due to having to grab multiple locks. ++ ++MuQSS works around the problems inherent in multiple runqueue designs by ++making its skip lists priority ordered and through novel use of lockless ++examination of each other runqueue it can decide if it should take the earliest ++deadline task from another runqueue for latency reasons, or for CPU balancing ++reasons. It still does not have a balancing system, choosing to allow the ++next task scheduling decision and task wakeup CPU choice to allow balancing to ++happen by virtue of its choices. ++ ++As a further evolution of the design, MuQSS normally configures sharing of ++runqueues in a logical fashion for when CPU resources are shared for improved ++latency and throughput. By default it shares runqueues and locks between ++multicore siblings. Optionally it can be configured to run with sharing of ++SMT siblings only, all SMP packages or no sharing at all. Additionally it can ++be selected at boot time. ++ ++ ++Design details. ++ ++Custom skip list implementation: ++ ++To avoid the overhead of building up and tearing down skip list structures, ++the variant used by MuQSS has a number of optimisations making it specific for ++its use case in the scheduler. It uses static arrays of 8 'levels' instead of ++building up and tearing down structures dynamically. This makes each runqueue ++only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU ++it means that it scales O(log N) up to 64k x number of logical CPUs which is ++far beyond the realistic task limits each CPU could handle. By being 8 levels ++it also makes the array exactly one cacheline in size. Additionally, each ++skip list node is bidirectional making insertion and removal amortised O(1), ++being O(k) where k is 1-8. Uniquely, we are only ever interested in the very ++first entry in each list at all times with MuQSS, so there is never a need to ++do a search and thus look up is always O(1). In interactive mode, the queues ++will be searched beyond their first entry if the first task is not suitable ++for affinity or SMT nice reasons. ++ ++Task insertion: ++ ++MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into ++a custom skip list as described above (based on the original design by William ++Pugh). Insertion is ordered in such a way that there is never a need to do a ++search by ordering tasks according to static priority primarily, and then ++virtual deadline at the time of insertion. ++ ++Niffies: ++ ++Niffies are a monotonic forward moving timer not unlike the "jiffies" but are ++of nanosecond resolution. Niffies are calculated per-runqueue from the high ++resolution TSC timers, and in order to maintain fairness are synchronised ++between CPUs whenever both runqueues are locked concurrently. ++ ++Virtual deadline: ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in MuQSS is entirely in the virtual deadline mechanism. The one ++tunable in MuQSS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in niffies by this equation: ++ ++ niffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (niffies) is ++constantly moving. ++ ++Task lookup: ++ ++As tasks are already pre-ordered according to anticipated scheduling order in ++the skip lists, lookup for the next suitable task per-runqueue is always a ++matter of simply selecting the first task in the 0th level skip list entry. ++In order to maintain optimal latency and fairness across CPUs, MuQSS does a ++novel examination of every other runqueue in cache locality order, choosing the ++best task across all runqueues. This provides near-determinism of how long any ++task across the entire system may wait before receiving CPU time. The other ++runqueues are first examine lockless and then trylocked to minimise the ++potential lock contention if they are likely to have a suitable better task. ++Each other runqueue lock is only held for as long as it takes to examine the ++entry for suitability. In "interactive" mode, the default setting, MuQSS will ++look for the best deadline task across all CPUs, while in !interactive mode, ++it will only select a better deadline task from another CPU if it is more ++heavily laden than the current one. ++ ++Lookup is therefore O(k) where k is number of CPUs. ++ ++ ++Latency. ++ ++Through the use of virtual deadlines to govern the scheduling order of normal ++tasks, queue-to-activation latency per runqueue is guaranteed to be bound by ++the rr_interval tunable which is set to 6ms by default. This means that the ++longest a CPU bound task will wait for more CPU is proportional to the number ++of running tasks and in the common case of 0-2 running tasks per CPU, will be ++under the 7ms threshold for human perception of jitter. Additionally, as newly ++woken tasks will have an early deadline from their previous runtime, the very ++tasks that are usually latency sensitive will have the shortest interval for ++activation, usually preempting any existing CPU bound tasks. ++ ++Tickless expiry: ++ ++A feature of MuQSS is that it is not tied to the resolution of the chosen tick ++rate in Hz, instead depending entirely on the high resolution timers where ++possible for sub-millisecond accuracy on timeouts regarless of the underlying ++tick rate. This allows MuQSS to be run with the low overhead of low Hz rates ++such as 100 by default, benefiting from the improved throughput and lower ++power usage it provides. Another advantage of this approach is that in ++combination with the Full No HZ option, which disables ticks on running task ++CPUs instead of just idle CPUs, the tick can be disabled at all times ++regardless of how many tasks are running instead of being limited to just one ++running task. Note that this option is NOT recommended for regular desktop ++users. ++ ++ ++Scalability and balancing. ++ ++Unlike traditional approaches where balancing is a combination of CPU selection ++at task wakeup and intermittent balancing based on a vast array of rules set ++according to architecture, busyness calculations and special case management, ++MuQSS indirectly balances on the fly at task wakeup and next task selection. ++During initialisation, MuQSS creates a cache coherency ordered list of CPUs for ++each logical CPU and uses this to aid task/CPU selection when CPUs are busy. ++Additionally it selects any idle CPUs, if they are available, at any time over ++busy CPUs according to the following preference: ++ ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ ++Mux is therefore SMT, MC and Numa aware without the need for extra ++intermittent balancing to maintain CPUs busy and make the most of cache ++coherency. ++ ++ ++Features ++ ++As the initial prime target audience for MuQSS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, ++interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO ++policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS ++does _not_ now feature is support for CGROUPS. The average user should neither ++need to know what these are, nor should they need to be using them to have good ++desktop behaviour. However since some applications refuse to work without ++cgroups, one can enable them with MuQSS as a stub and the filesystem will be ++created which will allow the applications to work. ++ ++rr_interval: ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6. Valid values ++are from 1 to 1000 Decreasing the value will decrease latencies at the cost of ++decreasing throughput, while increasing it will improve throughput, but at the ++cost of worsening latencies. It is based on the fact that humans can detect ++jitter at approximately 7ms, so aiming for much lower latencies is pointless ++under most circumstances. It is worth noting this fact when comparing the ++latency performance of MuQSS to other schedulers. Worst case latencies being ++higher than 7ms are far worse than average latencies not being in the ++microsecond range. ++ ++interactive: ++ ++ /proc/sys/kernel/interactive ++ ++The value is a simple boolean of 1 for on and 0 for off and is set to on by ++default. Disabling this will disable the near-determinism of MuQSS when ++selecting the next task by not examining all CPUs for the earliest deadline ++task, or which CPU to wake to, instead prioritising CPU balancing for improved ++throughput. Latency will still be bound by rr_interval, but on a per-CPU basis ++instead of across the whole system. ++ ++Runqueue sharing. ++ ++By default MuQSS chooses to share runqueue resources (specifically the skip ++list and locking) between multicore siblings. It is configurable at build time ++to select between None, SMT, MC and SMP, corresponding to no sharing, sharing ++only between simultaneous mulithreading siblings, multicore siblings, or ++symmetric multiprocessing physical packages. Additionally it can be se at ++bootime with the use of the rqshare parameter. The reason for configurability ++is that some architectures have CPUs with many multicore siblings (>= 16) ++where it may be detrimental to throughput to share runqueues and another ++sharing option may be desirable. Additionally, more sharing than usual can ++improve latency on a system-wide level at the expense of throughput if desired. ++ ++The options are: ++none, smt, mc, smp ++ ++eg: ++ rqshare=mc ++ ++Isochronous scheduling: ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of CPU available per CPU, configurable as a percentage in ++the following "resource handling" tunable (as opposed to a scheduler tunable): ++ ++iso_cpu: ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of MuQSS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++ ++ ++Idleprio scheduling: ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start a ++video encode or so on without any slowdown of other tasks. To avoid this policy ++from grabbing shared resources and holding them indefinitely, if it detects a ++state where the task is waiting on I/O, the machine is about to suspend to ram ++and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has ++been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without ++superuser privileges since it is effectively a lower scheduling policy. Tasks ++can be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++schedtool -D -e ./mprime ++ ++Subtick accounting: ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the timer ++tick frequency (HZ) is lowered. It is possible to create an application which ++uses almost 100% CPU, yet by being descheduled at the right time, records zero ++CPU usage. While the main problem with this is that there are possible security ++implications, it is also difficult to determine how much CPU a task really does ++use. Mux uses sub-tick accounting from the TSC clock to determine real CPU ++usage. Thus, the amount of CPU reported as being used by MuQSS will more ++accurately represent how much CPU the task itself is using (as is shown for ++example by the 'time' application), so the reported values may be quite ++different to other schedulers. When comparing throughput of MuQSS to other ++designs, it is important to compare the actual completed work in terms of total ++wall clock time taken and total work done, rather than the reported "cpu usage". ++ ++Symmetric MultiThreading (SMT) aware nice: ++ ++SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the ++logical CPU count rises by adding thread units to each CPU core, allowing more ++than one task to be run simultaneously on the same core, the disadvantage of it ++is that the CPU power is shared between the tasks, not summating to the power ++of two CPUs. The practical upshot of this is that two tasks running on ++separate threads of the same core run significantly slower than if they had one ++core each to run on. While smart CPU selection allows each task to have a core ++to itself whenever available (as is done on MuQSS), it cannot offset the ++slowdown that occurs when the cores are all loaded and only a thread is left. ++Most of the time this is harmless as the CPU is effectively overloaded at this ++point and the extra thread is of benefit. However when running a niced task in ++the presence of an un-niced task (say nice 19 v nice 0), the nice task gets ++precisely the same amount of CPU power as the unniced one. MuQSS has an ++optional configuration feature known as SMT-NICE which selectively idles the ++secondary niced thread for a period proportional to the nice difference, ++allowing CPU distribution according to nice level to be maintained, at the ++expense of a small amount of extra overhead. If this is configured in on a ++machine without SMT threads, the overhead is minimal. ++ ++ ++Con Kolivas Sat, 29th October 2016 +diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig +index 1f51437d5765..4eb637d19095 100644 +--- a/arch/alpha/Kconfig ++++ b/arch/alpha/Kconfig +@@ -674,6 +674,8 @@ config HZ + default 1200 if HZ_1200 + default 1024 + ++source "kernel/Kconfig.MuQSS" ++ + config SRM_ENV + tristate "SRM environment through procfs" + depends on PROC_FS +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index 138248999df7..f9d4e5603639 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -1211,6 +1211,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config HAVE_ARM_SCU + bool + help +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index f39568b28ec1..0531e228acfa 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -978,6 +978,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config NR_CPUS + int "Maximum number of CPUs (2-4096)" + range 2 4096 +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index 107bb4319e0e..fd2dd69c4047 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -898,6 +898,8 @@ config SCHED_SMT + when dealing with POWER5 cpus at a cost of slightly increased + overhead in some places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config PPC_DENORMALISATION + bool "PowerPC denormalisation exception handling" + depends on PPC_BOOK3S_64 +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 21f851179ff0..27cf7ccda3ec 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1011,6 +1011,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_MUQSS && SCHED_SMT ++ default y ++ help ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +@@ -1041,6 +1057,8 @@ config SCHED_MC_PRIO + + If unsure say Y here. + ++source "kernel/Kconfig.MuQSS" ++ + config UP_LATE_INIT + def_bool y + depends on !SMP && X86_LOCAL_APIC +diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig +index 85de313ddec2..4edef95c5f04 100644 +--- a/drivers/cpufreq/Kconfig ++++ b/drivers/cpufreq/Kconfig +@@ -39,6 +39,7 @@ choice + default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1100_CPUFREQ || ARM_SA1110_CPUFREQ + default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if ARM64 || ARM + default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if X86_INTEL_PSTATE && SMP ++ default CPU_FREQ_DEFAULT_GOV_ONDEMAND if !X86_INTEL_PSTATE + default CPU_FREQ_DEFAULT_GOV_PERFORMANCE + help + This option sets which CPUFreq governor shall be loaded at +diff --git a/fs/proc/base.c b/fs/proc/base.c +index b3422cda2a91..7ab99c9eaa5b 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index b2412b4d4c20..0db390aeae9f 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_MUQSS ++#define INIT_TASK_COMM "MuQSS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h +index e9bfe6972aed..16ba1c7e5bde 100644 +--- a/include/linux/ioprio.h ++++ b/include/linux/ioprio.h +@@ -53,6 +53,8 @@ enum { + */ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (iso_task(task)) ++ return 0; + return (task_nice(task) + 20) / 5; + } + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 6e3a5eeec509..6e75344b9f76 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -37,6 +37,10 @@ + #include + #include + ++#ifdef CONFIG_SCHED_MUQSS ++#include ++#endif ++ + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; + struct backing_dev_info; +@@ -669,8 +673,10 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) + int on_cpu; ++#endif ++#ifdef CONFIG_SMP + struct __call_single_node wake_entry; + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ +@@ -696,10 +702,25 @@ struct task_struct { + int static_prio; + int normal_prio; + unsigned int rt_priority; ++#ifdef CONFIG_SCHED_MUQSS ++ int time_slice; ++ u64 deadline; ++ skiplist_node node; /* Skip list node */ ++ u64 last_ran; ++ u64 sched_time; /* sched_clock time spent running */ ++#ifdef CONFIG_SMT_NICE ++ int smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++#ifdef CONFIG_HOTPLUG_CPU ++ bool zerobound; /* Bound to CPU0 for hotplug */ ++#endif ++ unsigned long rt_timeout; ++#else /* CONFIG_SCHED_MUQSS */ + + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +@@ -900,6 +921,10 @@ struct task_struct { + #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + u64 utimescaled; + u64 stimescaled; ++#endif ++#ifdef CONFIG_SCHED_MUQSS ++ /* Unbanked cpu time */ ++ unsigned long utime_ns, stime_ns; + #endif + u64 gtime; + struct prev_cputime prev_cputime; +@@ -1385,6 +1410,40 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_MUQSS ++#define tsk_seruntime(t) ((t)->sched_time) ++#define tsk_rttimeout(t) ((t)->rt_timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++} ++ ++void print_scheduler_version(void); ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return (p->policy == SCHED_ISO); ++} ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++ p->nr_cpus_allowed = current->nr_cpus_allowed; ++} ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "CFS CPU scheduler.\n"); ++} ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_MUQSS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..73d6319a856a 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -28,7 +28,16 @@ static inline bool dl_time_before(u64 a, u64 b) + #ifdef CONFIG_SMP + + struct root_domain; ++#ifdef CONFIG_SCHED_MUQSS ++static inline void dl_clear_root_domain(struct root_domain *rd) ++{ ++} ++static inline void dl_add_task_root_domain(struct task_struct *p) ++{ ++} ++#else /* CONFIG_SCHED_MUQSS */ + extern void dl_add_task_root_domain(struct task_struct *p); + extern void dl_clear_root_domain(struct root_domain *rd); ++#endif /* CONFIG_SCHED_MUQSS */ + + #endif /* CONFIG_SMP */ +diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h +index 6d67e9a5af6b..101fe470aa8f 100644 +--- a/include/linux/sched/nohz.h ++++ b/include/linux/sched/nohz.h +@@ -13,7 +13,7 @@ extern int get_nohz_timer_target(void); + static inline void nohz_balance_enter_idle(int cpu) { } + #endif + +-#ifdef CONFIG_NO_HZ_COMMON ++#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) + void calc_load_nohz_start(void); + void calc_load_nohz_remote(struct rq *rq); + void calc_load_nohz_stop(void); +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..43c9d9e50c09 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,8 +20,20 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_MUQSS ++/* Note different MAX_RT_PRIO */ ++#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) ++ ++#define ISO_PRIO (MAX_RT_PRIO) ++#define NORMAL_PRIO (MAX_RT_PRIO + 1) ++#define IDLE_PRIO (MAX_RT_PRIO + 2) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* CONFIG_SCHED_MUQSS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + ++#endif /* CONFIG_SCHED_MUQSS */ ++ + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..010b2244e0b6 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_MUQSS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index c0f71f2e7160..b5a5370d7e3b 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -91,7 +91,7 @@ int kernel_wait(pid_t pid, int *stat); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..d4be84ba273b +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,33 @@ ++#ifndef _LINUX_SKIP_LISTS_H ++#define _LINUX_SKIP_LISTS_H ++typedef u64 keyType; ++typedef void *valueType; ++ ++typedef struct nodeStructure skiplist_node; ++ ++struct nodeStructure { ++ int level; /* Levels in this structure */ ++ keyType key; ++ valueType value; ++ skiplist_node *next[8]; ++ skiplist_node *prev[8]; ++}; ++ ++typedef struct listStructure { ++ int entries; ++ int level; /* Maximum level of the list ++ (1 more than the number of levels in the list) */ ++ skiplist_node *header; /* pointer to header */ ++} skiplist; ++ ++void skiplist_init(skiplist_node *slnode); ++skiplist *new_skiplist(skiplist_node *slnode); ++void free_skiplist(skiplist *l); ++void skiplist_node_init(skiplist_node *node); ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); ++void skiplist_delete(skiplist *l, skiplist_node *node); ++ ++static inline bool skiplist_node_empty(skiplist_node *node) { ++ return (!node->next[0]); ++} ++#endif /* _LINUX_SKIP_LISTS_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 3bac0a8ceab2..f48c5c5da651 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -115,9 +115,16 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented on MuQSS only */ + #define SCHED_IDLE 5 ++#ifdef CONFIG_SCHED_MUQSS ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO SCHED_IDLE ++#define SCHED_MAX (SCHED_IDLEPRIO) ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++#else /* CONFIG_SCHED_MUQSS */ + #define SCHED_DEADLINE 6 ++#endif /* CONFIG_SCHED_MUQSS */ + + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ + #define SCHED_RESET_ON_FORK 0x40000000 +diff --git a/init/Kconfig b/init/Kconfig +index 29ad68325028..864228366cd2 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -95,6 +95,18 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config SCHED_MUQSS ++ bool "MuQSS cpu scheduler" ++ select HIGH_RES_TIMERS ++ help ++ The Multiple Queue Skiplist Scheduler for excellent interactivity and ++ responsiveness on the desktop and highly scalable deterministic ++ low latency on any hardware. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -513,6 +525,7 @@ config SCHED_THERMAL_PRESSURE + default y if ARM64 + depends on SMP + depends on CPU_FREQ_THERMAL ++ depends on !SCHED_MUQSS + help + Select this option to enable thermal pressure accounting in the + scheduler. Thermal pressure is the value conveyed to the scheduler +@@ -862,6 +875,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_MUQSS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -946,9 +960,13 @@ menuconfig CGROUP_SCHED + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group +- tasks. ++ tasks. In combination with MuQSS this is purely a STUB to create the ++ files associated with the CPU controller cgroup but most of the ++ controls do nothing. This is useful for working in environments and ++ with applications that will only work if this control group is ++ present. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_MUQSS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1077,6 +1095,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_MUQSS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1204,6 +1223,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_MUQSS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 3711cdaafed2..27826fdd0aa8 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -75,9 +75,17 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_MUQSS ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, ++ .time_slice = 1000000, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -87,6 +95,7 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifndef CONFIG_SCHED_MUQSS + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -94,6 +103,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/init/main.c b/init/main.c +index a626e78dbf06..184e927fb450 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -1439,6 +1439,8 @@ static int __ref kernel_init(void *unused) + + do_sysctl_args(); + ++ print_scheduler_version(); ++ + if (ramdisk_execute_command) { + ret = run_init_process(ramdisk_execute_command); + if (!ret) +diff --git a/kernel/Kconfig.MuQSS b/kernel/Kconfig.MuQSS +new file mode 100644 +index 000000000000..91688dae437b +--- /dev/null ++++ b/kernel/Kconfig.MuQSS +@@ -0,0 +1,106 @@ ++choice ++ depends on SMP ++ prompt "CPU scheduler runqueue sharing" ++ default RQ_MC if SCHED_MUQSS ++ default RQ_NONE ++ ++config RQ_NONE ++ bool "No sharing" ++ help ++ This is the default behaviour where the CPU scheduler has one runqueue ++ per CPU, whether it is a physical or logical CPU (hyperthread). ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=none ++ ++ If unsure, say N. ++ ++config RQ_SMT ++ bool "SMT (hyperthread) siblings" ++ depends on SCHED_SMT && SCHED_MUQSS ++ ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by SMT (hyperthread) siblings. As these logical cores share ++ one physical core, sharing the runqueue resource can lead to decreased ++ overhead, lower latency and higher throughput. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smt ++ ++ If unsure, say N. ++ ++config RQ_MC ++ bool "Multicore siblings" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by multicore siblings in addition to any SMT siblings. ++ As these physical cores share caches, sharing the runqueue resource ++ will lead to lower latency, but its effects on overhead and throughput ++ are less predictable. As a general rule, 6 or fewer cores will likely ++ benefit from this, while larger CPUs will only derive a latency ++ benefit. If your workloads are primarily single threaded, this will ++ possibly worsen throughput. If you are only concerned about latency ++ then enable this regardless of how many cores you have. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=mc ++ ++ If unsure, say Y. ++ ++config RQ_MC_LLC ++ bool "Multicore siblings (LLC)" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will behave similarly as ++ with "Multicore siblings". ++ This option takes LLC cache into account when scheduling tasks. ++ Option may benefit CPUs with multiple LLC caches, such as Ryzen ++ and Xeon CPUs. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=llc ++ ++ If unsure, say N. ++ ++config RQ_SMP ++ bool "Symmetric Multi-Processing" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by all physical CPUs unless they are on separate NUMA nodes. ++ As physical CPUs usually do not share resources, sharing the runqueue ++ will normally worsen throughput but improve latency. If you only ++ care about latency enable this. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smp ++ ++ If unsure, say N. ++ ++config RQ_ALL ++ bool "NUMA" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ regardless of the architecture configuration, including across NUMA ++ nodes. This can substantially decrease throughput in NUMA ++ configurations, but light NUMA designs will not be dramatically ++ affected. This option should only be chosen if latency is the prime ++ concern. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=all ++ ++ If unsure, say N. ++endchoice ++ ++config SHARERQ ++ int ++ default 0 if RQ_NONE ++ default 1 if RQ_SMT ++ default 2 if RQ_MC ++ default 3 if RQ_MC_LLC ++ default 4 if RQ_SMP ++ default 5 if RQ_ALL +diff --git a/kernel/Makefile b/kernel/Makefile +index aa7368c7eabf..eacb52b1bd94 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -10,7 +10,8 @@ obj-y = fork.o exec_domain.o panic.o \ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ + notifier.o ksysfs.o cred.o reboot.o \ +- async.o range.o smpboot.o ucount.o regset.o ++ async.o range.o smpboot.o ucount.o regset.o \ ++ skip_list.o + + obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o + obj-$(CONFIG_MODULES) += kmod.o +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index 04029e35e69a..5ee0dc0b9175 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -122,7 +122,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/kthread.c b/kernel/kthread.c +index 1578973c5740..24b3b39f4123 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -471,6 +471,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) + } + EXPORT_SYMBOL(kthread_bind); + ++#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) ++extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); ++ ++/* ++ * new_kthread_bind is a special variant of __kthread_bind_mask. ++ * For new threads to work on muqss we want to call do_set_cpus_allowed ++ * without the task_cpu being set and the task rescheduled until they're ++ * rescheduled on their own so we call __do_set_cpus_allowed directly which ++ * only changes the cpumask. This is particularly important for smpboot threads ++ * to work. ++ */ ++static void new_kthread_bind(struct task_struct *p, unsigned int cpu) ++{ ++ unsigned long flags; ++ ++ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) ++ return; ++ ++ /* It's safe because the task is inactive. */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ __do_set_cpus_allowed(p, cpumask_of(cpu)); ++ p->flags |= PF_NO_SETAFFINITY; ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++#else ++#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) ++#endif ++ + /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). +@@ -491,7 +519,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), + cpu); + if (IS_ERR(p)) + return p; +- kthread_bind(p, cpu); ++ new_kthread_bind(p, cpu); + /* CPU hotplug need to bind once again when unparking the thread. */ + to_kthread(p)->cpu = cpu; + return p; +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index f6310f848f34..825f9b8e228f 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -282,7 +282,7 @@ static bool klp_try_switch_task(struct task_struct *task) + { + static char err_buf[STACK_ERR_BUF_SIZE]; + struct rq *rq; +- struct rq_flags flags; ++ struct rq_flags rf; + int ret; + bool success = false; + +@@ -304,7 +304,7 @@ static bool klp_try_switch_task(struct task_struct *task) + * functions. If all goes well, switch the task to the target patch + * state. + */ +- rq = task_rq_lock(task, &flags); ++ rq = task_rq_lock(task, &rf); + + if (task_running(rq, task) && task != current) { + snprintf(err_buf, STACK_ERR_BUF_SIZE, +@@ -323,7 +323,7 @@ static bool klp_try_switch_task(struct task_struct *task) + task->patch_state = klp_target_state; + + done: +- task_rq_unlock(rq, task, &flags); ++ task_rq_unlock(rq, task, &rf); + + /* + * Due to console deadlock issues, pr_debug() can't be used while +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 5fc9c9b70862..1ff14a21193d 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -22,15 +22,23 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + ++ifdef CONFIG_SCHED_MUQSS ++obj-y += MuQSS.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++ ++obj-$(CONFIG_SMP) += topology.o ++else + obj-y += core.o loadavg.o clock.o cputime.o + obj-y += idle.o fair.o rt.o deadline.o + obj-y += wait.o wait_bit.o swait.o completion.o + + obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +new file mode 100644 +index 000000000000..034f2458afd8 +--- /dev/null ++++ b/kernel/sched/MuQSS.c +@@ -0,0 +1,7931 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * kernel/sched/MuQSS.c, was kernel/sched.c ++ * ++ * Kernel scheduler and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and ++ * make semaphores SMP safe ++ * 1998-11-19 Implemented schedule_timeout() and related stuff ++ * by Andrea Arcangeli ++ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: ++ * hybrid priority-list and round-robin design with ++ * an array-switch method of distributing timeslices ++ * and per-CPU runqueues. Cleanups and useful suggestions ++ * by Davide Libenzi, preemptible kernel bits by Robert Love. ++ * 2003-09-03 Interactivity tuning by Con Kolivas. ++ * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-04-15 Work begun on replacing all interactivity tuning with a ++ * fair scheduling design by Con Kolivas. ++ * 2007-05-05 Load balancing (smp-nice) and other improvements ++ * by Peter Williams ++ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith ++ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri ++ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, ++ * Thomas Gleixner, Mike Kravetz ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS ++ * scheduler by Con Kolivas. ++ * 2019-08-31 LLC bits by Eduards Bezverhijs ++ */ ++#define CREATE_TRACE_POINTS ++#include ++#undef CREATE_TRACE_POINTS ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#include "MuQSS.h" ++#include "smp.h" ++ ++#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) ++#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++ ++#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) ++#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) ++#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) ++ ++#define is_iso_policy(policy) ((policy) == SCHED_ISO) ++#define iso_task(p) unlikely(is_iso_policy((p)->policy)) ++#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) ++ ++#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) ++ ++#define ISO_PERIOD (5 * HZ) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define APPROX_NS_PS (1073741824) /* Approximate ns per second */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ)) ++#define JIFFY_NS (APPROX_NS_PS / HZ) ++#define JIFFY_US (1048576 / HZ) ++#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) ++#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2) ++#define HALF_JIFFY_US (1048576 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "MuQSS CPU scheduler v0.208 by Con Kolivas.\n"); ++} ++ ++/* Define RQ share levels */ ++#define RQSHARE_NONE 0 ++#define RQSHARE_SMT 1 ++#define RQSHARE_MC 2 ++#define RQSHARE_MC_LLC 3 ++#define RQSHARE_SMP 4 ++#define RQSHARE_ALL 5 ++ ++/* Define locality levels */ ++#define LOCALITY_SAME 0 ++#define LOCALITY_SMT 1 ++#define LOCALITY_MC_LLC 2 ++#define LOCALITY_MC 3 ++#define LOCALITY_SMP 4 ++#define LOCALITY_DISTANT 5 ++ ++/* ++ * This determines what level of runqueue sharing will be done and is ++ * configurable at boot time with the bootparam rqshare = ++ */ ++static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */ ++ ++static int __init set_rqshare(char *str) ++{ ++ if (!strncmp(str, "none", 4)) { ++ rqshare = RQSHARE_NONE; ++ return 1; ++ } ++ if (!strncmp(str, "smt", 3)) { ++ rqshare = RQSHARE_SMT; ++ return 1; ++ } ++ if (!strncmp(str, "mc", 2)) { ++ rqshare = RQSHARE_MC; ++ return 1; ++ } ++ if (!strncmp(str, "llc", 3)) { ++ rqshare = RQSHARE_MC_LLC; ++ return 1; ++ } ++ if (!strncmp(str, "smp", 3)) { ++ rqshare = RQSHARE_SMP; ++ return 1; ++ } ++ if (!strncmp(str, "all", 3)) { ++ rqshare = RQSHARE_ALL; ++ return 1; ++ } ++ return 0; ++} ++__setup("rqshare=", set_rqshare); ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++ ++/* ++ * Tunable to choose whether to prioritise latency or throughput, simple ++ * binary yes or no ++ */ ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run five seconds as real time tasks. This is the total over ++ * all online cpus. ++ */ ++int sched_iso_cpu __read_mostly = 70; ++ ++/* ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The relative length of deadline for each priority(nice) level. ++ */ ++static int prio_ratios[NICE_WIDTH] __read_mostly; ++ ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifdef CONFIG_SMP ++/* ++ * Total number of runqueues. Equals number of CPUs when there is no runqueue ++ * sharing but is usually less with SMT/MC sharing of runqueues. ++ */ ++static int total_runqueues __read_mostly = 1; ++ ++static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; ++ ++struct rq *cpu_rq(int cpu) ++{ ++ return &per_cpu(runqueues, (cpu)); ++} ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++/* ++ * For asym packing, by default the lower numbered cpu has higher priority. ++ */ ++int __weak arch_asym_cpu_priority(int cpu) ++{ ++ return -cpu; ++} ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++#include "stats.h" ++ ++/* ++ * All common locking functions performed on rq->lock. rq->clock is local to ++ * the CPU accessing it so it can be modified just with interrupts disabled ++ * when we're not updating niffies. ++ * Looking up task_rq must be done under rq->lock to be safe. ++ */ ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++static void update_irq_load_avg(struct rq *rq, long delta); ++#else ++static inline void update_irq_load_avg(struct rq *rq, long delta) {} ++#endif ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if (irq_delta + steal) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta < 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++/* ++ * Niffies are a globally increasing nanosecond counter. They're only used by ++ * update_load_avg and time_slice_expired, however deadlines are based on them ++ * across CPUs. Update them whenever we will call one of those functions, and ++ * synchronise them across CPUs whenever we hold both runqueue locks. ++ */ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff, minndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ rq->old_clock = rq->clock; ++ jdiff = jiffies - rq->last_jiffy; ++ ++ /* Subtract any niffies added by balancing with other rqs */ ++ ndiff -= rq->niffies - rq->last_niffy; ++ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; ++ if (minndiff < 0) ++ minndiff = 0; ++ ndiff = max(ndiff, minndiff); ++ rq->niffies += ndiff; ++ rq->last_niffy = rq->niffies; ++ if (jdiff) { ++ rq->last_jiffy += jdiff; ++ rq->last_jiffy_niffies = rq->niffies; ++ } ++} ++ ++/* ++ * Any time we have two runqueues locked we use that as an opportunity to ++ * synchronise niffies to the highest value as idle ticks may have artificially ++ * kept niffies low on one CPU and the truth can only be later. ++ */ ++static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) ++{ ++ if (rq1->niffies > rq2->niffies) ++ rq2->niffies = rq1->niffies; ++ else ++ rq1->niffies = rq2->niffies; ++} ++ ++/* ++ * double_rq_lock - safely lock two runqueues ++ * ++ * Note this does not disable interrupts like task_rq_lock, ++ * you need to do so manually before calling. ++ */ ++ ++/* For when we know rq1 != rq2 */ ++static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ if (rq1 < rq2) { ++ raw_spin_lock(rq1->lock); ++ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING); ++ } else { ++ raw_spin_lock(rq2->lock); ++ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING); ++ } ++} ++ ++static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ BUG_ON(!irqs_disabled()); ++ if (rq1->lock == rq2->lock) { ++ raw_spin_lock(rq1->lock); ++ __acquire(rq2->lock); /* Fake it out ;) */ ++ } else ++ __double_rq_lock(rq1, rq2); ++ synchronise_niffies(rq1, rq2); ++} ++ ++/* ++ * double_rq_unlock - safely unlock two runqueues ++ * ++ * Note this does not restore interrupts like task_rq_unlock, ++ * you need to do so manually after calling. ++ */ ++static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) ++ __releases(rq1->lock) ++ __releases(rq2->lock) ++{ ++ raw_spin_unlock(rq1->lock); ++ if (rq1->lock != rq2->lock) ++ raw_spin_unlock(rq2->lock); ++ else ++ __release(rq2->lock); ++} ++ ++static inline void lock_all_rqs(void) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_lock(rq->lock); ++ } ++} ++ ++static inline void unlock_all_rqs(void) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_unlock(rq->lock); ++ } ++ preempt_enable(); ++} ++ ++/* Specially nest trylock an rq */ ++static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) ++{ ++ if (unlikely(!do_raw_spin_trylock(rq->lock))) ++ return false; ++ spin_acquire(&rq->lock->dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ synchronise_niffies(this_rq, rq); ++ return true; ++} ++ ++/* Unlock a specially nested trylocked rq */ ++static inline void unlock_rq(struct rq *rq) ++{ ++ spin_release(&rq->lock->dep_map, _RET_IP_); ++ do_raw_spin_unlock(rq->lock); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * it's already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* Task can safely be re-inserted now */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++static inline void smp_sched_reschedule(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ smp_send_reschedule(cpu); ++} ++ ++/* ++ * resched_task - mark a task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_task(struct task_struct *p) ++{ ++ int cpu; ++#ifdef CONFIG_LOCKDEP ++ /* Kernel threads call this when creating workqueues while still ++ * inactive from __kthread_bind_mask, holding only the pi_lock */ ++ if (!(p->flags & PF_KTHREAD)) { ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(rq->lock); ++ } ++#endif ++ if (test_tsk_need_resched(p)) ++ return; ++ ++ cpu = task_cpu(p); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(p)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * A task that is not running or queued will not have a node set. ++ * A task that is queued but not running will have a node set. ++ * A task that is currently running will have ->on_cpu set but no node set. ++ */ ++static inline bool task_queued(struct task_struct *p) ++{ ++ return !skiplist_node_empty(&p->node); ++} ++ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); ++static inline void resched_if_idle(struct rq *rq); ++ ++static inline bool deadline_before(u64 deadline, u64 time) ++{ ++ return (deadline < time); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes cpu fairly amongst tasks of the ++ * same nice value, it proportions cpu according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 prio_deadline_diff(int user_prio) ++{ ++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); ++} ++ ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ return prio_deadline_diff(TASK_USER_PRIO(p)); ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return prio_deadline_diff(USER_PRIO(static_prio)); ++} ++ ++static inline int longest_deadline_diff(void) ++{ ++ return prio_deadline_diff(39); ++} ++ ++static inline int ms_longest_deadline_diff(void) ++{ ++ return NS_TO_MS(longest_deadline_diff()); ++} ++ ++static inline bool rq_local(struct rq *rq); ++ ++#ifndef SCHED_CAPACITY_SCALE ++#define SCHED_CAPACITY_SCALE 1024 ++#endif ++ ++static inline int rq_load(struct rq *rq) ++{ ++ return rq->nr_running; ++} ++ ++/* ++ * Update the load average for feeding into cpu frequency governors. Use a ++ * rough estimate of a rolling average with ~ time constant of 32ms. ++ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 ++ * Make sure a call to update_clocks has been made before calling this to get ++ * an updated rq->niffies. ++ */ ++static void update_load_avg(struct rq *rq, unsigned int flags) ++{ ++ long us_interval, load; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += rq_load(rq) * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->load_avg = load; ++ ++ rq->load_update = rq->niffies; ++ update_irq_load_avg(rq, 0); ++ if (likely(rq_local(rq))) ++ cpufreq_trigger(rq, flags); ++} ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++/* ++ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds ++ * here so we scale curload to how long it's been since the last update. ++ */ ++static void update_irq_load_avg(struct rq *rq, long delta) ++{ ++ long us_interval, load; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += NS_TO_US(delta) * SCHED_CAPACITY_SCALE * 5 / 262144; ++ rq->irq_load_avg = load; ++ ++ rq->irq_load_update = rq->niffies; ++} ++#endif ++ ++/* ++ * Removing from the runqueue. Enter with rq locked. Deleting a task ++ * from the skip list is done via the stored node reference in the task struct ++ * and does not require a full look up. Thus it occurs in O(k) time where k ++ * is the "level" of the list the task was stored at - usually < 4, max 8. ++ */ ++static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ skiplist_delete(rq->sl, &p->node); ++ rq->best_key = rq->node->next[0]->key; ++ update_clocks(rq); ++ ++ if (!(flags & DEQUEUE_SAVE)) { ++ sched_info_dequeued(rq, p); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ } ++ rq->nr_running--; ++ if (rt_task(p)) ++ rq->rt_nr_running--; ++ update_load_avg(rq, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_RCU ++static bool rcu_read_critical(struct task_struct *p) ++{ ++ return p->rcu_read_unlock_special.b.blocked; ++} ++#else /* CONFIG_PREEMPT_RCU */ ++#define rcu_read_critical(p) (false) ++#endif /* CONFIG_PREEMPT_RCU */ ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!(p->sched_contributes_to_load) && !(p->flags & (PF_EXITING)) && ++ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); ++} ++ ++/* ++ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check ++ * that the iso_refractory flag is not set. ++ */ ++static inline bool isoprio_suitable(struct rq *rq) ++{ ++ return !rq->iso_refractory; ++} ++ ++static inline void inc_nr_running(struct rq *rq) ++{ ++ rq->nr_running++; ++ if (trace_sched_update_nr_running_tp_enabled()) { ++ call_trace_sched_update_nr_running(rq, 1); ++ } ++} ++ ++static inline void dec_nr_running(struct rq *rq) ++{ ++ rq->nr_running--; ++ if (trace_sched_update_nr_running_tp_enabled()) { ++ call_trace_sched_update_nr_running(rq, -1); ++ } ++} ++ ++/* ++ * Adding to the runqueue. Enter with rq locked. ++ */ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ unsigned int randseed, cflags = 0; ++ u64 sl_id; ++ ++ if (!rt_task(p)) { ++ /* Check it hasn't gotten rt from PI */ ++ if ((idleprio_task(p) && idleprio_suitable(p)) || ++ (iso_task(p) && isoprio_suitable(rq))) ++ p->prio = p->normal_prio; ++ else ++ p->prio = NORMAL_PRIO; ++ } else ++ rq->rt_nr_running++; ++ /* ++ * The sl_id key passed to the skiplist generates a sorted list. ++ * Realtime and sched iso tasks run FIFO so they only need be sorted ++ * according to priority. The skiplist will put tasks of the same ++ * key inserted later in FIFO order. Tasks of sched normal, batch ++ * and idleprio are sorted according to their deadlines. Idleprio ++ * tasks are offset by an impossibly large deadline value ensuring ++ * they get sorted into last positions, but still according to their ++ * own deadlines. This creates a "landscape" of skiplists running ++ * from priority 0 realtime in first place to the lowest priority ++ * idleprio tasks last. Skiplist insertion is an O(log n) process. ++ */ ++ if (p->prio <= ISO_PRIO) { ++ sl_id = p->prio; ++ } else { ++ sl_id = p->deadline; ++ if (idleprio_task(p)) { ++ if (p->prio == IDLE_PRIO) ++ sl_id |= 0xF000000000000000; ++ else ++ sl_id += longest_deadline_diff(); ++ } ++ } ++ /* ++ * Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as the random seed for skiplist insertion. ++ */ ++ update_clocks(rq); ++ if (!(flags & ENQUEUE_RESTORE)) { ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags & ENQUEUE_WAKEUP); ++ } ++ ++ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; ++ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); ++ rq->best_key = rq->node->next[0]->key; ++ if (p->in_iowait) ++ cflags |= SCHED_CPUFREQ_IOWAIT; ++ inc_nr_running(rq); ++ update_load_avg(rq, cflags); ++} ++ ++/* ++ * Returns the relative length of deadline all compared to the shortest ++ * deadline which is that of nice -20. ++ */ ++static inline int task_prio_ratio(struct task_struct *p) ++{ ++ return prio_ratios[TASK_USER_PRIO(p)]; ++} ++ ++/* ++ * task_timeslice - all tasks of all priorities get the exact same timeslice ++ * length. CPU distribution is handled by giving different deadlines to ++ * tasks of different priorities. Use 128 as the base value for fast shifts. ++ */ ++static inline int task_timeslice(struct task_struct *p) ++{ ++ return (rr_interval * task_prio_ratio(p) / 128); ++} ++ ++#ifdef CONFIG_SMP ++/* Entered with rq locked */ ++static inline void resched_if_idle(struct rq *rq) ++{ ++ if (rq_idle(rq)) ++ resched_task(rq->curr); ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return (rq->cpu == smp_processor_id()); ++} ++#ifdef CONFIG_SMT_NICE ++static const cpumask_t *thread_cpumask(int cpu); ++ ++/* Find the best real time priority running on any SMT siblings of cpu and if ++ * none are running, the static priority of the best deadline task running. ++ * The lookups to the other runqueues is done lockless as the occasional wrong ++ * value would be harmless. */ ++static int best_smt_bias(struct rq *this_rq) ++{ ++ int other_cpu, best_bias = 0; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq = cpu_rq(other_cpu); ++ ++ if (rq_idle(rq)) ++ continue; ++ if (unlikely(!rq->online)) ++ continue; ++ if (!rq->rq_mm) ++ continue; ++ if (likely(rq->rq_smt_bias > best_bias)) ++ best_bias = rq->rq_smt_bias; ++ } ++ return best_bias; ++} ++ ++static int task_prio_bias(struct task_struct *p) ++{ ++ if (rt_task(p)) ++ return 1 << 30; ++ else if (task_running_iso(p)) ++ return 1 << 29; ++ else if (task_running_idle(p)) ++ return 0; ++ return MAX_PRIO - p->static_prio; ++} ++ ++static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) ++{ ++ return true; ++} ++ ++static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; ++ ++/* We've already decided p can run on CPU, now test if it shouldn't for SMT ++ * nice reasons. */ ++static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) ++{ ++ int best_bias, task_bias; ++ ++ /* Kernel threads always run */ ++ if (unlikely(!p->mm)) ++ return true; ++ if (rt_task(p)) ++ return true; ++ if (!idleprio_suitable(p)) ++ return true; ++ best_bias = best_smt_bias(this_rq); ++ /* The smt siblings are all idle or running IDLEPRIO */ ++ if (best_bias < 1) ++ return true; ++ task_bias = task_prio_bias(p); ++ if (task_bias < 1) ++ return false; ++ if (task_bias >= best_bias) ++ return true; ++ /* Dither 25% cpu of normal tasks regardless of nice difference */ ++ if (best_bias % 4 == 1) ++ return true; ++ /* Sorry, you lose */ ++ return false; ++} ++#else /* CONFIG_SMT_NICE */ ++#define smt_schedule(p, this_rq) (true) ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) ++{ ++ set_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++/* ++ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to ++ * allow easy lookup of whether any suitable idle CPUs are available. ++ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the ++ * idle_cpus variable than to do a full bitmask check when we are busy. The ++ * bits are set atomically but read locklessly as occasional false positive / ++ * negative is harmless. ++ */ ++static inline void set_cpuidle_map(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ atomic_set_cpu(cpu, &cpu_idle_map); ++} ++ ++static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) ++{ ++ clear_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++ atomic_clear_cpu(cpu, &cpu_idle_map); ++} ++ ++static bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return (cpumask_intersects(p->cpus_ptr, &cpu_idle_map)); ++} ++ ++/* ++ * Resched current on rq. We don't know if rq is local to this CPU nor if it ++ * is locked so we do not use an intermediate variable for the task to avoid ++ * having it dereferenced. ++ */ ++static void resched_curr(struct rq *rq) ++{ ++ int cpu; ++ ++ if (test_tsk_need_resched(rq->curr)) ++ return; ++ ++ rq->preempt = rq->curr; ++ cpu = rq->cpu; ++ ++ /* We're doing this without holding the rq lock if it's not task_rq */ ++ ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(rq->curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(rq->curr)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++#define CPUIDLE_DIFF_THREAD (1) ++#define CPUIDLE_DIFF_CORE_LLC (2) ++#define CPUIDLE_DIFF_CORE (4) ++#define CPUIDLE_CACHE_BUSY (8) ++#define CPUIDLE_DIFF_CPU (16) ++#define CPUIDLE_THREAD_BUSY (32) ++#define CPUIDLE_DIFF_NODE (64) ++ ++/* ++ * The best idle CPU is chosen according to the CPUIDLE ranking above where the ++ * lowest value would give the most suitable CPU to schedule p onto next. The ++ * order works out to be the following: ++ * ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ */ ++static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) ++{ ++ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | ++ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | ++ CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD; ++ int cpu_tmp; ++ ++ if (cpumask_test_cpu(best_cpu, tmpmask)) ++ goto out; ++ ++ for_each_cpu(cpu_tmp, tmpmask) { ++ int ranking, locality; ++ struct rq *tmp_rq; ++ ++ ranking = 0; ++ tmp_rq = cpu_rq(cpu_tmp); ++ ++ locality = rq->cpu_locality[cpu_tmp]; ++#ifdef CONFIG_NUMA ++ if (locality > LOCALITY_SMP) ++ ranking |= CPUIDLE_DIFF_NODE; ++ else ++#endif ++ if (locality > LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CPU; ++#ifdef CONFIG_SCHED_MC ++ else if (locality == LOCALITY_MC_LLC) ++ ranking |= CPUIDLE_DIFF_CORE_LLC; ++ else if (locality == LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CORE; ++ if (!(tmp_rq->cache_idle(tmp_rq))) ++ ranking |= CPUIDLE_CACHE_BUSY; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ if (locality == LOCALITY_SMT) ++ ranking |= CPUIDLE_DIFF_THREAD; ++#endif ++ if (ranking < best_ranking ++#ifdef CONFIG_SCHED_SMT ++ || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq))) ++#endif ++ ) { ++ best_cpu = cpu_tmp; ++ best_ranking = ranking; ++ } ++ } ++out: ++ return best_cpu; ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ struct rq *this_rq = cpu_rq(this_cpu); ++ ++ return (this_rq->cpu_locality[that_cpu] < LOCALITY_SMP); ++} ++ ++/* As per resched_curr but only will resched idle task */ ++static inline void resched_idle(struct rq *rq) ++{ ++ if (test_tsk_need_resched(rq->idle)) ++ return; ++ ++ rq->preempt = rq->idle; ++ ++ set_tsk_need_resched(rq->idle); ++ ++ if (rq_local(rq)) { ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ smp_sched_reschedule(rq->cpu); ++} ++ ++DEFINE_PER_CPU(cpumask_t, idlemask); ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ cpumask_t *tmpmask = &(per_cpu(idlemask, cpu)); ++ struct rq *rq; ++ int best_cpu; ++ ++ cpumask_and(tmpmask, p->cpus_ptr, &cpu_idle_map); ++ best_cpu = best_mask_cpu(cpu, task_rq(p), tmpmask); ++ rq = cpu_rq(best_cpu); ++ if (!smt_schedule(p, rq)) ++ return NULL; ++ rq->preempt = p; ++ resched_idle(rq); ++ return rq; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq->rq_order[cpu]; ++} ++#else /* CONFIG_SMP */ ++static inline void set_cpuidle_map(int cpu) ++{ ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++} ++ ++static inline bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return uprq->curr == uprq->idle; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++} ++ ++static inline void resched_curr(struct rq *rq) ++{ ++ resched_task(rq->curr); ++} ++ ++static inline void resched_if_idle(struct rq *rq) ++{ ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return true; ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq; ++} ++ ++static inline bool smt_schedule(struct task_struct *p, struct rq *rq) ++{ ++ return true; ++} ++#endif /* CONFIG_SMP */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ if (idleprio_task(p)) ++ return IDLE_PRIO; ++ if (iso_task(p)) ++ return ISO_PRIO; ++ return NORMAL_PRIO; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. Enter with rq locked. ++ */ ++static void activate_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ resched_if_idle(rq); ++ ++ /* ++ * Sleep time is in units of nanosecs, so shift by 20 to get a ++ * milliseconds-range estimation of the amount of time that the task ++ * spent sleeping: ++ */ ++ if (unlikely(prof_on == SLEEP_PROFILING)) { ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), ++ (rq->niffies - p->last_ran) >> 20); ++ } ++ ++ p->prio = effective_prio(p); ++ enqueue_task(rq, p, flags); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++} ++ ++/* ++ * deactivate_task - If it's running, it's not on the runqueue and we can just ++ * decrement the nr_running. Enter with rq locked. ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ p->on_rq = 0; ++ sched_info_dequeued(rq, p); ++ /* deactivate_task is always DEQUEUE_SLEEP in muqss */ ++ psi_dequeue(p, DEQUEUE_SLEEP); ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++ struct rq *rq; ++ ++ if (task_cpu(p) == new_cpu) ++ return; ++ ++ /* Do NOT call set_task_cpu on a currently queued task as we will not ++ * be reliably holding the rq lock after changing CPU. */ ++ BUG_ON(task_queued(p)); ++ rq = task_rq(p); ++ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * Furthermore, all task_rq users should acquire both locks, see ++ * task_rq_lock(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(rq->lock))); ++#endif ++ ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ /* ++ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++ p->wake_cpu = new_cpu; ++ ++ if (task_running(rq, p)) { ++ /* ++ * We should only be calling this on a running task if we're ++ * holding rq lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ ++ /* ++ * We can't change the task_thread_info CPU on a running task ++ * as p will still be protected by the rq lock of the CPU it ++ * is still running on so we only set the wake_cpu for it to be ++ * lazily updated once off the CPU. ++ */ ++ return; ++ } ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, new_cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, new_cpu); ++#endif ++ /* We're no longer protecting p after this point since we're holding ++ * the wrong runqueue lock. */ ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Move a task off the runqueue and take it to a cpu for it will ++ * become the running task. ++ */ ++static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ struct rq *p_rq = task_rq(p); ++ ++ dequeue_task(p_rq, p, DEQUEUE_SAVE); ++ if (p_rq != rq) { ++ sched_info_dequeued(p_rq, p); ++ sched_info_queued(rq, p); ++ } ++ set_task_cpu(p, cpu); ++} ++ ++/* ++ * Returns a descheduling task to the runqueue unless it is being ++ * deactivated. ++ */ ++static inline void return_task(struct task_struct *p, struct rq *rq, ++ int cpu, bool deactivate) ++{ ++ if (deactivate) ++ deactivate_task(p, rq); ++ else { ++#ifdef CONFIG_SMP ++ /* ++ * set_task_cpu was called on the running task that doesn't ++ * want to deactivate so it has to be enqueued to a different ++ * CPU and we need its lock. Tag it to be moved with as the ++ * lock is dropped in finish_lock_switch. ++ */ ++ if (unlikely(p->wake_cpu != cpu)) ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ else ++#endif ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ } ++} ++ ++/* Enter with rq lock held. We know p is on the local cpu */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ int running, queued; ++ struct rq_flags rf; ++ unsigned long ncsw; ++ struct rq *rq; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(rq, p)) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ rq = task_rq_lock(p, &rf); ++ trace_sched_wait_task(p); ++ running = task_running(rq, p); ++ queued = task_on_rq_queued(p); ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_rq_unlock(rq, p, &rf); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(queued)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_sched_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++#endif ++ ++/* ++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the ++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or ++ * between themselves, they cooperatively multitask. An idle rq scores as ++ * prio PRIO_LIMIT so it is always preempted. ++ */ ++static inline bool ++can_preempt(struct task_struct *p, int prio, u64 deadline) ++{ ++ /* Better static priority RT task or better policy preemption */ ++ if (p->prio < prio) ++ return true; ++ if (p->prio > prio) ++ return false; ++ if (p->policy == SCHED_BATCH) ++ return false; ++ /* SCHED_NORMAL and ISO will preempt based on deadline */ ++ if (!deadline_before(p->deadline, deadline)) ++ return false; ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (!(p->flags & PF_KTHREAD)) ++ return cpu_active(cpu); ++ ++ /* KTHREAD_IS_PER_CPU is always allowed. */ ++ if (kthread_is_per_cpu(p)) ++ return cpu_online(cpu); ++ ++ /* But are allowed during online. */ ++ return cpu_online(cpu); ++} ++ ++/* ++ * Check to see if p can run on cpu, and if not, whether there are any online ++ * CPUs it can run on instead. This only happens with the hotplug threads that ++ * bring up the CPUs. ++ */ ++static inline bool sched_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (likely(cpumask_test_cpu(cpu, p->cpus_ptr))) ++ return false; ++ if (p->nr_cpus_allowed == 1) { ++ cpumask_t valid_mask; ++ ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_online_mask); ++ if (unlikely(cpumask_empty(&valid_mask))) ++ return false; ++ } ++ return true; ++} ++ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ return true; ++} ++ ++#define cpu_online_map (*(cpumask_t *)cpu_online_mask) ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ int i, this_entries = rq_load(this_rq); ++ cpumask_t tmp; ++ ++ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) ++ return; ++ ++ /* IDLEPRIO tasks never preempt anything but idle */ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ ++ cpumask_and(&tmp, &cpu_online_map, p->cpus_ptr); ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *rq = this_rq->cpu_order[i]; ++ ++ if (!cpumask_test_cpu(rq->cpu, &tmp)) ++ continue; ++ ++ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) ++ continue; ++ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { ++ /* We set rq->preempting lockless, it's a hint only */ ++ rq->preempting = p; ++ resched_curr(rq); ++ return; ++ } ++ } ++} ++ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, ++ u32 flags); ++#else /* CONFIG_SMP */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ return false; ++} ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) ++ resched_curr(uprq); ++} ++ ++static inline int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, ++ u32 __always_unused flags) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq = this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) { ++ __schedstat_inc(rq->ttwu_local); ++ } else { ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(rq->cpu, sd) { ++ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ __schedstat_inc(sd->ttwu_wake_remote); ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ /* ++ * Sync wakeups (i.e. those types of wakeups where the waker ++ * has indicated that it will leave the CPU in short order) ++ * don't trigger a preemption if there are no idle cpus, ++ * instead waiting for current to deschedule. ++ */ ++ if (wake_flags & WF_SYNC) ++ resched_suitable_idle(p); ++ else ++ try_preempt(p, rq); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ int en_flags = ENQUEUE_WAKEUP; ++ ++ lockdep_assert_held(rq->lock); ++ ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++ ++#ifdef CONFIG_SMP ++ if (wake_flags & WF_MIGRATED) ++ en_flags |= ENQUEUE_MIGRATED; ++ else ++#endif ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ activate_task(rq, p, en_flags); ++ ttwu_do_wakeup(rq, p, wake_flags); ++} ++ ++/* ++ * Consider @p being inside a wait loop: ++ * ++ * for (;;) { ++ * set_current_state(TASK_UNINTERRUPTIBLE); ++ * ++ * if (CONDITION) ++ * break; ++ * ++ * schedule(); ++ * } ++ * __set_current_state(TASK_RUNNING); ++ * ++ * between set_current_state() and schedule(). In this case @p is still ++ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in ++ * an atomic manner. ++ * ++ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq ++ * then schedule() must still happen and p->state can be changed to ++ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we ++ * need to do a full wakeup with enqueue. ++ * ++ * Returns: %true when the wakeup is done, ++ * %false otherwise. ++ */ ++static int ttwu_runnable(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = __task_rq_lock(p, NULL); ++ if (likely(task_on_rq_queued(p))) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_rq_unlock(rq, NULL); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void *arg) ++{ ++ struct llist_node *llist = arg; ++ struct rq *rq = this_rq(); ++ struct task_struct *p, *t; ++ struct rq_flags rf; ++ ++ if (!llist) ++ return; ++ ++ /* ++ * rq::ttwu_pending racy indication of out-standing wakeups. ++ * Races such that false-negatives are possible, since they ++ * are shorter lived that false-positives would be. ++ */ ++ WRITE_ONCE(rq->ttwu_pending, 0); ++ ++ rq_lock_irqsave(rq, &rf); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { ++ if (WARN_ON_ONCE(p->on_cpu)) ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) ++ set_task_cpu(p, cpu_of(rq)); ++ ++ ttwu_do_activate(rq, p, 0); ++ } ++ ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++void send_call_function_single_ipi(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (!set_nr_if_polling(rq->idle)) ++ arch_send_call_function_single_ipi(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if ++ * necessary. The wakee CPU on receipt of the IPI will queue the task ++ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost ++ * of the wakeup instead of the waker. ++ */ ++static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ WRITE_ONCE(rq->ttwu_pending, 1); ++ __smp_call_single_queue(cpu, &p->wake_entry.llist); ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ rq_lock_irqsave(rq, &rf); ++ if (likely(is_idle_task(rq->curr))) ++ smp_sched_reschedule(cpu); ++ /* Else cpu is not in idle, do nothing here */ ++ rq_unlock_irqrestore(rq, &rf); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++static inline bool ttwu_queue_cond(int cpu, int wake_flags) ++{ ++ /* ++ * Do not complicate things with the async wake_list while the CPU is ++ * in hotplug state. ++ */ ++ if (!cpu_active(cpu)) ++ return false; ++ ++ /* ++ * If the CPU does not share cache, then queue the task on the ++ * remote rqs wakelist to avoid accessing remote data. ++ */ ++ if (!cpus_share_cache(smp_processor_id(), cpu)) ++ return true; ++ ++ /* ++ * If the task is descheduling and the only running task on the ++ * CPU then use the wakelist to offload the task activation to ++ * the soon-to-be-idle CPU as the current CPU is likely busy. ++ * nr_running is checked to avoid unnecessary task stacking. ++ */ ++ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) ++ return true; ++ ++ return false; ++} ++ ++static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { ++ if (WARN_ON_ONCE(cpu == smp_processor_id())) ++ return false; ++ ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ __ttwu_queue_wakelist(p, cpu, wake_flags); ++ return true; ++ } ++ ++ return false; ++} ++ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ cpumask_t valid_mask; ++ ++ if (p->flags & PF_KTHREAD) ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_all_mask); ++ else ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_active_mask); ++ ++ if (unlikely(!cpumask_weight(&valid_mask))) { ++ /* We shouldn't be hitting this any more */ ++ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm, ++ p->pid, cpumask_weight(p->cpus_ptr)); ++ return cpumask_any(p->cpus_ptr); ++ } ++ return cpumask_any(&valid_mask); ++} ++ ++/* ++ * For a task that's just being woken up we have a valuable balancing ++ * opportunity so choose the nearest cache most lightly loaded runqueue. ++ * Entered with rq locked and returns with the chosen runqueue locked. ++ */ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ unsigned int idlest = ~0U; ++ struct rq *rq = NULL; ++ int i; ++ ++ if (suitable_idle_cpus(p)) { ++ int cpu = task_cpu(p); ++ ++ if (unlikely(needs_other_cpu(p, cpu))) ++ cpu = valid_task_cpu(p); ++ rq = resched_best_idle(p, cpu); ++ if (likely(rq)) ++ return rq->cpu; ++ } ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *other_rq = task_rq(p)->cpu_order[i]; ++ int entries; ++ ++ if (!other_rq->online) ++ continue; ++ if (needs_other_cpu(p, other_rq->cpu)) ++ continue; ++ entries = rq_load(other_rq); ++ if (entries >= idlest) ++ continue; ++ idlest = entries; ++ rq = other_rq; ++ } ++ if (unlikely(!rq)) ++ return task_cpu(p); ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++ ++static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ return false; ++} ++ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ return NULL; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (ttwu_queue_wakelist(p, cpu, wake_flags)) ++ return; ++ ++ rq_lock(rq); ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ rq_unlock(rq); ++} ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int ++try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_runnable()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with smp_store_mb() ++ * in set_current_state() that the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) ++ goto unlock; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ * ++ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure ++ * schedule()'s deactivate_task() has 'happened' and p will no longer ++ * care about it's own p->state. See the comment in __schedule(). ++ */ ++ smp_acquire__after_ctrl_dep(); ++ ++ /* ++ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq ++ * == 0), which means we need to do an enqueue, change p->state to ++ * TASK_WAKING such that we can unlock p->pi_lock before doing the ++ * enqueue, such as ttwu_queue_wakelist(). ++ */ ++ p->state = TASK_WAKING; ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, considering queueing p on the remote CPUs wake_list ++ * which potentially sends an IPI instead of spinning on p->on_cpu to ++ * let the waker make forward progress. This is safe because IRQs are ++ * disabled and the IPI will deliver after on_cpu is cleared. ++ * ++ * Ensure we load task_cpu(p) after p->on_cpu: ++ * ++ * set_task_cpu(p, cpu); ++ * STORE p->cpu = @cpu ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock ++ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) ++ * STORE p->on_cpu = 1 LOAD p->cpu ++ * ++ * to ensure we observe the correct CPU on which the task is currently ++ * scheduling. ++ */ ++ if (smp_load_acquire(&p->on_cpu) && ++ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) ++ goto unlock; ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until it's done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ cpu = select_best_cpu(p); ++ if (task_cpu(p) != cpu) { ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++ ++#else ++ cpu = task_cpu(p); ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, task_cpu(p), wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state ++ * @p: Process for which the function is to be invoked. ++ * @func: Function to invoke. ++ * @arg: Argument to function. ++ * ++ * If the specified task can be quickly locked into a definite state ++ * (either sleeping or on a given runqueue), arrange to keep it in that ++ * state while invoking @func(@arg). This function can use ->on_rq and ++ * task_curr() to work out what the state is, if required. Given that ++ * @func can be invoked with a runqueue lock held, it had better be quite ++ * lightweight. ++ * ++ * Returns: ++ * @false if the task slipped out from under the locks. ++ * @true if the task was locked onto a runqueue or is sleeping. ++ * However, @func can override this by returning @false. ++ */ ++bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) ++{ ++ bool ret = false; ++ struct rq *rq; ++ ++ lockdep_assert_irqs_enabled(); ++ raw_spin_lock_irq(&p->pi_lock); ++ if (p->on_rq) { ++ rq = __task_rq_lock(p, NULL); ++ if (task_rq(p) == rq) ++ ret = func(p, arg); ++ rq_unlock(rq); ++ } else { ++ switch (p->state) { ++ case TASK_RUNNING: ++ case TASK_WAKING: ++ break; ++ default: ++ smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). ++ if (!p->on_rq) ++ ret = func(p, arg); ++ } ++ } ++ raw_spin_unlock_irq(&p->pi_lock); ++ return ret; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++static void time_slice_expired(struct task_struct *p, struct rq *rq); ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++#ifdef CONFIG_SMP ++ p->wake_entry.u_flags = CSD_TYPE_TTWU; ++#endif ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * The process state is set to the same value of the process executing ++ * do_fork() code. That is running. This guarantees that nobody will ++ * actually run it, and a signal or other external event cannot wake ++ * it up and insert it on the runqueue either. ++ */ ++ ++ /* Should be reset in fork.c but done here for ease of MuQSS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = ++ p->stime_ns = ++ p->utime_ns = 0; ++ skiplist_node_init(&p->node); ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR || p-> policy == SCHED_ISO) { ++ p->policy = SCHED_NORMAL; ++ p->normal_prio = normal_prio(p); ++ } ++ ++ if (PRIO_TO_NICE(p->static_prio) < 0) { ++ p->static_prio = NICE_TO_PRIO(0); ++ p->normal_prio = p->static_prio; ++ } ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rseq_migrate(p); ++ set_task_cpu(p, smp_processor_id()); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ return 0; ++} ++ ++void sched_post_fork(struct task_struct *p) ++{ ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); ++ ++static void account_task_cpu(struct rq *rq, struct task_struct *p) ++{ ++ update_clocks(rq); ++ /* This isn't really a context switch but accounting is the same */ ++ update_cpu_clock_switch(rq, p); ++ p->last_ran = rq->niffies; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++static inline int hrexpiry_enabled(struct rq *rq) ++{ ++ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrexpiry_timer); ++} ++ ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++static inline void hrexpiry_clear(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (hrtimer_active(&rq->hrexpiry_timer)) ++ hrtimer_cancel(&rq->hrexpiry_timer); ++} ++ ++/* ++ * High-resolution time_slice expiry. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrexpiry(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); ++ struct task_struct *p; ++ ++ /* This can happen during CPU hotplug / resume */ ++ if (unlikely(cpu_of(rq) != smp_processor_id())) ++ goto out; ++ ++ /* ++ * We're doing this without the runqueue lock but this should always ++ * be run on the local CPU. Time slice should run out in __schedule ++ * but we set it to zero here in case niffies is slightly less. ++ */ ++ p = rq->curr; ++ p->time_slice = 0; ++ __set_tsk_resched(p); ++out: ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Called to set the hrexpiry timer state. ++ * ++ * called with irqs disabled from the local CPU only ++ */ ++static void hrexpiry_start(struct rq *rq, u64 delay) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ ++ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED); ++} ++ ++static void init_rq_hrexpiry(struct rq *rq) ++{ ++ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ rq->hrexpiry_timer.function = hrexpiry; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return HALF_JIFFY_US; ++ return 0; ++} ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ struct task_struct *parent, *rq_curr; ++ struct rq *rq, *new_rq; ++ unsigned long flags; ++ ++ parent = p->parent; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ p->state = TASK_RUNNING; ++ /* Task_rq can't change yet on a new task */ ++ new_rq = rq = task_rq(p); ++ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { ++ set_task_cpu(p, valid_task_cpu(p)); ++ new_rq = task_rq(p); ++ } ++ ++ double_rq_lock(rq, new_rq); ++ rq_curr = rq->curr; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = rq_curr->normal_prio; ++ ++ trace_sched_wakeup_new(p); ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. If it's negative, it won't ++ * matter since that's the same as being 0. rq->rq_deadline is only ++ * modified within schedule() so it is always equal to ++ * current->deadline. ++ */ ++ account_task_cpu(rq, rq_curr); ++ p->last_ran = rq_curr->last_ran; ++ if (likely(rq_curr->policy != SCHED_FIFO)) { ++ rq_curr->time_slice /= 2; ++ if (rq_curr->time_slice < RESCHED_US) { ++ /* ++ * Forking task has run out of timeslice. Reschedule it and ++ * start its child with a new time slice and deadline. The ++ * child will end up running first because its deadline will ++ * be slightly earlier. ++ */ ++ __set_tsk_resched(rq_curr); ++ time_slice_expired(p, new_rq); ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++ else if (unlikely(rq != new_rq)) ++ try_preempt(p, new_rq); ++ } else { ++ p->time_slice = rq_curr->time_slice; ++ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { ++ /* ++ * The VM isn't cloned, so we're in a good position to ++ * do child-runs-first in anticipation of an exec. This ++ * usually avoids a lot of COW overhead. ++ */ ++ __set_tsk_resched(rq_curr); ++ } else { ++ /* ++ * Adjust the hrexpiry since rq_curr will keep ++ * running and its timeslice has been shortened. ++ */ ++ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); ++ try_preempt(p, new_rq); ++ } ++ } ++ } else { ++ time_slice_expired(p, new_rq); ++ try_preempt(p, new_rq); ++ } ++ activate_task(new_rq, p, 0); ++ double_rq_unlock(rq, new_rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ * ++ * See the ttwu() WF_ON_CPU case and its ordering comment. ++ */ ++ WRITE_ONCE(next->on_cpu, 1); ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * This must be the very last reference to @prev from this CPU. After ++ * p->on_cpu is cleared, the task can be moved to a different CPU. We ++ * must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock->dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock->owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock->dep_map, 0, 0, _THIS_IP_); ++ ++#ifdef CONFIG_SMP ++ /* ++ * If prev was marked as migrating to another CPU in return_task, drop ++ * the local runqueue lock but leave interrupts disabled and grab the ++ * remote lock we're migrating it to before enabling them. ++ */ ++ if (unlikely(task_on_rq_migrating(prev))) { ++ sched_info_dequeued(rq, prev); ++ /* ++ * We move the ownership of prev to the new cpu now. ttwu can't ++ * activate prev to the wrong cpu since it has to grab this ++ * runqueue in ttwu_remote. ++ */ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ prev->cpu = prev->wake_cpu; ++#else ++ task_thread_info(prev)->cpu = prev->wake_cpu; ++#endif ++ raw_spin_unlock(rq->lock); ++ ++ raw_spin_lock(&prev->pi_lock); ++ rq = __task_rq_lock(prev, NULL); ++ /* Check that someone else hasn't already queued prev */ ++ if (likely(!task_queued(prev))) { ++ enqueue_task(rq, prev, 0); ++ prev->on_rq = TASK_ON_RQ_QUEUED; ++ /* Wake up the CPU if it's not already running */ ++ resched_if_idle(rq); ++ } ++ raw_spin_unlock(&prev->pi_lock); ++ } ++#endif ++ raw_spin_unlock_irq(rq->lock); ++} ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_switch ++# define finish_arch_switch(prev) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++static inline void kmap_local_sched_out(void) ++{ ++#ifdef CONFIG_KMAP_LOCAL ++ if (unlikely(current->kmap_ctrl.idx)) ++ __kmap_local_sched_out(); ++#endif ++} ++ ++static inline void kmap_local_sched_in(void) ++{ ++#ifdef CONFIG_KMAP_LOCAL ++ if (unlikely(current->kmap_ctrl.idx)) ++ __kmap_local_sched_in(); ++#endif ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ kmap_local_sched_out(); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static void finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq, prev); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ /* ++ * kmap_local_sched_out() is invoked with rq::lock held and ++ * interrupts disabled. There is no requirement for that, but the ++ * sched out code does not have an interrupt enabled section. ++ * Restoring the maps on sched in does not require interrupts being ++ * disabled either. ++ */ ++ kmap_local_sched_in(); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++{ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline void ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++static unsigned long nr_uninterruptible(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_uninterruptible; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ if (rq_load(raw_rq()) == 1) ++ return true; ++ else ++ return false; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int cpu; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += cpu_rq(cpu)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpufreq menu ++ * governor are using nonsensical data. Boosting frequency for a CPU that has ++ * IO-wait which might not even end up running the task when it does become ++ * runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how it's mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long cpu, sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += nr_iowait_cpu(cpu); ++ ++ return sum; ++} ++ ++unsigned long nr_active(void) ++{ ++ return nr_running() + nr_uninterruptible(); ++} ++ ++/* Variables and functions for calc_load */ ++static unsigned long calc_load_update; ++unsigned long avenrun[3]; ++EXPORT_SYMBOL(avenrun); ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} ++ ++/* ++ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. ++ */ ++void calc_global_load(void) ++{ ++ long active; ++ ++ if (time_before(jiffies, READ_ONCE(calc_load_update))) ++ return; ++ active = nr_active() * FIXED_1; ++ ++ avenrun[0] = calc_load(avenrun[0], EXP_1, active); ++ avenrun[1] = calc_load(avenrun[1], EXP_5, active); ++ avenrun[2] = calc_load(avenrun[2], EXP_15, active); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++} ++ ++/** ++ * fixed_power_int - compute: x^n, in O(log n) time ++ * ++ * @x: base of the power ++ * @frac_bits: fractional bits of @x ++ * @n: power to raise @x to. ++ * ++ * By exploiting the relation between the definition of the natural power ++ * function: x^n := x*x*...*x (x multiplied by itself for n times), and ++ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, ++ * (where: n_i \elem {0, 1}, the binary vector representing n), ++ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is ++ * of course trivially computable in O(log_2 n), the length of our binary ++ * vector. ++ */ ++static unsigned long ++fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) ++{ ++ unsigned long result = 1UL << frac_bits; ++ ++ if (n) { ++ for (;;) { ++ if (n & 1) { ++ result *= x; ++ result += 1UL << (frac_bits - 1); ++ result >>= frac_bits; ++ } ++ n >>= 1; ++ if (!n) ++ break; ++ x *= x; ++ x += 1UL << (frac_bits - 1); ++ x >>= frac_bits; ++ } ++ } ++ ++ return result; ++} ++ ++/* ++ * a1 = a0 * e + a * (1 - e) ++ * ++ * a2 = a1 * e + a * (1 - e) ++ * = (a0 * e + a * (1 - e)) * e + a * (1 - e) ++ * = a0 * e^2 + a * (1 - e) * (1 + e) ++ * ++ * a3 = a2 * e + a * (1 - e) ++ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) ++ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) ++ * ++ * ... ++ * ++ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] ++ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) ++ * = a0 * e^n + a * (1 - e^n) ++ * ++ * [1] application of the geometric series: ++ * ++ * n 1 - x^(n+1) ++ * S_n := \Sum x^i = ------------- ++ * i=0 1 - x ++ */ ++unsigned long ++calc_load_n(unsigned long load, unsigned long exp, ++ unsigned long active, unsigned int n) ++{ ++ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++#ifdef CONFIG_PARAVIRT ++static inline u64 steal_ticks(u64 steal) ++{ ++ if (unlikely(steal > NSEC_PER_SEC)) ++ return div_u64(steal, TICK_NSEC); ++ ++ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); ++} ++#endif ++ ++#ifndef nsecs_to_cputime ++# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) ++#endif ++ ++/* ++ * On each tick, add the number of nanoseconds to the unbanked variables and ++ * once one tick's worth has accumulated, account it allowing for accurate ++ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we ++ * deduct nanoseconds. ++ */ ++static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ if (atomic_read(&rq->nr_iowait) > 0) { ++ rq->iowait_ns += ns; ++ if (rq->iowait_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->iowait_ns); ++ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->iowait_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->idle_ns += ns; ++ if (rq->idle_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->idle_ns); ++ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->idle_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(idle); ++} ++ ++static void pc_system_time(struct rq *rq, struct task_struct *p, ++ int hardirq_offset, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->stime_ns += ns; ++ if (p->stime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->stime_ns); ++ p->stime_ns %= JIFFY_NS; ++ p->stime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_system_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (hardirq_count() - hardirq_offset) { ++ rq->irq_ns += ns; ++ if (rq->irq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->irq_ns); ++ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->irq_ns %= JIFFY_NS; ++ } ++ } else if (in_serving_softirq()) { ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->system_ns += ns; ++ if (rq->system_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->system_ns); ++ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->system_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->utime_ns += ns; ++ if (p->utime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->utime_ns); ++ p->utime_ns %= JIFFY_NS; ++ p->utime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_user_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (this_cpu_ksoftirqd() == p) { ++ /* ++ * ksoftirqd time do not get accounted in cpu_softirq_time. ++ * So, we have to handle it separately here. ++ */ ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } ++ ++ if (task_nice(p) > 0 || idleprio_task(p)) { ++ rq->nice_ns += ns; ++ if (rq->nice_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->nice_ns); ++ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->nice_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->user_ns += ns; ++ if (rq->user_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->user_ns); ++ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->user_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++/* ++ * This is called on clock ticks. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate tick timekeeping */ ++ if (user_mode(get_irq_regs())) ++ pc_user_time(rq, p, account_ns); ++ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { ++ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); ++ } else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++ ++ p->last_ran = rq->niffies; ++} ++ ++/* ++ * This is called on context switches. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate subtick timekeeping */ ++ if (p != idle) ++ pc_user_time(rq, p, account_ns); ++ else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++} ++ ++/* ++ * Return any ns on the sched_clock that have not yet been accounted in ++ * @p in case that task is currently running. ++ * ++ * Called with task_rq_lock(p) held. ++ */ ++static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) ++{ ++ u64 ns = 0; ++ ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_clocks(rq); ++ ns = rq->niffies - p->last_ran; ++ } ++ ++ return ns; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ struct rq_flags rf; ++ struct rq *rq; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimisation chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_rq_lock(p, &rf); ++ ns = p->sched_time + do_task_delta_exec(p, rq); ++ task_rq_unlock(rq, p, &rf); ++ ++ return ns; ++} ++ ++/* ++ * Functions to test for when SCHED_ISO tasks have used their allocated ++ * quota as real time scheduling and convert them back to SCHED_NORMAL. All ++ * data is modified only by the local runqueue during scheduler_tick with ++ * interrupts disabled. ++ */ ++ ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a ++ * slow division. ++ */ ++static inline void iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++ rq->iso_ticks += 100; ++ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { ++ rq->iso_refractory = true; ++ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) ++ rq->iso_ticks = ISO_PERIOD * 100; ++ } ++} ++ ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq, int ticks) ++{ ++ if (rq->iso_ticks > 0 || rq->iso_refractory) { ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; ++ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { ++ rq->iso_refractory = false; ++ if (unlikely(rq->iso_ticks < 0)) ++ rq->iso_ticks = 0; ++ } ++ } ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if (rt_task(p) || task_running_iso(p)) ++ iso_tick(rq); ++ else ++ no_iso_tick(rq, 1); ++ ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->policy == SCHED_FIFO) ++ return; ++ ++ if (iso_task(p)) { ++ if (task_running_iso(p)) { ++ if (rq->iso_refractory) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Force it to reschedule as ++ * SCHED_NORMAL by zeroing its time_slice ++ */ ++ p->time_slice = 0; ++ } ++ } else if (!rq->iso_refractory) { ++ /* Can now run again ISO. Reschedule to pick up prio */ ++ goto out_resched; ++ } ++ } ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ * Dither is used as a backup for when hrexpiry is disabled or high res ++ * timers not configured in. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++out_resched: ++ rq_lock(rq); ++ __set_tsk_resched(p); ++ rq_unlock(rq); ++} ++ ++static inline void task_tick(struct rq *rq) ++{ ++ if (!rq_idle(rq)) ++ task_running_tick(rq); ++ else if (rq->last_jiffy > rq->last_scheduler_tick) ++ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * We can stop the timer tick any time highres timers are active since ++ * we rely entirely on highres timeouts for task expiry rescheduling. ++ */ ++static void sched_stop_tick(struct rq *rq, int cpu) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (!tick_nohz_full_enabled()) ++ return; ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++static inline void sched_start_tick(struct rq *rq, int cpu) ++{ ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (!tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ rq_lock_irq(rq); ++ if (cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ curr = rq->curr; ++ update_rq_clock(rq); ++ ++ if (!is_idle_task(curr)) { ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ delta = rq_clock_task(rq) - curr->last_ran; ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ } ++ task_tick(rq); ++ ++out_unlock: ++ rq_unlock_irq(rq, NULL); ++ ++out_requeue: ++ ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ /* There cannot be competing actions, but don't rely on stop-machine. */ ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); ++ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); ++ /* Don't cancel, as this would mess up the state machine. */ ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_stop_tick(struct rq *rq, int cpu) {} ++static inline void sched_start_tick(struct rq *rq, int cpu) {} ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ arch_scale_freq_tick(); ++ sched_clock_tick(); ++ update_clocks(rq); ++ update_load_avg(rq, 0); ++ update_cpu_clock_tick(rq, rq->curr); ++ task_tick(rq); ++ rq->last_scheduler_tick = rq->last_jiffy; ++ rq->last_tick = rq->clock; ++ psi_task_tick(rq); ++ perf_event_task_tick(); ++ sched_stop_tick(rq, cpu); ++} ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline. Make sure update_clocks has been called recently to update ++ * rq->niffies. ++ */ ++static void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ p->deadline = rq->niffies + task_deadline_diff(p); ++#ifdef CONFIG_SMT_NICE ++ if (!p->mm) ++ p->smt_bias = 0; ++ else if (rt_task(p)) ++ p->smt_bias = 1 << 30; ++ else if (task_running_iso(p)) ++ p->smt_bias = 1 << 29; ++ else if (idleprio_task(p)) { ++ if (task_running_idle(p)) ++ p->smt_bias = 0; ++ else ++ p->smt_bias = 1; ++ } else if (--p->smt_bias < 1) ++ p->smt_bias = MAX_PRIO - p->static_prio; ++#endif ++} ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (p->time_slice < RESCHED_US || batch_task(p)) ++ time_slice_expired(p, rq); ++} ++ ++/* ++ * Task selection with skiplists is a simple matter of picking off the first ++ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) ++ * being bound to the number of processors. ++ * ++ * Runqueues are selectively locked based on their unlocked data and then ++ * unlocked if not needed. At most 3 locks will be held at any time and are ++ * released as soon as they're no longer needed. All balancing between CPUs ++ * is thus done here in an extremely simple first come best fit manner. ++ * ++ * This iterates over runqueues in cache locality order. In interactive mode ++ * it iterates over all CPUs and finds the task with the best key/deadline. ++ * In non-interactive mode it will only take a task if it's from the current ++ * runqueue or a runqueue with more tasks than the current one with a better ++ * key/deadline. ++ */ ++#ifdef CONFIG_SMP ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct rq *locked = NULL, *chosen = NULL; ++ struct task_struct *edt = idle; ++ int i, best_entries = 0; ++ u64 best_key = ~0ULL; ++ ++ for (i = 0; i < total_runqueues; i++) { ++ struct rq *other_rq = rq_order(rq, i); ++ skiplist_node *next; ++ int entries; ++ ++ entries = other_rq->sl->entries; ++ /* ++ * Check for queued entres lockless first. The local runqueue ++ * is locked so entries will always be accurate. ++ */ ++ if (!sched_interactive) { ++ /* ++ * Don't reschedule balance across nodes unless the CPU ++ * is idle. ++ */ ++ if (edt != idle && rq->cpu_locality[other_rq->cpu] > LOCALITY_SMP) ++ break; ++ if (entries <= best_entries) ++ continue; ++ } else if (!entries) ++ continue; ++ ++ /* if (i) implies other_rq != rq */ ++ if (i) { ++ /* Check for best id queued lockless first */ ++ if (other_rq->best_key >= best_key) ++ continue; ++ ++ if (unlikely(!trylock_rq(rq, other_rq))) ++ continue; ++ ++ /* Need to reevaluate entries after locking */ ++ entries = other_rq->sl->entries; ++ if (unlikely(!entries)) { ++ unlock_rq(other_rq); ++ continue; ++ } ++ } ++ ++ next = other_rq->node; ++ /* ++ * In interactive mode we check beyond the best entry on other ++ * runqueues if we can't get the best for smt or affinity ++ * reasons. ++ */ ++ while ((next = next->next[0]) != other_rq->node) { ++ struct task_struct *p; ++ u64 key = next->key; ++ ++ /* Reevaluate key after locking */ ++ if (key >= best_key) ++ break; ++ ++ p = next->value; ++ if (!smt_schedule(p, rq)) { ++ if (i && !sched_interactive) ++ break; ++ continue; ++ } ++ ++ if (sched_other_cpu(p, cpu)) { ++ if (sched_interactive || !i) ++ continue; ++ break; ++ } ++ /* Make sure affinity is ok */ ++ if (i) { ++ /* From this point on p is the best so far */ ++ if (locked) ++ unlock_rq(locked); ++ chosen = locked = other_rq; ++ } ++ best_entries = entries; ++ best_key = key; ++ edt = p; ++ break; ++ } ++ /* rq->preempting is a hint only as the state may have changed ++ * since it was set with the resched call but if we have met ++ * the condition we can break out here. */ ++ if (edt == rq->preempting) ++ break; ++ if (i && other_rq != chosen) ++ unlock_rq(other_rq); ++ } ++ ++ if (likely(edt != idle)) ++ take_task(rq, cpu, edt); ++ ++ if (locked) ++ unlock_rq(locked); ++ ++ rq->preempting = NULL; ++ ++ return edt; ++} ++#else /* CONFIG_SMP */ ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct task_struct *edt; ++ ++ if (unlikely(!rq->sl->entries)) ++ return idle; ++ edt = rq->node->next[0]->value; ++ take_task(rq, cpu, edt); ++ return edt; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, preempt_disable_ip); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++ ++ if (task_scs_end_corrupted(prev)) ++ panic("corrupted shadow stack detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ SCHED_WARN_ON(ct_state() == CONTEXT_USER); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++/* ++ * The currently running task's information is all stored in rq local data ++ * which is only modified by the local CPU. ++ */ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ if (p == rq->idle || p->policy == SCHED_FIFO) ++ hrexpiry_clear(rq); ++ else ++ hrexpiry_start(rq, US_TO_NS(p->time_slice)); ++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) ++ rq->dither = 0; ++ else ++ rq->dither = rq_dither(rq); ++ ++ rq->rq_deadline = p->deadline; ++ rq->rq_prio = p->prio; ++#ifdef CONFIG_SMT_NICE ++ rq->rq_mm = p->mm; ++ rq->rq_smt_bias = p->smt_bias; ++#endif ++} ++ ++#ifdef CONFIG_SMT_NICE ++static void check_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; ++static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; ++ ++/* Iterate over smt siblings when we've scheduled a process on cpu and decide ++ * whether they should continue running or be descheduled. */ ++static void check_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct task_struct *p; ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ continue; ++ p = rq->curr; ++ if (!smt_schedule(p, this_rq)) ++ resched_curr(rq); ++ } ++} ++ ++static void wake_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ resched_idle(rq); ++ } ++} ++#else ++static void check_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_siblings(struct rq __maybe_unused *this_rq) {} ++#endif ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next, *idle; ++ unsigned long *switch_count; ++ unsigned long prev_state; ++ bool deactivate = false; ++ struct rq *rq; ++ u64 niffies; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ idle = rq->idle; ++ ++ schedule_debug(prev, preempt); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(): ++ * ++ * __set_current_state(@state) signal_wake_up() ++ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) ++ * wake_up_state(p, state) ++ * LOCK rq->lock LOCK p->pi_state ++ * smp_mb__after_spinlock() smp_mb__after_spinlock() ++ * if (signal_pending_state()) if (p->state & @state) ++ * ++ * Also, the membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ rq_lock(rq); ++ smp_mb__after_spinlock(); ++#ifdef CONFIG_SMP ++ if (rq->preempt) { ++ /* ++ * Make sure resched_curr hasn't triggered a preemption ++ * locklessly on a task that has since scheduled away. Spurious ++ * wakeup of idle is okay though. ++ */ ++ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { ++ rq->preempt = NULL; ++ clear_preempt_need_resched(); ++ rq_unlock_irq(rq, NULL); ++ return; ++ } ++ rq->preempt = NULL; ++ } ++#endif ++ ++ switch_count = &prev->nivcsw; ++ ++ /* ++ * We must load prev->state once (task_struct::state is volatile), such ++ * that: ++ * ++ * - we form a control dependency vs deactivate_task() below. ++ * - ptrace_{,un}freeze_traced() can change ->state underneath us. ++ */ ++ prev_state = prev->state; ++ if (!preempt && prev_state) { ++ if (signal_pending_state(prev_state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ prev->sched_contributes_to_load = ++ (prev_state & TASK_UNINTERRUPTIBLE) && ++ !(prev_state & TASK_NOLOAD) && ++ !(prev->flags & PF_FROZEN); ++ ++ if (prev->sched_contributes_to_load) ++ rq->nr_uninterruptible++; ++ ++ /* ++ * __schedule() ttwu() ++ * prev_state = prev->state; if (p->on_rq && ...) ++ * if (prev_state) goto out; ++ * p->on_rq = 0; smp_acquire__after_ctrl_dep(); ++ * p->state = TASK_WAKING ++ * ++ * Where __schedule() and ttwu() have matching control dependencies. ++ * ++ * After this, schedule() must not care about p->state any more. ++ */ ++ deactivate = true; ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ /* ++ * Store the niffy value here for use by the next task's last_ran ++ * below to avoid losing niffies due to update_clocks being called ++ * again after this point. ++ */ ++ update_clocks(rq); ++ niffies = rq->niffies; ++ update_cpu_clock_switch(rq, prev); ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ if (idle != prev) { ++ check_deadline(prev, rq); ++ return_task(prev, rq, cpu, deactivate); ++ } ++ ++ next = earliest_deadline_task(rq, cpu, idle); ++ if (likely(next->prio != PRIO_LIMIT)) ++ clear_cpuidle_map(cpu); ++ else { ++ set_cpuidle_map(cpu); ++ update_load_avg(rq, 0); ++ } ++ ++ set_rq_task(rq, next); ++ next->last_ran = niffies; ++ ++ if (likely(prev != next)) { ++ /* ++ * Don't reschedule an idle task or deactivated tasks ++ */ ++ if (prev == idle) { ++ inc_nr_running(rq); ++ if (rt_task(next)) ++ rq->rt_nr_running++; ++ } else if (!deactivate) ++ resched_suitable_idle(prev); ++ if (unlikely(next == idle)) { ++ dec_nr_running(rq); ++ if (rt_task(prev)) ++ rq->rt_nr_running--; ++ wake_siblings(rq); ++ } else ++ check_siblings(rq); ++ rq->nr_switches++; ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ ++ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); ++ ++ trace_sched_switch(preempt, prev, next); ++ context_switch(rq, prev, next); /* unlocks the rq */ ++ } else { ++ check_siblings(rq); ++ rq_unlock(rq); ++ local_irq_enable(); ++ } ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(). */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ unsigned int task_flags; ++ ++ if (!tsk->state) ++ return; ++ ++ task_flags = tsk->flags; ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker and because wq_worker_sleeping() ++ * requires it. ++ */ ++ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (task_flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static inline void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++ ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != IN_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio, oldprio; ++ struct rq *rq; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_rq_lock(p, NULL); ++ update_rq_clock(rq); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guarantees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ oldprio = p->prio; ++ p->prio = prio; ++ if (task_running(rq, p)){ ++ if (prio > oldprio) ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (prio < oldprio) ++ try_preempt(p, rq); ++ } ++out_unlock: ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_rq_unlock(rq, NULL); ++ ++ preempt_enable(); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++/* ++ * Adjust the deadline for when the priority is to change, before it's ++ * changed. ++ */ ++static inline void adjust_deadline(struct task_struct *p, int new_prio) ++{ ++ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); ++} ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static, old_static; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it won't have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (has_rt_policy(p)) { ++ p->static_prio = new_static; ++ goto out_unlock; ++ } ++ ++ adjust_deadline(p, new_static); ++ old_static = p->static_prio; ++ p->static_prio = new_static; ++ p->prio = effective_prio(p); ++ ++ if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (new_static < old_static) ++ try_preempt(p, rq); ++ } else if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ if (old_static < new_static) ++ resched_task(p); ++ } ++out_unlock: ++ task_rq_unlock(rq, p, &rf); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int delta, prio = p->prio - MAX_RT_PRIO; ++ ++ /* rt tasks and iso tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ /* Convert to ms to avoid overflows */ ++ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); ++ if (unlikely(delta < 0)) ++ delta = 0; ++ delta = delta * 40 / ms_longest_deadline_diff(); ++ if (delta <= 80) ++ prio += delta; ++ if (idleprio_task(p)) ++ prio += 40; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (rq->curr != rq->idle) ++ return 0; ++ ++ if (rq->nr_running) ++ return 0; ++ ++#ifdef CONFIG_SMP ++ if (rq->ttwu_pending) ++ return 0; ++#endif ++ ++ return 1; ++} ++ ++/** ++ * available_idle_cpu - is a given CPU idle for enqueuing work. ++ * @cpu: the CPU in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int available_idle_cpu(int cpu) ++{ ++ if (!idle_cpu(cpu)) ++ return 0; ++ ++ if (vcpu_is_preempted(cpu)) ++ return 0; ++ ++ return 1; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the CPU @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, ++ int prio, const struct sched_attr *attr, ++ bool keep_boost) ++{ ++ int oldrtprio, oldprio; ++ ++ /* ++ * If params can't change scheduling class changes aren't allowed ++ * either. ++ */ ++ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) ++ return; ++ ++ p->policy = policy; ++ oldrtprio = p->rt_priority; ++ p->rt_priority = prio; ++ p->normal_prio = normal_prio(p); ++ oldprio = p->prio; ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ ++ if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (p->prio < oldprio || p->rt_priority > oldrtprio) ++ try_preempt(p, rq); ++ } ++} ++ ++/* ++ * Check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority; ++ unsigned long rlim_rtprio = 0; ++ struct rq_flags rf; ++ int reset_on_fork; ++ struct rq *rq; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ priority = 0; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); ++ policy &= ~SCHED_RESET_ON_FORK; ++ ++ if (!SCHED_RANGE(policy)) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH is 0. ++ */ ++ if (priority < 0 || ++ (p->mm && priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if (is_rt_policy(policy) != (priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (is_rt_policy(policy)) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (priority > p->rt_priority && ++ priority > rlim_rtprio) ++ return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy != SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag: */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ * ++ * To be able to change p->policy safely, the runqueue lock must be ++ * held. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea: ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further, ++ * but store a possible modification of reset_on_fork. ++ */ ++ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || ++ priority == p->rt_priority))) { ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ __setscheduler(p, rq, policy, priority, attr, pi); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ task_rq_unlock(rq, p, &rf); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ preempt_enable(); ++out: ++ return 0; ++ ++unlock: ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Use sched_set_fifo(), read its comment. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++ ++/* ++ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally ++ * incapable of resource management, which is the one thing an OS really should ++ * be doing. ++ * ++ * This is of course the reason it is limited to privileged users only. ++ * ++ * Worse still; it is fundamentally impossible to compose static priority ++ * workloads. You cannot take two correctly working static prio workloads ++ * and smash them together and still expect them to work. ++ * ++ * For this reason 'all' FIFO tasks the kernel creates are basically at: ++ * ++ * MAX_RT_PRIO / 2 ++ * ++ * The administrator _MUST_ configure the system, the kernel simply doesn't ++ * know enough information to make a sensible choice. ++ */ ++void sched_set_fifo(struct task_struct *p) ++{ ++ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; ++ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_fifo); ++ ++/* ++ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. ++ */ ++void sched_set_fifo_low(struct task_struct *p) ++{ ++ struct sched_param sp = { .sched_priority = 1 }; ++ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_fifo_low); ++ ++void sched_set_normal(struct task_struct *p, int nice) ++{ ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ .sched_nice = nice, ++ }; ++ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_normal); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, ++ struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) ++ attr.sched_policy = SETPARAM_POLICY; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setattr(p, &attr); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (rt_task(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ cpumask_t *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ unsigned long flags; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ put_online_cpus(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min(len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++static void do_sched_yield(void) ++{ ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ if (sched_yield_type > 1) ++ time_slice_expired(current, rq); ++ schedstat_inc(rq->yld_count); ++ ++ preempt_disable(); ++ rq_unlock_irq(rq, &rf); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. If there are no ++ * other threads running on this CPU then this function will return. ++ * ++ * Return: 0. ++ */ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, it's already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ struct task_struct *rq_p; ++ struct rq *rq, *p_rq; ++ unsigned long flags; ++ int yielded = 0; ++ ++ local_irq_save(flags); ++ rq = this_rq(); ++ ++again: ++ p_rq = task_rq(p); ++ /* ++ * If we're the only runnable task on the rq and target rq also ++ * has only one task, there's absolutely no point in yielding. ++ */ ++ if (task_running(p_rq, p) || p->state) { ++ yielded = -ESRCH; ++ goto out_irq; ++ } ++ ++ double_rq_lock(rq, p_rq); ++ if (unlikely(task_rq(p) != p_rq)) { ++ double_rq_unlock(rq, p_rq); ++ goto again; ++ } ++ ++ yielded = 1; ++ schedstat_inc(rq->yld_count); ++ rq_p = rq->curr; ++ if (p->deadline > rq_p->deadline) ++ p->deadline = rq_p->deadline; ++ p->time_slice += rq_p->time_slice; ++ if (p->time_slice > timeslice()) ++ p->time_slice = timeslice(); ++ time_slice_expired(rq_p, rq); ++ if (preempt && rq != p_rq) ++ resched_task(p_rq->curr); ++ double_rq_unlock(rq, p_rq); ++out_irq: ++ local_irq_restore(flags); ++ ++ if (yielded > 0) ++ schedule(); ++ return yielded; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ unsigned int time_slice; ++ struct rq_flags rf; ++ struct rq *rq; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ rq = task_rq_lock(p, &rf); ++ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); ++ task_rq_unlock(rq, p, &rf); ++ ++ rcu_read_unlock(); ++ *t = ns_to_timespec64(time_slice); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * this syscall writes the default timeslice value of a given process ++ * into the user-space timespec buffer. A value of '0' means infinity. ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", ++ free, task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ print_stop_info(KERN_INFO, p); ++ show_stack(p, NULL, KERN_INFO); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++#ifdef CONFIG_SMP ++void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 __always_unused flags) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void ++__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ ++ if (task_queued(p)) { ++ /* ++ * Because __kthread_bind() calls this on blocked tasks without ++ * holding rq->lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ } ++} ++ ++/* ++ * Calling do_set_cpus_allowed from outside the scheduler code should not be ++ * called on a running or queued task. We should be holding pi_lock. ++ */ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ __do_set_cpus_allowed(p, new_mask); ++ if (needs_other_cpu(p, task_cpu(p))) { ++ struct rq *rq; ++ ++ rq = __task_rq_lock(p, NULL); ++ set_task_cpu(p, valid_task_cpu(p)); ++ resched_task(p); ++ __task_rq_unlock(rq, NULL); ++ } ++} ++ ++void migrate_disable(void) ++{ ++} ++EXPORT_SYMBOL_GPL(migrate_disable); ++ ++void migrate_enable(void) ++{ ++} ++EXPORT_SYMBOL_GPL(migrate_enable); ++#endif ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(rq->lock); ++ idle->last_ran = rq->niffies; ++ time_slice_expired(idle, rq); ++ idle->state = TASK_RUNNING; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ idle->flags |= PF_IDLE; ++ ++ scs_task_reset(idle); ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu), 0); ++#ifdef CONFIG_SMT_NICE ++ idle->smt_bias = 0; ++#endif ++#endif ++ set_rq_task(rq, idle); ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_rq = TASK_ON_RQ_QUEUED; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rq_lock_irqsave(rq, &rf); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(rq); ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++void nohz_balance_enter_idle(int cpu) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(), default_cpu = -1; ++ struct sched_domain *sd; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { ++ if (!idle_cpu(cpu)) ++ return cpu; ++ default_cpu = cpu; ++ } ++ ++ rcu_read_lock(); ++ for_each_domain(cpu, sd) { ++ for_each_cpu_and(i, sched_domain_span(sd), ++ housekeeping_cpumask(HK_FLAG_TIMER)) { ++ if (cpu == i) ++ continue; ++ ++ if (!idle_cpu(i)) { ++ cpu = i; ++ goto unlock; ++ } ++ } ++ } ++ ++ if (default_cpu == -1) ++ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ cpu = default_cpu; ++unlock: ++ rcu_read_unlock(); ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (cpu_is_offline(cpu)) ++ return true; /* Don't try to wake offline CPUs. */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Wake up the specified CPU. If the CPU is going offline, it is the ++ * caller's responsibility to deal with the lost wakeup, for example, ++ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. ++ */ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (!wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, ++ u32 flags) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ bool queued = false, running_wrong = false, kthread; ++ unsigned int dest_cpu; ++ struct rq_flags rf; ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ kthread = !!(p->flags & PF_KTHREAD); ++ if (kthread) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ /* ++ * Picking a ~random cpu helps in cases where we are changing affinity ++ * for groups of tasks (ie. cpuset), so that load balancing is not ++ * immediately required to distribute the tasks within their new mask. ++ */ ++ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ queued = task_queued(p); ++ __do_set_cpus_allowed(p, new_mask); ++ ++ if (kthread) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(rq, p)) { ++ /* Task is running on the wrong cpu now, reschedule it. */ ++ if (rq == this_rq()) { ++ set_task_cpu(p, dest_cpu); ++ set_tsk_need_resched(p); ++ running_wrong = true; ++ } else ++ resched_task(p); ++ } else { ++ if (queued) { ++ /* ++ * Switch runqueue locks after dequeueing the task ++ * here while still holding the pi_lock to be holding ++ * the correct lock for enqueueing. ++ */ ++ dequeue_task(rq, p, 0); ++ rq_unlock(rq); ++ ++ rq = cpu_rq(dest_cpu); ++ rq_lock(rq); ++ } ++ set_task_cpu(p, dest_cpu); ++ if (queued) ++ enqueue_task(rq, p, 0); ++ } ++ if (queued) ++ try_preempt(p, rq); ++ if (running_wrong) ++ preempt_disable(); ++out: ++ task_rq_unlock(rq, p, &rf); ++ ++ if (running_wrong) { ++ __schedule(true); ++ preempt_enable(); ++ } ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, 0); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Run through task list and find tasks affined to the dead cpu, then remove ++ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold ++ * cpu 0 and src_cpu's runqueue locks. We should be holding both rq lock and ++ * pi_lock to change cpus_mask but it's not going to matter here. ++ */ ++static void bind_zero(int src_cpu) ++{ ++ struct task_struct *p, *t; ++ struct rq *rq0; ++ int bound = 0; ++ ++ if (src_cpu == 0) ++ return; ++ ++ rq0 = cpu_rq(0); ++ ++ do_each_thread(t, p) { ++ if (cpumask_test_cpu(src_cpu, p->cpus_ptr)) { ++ bool local = (task_cpu(p) == src_cpu); ++ struct rq *rq = task_rq(p); ++ ++ /* task_running is the cpu stopper thread */ ++ if (local && task_running(rq, p)) ++ continue; ++ atomic_clear_cpu(src_cpu, &p->cpus_mask); ++ atomic_set_cpu(0, &p->cpus_mask); ++ p->zerobound = true; ++ bound++; ++ if (local) { ++ bool queued = task_queued(p); ++ ++ if (queued) ++ dequeue_task(rq, p, 0); ++ set_task_cpu(p, 0); ++ if (queued) ++ enqueue_task(rq0, p, 0); ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (bound) { ++ printk(KERN_INFO "MuQSS removed affinity for %d processes to cpu %d\n", ++ bound, src_cpu); ++ } ++} ++ ++/* Find processes with the zerobound flag and reenable their affinity for the ++ * CPU coming alive. */ ++static void unbind_zero(int src_cpu) ++{ ++ int unbound = 0, zerobound = 0; ++ struct task_struct *p, *t; ++ ++ if (src_cpu == 0) ++ return; ++ ++ do_each_thread(t, p) { ++ if (!p->mm) ++ p->zerobound = false; ++ if (p->zerobound) { ++ unbound++; ++ cpumask_set_cpu(src_cpu, &p->cpus_mask); ++ /* Once every CPU affinity has been re-enabled, remove ++ * the zerobound flag */ ++ if (cpumask_subset(cpu_possible_mask, p->cpus_ptr)) { ++ p->zerobound = false; ++ zerobound++; ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (unbound) { ++ printk(KERN_INFO "MuQSS added affinity for %d processes to cpu %d\n", ++ unbound, src_cpu); ++ } ++ if (zerobound) { ++ printk(KERN_INFO "MuQSS released forced binding to cpu0 for %d processes\n", ++ zerobound); ++ } ++} ++ ++/* ++ * Ensure that the idle task is using init_mm right before its cpu goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ BUG_ON(current != this_rq()->idle); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ finish_arch_post_lock_switch(); ++ } ++ ++ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ ++} ++#else /* CONFIG_HOTPLUG_CPU */ ++static void unbind_zero(int src_cpu) {} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++ ++static struct ctl_table sd_ctl_dir[] = { ++ { ++ .procname = "sched_domain", ++ .mode = 0555, ++ }, ++ {} ++}; ++ ++static struct ctl_table sd_ctl_root[] = { ++ { ++ .procname = "kernel", ++ .mode = 0555, ++ .child = sd_ctl_dir, ++ }, ++ {} ++}; ++ ++static struct ctl_table *sd_alloc_ctl_entry(int n) ++{ ++ struct ctl_table *entry = ++ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); ++ ++ return entry; ++} ++ ++static void sd_free_ctl_entry(struct ctl_table **tablep) ++{ ++ struct ctl_table *entry; ++ ++ /* ++ * In the intermediate directories, both the child directory and ++ * procname are dynamically allocated and could fail but the mode ++ * will always be set. In the lowest directory the names are ++ * static strings and all have proc handlers. ++ */ ++ for (entry = *tablep; entry->mode; entry++) { ++ if (entry->child) ++ sd_free_ctl_entry(&entry->child); ++ if (entry->proc_handler == NULL) ++ kfree(entry->procname); ++ } ++ ++ kfree(*tablep); ++ *tablep = NULL; ++} ++ ++static void ++set_table_entry(struct ctl_table *entry, ++ const char *procname, void *data, int maxlen, ++ umode_t mode, proc_handler *proc_handler) ++{ ++ entry->procname = procname; ++ entry->data = data; ++ entry->maxlen = maxlen; ++ entry->mode = mode; ++ entry->proc_handler = proc_handler; ++} ++ ++static struct ctl_table * ++sd_alloc_ctl_domain_table(struct sched_domain *sd) ++{ ++ struct ctl_table *table = sd_alloc_ctl_entry(9); ++ ++ if (table == NULL) ++ return NULL; ++ ++ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); ++ /* &table[8] is terminator */ ++ ++ return table; ++} ++ ++static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) ++{ ++ struct ctl_table *entry, *table; ++ struct sched_domain *sd; ++ int domain_num = 0, i; ++ char buf[32]; ++ ++ for_each_domain(cpu, sd) ++ domain_num++; ++ entry = table = sd_alloc_ctl_entry(domain_num + 1); ++ if (table == NULL) ++ return NULL; ++ ++ i = 0; ++ for_each_domain(cpu, sd) { ++ snprintf(buf, 32, "domain%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_domain_table(sd); ++ entry++; ++ i++; ++ } ++ return table; ++} ++ ++static cpumask_var_t sd_sysctl_cpus; ++static struct ctl_table_header *sd_sysctl_header; ++ ++void register_sched_domain_sysctl(void) ++{ ++ static struct ctl_table *cpu_entries; ++ static struct ctl_table **cpu_idx; ++ char buf[32]; ++ int i; ++ ++ if (!cpu_entries) { ++ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); ++ if (!cpu_entries) ++ return; ++ ++ WARN_ON(sd_ctl_dir[0].child); ++ sd_ctl_dir[0].child = cpu_entries; ++ } ++ ++ if (!cpu_idx) { ++ struct ctl_table *e = cpu_entries; ++ ++ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); ++ if (!cpu_idx) ++ return; ++ ++ /* deal with sparse possible map */ ++ for_each_possible_cpu(i) { ++ cpu_idx[i] = e; ++ e++; ++ } ++ } ++ ++ if (!cpumask_available(sd_sysctl_cpus)) { ++ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) ++ return; ++ ++ /* init to possible to not have holes in @cpu_entries */ ++ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); ++ } ++ ++ for_each_cpu(i, sd_sysctl_cpus) { ++ struct ctl_table *e = cpu_idx[i]; ++ ++ if (e->child) ++ sd_free_ctl_entry(&e->child); ++ ++ if (!e->procname) { ++ snprintf(buf, 32, "cpu%d", i); ++ e->procname = kstrdup(buf, GFP_KERNEL); ++ } ++ e->mode = 0555; ++ e->child = sd_alloc_ctl_cpu_table(i); ++ ++ __cpumask_clear_cpu(i, sd_sysctl_cpus); ++ } ++ ++ WARN_ON(sd_sysctl_header); ++ sd_sysctl_header = register_sysctl_table(sd_ctl_root); ++} ++ ++void dirty_sched_domain_sysctl(int cpu) ++{ ++ if (cpumask_available(sd_sysctl_cpus)) ++ __cpumask_set_cpu(cpu, sd_sysctl_cpus); ++} ++ ++/* may be called multiple times per register */ ++void unregister_sched_domain_sysctl(void) ++{ ++ unregister_sysctl_table(sd_sysctl_header); ++ sd_sysctl_header = NULL; ++} ++#endif /* CONFIG_SYSCTL */ ++ ++void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) { ++ cpumask_set_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = true; ++ } ++} ++ ++void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) { ++ int cpu = cpu_of(rq); ++ ++ cpumask_clear_cpu(cpu, rq->rd->online); ++ rq->online = false; ++ clear_cpuidle_map(cpu); ++ } ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) { ++ sched_domains_numa_masks_set(cpu); ++ cpuset_cpu_active(); ++ } ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all CPUs have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ rq_lock_irqsave(rq, &rf); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_online(rq); ++ } ++ unbind_zero(cpu); ++ rq_unlock_irqrestore(rq, &rf); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++ rq_lock_irqsave(rq, &rf); ++ if (rq->rd) { ++ update_rq_clock(rq); ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ rq_unlock_irqrestore(rq, &rf); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ sched_domains_numa_masks_clear(cpu); ++ return 0; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_wait_empty(unsigned int __always_unused cpu) ++{ ++ return 0; ++} ++ ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* Handle pending wakeups and then migrate everything off */ ++ sched_tick_stop(cpu); ++ ++ local_irq_save(flags); ++ double_rq_lock(rq, cpu_rq(0)); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ bind_zero(cpu); ++ double_rq_unlock(rq, cpu_rq(0)); ++ sched_start_tick(rq, cpu); ++ hrexpiry_clear(rq); ++ local_irq_restore(flags); ++ ++ return 0; ++} ++#endif ++ ++#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) ++/* ++ * Cheaper version of the below functions in case support for SMT and MC is ++ * compiled in but CPUs have no siblings. ++ */ ++static bool sole_cpu_idle(struct rq *rq) ++{ ++ return rq_idle(rq); ++} ++#endif ++#ifdef CONFIG_SCHED_SMT ++static const cpumask_t *thread_cpumask(int cpu) ++{ ++ return topology_sibling_cpumask(cpu); ++} ++/* All this CPU's SMT siblings are idle */ ++static bool siblings_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); ++} ++#endif ++#ifdef CONFIG_SCHED_MC ++static const cpumask_t *core_cpumask(int cpu) ++{ ++ return topology_core_cpumask(cpu); ++} ++/* All this CPU's shared cache siblings are idle */ ++static bool cache_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->core_mask, &cpu_idle_map); ++} ++/* MC siblings CPU mask which share the same LLC */ ++static const cpumask_t *llc_core_cpumask(int cpu) ++{ ++#ifdef CONFIG_X86 ++ return per_cpu(cpu_llc_shared_map, cpu); ++#else ++ return topology_core_cpumask(cpu); ++#endif ++} ++#endif ++ ++enum sched_domain_level { ++ SD_LV_NONE = 0, ++ SD_LV_SIBLING, ++ SD_LV_MC, ++ SD_LV_BOOK, ++ SD_LV_CPU, ++ SD_LV_NODE, ++ SD_LV_ALLNODES, ++ SD_LV_MAX ++}; ++ ++/* ++ * Set up the relative cache distance of each online cpu from each ++ * other in a simple array for quick lookup. Locality is determined ++ * by the closest sched_domain that CPUs are separated by. CPUs with ++ * shared cache in SMT and MC are treated as local. Separate CPUs ++ * (within the same package or physically) within the same node are ++ * treated as not local. CPUs not even in the same domain (different ++ * nodes) are treated as very distant. ++ */ ++static void __init select_leaders(void) ++{ ++ struct rq *rq, *other_rq, *leader; ++ struct sched_domain *sd; ++ int cpu, other_cpu; ++#ifdef CONFIG_SCHED_SMT ++ bool smt_threads = false; ++#endif ++ ++ for (cpu = 0; cpu < num_online_cpus(); cpu++) { ++ rq = cpu_rq(cpu); ++ leader = NULL; ++ /* First check if this cpu is in the same node */ ++ for_each_domain(cpu, sd) { ++ if (sd->level > SD_LV_MC) ++ continue; ++ if (rqshare != RQSHARE_ALL) ++ leader = NULL; ++ /* Set locality to local node if not already found lower */ ++ for_each_cpu(other_cpu, sched_domain_span(sd)) { ++ if (rqshare >= RQSHARE_SMP) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smp_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ if (!other_rq->smp_leader) ++ other_rq->smp_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMP) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMP; ++ } ++ } ++ ++ /* ++ * Each runqueue has its own function in case it doesn't have ++ * siblings of its own allowing mixed topologies. ++ */ ++#ifdef CONFIG_SCHED_MC ++ leader = NULL; ++ if (cpumask_weight(core_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->core_mask, llc_core_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->core_mask); ++ for_each_cpu(other_cpu, core_cpumask(cpu)) { ++ if (rqshare == RQSHARE_MC || ++ (rqshare == RQSHARE_MC_LLC && cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu)))) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the mc_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ if (!other_rq->mc_leader) ++ other_rq->mc_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_MC) { ++ /* this is to get LLC into play even in case LLC sharing is not used */ ++ if (cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu))) ++ rq->cpu_locality[other_cpu] = LOCALITY_MC_LLC; ++ else ++ rq->cpu_locality[other_cpu] = LOCALITY_MC; ++ } ++ } ++ rq->cache_idle = cache_cpu_idle; ++ } ++#endif ++#ifdef CONFIG_SCHED_SMT ++ leader = NULL; ++ if (cpumask_weight(thread_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->thread_mask); ++ for_each_cpu(other_cpu, thread_cpumask(cpu)) { ++ if (rqshare == RQSHARE_SMT) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smt_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ if (!other_rq->smt_leader) ++ other_rq->smt_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMT) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMT; ++ } ++ rq->siblings_idle = siblings_cpu_idle; ++ smt_threads = true; ++ } ++#endif ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (smt_threads) { ++ check_siblings = &check_smt_siblings; ++ wake_siblings = &wake_smt_siblings; ++ smt_schedule = &smt_should_schedule; ++ } ++#endif ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for_each_online_cpu(other_cpu) { ++ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); ++ } ++ } ++} ++ ++/* FIXME freeing locked spinlock */ ++static void __init share_and_free_rq(struct rq *leader, struct rq *rq) ++{ ++ WARN_ON(rq->nr_running > 0); ++ ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ rq->is_leader = false; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++} ++ ++static void __init share_rqs(void) ++{ ++ struct rq *rq, *leader; ++ int cpu; ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->smp_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMP runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ share_and_free_rq(leader, rq); ++ } else ++ rq_unlock(rq); ++ } ++ ++#ifdef CONFIG_SCHED_MC ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->mc_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing MC runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ share_and_free_rq(leader, rq); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_MC */ ++ ++#ifdef CONFIG_SCHED_SMT ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->smt_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMT runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ share_and_free_rq(leader, rq); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_SMT */ ++} ++ ++static void __init setup_rq_orders(void) ++{ ++ int *selected_cpus, *ordered_cpus; ++ struct rq *rq, *other_rq; ++ int cpu, other_cpu, i; ++ ++ selected_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC); ++ ordered_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC); ++ ++ total_runqueues = 0; ++ for_each_online_cpu(cpu) { ++ int locality, total_rqs = 0, total_cpus = 0; ++ ++ rq = cpu_rq(cpu); ++ if (rq->is_leader) ++ total_runqueues++; ++ ++ for (locality = LOCALITY_SAME; locality <= LOCALITY_DISTANT; locality++) { ++ int selected_cpu_cnt, selected_cpu_idx, test_cpu_idx, cpu_idx, best_locality, test_cpu; ++ int ordered_cpus_idx; ++ ++ ordered_cpus_idx = -1; ++ selected_cpu_cnt = 0; ++ ++ for_each_online_cpu(test_cpu) { ++ if (cpu < num_online_cpus() / 2) ++ other_cpu = cpu + test_cpu; ++ else ++ other_cpu = cpu - test_cpu; ++ if (other_cpu < 0) ++ other_cpu += num_online_cpus(); ++ else ++ other_cpu %= num_online_cpus(); ++ /* gather CPUs of the same locality */ ++ if (rq->cpu_locality[other_cpu] == locality) { ++ selected_cpus[selected_cpu_cnt] = other_cpu; ++ selected_cpu_cnt++; ++ } ++ } ++ ++ /* reserve first CPU as starting point */ ++ if (selected_cpu_cnt > 0) { ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[ordered_cpus_idx]; ++ selected_cpus[ordered_cpus_idx] = -1; ++ } ++ ++ /* take each CPU and sort it within the same locality based on each inter-CPU localities */ ++ for (test_cpu_idx = 1; test_cpu_idx < selected_cpu_cnt; test_cpu_idx++) { ++ /* starting point with worst locality and current CPU */ ++ best_locality = LOCALITY_DISTANT; ++ selected_cpu_idx = test_cpu_idx; ++ ++ /* try to find the best locality within group */ ++ for (cpu_idx = 1; cpu_idx < selected_cpu_cnt; cpu_idx++) { ++ /* if CPU has not been used and locality is better */ ++ if (selected_cpus[cpu_idx] > -1) { ++ other_rq = cpu_rq(ordered_cpus[ordered_cpus_idx]); ++ if (best_locality > other_rq->cpu_locality[selected_cpus[cpu_idx]]) { ++ /* assign best locality and best CPU idx in array */ ++ best_locality = other_rq->cpu_locality[selected_cpus[cpu_idx]]; ++ selected_cpu_idx = cpu_idx; ++ } ++ } ++ } ++ ++ /* add our next best CPU to ordered list */ ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[selected_cpu_idx]; ++ /* mark this CPU as used */ ++ selected_cpus[selected_cpu_idx] = -1; ++ } ++ ++ /* set up RQ and CPU orders */ ++ for (test_cpu = 0; test_cpu <= ordered_cpus_idx; test_cpu++) { ++ other_rq = cpu_rq(ordered_cpus[test_cpu]); ++ /* set up cpu orders */ ++ rq->cpu_order[total_cpus++] = other_rq; ++ if (other_rq->is_leader) { ++ /* set up RQ orders */ ++ rq->rq_order[total_rqs++] = other_rq; ++ } ++ } ++ } ++ } ++ ++ kfree(selected_cpus); ++ kfree(ordered_cpus); ++ ++#ifdef CONFIG_X86 ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < total_runqueues; i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d RQ order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->rq_order[i]->cpu, per_cpu(cpu_llc_id, rq->rq_order[i]->cpu)); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < num_online_cpus(); i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d CPU order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->cpu_order[i]->cpu, per_cpu(cpu_llc_id, rq->cpu_order[i]->cpu)); ++ } ++ } ++#endif ++} ++ ++void __init sched_init_smp(void) ++{ ++ sched_init_numa(); ++ ++ /* ++ * There's no userspace yet to cause hotplug operations; hence all the ++ * cpu masks are stable and all blatant races in the below code cannot ++ * happen. ++ */ ++ mutex_lock(&sched_domains_mutex); ++ sched_init_domains(cpu_active_mask); ++ mutex_unlock(&sched_domains_mutex); ++ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ local_irq_disable(); ++ mutex_lock(&sched_domains_mutex); ++ lock_all_rqs(); ++ ++ printk(KERN_INFO "MuQSS possible/present/online CPUs: %d/%d/%d\n", ++ num_possible_cpus(), num_present_cpus(), num_online_cpus()); ++ ++ select_leaders(); ++ ++ unlock_all_rqs(); ++ mutex_unlock(&sched_domains_mutex); ++ ++ share_rqs(); ++ ++ local_irq_enable(); ++ ++ setup_rq_orders(); ++ ++ switch (rqshare) { ++ case RQSHARE_ALL: ++ /* This should only ever read 1 */ ++ printk(KERN_INFO "MuQSS runqueue share type ALL total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMP: ++ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC: ++ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC_LLC: ++ printk(KERN_INFO "MuQSS runqueue share type LLC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMT: ++ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_NONE: ++ printk(KERN_INFO "MuQSS runqueue share type NONE total runqueues: %d\n", ++ total_runqueues); ++ break; ++ } ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ sched_smp_initialized = true; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++#ifdef CONFIG_SMP ++ int cpu_ids; ++#endif ++ int i; ++ struct rq *rq; ++ ++ wait_bit_init(); ++ ++ prio_ratios[0] = 128; ++ for (i = 1 ; i < NICE_WIDTH ; i++) ++ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; ++ ++ skiplist_node_init(&init_task.node); ++ ++#ifdef CONFIG_SMP ++ init_defrootdomain(); ++ cpumask_clear(&cpu_idle_map); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC); ++ skiplist_init(rq->node); ++ rq->sl = new_skiplist(rq->node); ++ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC); ++ raw_spin_lock_init(rq->lock); ++ rq->nr_running = 0; ++ rq->nr_uninterruptible = 0; ++ rq->nr_switches = 0; ++ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; ++ rq->last_jiffy = jiffies; ++ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = ++ rq->iowait_ns = rq->idle_ns = 0; ++ rq->dither = 0; ++ set_rq_task(rq, &init_task); ++ rq->iso_ticks = 0; ++ rq->iso_refractory = false; ++#ifdef CONFIG_SMP ++ rq->is_leader = true; ++ rq->smp_leader = NULL; ++#ifdef CONFIG_SCHED_MC ++ rq->mc_leader = NULL; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ rq->smt_leader = NULL; ++#endif ++ rq->sd = NULL; ++ rq->rd = NULL; ++ rq->online = false; ++ rq->cpu = i; ++ rq_attach_root(rq, &def_root_domain); ++#endif /* CONFIG_SMP */ ++ init_rq_hrexpiry(rq); ++ atomic_set(&rq->nr_iowait, 0); ++ } ++ ++#ifdef CONFIG_SMP ++ cpu_ids = i; ++ /* ++ * Set the base locality for cpu cache distance calculation to ++ * "distant" (3). Make sure the distance from a CPU to itself is 0. ++ */ ++ for_each_possible_cpu(i) { ++ int j; ++ ++ rq = cpu_rq(i); ++#ifdef CONFIG_SCHED_SMT ++ rq->siblings_idle = sole_cpu_idle; ++#endif ++#ifdef CONFIG_SCHED_MC ++ rq->cache_idle = sole_cpu_idle; ++#endif ++ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); ++ for_each_possible_cpu(j) { ++ if (i == j) ++ rq->cpu_locality[j] = LOCALITY_SAME; ++ else ++ rq->cpu_locality[j] = LOCALITY_DISTANT; ++ } ++ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->rq_order[0] = rq->cpu_order[0] = rq; ++ for (j = 1; j < cpu_ids; j++) ++ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j); ++ } ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && !preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, preempt_disable_ip); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++static inline void normalise_rt_tasks(void) ++{ ++ struct sched_attr attr = {}; ++ struct task_struct *g, *p; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p) && !iso_task(p)) ++ continue; ++ ++ rq = task_rq_lock(p, &rf); ++ __setscheduler(p, rq, SCHED_NORMAL, 0, &attr, false); ++ task_rq_unlock(rq, p, &rf); ++ } ++ read_unlock(&tasklist_lock); ++} ++ ++void normalize_rt_tasks(void) ++{ ++ normalise_rt_tasks(); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++void init_idle_bootup_task(struct task_struct *idle) ++{} ++ ++#ifdef CONFIG_SCHED_DEBUG ++__read_mostly bool sched_debug_enabled; ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ seq_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void call_trace_sched_update_nr_running(struct rq *rq, int count) ++{ ++ trace_sched_update_nr_running_tp(rq, count); ++} ++ ++/* CFS Compat */ ++#ifdef CONFIG_RCU_TORTURE_TEST ++int sysctl_sched_rt_runtime; ++#endif +diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h +new file mode 100644 +index 000000000000..0ffeddf5ddd3 +--- /dev/null ++++ b/kernel/sched/MuQSS.h +@@ -0,0 +1,1082 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef MUQSS_SCHED_H ++#define MUQSS_SCHED_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_PARAVIRT ++#include ++#endif ++ ++#include "cpupri.h" ++ ++#include ++ ++#ifdef CONFIG_SCHED_DEBUG ++# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) ++#else ++# define SCHED_WARN_ON(x) ((void)(x)) ++#endif ++ ++/* Wake flags. The first three directly map to some SD flag value */ ++#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ ++#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ ++#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ ++ ++#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ ++#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ ++#define WF_ON_CPU 0x40 /* Wakee is on_cpu */ ++ ++#ifdef CONFIG_SMP ++static_assert(WF_EXEC == SD_BALANCE_EXEC); ++static_assert(WF_FORK == SD_BALANCE_FORK); ++static_assert(WF_TTWU == SD_BALANCE_WAKE); ++#endif ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++extern void call_trace_sched_update_nr_running(struct rq *rq, int count); ++ ++struct rq; ++ ++#ifdef CONFIG_SMP ++ ++static inline bool sched_asym_prefer(int a, int b) ++{ ++ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); ++} ++ ++struct perf_domain { ++ struct em_perf_domain *em_pd; ++ struct perf_domain *next; ++ struct rcu_head rcu; ++}; ++ ++/* Scheduling group status flags */ ++#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ ++#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ ++ ++/* ++ * We add the notion of a root-domain which will be used to define per-domain ++ * variables. Each exclusive cpuset essentially defines an island domain by ++ * fully partitioning the member cpus from any other cpuset. Whenever a new ++ * exclusive cpuset is created, we also create and attach a new root-domain ++ * object. ++ * ++ */ ++struct root_domain { ++ atomic_t refcount; ++ atomic_t rto_count; ++ struct rcu_head rcu; ++ cpumask_var_t span; ++ cpumask_var_t online; ++ ++ /* ++ * Indicate pullable load on at least one CPU, e.g: ++ * - More than one runnable task ++ * - Running task is misfit ++ */ ++ int overload; ++ ++ /* Indicate one or more cpus over-utilized (tipping point) */ ++ int overutilized; ++ ++ /* ++ * The bit corresponding to a CPU gets set here if such CPU has more ++ * than one runnable -deadline task (as it is below for RT tasks). ++ */ ++ cpumask_var_t dlo_mask; ++ atomic_t dlo_count; ++ ++ /* Replace unused CFS structures with void */ ++ //struct dl_bw dl_bw; ++ //struct cpudl cpudl; ++ void *dl_bw; ++ void *cpudl; ++ u64 visit_gen; ++ ++ /* ++ * The "RT overload" flag: it gets set if a CPU has more than ++ * one runnable RT task. ++ */ ++ cpumask_var_t rto_mask; ++ //struct cpupri cpupri; ++ void *cpupri; ++ ++ unsigned long max_cpu_capacity; ++ ++ /* ++ * NULL-terminated list of performance domains intersecting with the ++ * CPUs of the rd. Protected by RCU. ++ */ ++ struct perf_domain *pd; ++}; ++ ++extern void init_defrootdomain(void); ++extern int sched_init_domains(const struct cpumask *cpu_map); ++extern void rq_attach_root(struct rq *rq, struct root_domain *rd); ++ ++static inline void cpupri_cleanup(void __maybe_unused *cpupri) ++{ ++} ++ ++static inline void cpudl_cleanup(void __maybe_unused *cpudl) ++{ ++} ++ ++static inline void init_dl_bw(void __maybe_unused *dl_bw) ++{ ++} ++ ++static inline int cpudl_init(void __maybe_unused *dl_bw) ++{ ++ return 0; ++} ++ ++static inline int cpupri_init(void __maybe_unused *cpupri) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ raw_spinlock_t *lock; ++ raw_spinlock_t *orig_lock; ++ ++ struct task_struct __rcu *curr; ++ struct task_struct *idle; ++ struct task_struct *stop; ++ struct mm_struct *prev_mm; ++ ++ unsigned int nr_running; ++ /* ++ * This is part of a global counter where only the total sum ++ * over all CPUs matters. A task can increase this counter on ++ * one CPU and if it got migrated afterwards it may decrease ++ * it on another CPU. Always updated under the runqueue lock: ++ */ ++ unsigned long nr_uninterruptible; ++#ifdef CONFIG_SMP ++ unsigned int ttwu_pending; ++#endif ++ u64 nr_switches; ++ ++ /* Stored data about rq->curr to work outside rq lock */ ++ u64 rq_deadline; ++ int rq_prio; ++ ++ /* Best queued id for use outside lock */ ++ u64 best_key; ++ ++ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ ++ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ ++ u64 niffies; /* Last time this RQ updated rq clock */ ++ u64 last_niffy; /* Last niffies as updated by local clock */ ++ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ ++ ++ u64 load_update; /* When we last updated load */ ++ unsigned long load_avg; /* Rolling load average */ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ u64 irq_load_update; /* When we last updated IRQ load */ ++ unsigned long irq_load_avg; /* Rolling IRQ load average */ ++#endif ++#ifdef CONFIG_SMT_NICE ++ struct mm_struct *rq_mm; ++ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++ /* Accurate timekeeping data */ ++ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, ++ iowait_ns, idle_ns; ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++ skiplist_node *node; ++ skiplist *sl; ++#ifdef CONFIG_SMP ++ struct task_struct *preempt; /* Preempt triggered on this task */ ++ struct task_struct *preempting; /* Hint only, what task is preempting */ ++ ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++ struct root_domain *rd; ++ struct sched_domain *sd; ++ ++ unsigned long cpu_capacity_orig; ++ ++ int *cpu_locality; /* CPU relative cache distance */ ++ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */ ++ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */ ++ ++ bool is_leader; ++ struct rq *smp_leader; /* First physical CPU per node */ ++#ifdef CONFIG_SCHED_THERMAL_PRESSURE ++ struct sched_avg avg_thermal; ++#endif /* CONFIG_SCHED_THERMAL_PRESSURE */ ++#ifdef CONFIG_SCHED_SMT ++ struct rq *smt_leader; /* First logical CPU in SMT siblings */ ++ cpumask_t thread_mask; ++ bool (*siblings_idle)(struct rq *rq); ++ /* See if all smt siblings are idle */ ++#endif /* CONFIG_SCHED_SMT */ ++#ifdef CONFIG_SCHED_MC ++ struct rq *mc_leader; /* First logical CPU in MC siblings */ ++ cpumask_t core_mask; ++ bool (*cache_idle)(struct rq *rq); ++ /* See if all cache siblings are idle */ ++#endif /* CONFIG_SCHED_MC */ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ u64 clock, old_clock, last_tick; ++ /* Ensure that all clocks are in the same cache line */ ++ u64 clock_task ____cacheline_aligned; ++ int dither; ++ ++ int iso_ticks; ++ bool iso_refractory; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ struct hrtimer hrexpiry_timer; ++#endif ++ ++ int rt_nr_running; /* Number real time tasks running */ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock_task; ++} ++ ++/** ++ * By default the decay is the default pelt decay period. ++ * The decay shift can change the decay period in ++ * multiples of 32. ++ * Decay shift Decay period(ms) ++ * 0 32 ++ * 1 64 ++ * 2 128 ++ * 3 256 ++ * 4 512 ++ */ ++extern int sched_thermal_decay_shift; ++ ++static inline u64 rq_clock_thermal(struct rq *rq) ++{ ++ return rq_clock_task(rq) >> sched_thermal_decay_shift; ++} ++ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++#ifdef CONFIG_SMP ++struct rq *cpu_rq(int cpu); ++#endif ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#endif /* CONFIG_SMP */ ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline int task_running(struct rq *rq, struct task_struct *p) ++{ ++#ifdef CONFIG_SMP ++ return p->on_cpu; ++#else ++ return task_current(rq, p); ++#endif ++} ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++static inline void rq_lock(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock(rq->lock); ++} ++ ++static inline void rq_unlock(struct rq *rq) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(rq->lock); ++} ++ ++static inline void rq_lock_irq(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irq(rq->lock); ++} ++ ++static inline void rq_unlock_irq(struct rq *rq, struct rq_flags __always_unused *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(rq->lock); ++} ++ ++static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(rq->lock, rf->flags); ++} ++ ++static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(rq->lock, rf->flags); ++} ++ ++static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ while (42) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ } ++ return rq; ++} ++ ++static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ rq_unlock(rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ while (42) { ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ } ++ return rq; ++} ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags __always_unused *rf) ++{ ++ rq_unlock(rq); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ return rq; ++} ++ ++/* ++ * {de,en}queue flags: Most not used on MuQSS. ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks ++ * are in a known state which allows modification. Such pairs ++ * should preserve as much state as possible. ++ * ++ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location ++ * in the runqueue. ++ * ++ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) ++ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) ++ * ENQUEUE_MIGRATED - the task was migrated during wakeup ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ ++ ++#define ENQUEUE_WAKEUP 0x01 ++#define ENQUEUE_RESTORE 0x02 ++ ++#ifdef CONFIG_SMP ++#define ENQUEUE_MIGRATED 0x40 ++#else ++#define ENQUEUE_MIGRATED 0x00 ++#endif ++ ++#ifdef CONFIG_NUMA ++enum numa_topology_type { ++ NUMA_DIRECT, ++ NUMA_GLUELESS_MESH, ++ NUMA_BACKPLANE, ++}; ++extern enum numa_topology_type sched_numa_topology_type; ++extern int sched_max_numa_distance; ++extern bool find_numa_distance(int distance); ++extern void sched_init_numa(void); ++extern void sched_domains_numa_masks_set(unsigned int cpu); ++extern void sched_domains_numa_masks_clear(unsigned int cpu); ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline void sched_init_numa(void) { } ++static inline void sched_domains_numa_masks_set(unsigned int cpu) { } ++static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++ ++extern struct mutex sched_domains_mutex; ++extern struct static_key_false sched_schedstats; ++ ++#define rcu_dereference_check_sched_domain(p) \ ++ rcu_dereference_check((p), \ ++ lockdep_is_held(&sched_domains_mutex)) ++ ++#define SCA_CHECK 0x01 ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. ++ * See destroy_sched_domains: call_rcu for details. ++ * ++ * The domain tree of any CPU may only be accessed from within ++ * preempt-disabled sections. ++ */ ++#define for_each_domain(cpu, __sd) \ ++ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ ++ __sd; __sd = __sd->parent) ++ ++/** ++ * highest_flag_domain - Return highest sched_domain containing flag. ++ * @cpu: The cpu whose highest level of sched domain is to ++ * be returned. ++ * @flag: The flag to check for the highest sched_domain ++ * for the given cpu. ++ * ++ * Returns the highest sched_domain of a cpu which contains the given flag. ++ */ ++static inline struct sched_domain *highest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd, *hsd = NULL; ++ ++ for_each_domain(cpu, sd) { ++ if (!(sd->flags & flag)) ++ break; ++ hsd = sd; ++ } ++ ++ return hsd; ++} ++ ++static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd; ++ ++ for_each_domain(cpu, sd) { ++ if (sd->flags & flag) ++ break; ++ } ++ ++ return sd; ++} ++ ++DECLARE_PER_CPU(struct sched_domain *, sd_llc); ++DECLARE_PER_CPU(int, sd_llc_size); ++DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); ++DECLARE_PER_CPU(struct sched_domain *, sd_numa); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); ++ ++struct sched_group_capacity { ++ atomic_t ref; ++ /* ++ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity ++ * for a single CPU. ++ */ ++ unsigned long capacity; ++ unsigned long min_capacity; /* Min per-CPU capacity in group */ ++ unsigned long max_capacity; /* Max per-CPU capacity in group */ ++ unsigned long next_update; ++ int imbalance; /* XXX unrelated to capacity but shared group state */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++ int id; ++#endif ++ ++ unsigned long cpumask[]; /* balance mask */ ++}; ++ ++struct sched_group { ++ struct sched_group *next; /* Must be a circular list */ ++ atomic_t ref; ++ ++ unsigned int group_weight; ++ struct sched_group_capacity *sgc; ++ int asym_prefer_cpu; /* cpu of highest priority in group */ ++ ++ /* ++ * The CPUs this group covers. ++ * ++ * NOTE: this field is variable length. (Allocated dynamically ++ * by attaching extra space to the end of the structure, ++ * depending on how many CPUs the kernel has booted up with) ++ */ ++ unsigned long cpumask[0]; ++}; ++ ++static inline struct cpumask *sched_group_span(struct sched_group *sg) ++{ ++ return to_cpumask(sg->cpumask); ++} ++ ++/* ++ * See build_balance_mask(). ++ */ ++static inline struct cpumask *group_balance_mask(struct sched_group *sg) ++{ ++ return to_cpumask(sg->sgc->cpumask); ++} ++ ++/** ++ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. ++ * @group: The group whose first cpu is to be returned. ++ */ ++static inline unsigned int group_first_cpu(struct sched_group *group) ++{ ++ return cpumask_first(sched_group_span(group)); ++} ++ ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void dirty_sched_domain_sysctl(int cpu); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void dirty_sched_domain_sysctl(int cpu) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern void flush_smp_call_function_from_idle(void); ++ ++extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); ++ ++extern void set_rq_online (struct rq *rq); ++extern void set_rq_offline(struct rq *rq); ++extern bool sched_smp_initialized; ++ ++static inline void update_group_capacity(struct sched_domain *sd, int cpu) ++{ ++} ++ ++static inline void trigger_load_balance(struct rq *rq) ++{ ++} ++ ++#define sched_feat(x) 0 ++ ++#else /* CONFIG_SMP */ ++ ++static inline void flush_smp_call_function_from_idle(void) { } ++ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ SCHED_WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++extern bool sched_debug_enabled; ++#endif ++ ++extern void schedule_idle(void); ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++static inline bool sched_stop_runnable(struct rq *rq) ++{ ++ return rq->stop && task_on_rq_queued(rq->stop); ++} ++ ++#ifdef CONFIG_SMP ++static inline int cpu_of(struct rq *rq) ++{ ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static inline int cpu_of(struct rq *rq) ++{ ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); ++ ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, ++ cpu_of(rq))); ++ ++ if (data) ++ data->func(data, rq->niffies, flags); ++} ++#else ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) ++{ ++} ++#endif /* CONFIG_CPU_FREQ */ ++ ++static __always_inline ++unsigned int uclamp_rq_util_with(struct rq __maybe_unused *rq, unsigned int util, ++ struct task_struct __maybe_unused *p) ++{ ++ return util; ++} ++ ++static inline bool uclamp_is_used(void) ++{ ++ return false; ++} ++ ++#ifndef arch_scale_freq_tick ++static __always_inline ++void arch_scale_freq_tick(void) ++{ ++} ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return tsk_seruntime(t); ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ struct rq_flags rf; ++ u64 ns; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = tsk_seruntime(t); ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++/** ++ * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. ++ * @cpu: the CPU in question. ++ * ++ * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e. ++ * ++ * f_curr ++ * ------ * SCHED_CAPACITY_SCALE ++ * f_max ++ */ ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern bool sched_can_stop_tick(struct rq *rq); ++extern int __init sched_tick_offload_init(void); ++ ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out of ++ * nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (sched_can_stop_tick(rq)) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++static inline bool rt_rq_is_runnable(struct rq *rt_rq) ++{ ++ return rt_rq->rt_nr_running; ++} ++ ++/** ++ * enum schedutil_type - CPU utilization type ++ * @FREQUENCY_UTIL: Utilization used to select frequency ++ * @ENERGY_UTIL: Utilization used during energy calculation ++ * ++ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time ++ * need to be aggregated differently depending on the usage made of them. This ++ * enum is used within schedutil_freq_util() to differentiate the types of ++ * utilization expected by the callers, and adjust the aggregation accordingly. ++ */ ++enum schedutil_type { ++ FREQUENCY_UTIL, ++ ENERGY_UTIL, ++}; ++ ++#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL ++ ++unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, ++ unsigned long max, enum schedutil_type type, ++ struct task_struct *p); ++ ++static inline unsigned long cpu_bw_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_cfs(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline unsigned long cpu_util_rt(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->rt_nr_running); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->irq_load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ util *= (max - irq); ++ util /= max; ++ ++ return util; ++ ++} ++#else ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ return util; ++} ++#endif ++#endif ++ ++#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) ++#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) ++ ++DECLARE_STATIC_KEY_FALSE(sched_energy_present); ++ ++static inline bool sched_energy_enabled(void) ++{ ++ return static_branch_unlikely(&sched_energy_present); ++} ++ ++#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ ++ ++#define perf_domain_span(pd) NULL ++static inline bool sched_energy_enabled(void) { return false; } ++ ++#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ if (!(p->flags & PF_KTHREAD)) ++ return false; ++ ++ if (p->nr_cpus_allowed != 1) ++ return false; ++ ++ return true; ++} ++#endif ++ ++void swake_up_all_locked(struct swait_queue_head *q); ++void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++/* pelt.h compat CONFIG_SCHED_THERMAL_PRESSURE impossible with MUQSS */ ++static inline int ++update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) ++{ ++ return 0; ++} ++ ++static inline u64 thermal_load_avg(struct rq *rq) ++{ ++ return 0; ++} ++ ++#ifdef CONFIG_RCU_TORTURE_TEST ++extern int sysctl_sched_rt_runtime; ++#endif ++ ++#endif /* MUQSS_SCHED_H */ +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 6931f0cdeb80..5b1e9dce4d6a 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -171,6 +171,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifdef CONFIG_SCHED_MUQSS ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(rq) ++#else ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(&rq->rt) ++#endif ++ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -199,7 +205,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + struct rq *rq = cpu_rq(cpu); + + if (!uclamp_is_used() && +- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { ++ type == FREQUENCY_UTIL && rt_rq_runnable(rq)) { + return max; + } + +@@ -681,7 +687,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + struct task_struct *thread; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), ++#ifdef CONFIG_SCHED_MUQSS ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, ++#endif + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, +diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h +index d6cba0020064..935c7dc48e26 100644 +--- a/kernel/sched/cpupri.h ++++ b/kernel/sched/cpupri.h +@@ -17,6 +17,7 @@ struct cpupri { + int *cpu_to_pri; + }; + ++#ifndef CONFIG_SCHED_MUQSS + #ifdef CONFIG_SMP + int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask); +@@ -27,3 +28,4 @@ void cpupri_set(struct cpupri *cp, int cpu, int pri); + int cpupri_init(struct cpupri *cp); + void cpupri_cleanup(struct cpupri *cp); + #endif ++#endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 5f611658eeab..743b462b04b9 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -267,26 +267,6 @@ static inline u64 account_other_time(u64 max) + return accounted; + } + +-#ifdef CONFIG_64BIT +-static inline u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- return t->se.sum_exec_runtime; +-} +-#else +-static u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- u64 ns; +- struct rq_flags rf; +- struct rq *rq; +- +- rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; +- task_rq_unlock(rq, t, &rf); +- +- return ns; +-} +-#endif +- + /* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. +@@ -612,7 +592,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index 305727ea0677..1adb4aa3c828 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -396,6 +396,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_MUQSS + /* + * idle-task scheduling class. + */ +@@ -509,3 +510,4 @@ DEFINE_SCHED_CLASS(idle) = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index bb09988451a0..2d25f046e5f5 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,19 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_MUQSS ++#include "MuQSS.h" ++ ++/* Begin compatibility wrappers for MuQSS/CFS differences */ ++#define rq_rt_nr_running(rq) ((rq)->rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->nr_running) ++ ++#else /* CONFIG_SCHED_MUQSS */ ++ ++#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running) ++ ++ + #include + + #include +@@ -2707,3 +2720,25 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) + + void swake_up_all_locked(struct swait_queue_head *q); + void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++/* MuQSS compatibility functions */ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return t->se.sum_exec_runtime; ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ u64 ns; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = t->se.sum_exec_runtime; ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 5d3675c7a76b..551c3ee1fb5c 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -467,7 +467,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + struct root_domain *old_rd = NULL; + unsigned long flags; + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_lock_irqsave(rq->lock, flags); ++#else + raw_spin_lock_irqsave(&rq->lock, flags); ++#endif + + if (rq->rd) { + old_rd = rq->rd; +@@ -493,7 +497,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_unlock_irqrestore(rq->lock, flags); ++#else + raw_spin_unlock_irqrestore(&rq->lock, flags); ++#endif + + if (old_rd) + call_rcu(&old_rd->rcu, free_rootdomain); +diff --git a/kernel/skip_list.c b/kernel/skip_list.c +new file mode 100644 +index 000000000000..bf5c6e97e139 +--- /dev/null ++++ b/kernel/skip_list.c +@@ -0,0 +1,148 @@ ++/* ++ Copyright (C) 2011,2016 Con Kolivas. ++ ++ Code based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++The routine randomLevel has been hard-coded to generate random ++levels using p=0.25. It can be easily changed. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++Levels start at zero and go up to MaxLevel (which is equal to ++MaxNumberOfLevels-1). ++ ++The routines defined in this file are: ++ ++init: defines slnode ++ ++new_skiplist: returns a new, empty list ++ ++randomLevel: Returns a random level based on a u64 random seed passed to it. ++In MuQSS, the "niffy" time is used for this purpose. ++ ++insert(l,key, value): inserts the binding (key, value) into l. This operation ++occurs in O(log n) time. ++ ++delnode(slnode, l, node): deletes any binding of key from the l based on the ++actual node value. This operation occurs in O(k) time where k is the ++number of levels of the node in question (max 8). The original delete ++function occurred in O(log n) time and involved a search. ++ ++MuQSS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++ ++*/ ++ ++#include ++#include ++ ++#define MaxNumberOfLevels 8 ++#define MaxLevel (MaxNumberOfLevels - 1) ++ ++void skiplist_init(skiplist_node *slnode) ++{ ++ int i; ++ ++ slnode->key = 0xFFFFFFFFFFFFFFFF; ++ slnode->level = 0; ++ slnode->value = NULL; ++ for (i = 0; i < MaxNumberOfLevels; i++) ++ slnode->next[i] = slnode->prev[i] = slnode; ++} ++ ++skiplist *new_skiplist(skiplist_node *slnode) ++{ ++ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); ++ ++ BUG_ON(!l); ++ l->header = slnode; ++ return l; ++} ++ ++void free_skiplist(skiplist *l) ++{ ++ skiplist_node *p, *q; ++ ++ p = l->header; ++ do { ++ q = p->next[0]; ++ p->next[0]->prev[0] = q->prev[0]; ++ skiplist_node_init(p); ++ p = q; ++ } while (p != l->header); ++ kfree(l); ++} ++ ++void skiplist_node_init(skiplist_node *node) ++{ ++ memset(node, 0, sizeof(skiplist_node)); ++} ++ ++static inline unsigned int randomLevel(const long unsigned int randseed) ++{ ++ return find_first_bit(&randseed, MaxLevel) / 2; ++} ++ ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) ++{ ++ skiplist_node *update[MaxNumberOfLevels]; ++ skiplist_node *p, *q; ++ int k = l->level; ++ ++ p = l->header; ++ do { ++ while (q = p->next[k], q->key <= key) ++ p = q; ++ update[k] = p; ++ } while (--k >= 0); ++ ++ ++l->entries; ++ k = randomLevel(randseed); ++ if (k > l->level) { ++ k = ++l->level; ++ update[k] = l->header; ++ } ++ ++ node->level = k; ++ node->key = key; ++ node->value = value; ++ do { ++ p = update[k]; ++ node->next[k] = p->next[k]; ++ p->next[k] = node; ++ node->prev[k] = p; ++ node->next[k]->prev[k] = node; ++ } while (--k >= 0); ++} ++ ++void skiplist_delete(skiplist *l, skiplist_node *node) ++{ ++ int k, m = node->level; ++ ++ for (k = 0; k <= m; k++) { ++ node->prev[k]->next[k] = node->next[k]; ++ node->next[k]->prev[k] = node->prev[k]; ++ } ++ skiplist_node_init(node); ++ if (m == l->level) { ++ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) ++ m--; ++ l->level = m; ++ } ++ l->entries--; ++} +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index c9fbdd848138..c85a2efde05e 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -120,6 +120,14 @@ static unsigned long long_max = LONG_MAX; + static int one_hundred = 100; + static int two_hundred = 200; + static int one_thousand = 1000; ++static int zero = 0; ++static int one = 1; ++#ifdef CONFIG_SCHED_MUQSS ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -184,7 +192,7 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; + int sysctl_legacy_va_layout; + #endif + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -193,7 +201,7 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; + static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; + #endif /* CONFIG_SMP */ +-#endif /* CONFIG_SCHED_DEBUG */ ++#endif /* CONFIG_SCHED_DEBUG && !CONFIG_SCHED_MUQSS */ + + #ifdef CONFIG_COMPACTION + static int min_extfrag_threshold; +@@ -1652,6 +1660,7 @@ int proc_do_static_key(struct ctl_table *table, int write, + } + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_MUQSS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -1843,6 +1852,56 @@ static struct ctl_table kern_table[] = { + .extra1 = SYSCTL_ONE, + }, + #endif ++#elif defined(CONFIG_SCHED_MUQSS) ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++ { ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#if defined(CONFIG_SMP) && defined(CONFIG_SCHEDSTATS) ++ { ++ .procname = "sched_schedstats", ++ .data = NULL, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sysctl_schedstats, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++#endif /* CONFIG_SMP && CONFIG_SCHEDSTATS */ ++#endif /* CONFIG_SCHED_MUQSS */ ++ + #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + { + .procname = "sched_energy_aware", +diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig +index 83e158d016ba..99235d57431e 100644 +--- a/kernel/time/Kconfig ++++ b/kernel/time/Kconfig +@@ -132,7 +132,7 @@ config CONTEXT_TRACKING + + config CONTEXT_TRACKING_FORCE + bool "Force context tracking" +- depends on CONTEXT_TRACKING ++ depends on CONTEXT_TRACKING && !SCHED_MUQSS + default y if !NO_HZ_FULL + help + The major pre-requirement for full dynticks to work is to +diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c +index f5490222e134..7a61971cca74 100644 +--- a/kernel/time/clockevents.c ++++ b/kernel/time/clockevents.c +@@ -190,8 +190,13 @@ int clockevents_tick_resume(struct clock_event_device *dev) + + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + ++#ifdef CONFIG_SCHED_MUQSS ++/* Limit min_delta to 100us */ ++#define MIN_DELTA_LIMIT (NSEC_PER_SEC / 10000) ++#else + /* Limit min_delta to a jiffie */ + #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) ++#endif + + /** + * clockevents_increase_min_delta - raise minimum delta of a clock event device +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index a71758e34e45..ebb84a65d928 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -216,7 +216,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -850,7 +850,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 8dbc008f8942..de93158493fd 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1591,7 +1591,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) ++static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) + { + u64 nextevt = hrtimer_get_next_event(); + +@@ -1609,6 +1609,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) + if (nextevt <= basem) + return basem; + ++ if (nextevt < expires && nextevt - basem <= TICK_NSEC) ++ base->is_idle = false; ++ + /* + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to +@@ -1678,7 +1681,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) + } + raw_spin_unlock(&base->lock); + +- return cmp_next_hrtimer_event(basem, expires); ++ return cmp_next_hrtimer_event(base, basem, expires); + } + + /** +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index 73ef12092250..f4d06357e783 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1052,10 +1052,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_MUQSS ++ /* No deadline on MuQSS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + +-- +2.25.1 + diff --git a/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.11/4502_futex-wait-multiple.patch b/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.11/4502_futex-wait-multiple.patch new file mode 100644 index 000000000..e06d64734 --- /dev/null +++ b/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.11/4502_futex-wait-multiple.patch @@ -0,0 +1,502 @@ +# Calculate format=diff merge(sys-kernel/calculate-sources[fsync])!= +From df2930f79f2203bc308fa3f4ebbbe913925c9531 Mon Sep 17 00:00:00 2001 +From: Piotr Gorski +Date: Fri, 29 Jan 2021 13:03:58 +0100 +Subject: [PATCH] futex: resync from gitlab.collabora.com + +Signed-off-by: Piotr Gorski +--- + include/uapi/linux/futex.h | 20 +++ + kernel/futex.c | 357 ++++++++++++++++++++++++++++++++++++- + 2 files changed, 373 insertions(+), 4 deletions(-) + +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index a89eb0acc..a3e760886 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,6 +21,7 @@ + #define FUTEX_WAKE_BITSET 10 + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 ++#define FUTEX_WAIT_MULTIPLE 31 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +@@ -40,6 +41,8 @@ + FUTEX_PRIVATE_FLAG) + #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) ++#define FUTEX_WAIT_MULTIPLE_PRIVATE (FUTEX_WAIT_MULTIPLE | \ ++ FUTEX_PRIVATE_FLAG) + + /* + * Support for robust futexes: the kernel cleans up held futexes at +@@ -150,4 +153,21 @@ struct robust_list_head { + (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ + | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) + ++/* ++ * Maximum number of multiple futexes to wait for ++ */ ++#define FUTEX_MULTIPLE_MAX_COUNT 128 ++ ++/** ++ * struct futex_wait_block - Block of futexes to be waited for ++ * @uaddr: User address of the futex ++ * @val: Futex value expected by userspace ++ * @bitset: Bitset for the optional bitmasked wakeup ++ */ ++struct futex_wait_block { ++ __u32 __user *uaddr; ++ __u32 val; ++ __u32 bitset; ++}; ++ + #endif /* _UAPI_LINUX_FUTEX_H */ +diff --git a/kernel/futex.c b/kernel/futex.c +index 45a13eb88..6acf2c806 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -198,6 +198,8 @@ struct futex_pi_state { + * @rt_waiter: rt_waiter storage for use with requeue_pi + * @requeue_pi_key: the requeue_pi target futex key + * @bitset: bitset for the optional bitmasked wakeup ++ * @uaddr: userspace address of futex ++ * @uval: expected futex's value + * + * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so + * we can wake only the relevant ones (hashed queues may be shared). +@@ -220,6 +222,8 @@ struct futex_q { + struct rt_mutex_waiter *rt_waiter; + union futex_key *requeue_pi_key; + u32 bitset; ++ u32 __user *uaddr; ++ u32 uval; + } __randomize_layout; + + static const struct futex_q futex_q_init = { +@@ -2313,6 +2317,29 @@ static int unqueue_me(struct futex_q *q) + return ret; + } + ++/** ++ * unqueue_multiple() - Remove several futexes from their futex_hash_bucket ++ * @q: The list of futexes to unqueue ++ * @count: Number of futexes in the list ++ * ++ * Helper to unqueue a list of futexes. This can't fail. ++ * ++ * Return: ++ * - >=0 - Index of the last futex that was awoken; ++ * - -1 - If no futex was awoken ++ */ ++static int unqueue_multiple(struct futex_q *q, int count) ++{ ++ int ret = -1; ++ int i; ++ ++ for (i = 0; i < count; i++) { ++ if (!unqueue_me(&q[i])) ++ ret = i; ++ } ++ return ret; ++} ++ + /* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry +@@ -2680,6 +2707,205 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + return ret; + } + ++/** ++ * futex_wait_multiple_setup() - Prepare to wait and enqueue multiple futexes ++ * @qs: The corresponding futex list ++ * @count: The size of the lists ++ * @flags: Futex flags (FLAGS_SHARED, etc.) ++ * @awaken: Index of the last awoken futex ++ * ++ * Prepare multiple futexes in a single step and enqueue them. This may fail if ++ * the futex list is invalid or if any futex was already awoken. On success the ++ * task is ready to interruptible sleep. ++ * ++ * Return: ++ * - 1 - One of the futexes was awaken by another thread ++ * - 0 - Success ++ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL ++ */ ++static int futex_wait_multiple_setup(struct futex_q *qs, int count, ++ unsigned int flags, int *awaken) ++{ ++ struct futex_hash_bucket *hb; ++ int ret, i; ++ u32 uval; ++ ++ /* ++ * Enqueuing multiple futexes is tricky, because we need to ++ * enqueue each futex in the list before dealing with the next ++ * one to avoid deadlocking on the hash bucket. But, before ++ * enqueuing, we need to make sure that current->state is ++ * TASK_INTERRUPTIBLE, so we don't absorb any awake events, which ++ * cannot be done before the get_futex_key of the next key, ++ * because it calls get_user_pages, which can sleep. Thus, we ++ * fetch the list of futexes keys in two steps, by first pinning ++ * all the memory keys in the futex key, and only then we read ++ * each key and queue the corresponding futex. ++ */ ++retry: ++ for (i = 0; i < count; i++) { ++ qs[i].key = FUTEX_KEY_INIT; ++ ret = get_futex_key(qs[i].uaddr, flags & FLAGS_SHARED, ++ &qs[i].key, FUTEX_READ); ++ if (unlikely(ret)) { ++ return ret; ++ } ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < count; i++) { ++ struct futex_q *q = &qs[i]; ++ ++ hb = queue_lock(q); ++ ++ ret = get_futex_value_locked(&uval, q->uaddr); ++ if (ret) { ++ /* ++ * We need to try to handle the fault, which ++ * cannot be done without sleep, so we need to ++ * undo all the work already done, to make sure ++ * we don't miss any wake ups. Therefore, clean ++ * up, handle the fault and retry from the ++ * beginning. ++ */ ++ queue_unlock(hb); ++ ++ /* ++ * Keys 0..(i-1) are implicitly put ++ * on unqueue_multiple. ++ */ ++ *awaken = unqueue_multiple(qs, i); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ /* ++ * On a real fault, prioritize the error even if ++ * some other futex was awoken. Userspace gave ++ * us a bad address, -EFAULT them. ++ */ ++ ret = get_user(uval, q->uaddr); ++ if (ret) ++ return ret; ++ ++ /* ++ * Even if the page fault was handled, If ++ * something was already awaken, we can safely ++ * give up and succeed to give a hint for userspace to ++ * acquire the right futex faster. ++ */ ++ if (*awaken >= 0) ++ return 1; ++ ++ goto retry; ++ } ++ ++ if (uval != q->uval) { ++ queue_unlock(hb); ++ ++ /* ++ * If something was already awaken, we can ++ * safely ignore the error and succeed. ++ */ ++ *awaken = unqueue_multiple(qs, i); ++ __set_current_state(TASK_RUNNING); ++ if (*awaken >= 0) ++ return 1; ++ ++ return -EWOULDBLOCK; ++ } ++ ++ /* ++ * The bucket lock can't be held while dealing with the ++ * next futex. Queue each futex at this moment so hb can ++ * be unlocked. ++ */ ++ queue_me(&qs[i], hb); ++ } ++ return 0; ++} ++ ++/** ++ * futex_wait_multiple() - Prepare to wait on and enqueue several futexes ++ * @qs: The list of futexes to wait on ++ * @op: Operation code from futex's syscall ++ * @count: The number of objects ++ * @abs_time: Timeout before giving up and returning to userspace ++ * ++ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function ++ * sleeps on a group of futexes and returns on the first futex that ++ * triggered, or after the timeout has elapsed. ++ * ++ * Return: ++ * - >=0 - Hint to the futex that was awoken ++ * - <0 - On error ++ */ ++static int futex_wait_multiple(struct futex_q *qs, int op, ++ u32 count, ktime_t *abs_time) ++{ ++ struct hrtimer_sleeper timeout, *to; ++ int ret, flags = 0, hint = 0; ++ unsigned int i; ++ ++ if (!(op & FUTEX_PRIVATE_FLAG)) ++ flags |= FLAGS_SHARED; ++ ++ if (op & FUTEX_CLOCK_REALTIME) ++ flags |= FLAGS_CLOCKRT; ++ ++ to = futex_setup_timer(abs_time, &timeout, flags, 0); ++ while (1) { ++ ret = futex_wait_multiple_setup(qs, count, flags, &hint); ++ if (ret) { ++ if (ret > 0) { ++ /* A futex was awaken during setup */ ++ ret = hint; ++ } ++ break; ++ } ++ ++ if (to) ++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ++ ++ /* ++ * Avoid sleeping if another thread already tried to ++ * wake us. ++ */ ++ for (i = 0; i < count; i++) { ++ if (plist_node_empty(&qs[i].list)) ++ break; ++ } ++ ++ if (i == count && (!to || to->task)) ++ freezable_schedule(); ++ ++ ret = unqueue_multiple(qs, count); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ if (ret >= 0) ++ break; ++ if (to && !to->task) { ++ ret = -ETIMEDOUT; ++ break; ++ } else if (signal_pending(current)) { ++ ret = -ERESTARTSYS; ++ break; ++ } ++ /* ++ * The final case is a spurious wakeup, for ++ * which just retry. ++ */ ++ } ++ ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } ++ ++ return ret; ++} ++ + static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) + { +@@ -3761,6 +3987,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + return -ENOSYS; + } + ++/** ++ * futex_read_wait_block - Read an array of futex_wait_block from userspace ++ * @uaddr: Userspace address of the block ++ * @count: Number of blocks to be read ++ * ++ * This function creates and allocate an array of futex_q (we zero it to ++ * initialize the fields) and then, for each futex_wait_block element from ++ * userspace, fill a futex_q element with proper values. ++ */ ++inline struct futex_q *futex_read_wait_block(u32 __user *uaddr, u32 count) ++{ ++ unsigned int i; ++ struct futex_q *qs; ++ struct futex_wait_block fwb; ++ struct futex_wait_block __user *entry = ++ (struct futex_wait_block __user *)uaddr; ++ ++ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) ++ return ERR_PTR(-EINVAL); ++ ++ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); ++ if (!qs) ++ return ERR_PTR(-ENOMEM); ++ ++ for (i = 0; i < count; i++) { ++ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { ++ kfree(qs); ++ return ERR_PTR(-EFAULT); ++ } ++ ++ qs[i].uaddr = fwb.uaddr; ++ qs[i].uval = fwb.val; ++ qs[i].bitset = fwb.bitset; ++ } ++ ++ return qs; ++} + + SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, +@@ -3773,7 +4036,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; + if (get_timespec64(&ts, utime)) +@@ -3782,7 +4046,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + else if (!(op & FUTEX_CLOCK_REALTIME)) + t = timens_ktime_to_host(CLOCK_MONOTONIC, t); +@@ -3796,6 +4060,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) + val2 = (u32) (unsigned long) utime; + ++ if (cmd == FUTEX_WAIT_MULTIPLE) { ++ int ret; ++ struct futex_q *qs; ++ ++#ifdef CONFIG_X86_X32 ++ if (unlikely(in_x32_syscall())) ++ return -ENOSYS; ++#endif ++ qs = futex_read_wait_block(uaddr, val); ++ ++ if (IS_ERR(qs)) ++ return PTR_ERR(qs); ++ ++ ret = futex_wait_multiple(qs, op, val, tp); ++ kfree(qs); ++ ++ return ret; ++ } ++ + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); + } + +@@ -3958,6 +4241,58 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + #endif /* CONFIG_COMPAT */ + + #ifdef CONFIG_COMPAT_32BIT_TIME ++/** ++ * struct compat_futex_wait_block - Block of futexes to be waited for ++ * @uaddr: User address of the futex (compatible pointer) ++ * @val: Futex value expected by userspace ++ * @bitset: Bitset for the optional bitmasked wakeup ++ */ ++struct compat_futex_wait_block { ++ compat_uptr_t uaddr; ++ __u32 pad; ++ __u32 val; ++ __u32 bitset; ++}; ++ ++/** ++ * compat_futex_read_wait_block - Read an array of futex_wait_block from ++ * userspace ++ * @uaddr: Userspace address of the block ++ * @count: Number of blocks to be read ++ * ++ * This function does the same as futex_read_wait_block(), except that it ++ * converts the pointer to the futex from the compat version to the regular one. ++ */ ++inline struct futex_q *compat_futex_read_wait_block(u32 __user *uaddr, ++ u32 count) ++{ ++ unsigned int i; ++ struct futex_q *qs; ++ struct compat_futex_wait_block fwb; ++ struct compat_futex_wait_block __user *entry = ++ (struct compat_futex_wait_block __user *)uaddr; ++ ++ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) ++ return ERR_PTR(-EINVAL); ++ ++ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); ++ if (!qs) ++ return ERR_PTR(-ENOMEM); ++ ++ for (i = 0; i < count; i++) { ++ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { ++ kfree(qs); ++ return ERR_PTR(-EFAULT); ++ } ++ ++ qs[i].uaddr = compat_ptr(fwb.uaddr); ++ qs[i].uval = fwb.val; ++ qs[i].bitset = fwb.bitset; ++ } ++ ++ return qs; ++} ++ + SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + struct old_timespec32 __user *, utime, u32 __user *, uaddr2, + u32, val3) +@@ -3969,14 +4304,15 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (get_old_timespec32(&ts, utime)) + return -EFAULT; + if (!timespec64_valid(&ts)) + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + else if (!(op & FUTEX_CLOCK_REALTIME)) + t = timens_ktime_to_host(CLOCK_MONOTONIC, t); +@@ -3986,6 +4322,19 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) + val2 = (int) (unsigned long) utime; + ++ if (cmd == FUTEX_WAIT_MULTIPLE) { ++ int ret; ++ struct futex_q *qs = compat_futex_read_wait_block(uaddr, val); ++ ++ if (IS_ERR(qs)) ++ return PTR_ERR(qs); ++ ++ ret = futex_wait_multiple(qs, op, val, tp); ++ kfree(qs); ++ ++ return ret; ++ } ++ + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); + } + #endif /* CONFIG_COMPAT_32BIT_TIME */ +-- +2.30.1.457.gf011795891 + diff --git a/sys-kernel/calculate-sources/calculate-sources-5.10.18.ebuild b/sys-kernel/calculate-sources/calculate-sources-5.10.18.ebuild index 55411e221..1210fed76 100644 --- a/sys-kernel/calculate-sources/calculate-sources-5.10.18.ebuild +++ b/sys-kernel/calculate-sources/calculate-sources-5.10.18.ebuild @@ -13,7 +13,7 @@ HOMEPAGE="http://www.calculate-linux.org" SRC_URI="${KERNEL_URI} ${ARCH_URI}" -IUSE="fsync uksm" +IUSE="fsync muqss uksm" src_unpack() { calculate-kernel-8_src_unpack diff --git a/sys-kernel/calculate-sources/calculate-sources-5.11.1.ebuild b/sys-kernel/calculate-sources/calculate-sources-5.11.1.ebuild index d42ec5e2f..1210fed76 100644 --- a/sys-kernel/calculate-sources/calculate-sources-5.11.1.ebuild +++ b/sys-kernel/calculate-sources/calculate-sources-5.11.1.ebuild @@ -13,7 +13,7 @@ HOMEPAGE="http://www.calculate-linux.org" SRC_URI="${KERNEL_URI} ${ARCH_URI}" -IUSE="" +IUSE="fsync muqss uksm" src_unpack() { calculate-kernel-8_src_unpack