From 9b117a69d6f696a38eece7478f56c082bbbc1f5e Mon Sep 17 00:00:00 2001
From: Alexander Tratsevskiy <at@calculate.ru>
Date: Wed, 3 Nov 2021 14:40:22 +0300
Subject: [PATCH] sys-kernel/calculate-sources: Version bump to 5.14.16,
 5.10.77, 5.4.157; add USE fsync for 5.15.0

---
 .../5.10/.4502_futex-wait-multiple.patch.swp  |  Bin 0 -> 16384 bytes
 .../5.15/4502_futex-wait-multiple.patch       | 9811 +++++++++++++++++
 sys-kernel/calculate-sources/Manifest         |    6 +-
 ...build => calculate-sources-5.10.77.ebuild} |    0
 ...build => calculate-sources-5.14.16.ebuild} |    0
 .../calculate-sources-5.15.0.ebuild           |    2 +-
 ...build => calculate-sources-5.4.157.ebuild} |    0
 7 files changed, 9815 insertions(+), 4 deletions(-)
 create mode 100644 profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.10/.4502_futex-wait-multiple.patch.swp
 create mode 100644 profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.15/4502_futex-wait-multiple.patch
 rename sys-kernel/calculate-sources/{calculate-sources-5.10.76.ebuild => calculate-sources-5.10.77.ebuild} (100%)
 rename sys-kernel/calculate-sources/{calculate-sources-5.14.15.ebuild => calculate-sources-5.14.16.ebuild} (100%)
 rename sys-kernel/calculate-sources/{calculate-sources-5.4.156.ebuild => calculate-sources-5.4.157.ebuild} (100%)
diff --git a/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.10/.4502_futex-wait-multiple.patch.swp b/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.10/.4502_futex-wait-multiple.patch.swp
new file mode 100644
index 0000000000000000000000000000000000000000..356d01fc3ad013848deb7f8c43be78ade9645abf
GIT binary patch
literal 16384
zcmeHOTWlm*c|P2d7_uR-NZ}!YvlmUe$9->)$L<;LxTmK*IOAJ9_Rhd)RC~JWRJ+F2
zb;@-ayN8_s34%xoQSty%5+vS&NR*dE$wP=D5k!;(5fUwt(1s)^JP;%#ZW7`F7QX+S
zs&>0)yt^PC5VW<Qr@QK0{{Q<g=bYbXd$p%m*ZD%F!SM6TjO{#sD2k6t*1yk~NFtHM
z@@~BE4e9l1KcU^I)lYjl_PlymL{(e(cIt~nRwEgz<F93PS4O@VR>MfSo-gBSA_pNg
zjH}Jce0AOuw&ewJB7ENpMPlz&<6&Ihmr)@7v0-ABMmGAKtJG`NxpTFK<)(=|Djy0j
zDGyRV@j_o#^Z;X`O+QW<_!(f}5!P(fYozECk3YsAedF%WfX-7DrwmLPm@+VBV9LOh
zfhhx12Br*58ORw(!q>2O;ma$t2kY6tubBAv``J&NJ>Qsk|3dcu-P!X`OuYYI_Wr%u
z^L92tKF_PN=lS@bns}eRp8hjsV9LOhfhhx12Br*58JIFKWnjv{lz}M&QwF9C{0IyP
zhq1S!pQ7_N{{Ekz|3ChE#uDHu;LX6l{SsrJ0{$BKJzyWW13V7=*CUL52<QO<cr)<t
zzsT78fvdpxUdPzq0Rb=oo&^5ywT%5K@F&1c;0?fMW-vC;0lxbSm<#wQ@Mpkp0r!DL
z;9Eb>*na|_13m!s0T=i+;B!C6*mJ-i0o%aqfKR>#-+?~_`al=>Rp6VihFrjVf!_e$
z2z(lai1z_kffDd_6h{6Q_#5DNfZql#0N(*$&jEi7d=Mc10^lWnxEI)d>d0y;La*w3
zL3&ix_Y*36{8`Q$H9k8#%gW_4yPo#$NIPlqTU+g&&f9pS+PKYi(-v<#4&RDYq5^KK
z0q!z*J<s=&A-*}1UBR7+7GH_H5<kyZWDlca+(xt2SZLK6yj*M6YHTG^gBD-9w7auX
zUbu8{sU0}cQm47pUR-L_+WtU#j<~eMFA6-DLVsVjmG6t5ibMsoKBb4?J$Io|6ZV3f
zlXGIOA*J0o-;)g?oyNlX#RbQ8#JT#S=<#(3dtHXSexBEAE&U$^qkAK5=S~_Y(&4U(
z_}z1r#TMU|16&7j_%mUenEM;nz8pToae2g#D~w(D`hj%H%5}@VA^3ohpZpll#C}ns
zJF4u+yV6%7`u~u9Z-I9R^&sW9Ck47HdXXo6el7Cife83z871PL$1m=i2a{Az=(FX?
zD3awhm4>`s;a4Nib%hu6i(x<exO`XmcV(>H<WNM?*y91!ezajGWVxPzu7GbS6+n@T
zFciMNtm$8N&xsk|A$!vC#2#Pq`e`Ki#d{T3f2uc@`vc+mr>OgK``InNB@-F(cX)UF
z7<Zh&D}BGW>;-Nl94VZNiuzBn4qEQ)sey?3RbP5X0s34_`u@=}1#S|lz&olq(lp%m
z>^%`V{Ov;Vi_uW1<-QCOrLv(qEuQhQt-zs^ppp5o+*isOc(G~7f^^WsJcnMgr_zM?
z5Pkbz(C2B4r*SB3dC~zZOeY0CKgTg&G~}W3f&_VnCx?n7eEA`Ih~U=~4grI{<jEc$
zq;N1aJ8CvKu2E53u|(`kF8Dy)K^#W%P(}M(I1WuNcpwi?j86l{A(Bo)@5hEj;tE3*
zB|4gEcoCwRjU1(dn8S$!1?h0JOAa6a!YM{dl-x`Bp-O#+b+nK%q>dn}B(C&OG=+-<
zqz-Aq_T|;M#g7)|t@*hU_nHgyT2;)4>@YqfUIsiAQR309vxpCnA)|oz#X!a-j+~UJ
zkSiA&K(vuqTG|S07Ww5NkJ2E>mAv5v5F;K2_C#Yny*!e3stM5-&C1ChtcR3=WsfeN
zjlP#S1yNFlst#PK&tsXSVTCpHd<ZjnB#0qc+97+11W#mSd$Ei)*DJd_-J8~p_Uewc
zzPq-wy0zAYlF1$^Bp7o%xV4(p&RmtNuyK9T0EBq12a@5dK3%$tNr7rCeV0DRW_i-;
z39LW9&-JRs>V_LR;*9h-mP<RJj0Q20d;IuVT&RxQhQ#_I;wv+B3TGg1ilpQNN&brb
z8Mm+%f%S9w>9fMG$Sj9s$3*ck(wst2SY|*8B$2$_y|KBwcIEQgX6G7QI!R%A&8ds!
zkBPqUf|8Cb6Id}q&Ic2bz+J@HM2ZhCF+nuck|~T*7WP5v!8=7Tgm$c-ib%kfGfP1u
zEn(mfjezLv_4{yVr^2xkV3nhQ^tIusWgdSfj%|TRhYWaeNCAl@z$gYCDF<jtOT35N
zKyCIY*c~Nf1US<{=}WA#o{Knxs}i@uCode1i?NzyY~ZaK0#-WQLkeRBoj+t$;#gu*
ziJ3gy0Zod;F->T!P%^ENrr;1VI7np*KJZzGk{rOYh!eVa33rJR@G<-|CdEh~IB66K
zB9A`<PfcxP3u<f8B~7d6D|`Q$JZ1*i*}$t;uEfO>dO8rdPqy!D?Q+DL@7>eE3fVBM
zj$%qxSl)4W6&;ZtVQO?o1nq%Zdd3S-1V(^R<Ees%NO&J=8ckJ%R6z#TL7(p$qL}Z&
zJxGa>>yIQz$V9O*nCloTA+yD)>%wx3*~o$EfD*Zfz{dO%oCJv{e4XQHCE$=8E`nVH
zvZ7{F!4vbCrAxA$<f*V2dm!}F&VeW+<t3s(IjNmZVEBTs+WYWDqgbrn!?b3U$M$F$
z%Qzgdrlr9&jygQa5mb^C2%?AV1fH_NwPFg7yZ}2kq_@P@4r645+jpQiRE|t|Lvi9B
z(s~0g@$O1ShNpL#8m<??X`sEJ)JPvp3}lykd0{~Aq9fk}oyY)YrV_h>f!4EIzMFan
z9apf3W4c_O_SP!AEwHsGz-6_HAdiBCRMHG-|Aj3PsQBf3Fe<I6U|2iDgyprF^eau+
z#ts@|L*ZhvW-K=AAzZ&li?h;kpXsTPZA@~6gO2)C5Xd?>Fy8>=ts-8)Ll7ex;Z38-
z$?BPpAlRfpdeiFzycluM#F$dVFOiGe+z1SKo8gb6DJd0WIphhbG%nMFdmpNOZ2Njc
zlcTk{D6mXSj~u0>GZ}PyNQ%#Mgy9&+It5a&>g^y~#)yY<H$X0i)mwZ4N#^+dFvR%<
z_D!m||Lp}AXybx(oGj_X3R_=Y@0Op2)e*`qUa!<xuFeC8rux{SX*J(-9Iq^7?wpy7
z@{FMILsY%VuSgq*mHHyD*ITs*PEkMXZlBIx*jd?j{k&6e+ODiSJ-a?%g99!!=j{1r
z!<n0R7n?Xf;ox@a!RZO!$?wurybeUNgr(etdQo2%UhH!GExm)eU3_eb(fR)aIJc&+
z!pRlq|NQUoU%{FGpMbvt{v3E0a38n>Yy(@sW5A=p8-P~=FQ69iUEmYIKLCFZ#J~+;
z7nlR8!2hE@@Rz`QfNQ`C@FegEz<_U|X7DB8e}Vr7J`el>@NQriC<8^{e^5vGec=7T
zC7=kL0p0|>fO^8`fsX<S5CO};lfdJ^H&J8wGVp2OUx1GR5nux=KpVIO{0i^_Y7Son
zz5qNAd<OVe;Gco_0BxWNP)*`F;6uOxa20q9@NLu}J_~#t_%QIh!219fSOeY!d=>SG
ze*~yT(FDGOTEu?<p9TH`_y}+txCOior~wt=`>@+LfUf~x1ik<~4}1jpFhF)qfa%Xq
z5Ci-P86Z@7^(^mfZmg_cwK|*YDDzm&#${~jcaY%t*(&4r8K22^t&TEvkV1+h88&As
z)EhMHOz}ZKYv>2&ce8kb&9d2<`#?Sd1~Lz4_Fe3+3kNYZ(y#fF$)INcmH$sNXmMhm
zY|PPsTRYoUcYAv@5ELq`Ue6yDwE#0Gws!~o>_LplOa&z??9jS9w(-jq#O~9`M;lxt
z`wY!s4;;R<^sx1&$lu907@I-Y24*RsXOS0D>^!Ji#C;X?i?nAP$O~x3@pEFZ&ia<M
z-Tlqo?rzt@9^u(%Prkm<zSgxiw~B0enV08k^>d|pK0DVuS1(~*z_aUbzusxDtyx#P
zE2|sb`GVQe(rir_CB;eDvS<U(&t?Xrqbc@EWJ$a=<|OfF5(rQIg4o)pf2~AWdngtZ
zI&_Y));BjccQ!XxJ1{;dLJ#R<lS1S?R={cYOh%4rz1?l&fVkc*7Dr>7LE%SMEL>SI
zU;8pCV24d3!YLlg^g}2aJAf8h`ILTo=3J>bl=l1_sifyEB;<zLY`LSByQ>>dx7W00
zS}Ecn>ju+$PT>qX8FjMgD0f%dI0uiqKp6{rD=Y8WY)~KYOb^f%8j|@Z9N@%xx=8Ny
zBJECi3aNhDfX5B>kTBP2EfM?IyE|YRmLkQqwEWDOhdBKMrqG_Dy%uDWhc*`%7fSUx
za?xh9R6qA)b<x~ko7xIljGrWY+}nxuhlW6-#C}Y1gKS61F7t<j*rYn0{|@3?-tCGx
z#!{v`>}6lHr<=Da94mW=Pvb8r-QrD_vce=P&;UB)QmBxN=vnfqkr6Fq#zN+jCY&Zx
zu8uAh4lO#H>ZDLOgHcQT4ED#-5aQB-=AKk8O;*fYk;>x*53<xQaFmiEQW_-qI6JFd
zl_bg3KbB!*bG^H+Wzdm?oR4NY2k%n}LfHj9%4j;|XC#5ED_hpJ?)FA^Em!e@fpv%;
z53EPgPDw)Xr?M237pJWC_D!sDyBj;%vQWr;=EO=sTXA#xA30A^6HWGbnH@BNeCee-
zfY`BNO|sVO6FnWR8MOF#c=Ap>)ISuvl*_fkP{s>zp2GoZyo!NgPU!W(3OBYEnB|<!
z(t3|y7&pKPEipButQ3<MB+S`M*E8tQX6E2%PA8$fQ6&4i(21jAMUSWtzoP>$m5Pp!
zd?>~k%_l{K6UZZ5hIBl{84;&52&acz)|$6wHcXuwM_*m9!JrDY0j$(GQII0bTC>*7
zZeFvZS~AY3x*$m6>u$O!t%|y=rN6xxb-au}4dIwRUO_SnjQI``#Ad5!Auo~wnG*{y
zC^s44_<I)U=4B`xdXr6_BF?R3CU?G<@I5$3ZoXwRNo(e^uA1pO3Ms2XDmNg<;647H
zu#fEsRrk`+a0O9XGb_4WhL0$v$s;V2RQg8E;3PfcyiQVT90Vb4{w9ceOy)swRG8!?
z)F|`IRC%Lniynv^Mcb6Pg(%2BhF1QB=Q`3ob?-{c`6)YG;>oN|3+MIzsd2tmI=4WJ
zQ?>;@xs7TTXep0W50yL1lw$R!DfApH9X}O2WSxzT6Hi8=Ne)zqrA%um)rn}d?b+A_
nU*x!E!+>P8Fa05Qz83xzVr7-(0#*=oKVh_+_|wVFW`q3zqjIlt

literal 0
HcmV?d00001

diff --git a/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.15/4502_futex-wait-multiple.patch b/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.15/4502_futex-wait-multiple.patch
new file mode 100644
index 000000000..57d514321
--- /dev/null
+++ b/profiles/templates/3.6/6_ac_install_patch/sys-kernel/calculate-sources/5.15/4502_futex-wait-multiple.patch
@@ -0,0 +1,9811 @@
+# Calculate format=diff merge(sys-kernel/calculate-sources[fsync])!=
+From 4dc2913212c08c6970f6e8971fd23b6328982f94 Mon Sep 17 00:00:00 2001
+From: Piotr Gorski <lucjan.lucjanov@gmail.com>
+Date: Mon, 1 Nov 2021 12:11:04 +0100
+Subject: [PATCH] futex: resync from gitlab.collabora.com
+
+Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
+---
+ Documentation/userspace-api/futex2.rst        |   86 +
+ Documentation/userspace-api/index.rst         |    1 +
+ MAINTAINERS                                   |    3 +-
+ arch/arm/tools/syscall.tbl                    |    1 +
+ arch/arm64/include/asm/unistd.h               |    2 +-
+ arch/arm64/include/asm/unistd32.h             |    2 +
+ arch/x86/entry/syscalls/syscall_32.tbl        |    1 +
+ arch/x86/entry/syscalls/syscall_64.tbl        |    1 +
+ include/linux/syscalls.h                      |    7 +-
+ include/uapi/asm-generic/unistd.h             |    5 +-
+ include/uapi/linux/futex.h                    |   25 +
+ kernel/Makefile                               |    2 +-
+ kernel/futex.c                                | 4272 -----------------
+ kernel/futex/Makefile                         |    3 +
+ kernel/futex/core.c                           | 1176 +++++
+ kernel/futex/futex.h                          |  295 ++
+ kernel/futex/pi.c                             | 1233 +++++
+ kernel/futex/requeue.c                        |  897 ++++
+ kernel/futex/syscalls.c                       |  396 ++
+ kernel/futex/waitwake.c                       |  708 +++
+ kernel/sys_ni.c                               |    3 +-
+ .../selftests/futex/functional/.gitignore     |    1 +
+ .../selftests/futex/functional/Makefile       |    3 +-
+ .../futex/functional/futex_wait_timeout.c     |   21 +-
+ .../futex/functional/futex_wait_wouldblock.c  |   41 +-
+ .../selftests/futex/functional/futex_waitv.c  |  237 +
+ .../testing/selftests/futex/functional/run.sh |    3 +
+ .../selftests/futex/include/futex2test.h      |   22 +
+ 28 files changed, 5163 insertions(+), 4284 deletions(-)
+ create mode 100644 Documentation/userspace-api/futex2.rst
+ delete mode 100644 kernel/futex.c
+ create mode 100644 kernel/futex/Makefile
+ create mode 100644 kernel/futex/core.c
+ create mode 100644 kernel/futex/futex.h
+ create mode 100644 kernel/futex/pi.c
+ create mode 100644 kernel/futex/requeue.c
+ create mode 100644 kernel/futex/syscalls.c
+ create mode 100644 kernel/futex/waitwake.c
+ create mode 100644 tools/testing/selftests/futex/functional/futex_waitv.c
+ create mode 100644 tools/testing/selftests/futex/include/futex2test.h
+
+diff --git a/Documentation/userspace-api/futex2.rst b/Documentation/userspace-api/futex2.rst
+new file mode 100644
+index 000000000..7d37409df
+--- /dev/null
++++ b/Documentation/userspace-api/futex2.rst
+@@ -0,0 +1,86 @@
++.. SPDX-License-Identifier: GPL-2.0
++
++======
++futex2
++======
++
++:Author: André Almeida <andrealmeid@collabora.com>
++
++futex, or fast user mutex, is a set of syscalls to allow userspace to create
++performant synchronization mechanisms, such as mutexes, semaphores and
++conditional variables in userspace. C standard libraries, like glibc, uses it
++as a means to implement more high level interfaces like pthreads.
++
++futex2 is a followup version of the initial futex syscall, designed to overcome
++limitations of the original interface.
++
++User API
++========
++
++``futex_waitv()``
++-----------------
++
++Wait on an array of futexes, wake on any::
++
++  futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes,
++              unsigned int flags, struct timespec *timeout, clockid_t clockid)
++
++  struct futex_waitv {
++        __u64 val;
++        __u64 uaddr;
++        __u32 flags;
++        __u32 __reserved;
++  };
++
++Userspace sets an array of struct futex_waitv (up to a max of 128 entries),
++using ``uaddr`` for the address to wait for, ``val`` for the expected value
++and ``flags`` to specify the type (e.g. private) and size of futex.
++``__reserved`` needs to be 0, but it can be used for future extension. The
++pointer for the first item of the array is passed as ``waiters``. An invalid
++address for ``waiters`` or for any ``uaddr`` returns ``-EFAULT``.
++
++If userspace has 32-bit pointers, it should do a explicit cast to make sure
++the upper bits are zeroed. ``uintptr_t`` does the tricky and it works for
++both 32/64-bit pointers.
++
++``nr_futexes`` specifies the size of the array. Numbers out of [1, 128]
++interval will make the syscall return ``-EINVAL``.
++
++The ``flags`` argument of the syscall needs to be 0, but it can be used for
++future extension.
++
++For each entry in ``waiters`` array, the current value at ``uaddr`` is compared
++to ``val``. If it's different, the syscall undo all the work done so far and
++return ``-EAGAIN``. If all tests and verifications succeeds, syscall waits until
++one of the following happens:
++
++- The timeout expires, returning ``-ETIMEOUT``.
++- A signal was sent to the sleeping task, returning ``-ERESTARTSYS``.
++- Some futex at the list was awaken, returning the index of some waked futex.
++
++An example of how to use the interface can be found at ``tools/testing/selftests/futex/functional/futex_waitv.c``.
++
++Timeout
++-------
++
++``struct timespec *timeout`` argument is an optional argument that points to an
++absolute timeout. You need to specify the type of clock being used at
++``clockid`` argument. ``CLOCK_MONOTONIC`` and ``CLOCK_REALTIME`` are supported.
++This syscall accepts only 64bit timespec structs.
++
++Types of futex
++--------------
++
++A futex can be either private or shared. Private is used for processes that
++shares the same memory space and the virtual address of the futex will be the
++same for all processes. This allows for optimizations in the kernel. To use
++private futexes, it's necessary to specify ``FUTEX_PRIVATE_FLAG`` in the futex
++flag. For processes that doesn't share the same memory space and therefore can
++have different virtual addresses for the same futex (using, for instance, a
++file-backed shared memory) requires different internal mechanisms to be get
++properly enqueued. This is the default behavior, and it works with both private
++and shared futexes.
++
++Futexes can be of different sizes: 8, 16, 32 or 64 bits. Currently, the only
++supported one is 32 bit sized futex, and it need to be specified using
++``FUTEX_32`` flag.
+diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
+index c432be070..a61eac0c7 100644
+--- a/Documentation/userspace-api/index.rst
++++ b/Documentation/userspace-api/index.rst
+@@ -28,6 +28,7 @@ place where this information is gathered.
+    media/index
+    sysfs-platform_profile
+    vduse
++   futex2
+ 
+ .. only::  subproject and html
+ 
+diff --git a/MAINTAINERS b/MAINTAINERS
+index 3b79fd441..dd165835f 100644
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -7737,6 +7737,7 @@ M:	Ingo Molnar <mingo@redhat.com>
+ R:	Peter Zijlstra <peterz@infradead.org>
+ R:	Darren Hart <dvhart@infradead.org>
+ R:	Davidlohr Bueso <dave@stgolabs.net>
++R:	André Almeida <andrealmeid@collabora.com>
+ L:	linux-kernel@vger.kernel.org
+ S:	Maintained
+ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
+@@ -7744,7 +7745,7 @@ F:	Documentation/locking/*futex*
+ F:	include/asm-generic/futex.h
+ F:	include/linux/futex.h
+ F:	include/uapi/linux/futex.h
+-F:	kernel/futex.c
++F:	kernel/futex/*
+ F:	tools/perf/bench/futex*
+ F:	tools/testing/selftests/futex/
+ 
+diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
+index e842209e1..543100151 100644
+--- a/arch/arm/tools/syscall.tbl
++++ b/arch/arm/tools/syscall.tbl
+@@ -462,3 +462,4 @@
+ 446	common	landlock_restrict_self		sys_landlock_restrict_self
+ # 447 reserved for memfd_secret
+ 448	common	process_mrelease		sys_process_mrelease
++449	common	futex_waitv			sys_futex_waitv
+diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
+index 3cb206aea..6bdb5f5db 100644
+--- a/arch/arm64/include/asm/unistd.h
++++ b/arch/arm64/include/asm/unistd.h
+@@ -38,7 +38,7 @@
+ #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
+ #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
+ 
+-#define __NR_compat_syscalls		449
++#define __NR_compat_syscalls		450
+ #endif
+ 
+ #define __ARCH_WANT_SYS_CLONE
+diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
+index 844f6ae58..41ea1195e 100644
+--- a/arch/arm64/include/asm/unistd32.h
++++ b/arch/arm64/include/asm/unistd32.h
+@@ -903,6 +903,8 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
+ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
+ #define __NR_process_mrelease 448
+ __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
++#define __NR_futex_waitv 449
++__SYSCALL(__NR_futex_waitv, sys_futex_waitv)
+ 
+ /*
+  * Please add new compat syscalls above this comment and update
+diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
+index 960a021d5..7e2554369 100644
+--- a/arch/x86/entry/syscalls/syscall_32.tbl
++++ b/arch/x86/entry/syscalls/syscall_32.tbl
+@@ -453,3 +453,4 @@
+ 446	i386	landlock_restrict_self	sys_landlock_restrict_self
+ 447	i386	memfd_secret		sys_memfd_secret
+ 448	i386	process_mrelease	sys_process_mrelease
++449	i386	futex_waitv		sys_futex_waitv
+diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
+index 18b5500ea..fe8f8dd15 100644
+--- a/arch/x86/entry/syscalls/syscall_64.tbl
++++ b/arch/x86/entry/syscalls/syscall_64.tbl
+@@ -370,6 +370,7 @@
+ 446	common	landlock_restrict_self	sys_landlock_restrict_self
+ 447	common	memfd_secret		sys_memfd_secret
+ 448	common	process_mrelease	sys_process_mrelease
++449	common	futex_waitv		sys_futex_waitv
+ 
+ #
+ # Due to a historical design error, certain syscalls are numbered differently
+diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
+index 252243c77..528a478db 100644
+--- a/include/linux/syscalls.h
++++ b/include/linux/syscalls.h
+@@ -58,6 +58,7 @@ struct mq_attr;
+ struct compat_stat;
+ struct old_timeval32;
+ struct robust_list_head;
++struct futex_waitv;
+ struct getcpu_cache;
+ struct old_linux_dirent;
+ struct perf_event_attr;
+@@ -610,7 +611,7 @@ asmlinkage long sys_waitid(int which, pid_t pid,
+ asmlinkage long sys_set_tid_address(int __user *tidptr);
+ asmlinkage long sys_unshare(unsigned long unshare_flags);
+ 
+-/* kernel/futex.c */
++/* kernel/futex/syscalls.c */
+ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
+ 			  const struct __kernel_timespec __user *utime,
+ 			  u32 __user *uaddr2, u32 val3);
+@@ -623,6 +624,10 @@ asmlinkage long sys_get_robust_list(int pid,
+ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
+ 				    size_t len);
+ 
++asmlinkage long sys_futex_waitv(struct futex_waitv *waiters,
++				unsigned int nr_futexes, unsigned int flags,
++				struct __kernel_timespec __user *timeout, clockid_t clockid);
++
+ /* kernel/hrtimer.c */
+ asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
+ 			      struct __kernel_timespec __user *rmtp);
+diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
+index 1c5fb86d4..4557a8b60 100644
+--- a/include/uapi/asm-generic/unistd.h
++++ b/include/uapi/asm-generic/unistd.h
+@@ -880,8 +880,11 @@ __SYSCALL(__NR_memfd_secret, sys_memfd_secret)
+ #define __NR_process_mrelease 448
+ __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
+ 
++#define __NR_futex_waitv 449
++__SYSCALL(__NR_futex_waitv, sys_futex_waitv)
++
+ #undef __NR_syscalls
+-#define __NR_syscalls 449
++#define __NR_syscalls 450
+ 
+ /*
+  * 32 bit systems traditionally used different
+diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
+index 235e5b2fa..71a5df8d2 100644
+--- a/include/uapi/linux/futex.h
++++ b/include/uapi/linux/futex.h
+@@ -43,6 +43,31 @@
+ #define FUTEX_CMP_REQUEUE_PI_PRIVATE	(FUTEX_CMP_REQUEUE_PI | \
+ 					 FUTEX_PRIVATE_FLAG)
+ 
++/*
++ * Flags to specify the bit length of the futex word for futex2 syscalls.
++ * Currently, only 32 is supported.
++ */
++#define FUTEX_32		2
++
++/*
++ * Max numbers of elements in a futex_waitv array
++ */
++#define FUTEX_WAITV_MAX		128
++
++/**
++ * struct futex_waitv - A waiter for vectorized wait
++ * @val:	Expected value at uaddr
++ * @uaddr:	User address to wait on
++ * @flags:	Flags for this waiter
++ * @__reserved:	Reserved member to preserve data alignment. Should be 0.
++ */
++struct futex_waitv {
++	__u64 val;
++	__u64 uaddr;
++	__u32 flags;
++	__u32 __reserved;
++};
++
+ /*
+  * Support for robust futexes: the kernel cleans up held futexes at
+  * thread exit time.
+diff --git a/kernel/Makefile b/kernel/Makefile
+index 4df609be4..3f6ab5d50 100644
+--- a/kernel/Makefile
++++ b/kernel/Makefile
+@@ -59,7 +59,7 @@ obj-$(CONFIG_FREEZER) += freezer.o
+ obj-$(CONFIG_PROFILING) += profile.o
+ obj-$(CONFIG_STACKTRACE) += stacktrace.o
+ obj-y += time/
+-obj-$(CONFIG_FUTEX) += futex.o
++obj-$(CONFIG_FUTEX) += futex/
+ obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
+ obj-$(CONFIG_SMP) += smp.o
+ ifneq ($(CONFIG_SMP),y)
+diff --git a/kernel/futex.c b/kernel/futex.c
+deleted file mode 100644
+index c15ad276f..000000000
+--- a/kernel/futex.c
++++ /dev/null
+@@ -1,4272 +0,0 @@
+-// SPDX-License-Identifier: GPL-2.0-or-later
+-/*
+- *  Fast Userspace Mutexes (which I call "Futexes!").
+- *  (C) Rusty Russell, IBM 2002
+- *
+- *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
+- *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
+- *
+- *  Removed page pinning, fix privately mapped COW pages and other cleanups
+- *  (C) Copyright 2003, 2004 Jamie Lokier
+- *
+- *  Robust futex support started by Ingo Molnar
+- *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
+- *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
+- *
+- *  PI-futex support started by Ingo Molnar and Thomas Gleixner
+- *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+- *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
+- *
+- *  PRIVATE futexes by Eric Dumazet
+- *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
+- *
+- *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+- *  Copyright (C) IBM Corporation, 2009
+- *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
+- *
+- *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
+- *  enough at me, Linus for the original (flawed) idea, Matthew
+- *  Kirkwood for proof-of-concept implementation.
+- *
+- *  "The futexes are also cursed."
+- *  "But they come in a choice of three flavours!"
+- */
+-#include <linux/compat.h>
+-#include <linux/jhash.h>
+-#include <linux/pagemap.h>
+-#include <linux/syscalls.h>
+-#include <linux/freezer.h>
+-#include <linux/memblock.h>
+-#include <linux/fault-inject.h>
+-#include <linux/time_namespace.h>
+-
+-#include <asm/futex.h>
+-
+-#include "locking/rtmutex_common.h"
+-
+-/*
+- * READ this before attempting to hack on futexes!
+- *
+- * Basic futex operation and ordering guarantees
+- * =============================================
+- *
+- * The waiter reads the futex value in user space and calls
+- * futex_wait(). This function computes the hash bucket and acquires
+- * the hash bucket lock. After that it reads the futex user space value
+- * again and verifies that the data has not changed. If it has not changed
+- * it enqueues itself into the hash bucket, releases the hash bucket lock
+- * and schedules.
+- *
+- * The waker side modifies the user space value of the futex and calls
+- * futex_wake(). This function computes the hash bucket and acquires the
+- * hash bucket lock. Then it looks for waiters on that futex in the hash
+- * bucket and wakes them.
+- *
+- * In futex wake up scenarios where no tasks are blocked on a futex, taking
+- * the hb spinlock can be avoided and simply return. In order for this
+- * optimization to work, ordering guarantees must exist so that the waiter
+- * being added to the list is acknowledged when the list is concurrently being
+- * checked by the waker, avoiding scenarios like the following:
+- *
+- * CPU 0                               CPU 1
+- * val = *futex;
+- * sys_futex(WAIT, futex, val);
+- *   futex_wait(futex, val);
+- *   uval = *futex;
+- *                                     *futex = newval;
+- *                                     sys_futex(WAKE, futex);
+- *                                       futex_wake(futex);
+- *                                       if (queue_empty())
+- *                                         return;
+- *   if (uval == val)
+- *      lock(hash_bucket(futex));
+- *      queue();
+- *     unlock(hash_bucket(futex));
+- *     schedule();
+- *
+- * This would cause the waiter on CPU 0 to wait forever because it
+- * missed the transition of the user space value from val to newval
+- * and the waker did not find the waiter in the hash bucket queue.
+- *
+- * The correct serialization ensures that a waiter either observes
+- * the changed user space value before blocking or is woken by a
+- * concurrent waker:
+- *
+- * CPU 0                                 CPU 1
+- * val = *futex;
+- * sys_futex(WAIT, futex, val);
+- *   futex_wait(futex, val);
+- *
+- *   waiters++; (a)
+- *   smp_mb(); (A) <-- paired with -.
+- *                                  |
+- *   lock(hash_bucket(futex));      |
+- *                                  |
+- *   uval = *futex;                 |
+- *                                  |        *futex = newval;
+- *                                  |        sys_futex(WAKE, futex);
+- *                                  |          futex_wake(futex);
+- *                                  |
+- *                                  `--------> smp_mb(); (B)
+- *   if (uval == val)
+- *     queue();
+- *     unlock(hash_bucket(futex));
+- *     schedule();                         if (waiters)
+- *                                           lock(hash_bucket(futex));
+- *   else                                    wake_waiters(futex);
+- *     waiters--; (b)                        unlock(hash_bucket(futex));
+- *
+- * Where (A) orders the waiters increment and the futex value read through
+- * atomic operations (see hb_waiters_inc) and where (B) orders the write
+- * to futex and the waiters read (see hb_waiters_pending()).
+- *
+- * This yields the following case (where X:=waiters, Y:=futex):
+- *
+- *	X = Y = 0
+- *
+- *	w[X]=1		w[Y]=1
+- *	MB		MB
+- *	r[Y]=y		r[X]=x
+- *
+- * Which guarantees that x==0 && y==0 is impossible; which translates back into
+- * the guarantee that we cannot both miss the futex variable change and the
+- * enqueue.
+- *
+- * Note that a new waiter is accounted for in (a) even when it is possible that
+- * the wait call can return error, in which case we backtrack from it in (b).
+- * Refer to the comment in queue_lock().
+- *
+- * Similarly, in order to account for waiters being requeued on another
+- * address we always increment the waiters for the destination bucket before
+- * acquiring the lock. It then decrements them again  after releasing it -
+- * the code that actually moves the futex(es) between hash buckets (requeue_futex)
+- * will do the additional required waiter count housekeeping. This is done for
+- * double_lock_hb() and double_unlock_hb(), respectively.
+- */
+-
+-#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
+-#define futex_cmpxchg_enabled 1
+-#else
+-static int  __read_mostly futex_cmpxchg_enabled;
+-#endif
+-
+-/*
+- * Futex flags used to encode options to functions and preserve them across
+- * restarts.
+- */
+-#ifdef CONFIG_MMU
+-# define FLAGS_SHARED		0x01
+-#else
+-/*
+- * NOMMU does not have per process address space. Let the compiler optimize
+- * code away.
+- */
+-# define FLAGS_SHARED		0x00
+-#endif
+-#define FLAGS_CLOCKRT		0x02
+-#define FLAGS_HAS_TIMEOUT	0x04
+-
+-/*
+- * Priority Inheritance state:
+- */
+-struct futex_pi_state {
+-	/*
+-	 * list of 'owned' pi_state instances - these have to be
+-	 * cleaned up in do_exit() if the task exits prematurely:
+-	 */
+-	struct list_head list;
+-
+-	/*
+-	 * The PI object:
+-	 */
+-	struct rt_mutex_base pi_mutex;
+-
+-	struct task_struct *owner;
+-	refcount_t refcount;
+-
+-	union futex_key key;
+-} __randomize_layout;
+-
+-/**
+- * struct futex_q - The hashed futex queue entry, one per waiting task
+- * @list:		priority-sorted list of tasks waiting on this futex
+- * @task:		the task waiting on the futex
+- * @lock_ptr:		the hash bucket lock
+- * @key:		the key the futex is hashed on
+- * @pi_state:		optional priority inheritance state
+- * @rt_waiter:		rt_waiter storage for use with requeue_pi
+- * @requeue_pi_key:	the requeue_pi target futex key
+- * @bitset:		bitset for the optional bitmasked wakeup
+- * @requeue_state:	State field for futex_requeue_pi()
+- * @requeue_wait:	RCU wait for futex_requeue_pi() (RT only)
+- *
+- * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
+- * we can wake only the relevant ones (hashed queues may be shared).
+- *
+- * A futex_q has a woken state, just like tasks have TASK_RUNNING.
+- * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
+- * The order of wakeup is always to make the first condition true, then
+- * the second.
+- *
+- * PI futexes are typically woken before they are removed from the hash list via
+- * the rt_mutex code. See unqueue_me_pi().
+- */
+-struct futex_q {
+-	struct plist_node list;
+-
+-	struct task_struct *task;
+-	spinlock_t *lock_ptr;
+-	union futex_key key;
+-	struct futex_pi_state *pi_state;
+-	struct rt_mutex_waiter *rt_waiter;
+-	union futex_key *requeue_pi_key;
+-	u32 bitset;
+-	atomic_t requeue_state;
+-#ifdef CONFIG_PREEMPT_RT
+-	struct rcuwait requeue_wait;
+-#endif
+-} __randomize_layout;
+-
+-/*
+- * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an
+- * underlying rtmutex. The task which is about to be requeued could have
+- * just woken up (timeout, signal). After the wake up the task has to
+- * acquire hash bucket lock, which is held by the requeue code.  As a task
+- * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking
+- * and the hash bucket lock blocking would collide and corrupt state.
+- *
+- * On !PREEMPT_RT this is not a problem and everything could be serialized
+- * on hash bucket lock, but aside of having the benefit of common code,
+- * this allows to avoid doing the requeue when the task is already on the
+- * way out and taking the hash bucket lock of the original uaddr1 when the
+- * requeue has been completed.
+- *
+- * The following state transitions are valid:
+- *
+- * On the waiter side:
+- *   Q_REQUEUE_PI_NONE		-> Q_REQUEUE_PI_IGNORE
+- *   Q_REQUEUE_PI_IN_PROGRESS	-> Q_REQUEUE_PI_WAIT
+- *
+- * On the requeue side:
+- *   Q_REQUEUE_PI_NONE		-> Q_REQUEUE_PI_INPROGRESS
+- *   Q_REQUEUE_PI_IN_PROGRESS	-> Q_REQUEUE_PI_DONE/LOCKED
+- *   Q_REQUEUE_PI_IN_PROGRESS	-> Q_REQUEUE_PI_NONE (requeue failed)
+- *   Q_REQUEUE_PI_WAIT		-> Q_REQUEUE_PI_DONE/LOCKED
+- *   Q_REQUEUE_PI_WAIT		-> Q_REQUEUE_PI_IGNORE (requeue failed)
+- *
+- * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this
+- * signals that the waiter is already on the way out. It also means that
+- * the waiter is still on the 'wait' futex, i.e. uaddr1.
+- *
+- * The waiter side signals early wakeup to the requeue side either through
+- * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending
+- * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately
+- * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,
+- * which means the wakeup is interleaving with a requeue in progress it has
+- * to wait for the requeue side to change the state. Either to DONE/LOCKED
+- * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex
+- * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by
+- * the requeue side when the requeue attempt failed via deadlock detection
+- * and therefore the waiter q is still on the uaddr1 futex.
+- */
+-enum {
+-	Q_REQUEUE_PI_NONE		=  0,
+-	Q_REQUEUE_PI_IGNORE,
+-	Q_REQUEUE_PI_IN_PROGRESS,
+-	Q_REQUEUE_PI_WAIT,
+-	Q_REQUEUE_PI_DONE,
+-	Q_REQUEUE_PI_LOCKED,
+-};
+-
+-static const struct futex_q futex_q_init = {
+-	/* list gets initialized in queue_me()*/
+-	.key		= FUTEX_KEY_INIT,
+-	.bitset		= FUTEX_BITSET_MATCH_ANY,
+-	.requeue_state	= ATOMIC_INIT(Q_REQUEUE_PI_NONE),
+-};
+-
+-/*
+- * Hash buckets are shared by all the futex_keys that hash to the same
+- * location.  Each key may have multiple futex_q structures, one for each task
+- * waiting on a futex.
+- */
+-struct futex_hash_bucket {
+-	atomic_t waiters;
+-	spinlock_t lock;
+-	struct plist_head chain;
+-} ____cacheline_aligned_in_smp;
+-
+-/*
+- * The base of the bucket array and its size are always used together
+- * (after initialization only in hash_futex()), so ensure that they
+- * reside in the same cacheline.
+- */
+-static struct {
+-	struct futex_hash_bucket *queues;
+-	unsigned long            hashsize;
+-} __futex_data __read_mostly __aligned(2*sizeof(long));
+-#define futex_queues   (__futex_data.queues)
+-#define futex_hashsize (__futex_data.hashsize)
+-
+-
+-/*
+- * Fault injections for futexes.
+- */
+-#ifdef CONFIG_FAIL_FUTEX
+-
+-static struct {
+-	struct fault_attr attr;
+-
+-	bool ignore_private;
+-} fail_futex = {
+-	.attr = FAULT_ATTR_INITIALIZER,
+-	.ignore_private = false,
+-};
+-
+-static int __init setup_fail_futex(char *str)
+-{
+-	return setup_fault_attr(&fail_futex.attr, str);
+-}
+-__setup("fail_futex=", setup_fail_futex);
+-
+-static bool should_fail_futex(bool fshared)
+-{
+-	if (fail_futex.ignore_private && !fshared)
+-		return false;
+-
+-	return should_fail(&fail_futex.attr, 1);
+-}
+-
+-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+-
+-static int __init fail_futex_debugfs(void)
+-{
+-	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+-	struct dentry *dir;
+-
+-	dir = fault_create_debugfs_attr("fail_futex", NULL,
+-					&fail_futex.attr);
+-	if (IS_ERR(dir))
+-		return PTR_ERR(dir);
+-
+-	debugfs_create_bool("ignore-private", mode, dir,
+-			    &fail_futex.ignore_private);
+-	return 0;
+-}
+-
+-late_initcall(fail_futex_debugfs);
+-
+-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+-
+-#else
+-static inline bool should_fail_futex(bool fshared)
+-{
+-	return false;
+-}
+-#endif /* CONFIG_FAIL_FUTEX */
+-
+-#ifdef CONFIG_COMPAT
+-static void compat_exit_robust_list(struct task_struct *curr);
+-#endif
+-
+-/*
+- * Reflects a new waiter being added to the waitqueue.
+- */
+-static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
+-{
+-#ifdef CONFIG_SMP
+-	atomic_inc(&hb->waiters);
+-	/*
+-	 * Full barrier (A), see the ordering comment above.
+-	 */
+-	smp_mb__after_atomic();
+-#endif
+-}
+-
+-/*
+- * Reflects a waiter being removed from the waitqueue by wakeup
+- * paths.
+- */
+-static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
+-{
+-#ifdef CONFIG_SMP
+-	atomic_dec(&hb->waiters);
+-#endif
+-}
+-
+-static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
+-{
+-#ifdef CONFIG_SMP
+-	/*
+-	 * Full barrier (B), see the ordering comment above.
+-	 */
+-	smp_mb();
+-	return atomic_read(&hb->waiters);
+-#else
+-	return 1;
+-#endif
+-}
+-
+-/**
+- * hash_futex - Return the hash bucket in the global hash
+- * @key:	Pointer to the futex key for which the hash is calculated
+- *
+- * We hash on the keys returned from get_futex_key (see below) and return the
+- * corresponding hash bucket in the global hash.
+- */
+-static struct futex_hash_bucket *hash_futex(union futex_key *key)
+-{
+-	u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
+-			  key->both.offset);
+-
+-	return &futex_queues[hash & (futex_hashsize - 1)];
+-}
+-
+-
+-/**
+- * match_futex - Check whether two futex keys are equal
+- * @key1:	Pointer to key1
+- * @key2:	Pointer to key2
+- *
+- * Return 1 if two futex_keys are equal, 0 otherwise.
+- */
+-static inline int match_futex(union futex_key *key1, union futex_key *key2)
+-{
+-	return (key1 && key2
+-		&& key1->both.word == key2->both.word
+-		&& key1->both.ptr == key2->both.ptr
+-		&& key1->both.offset == key2->both.offset);
+-}
+-
+-enum futex_access {
+-	FUTEX_READ,
+-	FUTEX_WRITE
+-};
+-
+-/**
+- * futex_setup_timer - set up the sleeping hrtimer.
+- * @time:	ptr to the given timeout value
+- * @timeout:	the hrtimer_sleeper structure to be set up
+- * @flags:	futex flags
+- * @range_ns:	optional range in ns
+- *
+- * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
+- *	   value given
+- */
+-static inline struct hrtimer_sleeper *
+-futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+-		  int flags, u64 range_ns)
+-{
+-	if (!time)
+-		return NULL;
+-
+-	hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
+-				      CLOCK_REALTIME : CLOCK_MONOTONIC,
+-				      HRTIMER_MODE_ABS);
+-	/*
+-	 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
+-	 * effectively the same as calling hrtimer_set_expires().
+-	 */
+-	hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
+-
+-	return timeout;
+-}
+-
+-/*
+- * Generate a machine wide unique identifier for this inode.
+- *
+- * This relies on u64 not wrapping in the life-time of the machine; which with
+- * 1ns resolution means almost 585 years.
+- *
+- * This further relies on the fact that a well formed program will not unmap
+- * the file while it has a (shared) futex waiting on it. This mapping will have
+- * a file reference which pins the mount and inode.
+- *
+- * If for some reason an inode gets evicted and read back in again, it will get
+- * a new sequence number and will _NOT_ match, even though it is the exact same
+- * file.
+- *
+- * It is important that match_futex() will never have a false-positive, esp.
+- * for PI futexes that can mess up the state. The above argues that false-negatives
+- * are only possible for malformed programs.
+- */
+-static u64 get_inode_sequence_number(struct inode *inode)
+-{
+-	static atomic64_t i_seq;
+-	u64 old;
+-
+-	/* Does the inode already have a sequence number? */
+-	old = atomic64_read(&inode->i_sequence);
+-	if (likely(old))
+-		return old;
+-
+-	for (;;) {
+-		u64 new = atomic64_add_return(1, &i_seq);
+-		if (WARN_ON_ONCE(!new))
+-			continue;
+-
+-		old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
+-		if (old)
+-			return old;
+-		return new;
+-	}
+-}
+-
+-/**
+- * get_futex_key() - Get parameters which are the keys for a futex
+- * @uaddr:	virtual address of the futex
+- * @fshared:	false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
+- * @key:	address where result is stored.
+- * @rw:		mapping needs to be read/write (values: FUTEX_READ,
+- *              FUTEX_WRITE)
+- *
+- * Return: a negative error code or 0
+- *
+- * The key words are stored in @key on success.
+- *
+- * For shared mappings (when @fshared), the key is:
+- *
+- *   ( inode->i_sequence, page->index, offset_within_page )
+- *
+- * [ also see get_inode_sequence_number() ]
+- *
+- * For private mappings (or when !@fshared), the key is:
+- *
+- *   ( current->mm, address, 0 )
+- *
+- * This allows (cross process, where applicable) identification of the futex
+- * without keeping the page pinned for the duration of the FUTEX_WAIT.
+- *
+- * lock_page() might sleep, the caller should not hold a spinlock.
+- */
+-static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+-			 enum futex_access rw)
+-{
+-	unsigned long address = (unsigned long)uaddr;
+-	struct mm_struct *mm = current->mm;
+-	struct page *page, *tail;
+-	struct address_space *mapping;
+-	int err, ro = 0;
+-
+-	/*
+-	 * The futex address must be "naturally" aligned.
+-	 */
+-	key->both.offset = address % PAGE_SIZE;
+-	if (unlikely((address % sizeof(u32)) != 0))
+-		return -EINVAL;
+-	address -= key->both.offset;
+-
+-	if (unlikely(!access_ok(uaddr, sizeof(u32))))
+-		return -EFAULT;
+-
+-	if (unlikely(should_fail_futex(fshared)))
+-		return -EFAULT;
+-
+-	/*
+-	 * PROCESS_PRIVATE futexes are fast.
+-	 * As the mm cannot disappear under us and the 'key' only needs
+-	 * virtual address, we dont even have to find the underlying vma.
+-	 * Note : We do have to check 'uaddr' is a valid user address,
+-	 *        but access_ok() should be faster than find_vma()
+-	 */
+-	if (!fshared) {
+-		key->private.mm = mm;
+-		key->private.address = address;
+-		return 0;
+-	}
+-
+-again:
+-	/* Ignore any VERIFY_READ mapping (futex common case) */
+-	if (unlikely(should_fail_futex(true)))
+-		return -EFAULT;
+-
+-	err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
+-	/*
+-	 * If write access is not required (eg. FUTEX_WAIT), try
+-	 * and get read-only access.
+-	 */
+-	if (err == -EFAULT && rw == FUTEX_READ) {
+-		err = get_user_pages_fast(address, 1, 0, &page);
+-		ro = 1;
+-	}
+-	if (err < 0)
+-		return err;
+-	else
+-		err = 0;
+-
+-	/*
+-	 * The treatment of mapping from this point on is critical. The page
+-	 * lock protects many things but in this context the page lock
+-	 * stabilizes mapping, prevents inode freeing in the shared
+-	 * file-backed region case and guards against movement to swap cache.
+-	 *
+-	 * Strictly speaking the page lock is not needed in all cases being
+-	 * considered here and page lock forces unnecessarily serialization
+-	 * From this point on, mapping will be re-verified if necessary and
+-	 * page lock will be acquired only if it is unavoidable
+-	 *
+-	 * Mapping checks require the head page for any compound page so the
+-	 * head page and mapping is looked up now. For anonymous pages, it
+-	 * does not matter if the page splits in the future as the key is
+-	 * based on the address. For filesystem-backed pages, the tail is
+-	 * required as the index of the page determines the key. For
+-	 * base pages, there is no tail page and tail == page.
+-	 */
+-	tail = page;
+-	page = compound_head(page);
+-	mapping = READ_ONCE(page->mapping);
+-
+-	/*
+-	 * If page->mapping is NULL, then it cannot be a PageAnon
+-	 * page; but it might be the ZERO_PAGE or in the gate area or
+-	 * in a special mapping (all cases which we are happy to fail);
+-	 * or it may have been a good file page when get_user_pages_fast
+-	 * found it, but truncated or holepunched or subjected to
+-	 * invalidate_complete_page2 before we got the page lock (also
+-	 * cases which we are happy to fail).  And we hold a reference,
+-	 * so refcount care in invalidate_complete_page's remove_mapping
+-	 * prevents drop_caches from setting mapping to NULL beneath us.
+-	 *
+-	 * The case we do have to guard against is when memory pressure made
+-	 * shmem_writepage move it from filecache to swapcache beneath us:
+-	 * an unlikely race, but we do need to retry for page->mapping.
+-	 */
+-	if (unlikely(!mapping)) {
+-		int shmem_swizzled;
+-
+-		/*
+-		 * Page lock is required to identify which special case above
+-		 * applies. If this is really a shmem page then the page lock
+-		 * will prevent unexpected transitions.
+-		 */
+-		lock_page(page);
+-		shmem_swizzled = PageSwapCache(page) || page->mapping;
+-		unlock_page(page);
+-		put_page(page);
+-
+-		if (shmem_swizzled)
+-			goto again;
+-
+-		return -EFAULT;
+-	}
+-
+-	/*
+-	 * Private mappings are handled in a simple way.
+-	 *
+-	 * If the futex key is stored on an anonymous page, then the associated
+-	 * object is the mm which is implicitly pinned by the calling process.
+-	 *
+-	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
+-	 * it's a read-only handle, it's expected that futexes attach to
+-	 * the object not the particular process.
+-	 */
+-	if (PageAnon(page)) {
+-		/*
+-		 * A RO anonymous page will never change and thus doesn't make
+-		 * sense for futex operations.
+-		 */
+-		if (unlikely(should_fail_futex(true)) || ro) {
+-			err = -EFAULT;
+-			goto out;
+-		}
+-
+-		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
+-		key->private.mm = mm;
+-		key->private.address = address;
+-
+-	} else {
+-		struct inode *inode;
+-
+-		/*
+-		 * The associated futex object in this case is the inode and
+-		 * the page->mapping must be traversed. Ordinarily this should
+-		 * be stabilised under page lock but it's not strictly
+-		 * necessary in this case as we just want to pin the inode, not
+-		 * update the radix tree or anything like that.
+-		 *
+-		 * The RCU read lock is taken as the inode is finally freed
+-		 * under RCU. If the mapping still matches expectations then the
+-		 * mapping->host can be safely accessed as being a valid inode.
+-		 */
+-		rcu_read_lock();
+-
+-		if (READ_ONCE(page->mapping) != mapping) {
+-			rcu_read_unlock();
+-			put_page(page);
+-
+-			goto again;
+-		}
+-
+-		inode = READ_ONCE(mapping->host);
+-		if (!inode) {
+-			rcu_read_unlock();
+-			put_page(page);
+-
+-			goto again;
+-		}
+-
+-		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
+-		key->shared.i_seq = get_inode_sequence_number(inode);
+-		key->shared.pgoff = page_to_pgoff(tail);
+-		rcu_read_unlock();
+-	}
+-
+-out:
+-	put_page(page);
+-	return err;
+-}
+-
+-/**
+- * fault_in_user_writeable() - Fault in user address and verify RW access
+- * @uaddr:	pointer to faulting user space address
+- *
+- * Slow path to fixup the fault we just took in the atomic write
+- * access to @uaddr.
+- *
+- * We have no generic implementation of a non-destructive write to the
+- * user address. We know that we faulted in the atomic pagefault
+- * disabled section so we can as well avoid the #PF overhead by
+- * calling get_user_pages() right away.
+- */
+-static int fault_in_user_writeable(u32 __user *uaddr)
+-{
+-	struct mm_struct *mm = current->mm;
+-	int ret;
+-
+-	mmap_read_lock(mm);
+-	ret = fixup_user_fault(mm, (unsigned long)uaddr,
+-			       FAULT_FLAG_WRITE, NULL);
+-	mmap_read_unlock(mm);
+-
+-	return ret < 0 ? ret : 0;
+-}
+-
+-/**
+- * futex_top_waiter() - Return the highest priority waiter on a futex
+- * @hb:		the hash bucket the futex_q's reside in
+- * @key:	the futex key (to distinguish it from other futex futex_q's)
+- *
+- * Must be called with the hb lock held.
+- */
+-static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
+-					union futex_key *key)
+-{
+-	struct futex_q *this;
+-
+-	plist_for_each_entry(this, &hb->chain, list) {
+-		if (match_futex(&this->key, key))
+-			return this;
+-	}
+-	return NULL;
+-}
+-
+-static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
+-				      u32 uval, u32 newval)
+-{
+-	int ret;
+-
+-	pagefault_disable();
+-	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
+-	pagefault_enable();
+-
+-	return ret;
+-}
+-
+-static int get_futex_value_locked(u32 *dest, u32 __user *from)
+-{
+-	int ret;
+-
+-	pagefault_disable();
+-	ret = __get_user(*dest, from);
+-	pagefault_enable();
+-
+-	return ret ? -EFAULT : 0;
+-}
+-
+-
+-/*
+- * PI code:
+- */
+-static int refill_pi_state_cache(void)
+-{
+-	struct futex_pi_state *pi_state;
+-
+-	if (likely(current->pi_state_cache))
+-		return 0;
+-
+-	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
+-
+-	if (!pi_state)
+-		return -ENOMEM;
+-
+-	INIT_LIST_HEAD(&pi_state->list);
+-	/* pi_mutex gets initialized later */
+-	pi_state->owner = NULL;
+-	refcount_set(&pi_state->refcount, 1);
+-	pi_state->key = FUTEX_KEY_INIT;
+-
+-	current->pi_state_cache = pi_state;
+-
+-	return 0;
+-}
+-
+-static struct futex_pi_state *alloc_pi_state(void)
+-{
+-	struct futex_pi_state *pi_state = current->pi_state_cache;
+-
+-	WARN_ON(!pi_state);
+-	current->pi_state_cache = NULL;
+-
+-	return pi_state;
+-}
+-
+-static void pi_state_update_owner(struct futex_pi_state *pi_state,
+-				  struct task_struct *new_owner)
+-{
+-	struct task_struct *old_owner = pi_state->owner;
+-
+-	lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
+-
+-	if (old_owner) {
+-		raw_spin_lock(&old_owner->pi_lock);
+-		WARN_ON(list_empty(&pi_state->list));
+-		list_del_init(&pi_state->list);
+-		raw_spin_unlock(&old_owner->pi_lock);
+-	}
+-
+-	if (new_owner) {
+-		raw_spin_lock(&new_owner->pi_lock);
+-		WARN_ON(!list_empty(&pi_state->list));
+-		list_add(&pi_state->list, &new_owner->pi_state_list);
+-		pi_state->owner = new_owner;
+-		raw_spin_unlock(&new_owner->pi_lock);
+-	}
+-}
+-
+-static void get_pi_state(struct futex_pi_state *pi_state)
+-{
+-	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
+-}
+-
+-/*
+- * Drops a reference to the pi_state object and frees or caches it
+- * when the last reference is gone.
+- */
+-static void put_pi_state(struct futex_pi_state *pi_state)
+-{
+-	if (!pi_state)
+-		return;
+-
+-	if (!refcount_dec_and_test(&pi_state->refcount))
+-		return;
+-
+-	/*
+-	 * If pi_state->owner is NULL, the owner is most probably dying
+-	 * and has cleaned up the pi_state already
+-	 */
+-	if (pi_state->owner) {
+-		unsigned long flags;
+-
+-		raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
+-		pi_state_update_owner(pi_state, NULL);
+-		rt_mutex_proxy_unlock(&pi_state->pi_mutex);
+-		raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
+-	}
+-
+-	if (current->pi_state_cache) {
+-		kfree(pi_state);
+-	} else {
+-		/*
+-		 * pi_state->list is already empty.
+-		 * clear pi_state->owner.
+-		 * refcount is at 0 - put it back to 1.
+-		 */
+-		pi_state->owner = NULL;
+-		refcount_set(&pi_state->refcount, 1);
+-		current->pi_state_cache = pi_state;
+-	}
+-}
+-
+-#ifdef CONFIG_FUTEX_PI
+-
+-/*
+- * This task is holding PI mutexes at exit time => bad.
+- * Kernel cleans up PI-state, but userspace is likely hosed.
+- * (Robust-futex cleanup is separate and might save the day for userspace.)
+- */
+-static void exit_pi_state_list(struct task_struct *curr)
+-{
+-	struct list_head *next, *head = &curr->pi_state_list;
+-	struct futex_pi_state *pi_state;
+-	struct futex_hash_bucket *hb;
+-	union futex_key key = FUTEX_KEY_INIT;
+-
+-	if (!futex_cmpxchg_enabled)
+-		return;
+-	/*
+-	 * We are a ZOMBIE and nobody can enqueue itself on
+-	 * pi_state_list anymore, but we have to be careful
+-	 * versus waiters unqueueing themselves:
+-	 */
+-	raw_spin_lock_irq(&curr->pi_lock);
+-	while (!list_empty(head)) {
+-		next = head->next;
+-		pi_state = list_entry(next, struct futex_pi_state, list);
+-		key = pi_state->key;
+-		hb = hash_futex(&key);
+-
+-		/*
+-		 * We can race against put_pi_state() removing itself from the
+-		 * list (a waiter going away). put_pi_state() will first
+-		 * decrement the reference count and then modify the list, so
+-		 * its possible to see the list entry but fail this reference
+-		 * acquire.
+-		 *
+-		 * In that case; drop the locks to let put_pi_state() make
+-		 * progress and retry the loop.
+-		 */
+-		if (!refcount_inc_not_zero(&pi_state->refcount)) {
+-			raw_spin_unlock_irq(&curr->pi_lock);
+-			cpu_relax();
+-			raw_spin_lock_irq(&curr->pi_lock);
+-			continue;
+-		}
+-		raw_spin_unlock_irq(&curr->pi_lock);
+-
+-		spin_lock(&hb->lock);
+-		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+-		raw_spin_lock(&curr->pi_lock);
+-		/*
+-		 * We dropped the pi-lock, so re-check whether this
+-		 * task still owns the PI-state:
+-		 */
+-		if (head->next != next) {
+-			/* retain curr->pi_lock for the loop invariant */
+-			raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+-			spin_unlock(&hb->lock);
+-			put_pi_state(pi_state);
+-			continue;
+-		}
+-
+-		WARN_ON(pi_state->owner != curr);
+-		WARN_ON(list_empty(&pi_state->list));
+-		list_del_init(&pi_state->list);
+-		pi_state->owner = NULL;
+-
+-		raw_spin_unlock(&curr->pi_lock);
+-		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+-		spin_unlock(&hb->lock);
+-
+-		rt_mutex_futex_unlock(&pi_state->pi_mutex);
+-		put_pi_state(pi_state);
+-
+-		raw_spin_lock_irq(&curr->pi_lock);
+-	}
+-	raw_spin_unlock_irq(&curr->pi_lock);
+-}
+-#else
+-static inline void exit_pi_state_list(struct task_struct *curr) { }
+-#endif
+-
+-/*
+- * We need to check the following states:
+- *
+- *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
+- *
+- * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
+- * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
+- *
+- * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
+- *
+- * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
+- * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
+- *
+- * [6]  Found  | Found    | task      | 0         | 1      | Valid
+- *
+- * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
+- *
+- * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
+- * [9]  Found  | Found    | task      | 0         | 0      | Invalid
+- * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
+- *
+- * [1]	Indicates that the kernel can acquire the futex atomically. We
+- *	came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+- *
+- * [2]	Valid, if TID does not belong to a kernel thread. If no matching
+- *      thread is found then it indicates that the owner TID has died.
+- *
+- * [3]	Invalid. The waiter is queued on a non PI futex
+- *
+- * [4]	Valid state after exit_robust_list(), which sets the user space
+- *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
+- *
+- * [5]	The user space value got manipulated between exit_robust_list()
+- *	and exit_pi_state_list()
+- *
+- * [6]	Valid state after exit_pi_state_list() which sets the new owner in
+- *	the pi_state but cannot access the user space value.
+- *
+- * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set.
+- *
+- * [8]	Owner and user space value match
+- *
+- * [9]	There is no transient state which sets the user space TID to 0
+- *	except exit_robust_list(), but this is indicated by the
+- *	FUTEX_OWNER_DIED bit. See [4]
+- *
+- * [10] There is no transient state which leaves owner and user space
+- *	TID out of sync. Except one error case where the kernel is denied
+- *	write access to the user address, see fixup_pi_state_owner().
+- *
+- *
+- * Serialization and lifetime rules:
+- *
+- * hb->lock:
+- *
+- *	hb -> futex_q, relation
+- *	futex_q -> pi_state, relation
+- *
+- *	(cannot be raw because hb can contain arbitrary amount
+- *	 of futex_q's)
+- *
+- * pi_mutex->wait_lock:
+- *
+- *	{uval, pi_state}
+- *
+- *	(and pi_mutex 'obviously')
+- *
+- * p->pi_lock:
+- *
+- *	p->pi_state_list -> pi_state->list, relation
+- *	pi_mutex->owner -> pi_state->owner, relation
+- *
+- * pi_state->refcount:
+- *
+- *	pi_state lifetime
+- *
+- *
+- * Lock order:
+- *
+- *   hb->lock
+- *     pi_mutex->wait_lock
+- *       p->pi_lock
+- *
+- */
+-
+-/*
+- * Validate that the existing waiter has a pi_state and sanity check
+- * the pi_state against the user space value. If correct, attach to
+- * it.
+- */
+-static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+-			      struct futex_pi_state *pi_state,
+-			      struct futex_pi_state **ps)
+-{
+-	pid_t pid = uval & FUTEX_TID_MASK;
+-	u32 uval2;
+-	int ret;
+-
+-	/*
+-	 * Userspace might have messed up non-PI and PI futexes [3]
+-	 */
+-	if (unlikely(!pi_state))
+-		return -EINVAL;
+-
+-	/*
+-	 * We get here with hb->lock held, and having found a
+-	 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+-	 * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+-	 * which in turn means that futex_lock_pi() still has a reference on
+-	 * our pi_state.
+-	 *
+-	 * The waiter holding a reference on @pi_state also protects against
+-	 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+-	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+-	 * free pi_state before we can take a reference ourselves.
+-	 */
+-	WARN_ON(!refcount_read(&pi_state->refcount));
+-
+-	/*
+-	 * Now that we have a pi_state, we can acquire wait_lock
+-	 * and do the state validation.
+-	 */
+-	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+-
+-	/*
+-	 * Since {uval, pi_state} is serialized by wait_lock, and our current
+-	 * uval was read without holding it, it can have changed. Verify it
+-	 * still is what we expect it to be, otherwise retry the entire
+-	 * operation.
+-	 */
+-	if (get_futex_value_locked(&uval2, uaddr))
+-		goto out_efault;
+-
+-	if (uval != uval2)
+-		goto out_eagain;
+-
+-	/*
+-	 * Handle the owner died case:
+-	 */
+-	if (uval & FUTEX_OWNER_DIED) {
+-		/*
+-		 * exit_pi_state_list sets owner to NULL and wakes the
+-		 * topmost waiter. The task which acquires the
+-		 * pi_state->rt_mutex will fixup owner.
+-		 */
+-		if (!pi_state->owner) {
+-			/*
+-			 * No pi state owner, but the user space TID
+-			 * is not 0. Inconsistent state. [5]
+-			 */
+-			if (pid)
+-				goto out_einval;
+-			/*
+-			 * Take a ref on the state and return success. [4]
+-			 */
+-			goto out_attach;
+-		}
+-
+-		/*
+-		 * If TID is 0, then either the dying owner has not
+-		 * yet executed exit_pi_state_list() or some waiter
+-		 * acquired the rtmutex in the pi state, but did not
+-		 * yet fixup the TID in user space.
+-		 *
+-		 * Take a ref on the state and return success. [6]
+-		 */
+-		if (!pid)
+-			goto out_attach;
+-	} else {
+-		/*
+-		 * If the owner died bit is not set, then the pi_state
+-		 * must have an owner. [7]
+-		 */
+-		if (!pi_state->owner)
+-			goto out_einval;
+-	}
+-
+-	/*
+-	 * Bail out if user space manipulated the futex value. If pi
+-	 * state exists then the owner TID must be the same as the
+-	 * user space TID. [9/10]
+-	 */
+-	if (pid != task_pid_vnr(pi_state->owner))
+-		goto out_einval;
+-
+-out_attach:
+-	get_pi_state(pi_state);
+-	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+-	*ps = pi_state;
+-	return 0;
+-
+-out_einval:
+-	ret = -EINVAL;
+-	goto out_error;
+-
+-out_eagain:
+-	ret = -EAGAIN;
+-	goto out_error;
+-
+-out_efault:
+-	ret = -EFAULT;
+-	goto out_error;
+-
+-out_error:
+-	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+-	return ret;
+-}
+-
+-/**
+- * wait_for_owner_exiting - Block until the owner has exited
+- * @ret: owner's current futex lock status
+- * @exiting:	Pointer to the exiting task
+- *
+- * Caller must hold a refcount on @exiting.
+- */
+-static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
+-{
+-	if (ret != -EBUSY) {
+-		WARN_ON_ONCE(exiting);
+-		return;
+-	}
+-
+-	if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
+-		return;
+-
+-	mutex_lock(&exiting->futex_exit_mutex);
+-	/*
+-	 * No point in doing state checking here. If the waiter got here
+-	 * while the task was in exec()->exec_futex_release() then it can
+-	 * have any FUTEX_STATE_* value when the waiter has acquired the
+-	 * mutex. OK, if running, EXITING or DEAD if it reached exit()
+-	 * already. Highly unlikely and not a problem. Just one more round
+-	 * through the futex maze.
+-	 */
+-	mutex_unlock(&exiting->futex_exit_mutex);
+-
+-	put_task_struct(exiting);
+-}
+-
+-static int handle_exit_race(u32 __user *uaddr, u32 uval,
+-			    struct task_struct *tsk)
+-{
+-	u32 uval2;
+-
+-	/*
+-	 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
+-	 * caller that the alleged owner is busy.
+-	 */
+-	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+-		return -EBUSY;
+-
+-	/*
+-	 * Reread the user space value to handle the following situation:
+-	 *
+-	 * CPU0				CPU1
+-	 *
+-	 * sys_exit()			sys_futex()
+-	 *  do_exit()			 futex_lock_pi()
+-	 *                                futex_lock_pi_atomic()
+-	 *   exit_signals(tsk)		    No waiters:
+-	 *    tsk->flags |= PF_EXITING;	    *uaddr == 0x00000PID
+-	 *  mm_release(tsk)		    Set waiter bit
+-	 *   exit_robust_list(tsk) {	    *uaddr = 0x80000PID;
+-	 *      Set owner died		    attach_to_pi_owner() {
+-	 *    *uaddr = 0xC0000000;	     tsk = get_task(PID);
+-	 *   }				     if (!tsk->flags & PF_EXITING) {
+-	 *  ...				       attach();
+-	 *  tsk->futex_state =               } else {
+-	 *	FUTEX_STATE_DEAD;              if (tsk->futex_state !=
+-	 *					  FUTEX_STATE_DEAD)
+-	 *				         return -EAGAIN;
+-	 *				       return -ESRCH; <--- FAIL
+-	 *				     }
+-	 *
+-	 * Returning ESRCH unconditionally is wrong here because the
+-	 * user space value has been changed by the exiting task.
+-	 *
+-	 * The same logic applies to the case where the exiting task is
+-	 * already gone.
+-	 */
+-	if (get_futex_value_locked(&uval2, uaddr))
+-		return -EFAULT;
+-
+-	/* If the user space value has changed, try again. */
+-	if (uval2 != uval)
+-		return -EAGAIN;
+-
+-	/*
+-	 * The exiting task did not have a robust list, the robust list was
+-	 * corrupted or the user space value in *uaddr is simply bogus.
+-	 * Give up and tell user space.
+-	 */
+-	return -ESRCH;
+-}
+-
+-static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
+-				 struct futex_pi_state **ps)
+-{
+-	/*
+-	 * No existing pi state. First waiter. [2]
+-	 *
+-	 * This creates pi_state, we have hb->lock held, this means nothing can
+-	 * observe this state, wait_lock is irrelevant.
+-	 */
+-	struct futex_pi_state *pi_state = alloc_pi_state();
+-
+-	/*
+-	 * Initialize the pi_mutex in locked state and make @p
+-	 * the owner of it:
+-	 */
+-	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+-
+-	/* Store the key for possible exit cleanups: */
+-	pi_state->key = *key;
+-
+-	WARN_ON(!list_empty(&pi_state->list));
+-	list_add(&pi_state->list, &p->pi_state_list);
+-	/*
+-	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+-	 * because there is no concurrency as the object is not published yet.
+-	 */
+-	pi_state->owner = p;
+-
+-	*ps = pi_state;
+-}
+-/*
+- * Lookup the task for the TID provided from user space and attach to
+- * it after doing proper sanity checks.
+- */
+-static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
+-			      struct futex_pi_state **ps,
+-			      struct task_struct **exiting)
+-{
+-	pid_t pid = uval & FUTEX_TID_MASK;
+-	struct task_struct *p;
+-
+-	/*
+-	 * We are the first waiter - try to look up the real owner and attach
+-	 * the new pi_state to it, but bail out when TID = 0 [1]
+-	 *
+-	 * The !pid check is paranoid. None of the call sites should end up
+-	 * with pid == 0, but better safe than sorry. Let the caller retry
+-	 */
+-	if (!pid)
+-		return -EAGAIN;
+-	p = find_get_task_by_vpid(pid);
+-	if (!p)
+-		return handle_exit_race(uaddr, uval, NULL);
+-
+-	if (unlikely(p->flags & PF_KTHREAD)) {
+-		put_task_struct(p);
+-		return -EPERM;
+-	}
+-
+-	/*
+-	 * We need to look at the task state to figure out, whether the
+-	 * task is exiting. To protect against the change of the task state
+-	 * in futex_exit_release(), we do this protected by p->pi_lock:
+-	 */
+-	raw_spin_lock_irq(&p->pi_lock);
+-	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
+-		/*
+-		 * The task is on the way out. When the futex state is
+-		 * FUTEX_STATE_DEAD, we know that the task has finished
+-		 * the cleanup:
+-		 */
+-		int ret = handle_exit_race(uaddr, uval, p);
+-
+-		raw_spin_unlock_irq(&p->pi_lock);
+-		/*
+-		 * If the owner task is between FUTEX_STATE_EXITING and
+-		 * FUTEX_STATE_DEAD then store the task pointer and keep
+-		 * the reference on the task struct. The calling code will
+-		 * drop all locks, wait for the task to reach
+-		 * FUTEX_STATE_DEAD and then drop the refcount. This is
+-		 * required to prevent a live lock when the current task
+-		 * preempted the exiting task between the two states.
+-		 */
+-		if (ret == -EBUSY)
+-			*exiting = p;
+-		else
+-			put_task_struct(p);
+-		return ret;
+-	}
+-
+-	__attach_to_pi_owner(p, key, ps);
+-	raw_spin_unlock_irq(&p->pi_lock);
+-
+-	put_task_struct(p);
+-
+-	return 0;
+-}
+-
+-static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+-{
+-	int err;
+-	u32 curval;
+-
+-	if (unlikely(should_fail_futex(true)))
+-		return -EFAULT;
+-
+-	err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+-	if (unlikely(err))
+-		return err;
+-
+-	/* If user space value changed, let the caller retry */
+-	return curval != uval ? -EAGAIN : 0;
+-}
+-
+-/**
+- * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
+- * @uaddr:		the pi futex user address
+- * @hb:			the pi futex hash bucket
+- * @key:		the futex key associated with uaddr and hb
+- * @ps:			the pi_state pointer where we store the result of the
+- *			lookup
+- * @task:		the task to perform the atomic lock work for.  This will
+- *			be "current" except in the case of requeue pi.
+- * @exiting:		Pointer to store the task pointer of the owner task
+- *			which is in the middle of exiting
+- * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
+- *
+- * Return:
+- *  -  0 - ready to wait;
+- *  -  1 - acquired the lock;
+- *  - <0 - error
+- *
+- * The hb->lock must be held by the caller.
+- *
+- * @exiting is only set when the return value is -EBUSY. If so, this holds
+- * a refcount on the exiting task on return and the caller needs to drop it
+- * after waiting for the exit to complete.
+- */
+-static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+-				union futex_key *key,
+-				struct futex_pi_state **ps,
+-				struct task_struct *task,
+-				struct task_struct **exiting,
+-				int set_waiters)
+-{
+-	u32 uval, newval, vpid = task_pid_vnr(task);
+-	struct futex_q *top_waiter;
+-	int ret;
+-
+-	/*
+-	 * Read the user space value first so we can validate a few
+-	 * things before proceeding further.
+-	 */
+-	if (get_futex_value_locked(&uval, uaddr))
+-		return -EFAULT;
+-
+-	if (unlikely(should_fail_futex(true)))
+-		return -EFAULT;
+-
+-	/*
+-	 * Detect deadlocks.
+-	 */
+-	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
+-		return -EDEADLK;
+-
+-	if ((unlikely(should_fail_futex(true))))
+-		return -EDEADLK;
+-
+-	/*
+-	 * Lookup existing state first. If it exists, try to attach to
+-	 * its pi_state.
+-	 */
+-	top_waiter = futex_top_waiter(hb, key);
+-	if (top_waiter)
+-		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
+-
+-	/*
+-	 * No waiter and user TID is 0. We are here because the
+-	 * waiters or the owner died bit is set or called from
+-	 * requeue_cmp_pi or for whatever reason something took the
+-	 * syscall.
+-	 */
+-	if (!(uval & FUTEX_TID_MASK)) {
+-		/*
+-		 * We take over the futex. No other waiters and the user space
+-		 * TID is 0. We preserve the owner died bit.
+-		 */
+-		newval = uval & FUTEX_OWNER_DIED;
+-		newval |= vpid;
+-
+-		/* The futex requeue_pi code can enforce the waiters bit */
+-		if (set_waiters)
+-			newval |= FUTEX_WAITERS;
+-
+-		ret = lock_pi_update_atomic(uaddr, uval, newval);
+-		if (ret)
+-			return ret;
+-
+-		/*
+-		 * If the waiter bit was requested the caller also needs PI
+-		 * state attached to the new owner of the user space futex.
+-		 *
+-		 * @task is guaranteed to be alive and it cannot be exiting
+-		 * because it is either sleeping or waiting in
+-		 * futex_requeue_pi_wakeup_sync().
+-		 *
+-		 * No need to do the full attach_to_pi_owner() exercise
+-		 * because @task is known and valid.
+-		 */
+-		if (set_waiters) {
+-			raw_spin_lock_irq(&task->pi_lock);
+-			__attach_to_pi_owner(task, key, ps);
+-			raw_spin_unlock_irq(&task->pi_lock);
+-		}
+-		return 1;
+-	}
+-
+-	/*
+-	 * First waiter. Set the waiters bit before attaching ourself to
+-	 * the owner. If owner tries to unlock, it will be forced into
+-	 * the kernel and blocked on hb->lock.
+-	 */
+-	newval = uval | FUTEX_WAITERS;
+-	ret = lock_pi_update_atomic(uaddr, uval, newval);
+-	if (ret)
+-		return ret;
+-	/*
+-	 * If the update of the user space value succeeded, we try to
+-	 * attach to the owner. If that fails, no harm done, we only
+-	 * set the FUTEX_WAITERS bit in the user space variable.
+-	 */
+-	return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
+-}
+-
+-/**
+- * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
+- * @q:	The futex_q to unqueue
+- *
+- * The q->lock_ptr must not be NULL and must be held by the caller.
+- */
+-static void __unqueue_futex(struct futex_q *q)
+-{
+-	struct futex_hash_bucket *hb;
+-
+-	if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
+-		return;
+-	lockdep_assert_held(q->lock_ptr);
+-
+-	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+-	plist_del(&q->list, &hb->chain);
+-	hb_waiters_dec(hb);
+-}
+-
+-/*
+- * The hash bucket lock must be held when this is called.
+- * Afterwards, the futex_q must not be accessed. Callers
+- * must ensure to later call wake_up_q() for the actual
+- * wakeups to occur.
+- */
+-static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
+-{
+-	struct task_struct *p = q->task;
+-
+-	if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+-		return;
+-
+-	get_task_struct(p);
+-	__unqueue_futex(q);
+-	/*
+-	 * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
+-	 * is written, without taking any locks. This is possible in the event
+-	 * of a spurious wakeup, for example. A memory barrier is required here
+-	 * to prevent the following store to lock_ptr from getting ahead of the
+-	 * plist_del in __unqueue_futex().
+-	 */
+-	smp_store_release(&q->lock_ptr, NULL);
+-
+-	/*
+-	 * Queue the task for later wakeup for after we've released
+-	 * the hb->lock.
+-	 */
+-	wake_q_add_safe(wake_q, p);
+-}
+-
+-/*
+- * Caller must hold a reference on @pi_state.
+- */
+-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
+-{
+-	struct rt_mutex_waiter *top_waiter;
+-	struct task_struct *new_owner;
+-	bool postunlock = false;
+-	DEFINE_RT_WAKE_Q(wqh);
+-	u32 curval, newval;
+-	int ret = 0;
+-
+-	top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
+-	if (WARN_ON_ONCE(!top_waiter)) {
+-		/*
+-		 * As per the comment in futex_unlock_pi() this should not happen.
+-		 *
+-		 * When this happens, give up our locks and try again, giving
+-		 * the futex_lock_pi() instance time to complete, either by
+-		 * waiting on the rtmutex or removing itself from the futex
+-		 * queue.
+-		 */
+-		ret = -EAGAIN;
+-		goto out_unlock;
+-	}
+-
+-	new_owner = top_waiter->task;
+-
+-	/*
+-	 * We pass it to the next owner. The WAITERS bit is always kept
+-	 * enabled while there is PI state around. We cleanup the owner
+-	 * died bit, because we are the owner.
+-	 */
+-	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+-
+-	if (unlikely(should_fail_futex(true))) {
+-		ret = -EFAULT;
+-		goto out_unlock;
+-	}
+-
+-	ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+-	if (!ret && (curval != uval)) {
+-		/*
+-		 * If a unconditional UNLOCK_PI operation (user space did not
+-		 * try the TID->0 transition) raced with a waiter setting the
+-		 * FUTEX_WAITERS flag between get_user() and locking the hash
+-		 * bucket lock, retry the operation.
+-		 */
+-		if ((FUTEX_TID_MASK & curval) == uval)
+-			ret = -EAGAIN;
+-		else
+-			ret = -EINVAL;
+-	}
+-
+-	if (!ret) {
+-		/*
+-		 * This is a point of no return; once we modified the uval
+-		 * there is no going back and subsequent operations must
+-		 * not fail.
+-		 */
+-		pi_state_update_owner(pi_state, new_owner);
+-		postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
+-	}
+-
+-out_unlock:
+-	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+-
+-	if (postunlock)
+-		rt_mutex_postunlock(&wqh);
+-
+-	return ret;
+-}
+-
+-/*
+- * Express the locking dependencies for lockdep:
+- */
+-static inline void
+-double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+-{
+-	if (hb1 <= hb2) {
+-		spin_lock(&hb1->lock);
+-		if (hb1 < hb2)
+-			spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+-	} else { /* hb1 > hb2 */
+-		spin_lock(&hb2->lock);
+-		spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
+-	}
+-}
+-
+-static inline void
+-double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+-{
+-	spin_unlock(&hb1->lock);
+-	if (hb1 != hb2)
+-		spin_unlock(&hb2->lock);
+-}
+-
+-/*
+- * Wake up waiters matching bitset queued on this futex (uaddr).
+- */
+-static int
+-futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
+-{
+-	struct futex_hash_bucket *hb;
+-	struct futex_q *this, *next;
+-	union futex_key key = FUTEX_KEY_INIT;
+-	int ret;
+-	DEFINE_WAKE_Q(wake_q);
+-
+-	if (!bitset)
+-		return -EINVAL;
+-
+-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
+-	if (unlikely(ret != 0))
+-		return ret;
+-
+-	hb = hash_futex(&key);
+-
+-	/* Make sure we really have tasks to wakeup */
+-	if (!hb_waiters_pending(hb))
+-		return ret;
+-
+-	spin_lock(&hb->lock);
+-
+-	plist_for_each_entry_safe(this, next, &hb->chain, list) {
+-		if (match_futex (&this->key, &key)) {
+-			if (this->pi_state || this->rt_waiter) {
+-				ret = -EINVAL;
+-				break;
+-			}
+-
+-			/* Check if one of the bits is set in both bitsets */
+-			if (!(this->bitset & bitset))
+-				continue;
+-
+-			mark_wake_futex(&wake_q, this);
+-			if (++ret >= nr_wake)
+-				break;
+-		}
+-	}
+-
+-	spin_unlock(&hb->lock);
+-	wake_up_q(&wake_q);
+-	return ret;
+-}
+-
+-static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
+-{
+-	unsigned int op =	  (encoded_op & 0x70000000) >> 28;
+-	unsigned int cmp =	  (encoded_op & 0x0f000000) >> 24;
+-	int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
+-	int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
+-	int oldval, ret;
+-
+-	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
+-		if (oparg < 0 || oparg > 31) {
+-			char comm[sizeof(current->comm)];
+-			/*
+-			 * kill this print and return -EINVAL when userspace
+-			 * is sane again
+-			 */
+-			pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
+-					get_task_comm(comm, current), oparg);
+-			oparg &= 31;
+-		}
+-		oparg = 1 << oparg;
+-	}
+-
+-	pagefault_disable();
+-	ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
+-	pagefault_enable();
+-	if (ret)
+-		return ret;
+-
+-	switch (cmp) {
+-	case FUTEX_OP_CMP_EQ:
+-		return oldval == cmparg;
+-	case FUTEX_OP_CMP_NE:
+-		return oldval != cmparg;
+-	case FUTEX_OP_CMP_LT:
+-		return oldval < cmparg;
+-	case FUTEX_OP_CMP_GE:
+-		return oldval >= cmparg;
+-	case FUTEX_OP_CMP_LE:
+-		return oldval <= cmparg;
+-	case FUTEX_OP_CMP_GT:
+-		return oldval > cmparg;
+-	default:
+-		return -ENOSYS;
+-	}
+-}
+-
+-/*
+- * Wake up all waiters hashed on the physical page that is mapped
+- * to this virtual address:
+- */
+-static int
+-futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
+-	      int nr_wake, int nr_wake2, int op)
+-{
+-	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+-	struct futex_hash_bucket *hb1, *hb2;
+-	struct futex_q *this, *next;
+-	int ret, op_ret;
+-	DEFINE_WAKE_Q(wake_q);
+-
+-retry:
+-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+-	if (unlikely(ret != 0))
+-		return ret;
+-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+-	if (unlikely(ret != 0))
+-		return ret;
+-
+-	hb1 = hash_futex(&key1);
+-	hb2 = hash_futex(&key2);
+-
+-retry_private:
+-	double_lock_hb(hb1, hb2);
+-	op_ret = futex_atomic_op_inuser(op, uaddr2);
+-	if (unlikely(op_ret < 0)) {
+-		double_unlock_hb(hb1, hb2);
+-
+-		if (!IS_ENABLED(CONFIG_MMU) ||
+-		    unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
+-			/*
+-			 * we don't get EFAULT from MMU faults if we don't have
+-			 * an MMU, but we might get them from range checking
+-			 */
+-			ret = op_ret;
+-			return ret;
+-		}
+-
+-		if (op_ret == -EFAULT) {
+-			ret = fault_in_user_writeable(uaddr2);
+-			if (ret)
+-				return ret;
+-		}
+-
+-		cond_resched();
+-		if (!(flags & FLAGS_SHARED))
+-			goto retry_private;
+-		goto retry;
+-	}
+-
+-	plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+-		if (match_futex (&this->key, &key1)) {
+-			if (this->pi_state || this->rt_waiter) {
+-				ret = -EINVAL;
+-				goto out_unlock;
+-			}
+-			mark_wake_futex(&wake_q, this);
+-			if (++ret >= nr_wake)
+-				break;
+-		}
+-	}
+-
+-	if (op_ret > 0) {
+-		op_ret = 0;
+-		plist_for_each_entry_safe(this, next, &hb2->chain, list) {
+-			if (match_futex (&this->key, &key2)) {
+-				if (this->pi_state || this->rt_waiter) {
+-					ret = -EINVAL;
+-					goto out_unlock;
+-				}
+-				mark_wake_futex(&wake_q, this);
+-				if (++op_ret >= nr_wake2)
+-					break;
+-			}
+-		}
+-		ret += op_ret;
+-	}
+-
+-out_unlock:
+-	double_unlock_hb(hb1, hb2);
+-	wake_up_q(&wake_q);
+-	return ret;
+-}
+-
+-/**
+- * requeue_futex() - Requeue a futex_q from one hb to another
+- * @q:		the futex_q to requeue
+- * @hb1:	the source hash_bucket
+- * @hb2:	the target hash_bucket
+- * @key2:	the new key for the requeued futex_q
+- */
+-static inline
+-void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
+-		   struct futex_hash_bucket *hb2, union futex_key *key2)
+-{
+-
+-	/*
+-	 * If key1 and key2 hash to the same bucket, no need to
+-	 * requeue.
+-	 */
+-	if (likely(&hb1->chain != &hb2->chain)) {
+-		plist_del(&q->list, &hb1->chain);
+-		hb_waiters_dec(hb1);
+-		hb_waiters_inc(hb2);
+-		plist_add(&q->list, &hb2->chain);
+-		q->lock_ptr = &hb2->lock;
+-	}
+-	q->key = *key2;
+-}
+-
+-static inline bool futex_requeue_pi_prepare(struct futex_q *q,
+-					    struct futex_pi_state *pi_state)
+-{
+-	int old, new;
+-
+-	/*
+-	 * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
+-	 * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
+-	 * ignore the waiter.
+-	 */
+-	old = atomic_read_acquire(&q->requeue_state);
+-	do {
+-		if (old == Q_REQUEUE_PI_IGNORE)
+-			return false;
+-
+-		/*
+-		 * futex_proxy_trylock_atomic() might have set it to
+-		 * IN_PROGRESS and a interleaved early wake to WAIT.
+-		 *
+-		 * It was considered to have an extra state for that
+-		 * trylock, but that would just add more conditionals
+-		 * all over the place for a dubious value.
+-		 */
+-		if (old != Q_REQUEUE_PI_NONE)
+-			break;
+-
+-		new = Q_REQUEUE_PI_IN_PROGRESS;
+-	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+-
+-	q->pi_state = pi_state;
+-	return true;
+-}
+-
+-static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
+-{
+-	int old, new;
+-
+-	old = atomic_read_acquire(&q->requeue_state);
+-	do {
+-		if (old == Q_REQUEUE_PI_IGNORE)
+-			return;
+-
+-		if (locked >= 0) {
+-			/* Requeue succeeded. Set DONE or LOCKED */
+-			WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS &&
+-				     old != Q_REQUEUE_PI_WAIT);
+-			new = Q_REQUEUE_PI_DONE + locked;
+-		} else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
+-			/* Deadlock, no early wakeup interleave */
+-			new = Q_REQUEUE_PI_NONE;
+-		} else {
+-			/* Deadlock, early wakeup interleave. */
+-			WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT);
+-			new = Q_REQUEUE_PI_IGNORE;
+-		}
+-	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+-
+-#ifdef CONFIG_PREEMPT_RT
+-	/* If the waiter interleaved with the requeue let it know */
+-	if (unlikely(old == Q_REQUEUE_PI_WAIT))
+-		rcuwait_wake_up(&q->requeue_wait);
+-#endif
+-}
+-
+-static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
+-{
+-	int old, new;
+-
+-	old = atomic_read_acquire(&q->requeue_state);
+-	do {
+-		/* Is requeue done already? */
+-		if (old >= Q_REQUEUE_PI_DONE)
+-			return old;
+-
+-		/*
+-		 * If not done, then tell the requeue code to either ignore
+-		 * the waiter or to wake it up once the requeue is done.
+-		 */
+-		new = Q_REQUEUE_PI_WAIT;
+-		if (old == Q_REQUEUE_PI_NONE)
+-			new = Q_REQUEUE_PI_IGNORE;
+-	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
+-
+-	/* If the requeue was in progress, wait for it to complete */
+-	if (old == Q_REQUEUE_PI_IN_PROGRESS) {
+-#ifdef CONFIG_PREEMPT_RT
+-		rcuwait_wait_event(&q->requeue_wait,
+-				   atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
+-				   TASK_UNINTERRUPTIBLE);
+-#else
+-		(void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
+-#endif
+-	}
+-
+-	/*
+-	 * Requeue is now either prohibited or complete. Reread state
+-	 * because during the wait above it might have changed. Nothing
+-	 * will modify q->requeue_state after this point.
+-	 */
+-	return atomic_read(&q->requeue_state);
+-}
+-
+-/**
+- * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+- * @q:		the futex_q
+- * @key:	the key of the requeue target futex
+- * @hb:		the hash_bucket of the requeue target futex
+- *
+- * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+- * target futex if it is uncontended or via a lock steal.
+- *
+- * 1) Set @q::key to the requeue target futex key so the waiter can detect
+- *    the wakeup on the right futex.
+- *
+- * 2) Dequeue @q from the hash bucket.
+- *
+- * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock
+- *    acquisition.
+- *
+- * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that
+- *    the waiter has to fixup the pi state.
+- *
+- * 5) Complete the requeue state so the waiter can make progress. After
+- *    this point the waiter task can return from the syscall immediately in
+- *    case that the pi state does not have to be fixed up.
+- *
+- * 6) Wake the waiter task.
+- *
+- * Must be called with both q->lock_ptr and hb->lock held.
+- */
+-static inline
+-void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
+-			   struct futex_hash_bucket *hb)
+-{
+-	q->key = *key;
+-
+-	__unqueue_futex(q);
+-
+-	WARN_ON(!q->rt_waiter);
+-	q->rt_waiter = NULL;
+-
+-	q->lock_ptr = &hb->lock;
+-
+-	/* Signal locked state to the waiter */
+-	futex_requeue_pi_complete(q, 1);
+-	wake_up_state(q->task, TASK_NORMAL);
+-}
+-
+-/**
+- * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+- * @pifutex:		the user address of the to futex
+- * @hb1:		the from futex hash bucket, must be locked by the caller
+- * @hb2:		the to futex hash bucket, must be locked by the caller
+- * @key1:		the from futex key
+- * @key2:		the to futex key
+- * @ps:			address to store the pi_state pointer
+- * @exiting:		Pointer to store the task pointer of the owner task
+- *			which is in the middle of exiting
+- * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
+- *
+- * Try and get the lock on behalf of the top waiter if we can do it atomically.
+- * Wake the top waiter if we succeed.  If the caller specified set_waiters,
+- * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+- * hb1 and hb2 must be held by the caller.
+- *
+- * @exiting is only set when the return value is -EBUSY. If so, this holds
+- * a refcount on the exiting task on return and the caller needs to drop it
+- * after waiting for the exit to complete.
+- *
+- * Return:
+- *  -  0 - failed to acquire the lock atomically;
+- *  - >0 - acquired the lock, return value is vpid of the top_waiter
+- *  - <0 - error
+- */
+-static int
+-futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
+-			   struct futex_hash_bucket *hb2, union futex_key *key1,
+-			   union futex_key *key2, struct futex_pi_state **ps,
+-			   struct task_struct **exiting, int set_waiters)
+-{
+-	struct futex_q *top_waiter = NULL;
+-	u32 curval;
+-	int ret;
+-
+-	if (get_futex_value_locked(&curval, pifutex))
+-		return -EFAULT;
+-
+-	if (unlikely(should_fail_futex(true)))
+-		return -EFAULT;
+-
+-	/*
+-	 * Find the top_waiter and determine if there are additional waiters.
+-	 * If the caller intends to requeue more than 1 waiter to pifutex,
+-	 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+-	 * as we have means to handle the possible fault.  If not, don't set
+-	 * the bit unnecessarily as it will force the subsequent unlock to enter
+-	 * the kernel.
+-	 */
+-	top_waiter = futex_top_waiter(hb1, key1);
+-
+-	/* There are no waiters, nothing for us to do. */
+-	if (!top_waiter)
+-		return 0;
+-
+-	/*
+-	 * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
+-	 * and waiting on the 'waitqueue' futex which is always !PI.
+-	 */
+-	if (!top_waiter->rt_waiter || top_waiter->pi_state)
+-		return -EINVAL;
+-
+-	/* Ensure we requeue to the expected futex. */
+-	if (!match_futex(top_waiter->requeue_pi_key, key2))
+-		return -EINVAL;
+-
+-	/* Ensure that this does not race against an early wakeup */
+-	if (!futex_requeue_pi_prepare(top_waiter, NULL))
+-		return -EAGAIN;
+-
+-	/*
+-	 * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
+-	 * in the contended case or if @set_waiters is true.
+-	 *
+-	 * In the contended case PI state is attached to the lock owner. If
+-	 * the user space lock can be acquired then PI state is attached to
+-	 * the new owner (@top_waiter->task) when @set_waiters is true.
+-	 */
+-	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+-				   exiting, set_waiters);
+-	if (ret == 1) {
+-		/*
+-		 * Lock was acquired in user space and PI state was
+-		 * attached to @top_waiter->task. That means state is fully
+-		 * consistent and the waiter can return to user space
+-		 * immediately after the wakeup.
+-		 */
+-		requeue_pi_wake_futex(top_waiter, key2, hb2);
+-	} else if (ret < 0) {
+-		/* Rewind top_waiter::requeue_state */
+-		futex_requeue_pi_complete(top_waiter, ret);
+-	} else {
+-		/*
+-		 * futex_lock_pi_atomic() did not acquire the user space
+-		 * futex, but managed to establish the proxy lock and pi
+-		 * state. top_waiter::requeue_state cannot be fixed up here
+-		 * because the waiter is not enqueued on the rtmutex
+-		 * yet. This is handled at the callsite depending on the
+-		 * result of rt_mutex_start_proxy_lock() which is
+-		 * guaranteed to be reached with this function returning 0.
+-		 */
+-	}
+-	return ret;
+-}
+-
+-/**
+- * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+- * @uaddr1:	source futex user address
+- * @flags:	futex flags (FLAGS_SHARED, etc.)
+- * @uaddr2:	target futex user address
+- * @nr_wake:	number of waiters to wake (must be 1 for requeue_pi)
+- * @nr_requeue:	number of waiters to requeue (0-INT_MAX)
+- * @cmpval:	@uaddr1 expected value (or %NULL)
+- * @requeue_pi:	if we are attempting to requeue from a non-pi futex to a
+- *		pi futex (pi to pi requeue is not supported)
+- *
+- * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+- * uaddr2 atomically on behalf of the top waiter.
+- *
+- * Return:
+- *  - >=0 - on success, the number of tasks requeued or woken;
+- *  -  <0 - on error
+- */
+-static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
+-			 u32 __user *uaddr2, int nr_wake, int nr_requeue,
+-			 u32 *cmpval, int requeue_pi)
+-{
+-	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+-	int task_count = 0, ret;
+-	struct futex_pi_state *pi_state = NULL;
+-	struct futex_hash_bucket *hb1, *hb2;
+-	struct futex_q *this, *next;
+-	DEFINE_WAKE_Q(wake_q);
+-
+-	if (nr_wake < 0 || nr_requeue < 0)
+-		return -EINVAL;
+-
+-	/*
+-	 * When PI not supported: return -ENOSYS if requeue_pi is true,
+-	 * consequently the compiler knows requeue_pi is always false past
+-	 * this point which will optimize away all the conditional code
+-	 * further down.
+-	 */
+-	if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
+-		return -ENOSYS;
+-
+-	if (requeue_pi) {
+-		/*
+-		 * Requeue PI only works on two distinct uaddrs. This
+-		 * check is only valid for private futexes. See below.
+-		 */
+-		if (uaddr1 == uaddr2)
+-			return -EINVAL;
+-
+-		/*
+-		 * futex_requeue() allows the caller to define the number
+-		 * of waiters to wake up via the @nr_wake argument. With
+-		 * REQUEUE_PI, waking up more than one waiter is creating
+-		 * more problems than it solves. Waking up a waiter makes
+-		 * only sense if the PI futex @uaddr2 is uncontended as
+-		 * this allows the requeue code to acquire the futex
+-		 * @uaddr2 before waking the waiter. The waiter can then
+-		 * return to user space without further action. A secondary
+-		 * wakeup would just make the futex_wait_requeue_pi()
+-		 * handling more complex, because that code would have to
+-		 * look up pi_state and do more or less all the handling
+-		 * which the requeue code has to do for the to be requeued
+-		 * waiters. So restrict the number of waiters to wake to
+-		 * one, and only wake it up when the PI futex is
+-		 * uncontended. Otherwise requeue it and let the unlock of
+-		 * the PI futex handle the wakeup.
+-		 *
+-		 * All REQUEUE_PI users, e.g. pthread_cond_signal() and
+-		 * pthread_cond_broadcast() must use nr_wake=1.
+-		 */
+-		if (nr_wake != 1)
+-			return -EINVAL;
+-
+-		/*
+-		 * requeue_pi requires a pi_state, try to allocate it now
+-		 * without any locks in case it fails.
+-		 */
+-		if (refill_pi_state_cache())
+-			return -ENOMEM;
+-	}
+-
+-retry:
+-	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
+-	if (unlikely(ret != 0))
+-		return ret;
+-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
+-			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
+-	if (unlikely(ret != 0))
+-		return ret;
+-
+-	/*
+-	 * The check above which compares uaddrs is not sufficient for
+-	 * shared futexes. We need to compare the keys:
+-	 */
+-	if (requeue_pi && match_futex(&key1, &key2))
+-		return -EINVAL;
+-
+-	hb1 = hash_futex(&key1);
+-	hb2 = hash_futex(&key2);
+-
+-retry_private:
+-	hb_waiters_inc(hb2);
+-	double_lock_hb(hb1, hb2);
+-
+-	if (likely(cmpval != NULL)) {
+-		u32 curval;
+-
+-		ret = get_futex_value_locked(&curval, uaddr1);
+-
+-		if (unlikely(ret)) {
+-			double_unlock_hb(hb1, hb2);
+-			hb_waiters_dec(hb2);
+-
+-			ret = get_user(curval, uaddr1);
+-			if (ret)
+-				return ret;
+-
+-			if (!(flags & FLAGS_SHARED))
+-				goto retry_private;
+-
+-			goto retry;
+-		}
+-		if (curval != *cmpval) {
+-			ret = -EAGAIN;
+-			goto out_unlock;
+-		}
+-	}
+-
+-	if (requeue_pi) {
+-		struct task_struct *exiting = NULL;
+-
+-		/*
+-		 * Attempt to acquire uaddr2 and wake the top waiter. If we
+-		 * intend to requeue waiters, force setting the FUTEX_WAITERS
+-		 * bit.  We force this here where we are able to easily handle
+-		 * faults rather in the requeue loop below.
+-		 *
+-		 * Updates topwaiter::requeue_state if a top waiter exists.
+-		 */
+-		ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+-						 &key2, &pi_state,
+-						 &exiting, nr_requeue);
+-
+-		/*
+-		 * At this point the top_waiter has either taken uaddr2 or
+-		 * is waiting on it. In both cases pi_state has been
+-		 * established and an initial refcount on it. In case of an
+-		 * error there's nothing.
+-		 *
+-		 * The top waiter's requeue_state is up to date:
+-		 *
+-		 *  - If the lock was acquired atomically (ret == 1), then
+-		 *    the state is Q_REQUEUE_PI_LOCKED.
+-		 *
+-		 *    The top waiter has been dequeued and woken up and can
+-		 *    return to user space immediately. The kernel/user
+-		 *    space state is consistent. In case that there must be
+-		 *    more waiters requeued the WAITERS bit in the user
+-		 *    space futex is set so the top waiter task has to go
+-		 *    into the syscall slowpath to unlock the futex. This
+-		 *    will block until this requeue operation has been
+-		 *    completed and the hash bucket locks have been
+-		 *    dropped.
+-		 *
+-		 *  - If the trylock failed with an error (ret < 0) then
+-		 *    the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
+-		 *    happened", or Q_REQUEUE_PI_IGNORE when there was an
+-		 *    interleaved early wakeup.
+-		 *
+-		 *  - If the trylock did not succeed (ret == 0) then the
+-		 *    state is either Q_REQUEUE_PI_IN_PROGRESS or
+-		 *    Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
+-		 *    This will be cleaned up in the loop below, which
+-		 *    cannot fail because futex_proxy_trylock_atomic() did
+-		 *    the same sanity checks for requeue_pi as the loop
+-		 *    below does.
+-		 */
+-		switch (ret) {
+-		case 0:
+-			/* We hold a reference on the pi state. */
+-			break;
+-
+-		case 1:
+-			/*
+-			 * futex_proxy_trylock_atomic() acquired the user space
+-			 * futex. Adjust task_count.
+-			 */
+-			task_count++;
+-			ret = 0;
+-			break;
+-
+-		/*
+-		 * If the above failed, then pi_state is NULL and
+-		 * waiter::requeue_state is correct.
+-		 */
+-		case -EFAULT:
+-			double_unlock_hb(hb1, hb2);
+-			hb_waiters_dec(hb2);
+-			ret = fault_in_user_writeable(uaddr2);
+-			if (!ret)
+-				goto retry;
+-			return ret;
+-		case -EBUSY:
+-		case -EAGAIN:
+-			/*
+-			 * Two reasons for this:
+-			 * - EBUSY: Owner is exiting and we just wait for the
+-			 *   exit to complete.
+-			 * - EAGAIN: The user space value changed.
+-			 */
+-			double_unlock_hb(hb1, hb2);
+-			hb_waiters_dec(hb2);
+-			/*
+-			 * Handle the case where the owner is in the middle of
+-			 * exiting. Wait for the exit to complete otherwise
+-			 * this task might loop forever, aka. live lock.
+-			 */
+-			wait_for_owner_exiting(ret, exiting);
+-			cond_resched();
+-			goto retry;
+-		default:
+-			goto out_unlock;
+-		}
+-	}
+-
+-	plist_for_each_entry_safe(this, next, &hb1->chain, list) {
+-		if (task_count - nr_wake >= nr_requeue)
+-			break;
+-
+-		if (!match_futex(&this->key, &key1))
+-			continue;
+-
+-		/*
+-		 * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
+-		 * be paired with each other and no other futex ops.
+-		 *
+-		 * We should never be requeueing a futex_q with a pi_state,
+-		 * which is awaiting a futex_unlock_pi().
+-		 */
+-		if ((requeue_pi && !this->rt_waiter) ||
+-		    (!requeue_pi && this->rt_waiter) ||
+-		    this->pi_state) {
+-			ret = -EINVAL;
+-			break;
+-		}
+-
+-		/* Plain futexes just wake or requeue and are done */
+-		if (!requeue_pi) {
+-			if (++task_count <= nr_wake)
+-				mark_wake_futex(&wake_q, this);
+-			else
+-				requeue_futex(this, hb1, hb2, &key2);
+-			continue;
+-		}
+-
+-		/* Ensure we requeue to the expected futex for requeue_pi. */
+-		if (!match_futex(this->requeue_pi_key, &key2)) {
+-			ret = -EINVAL;
+-			break;
+-		}
+-
+-		/*
+-		 * Requeue nr_requeue waiters and possibly one more in the case
+-		 * of requeue_pi if we couldn't acquire the lock atomically.
+-		 *
+-		 * Prepare the waiter to take the rt_mutex. Take a refcount
+-		 * on the pi_state and store the pointer in the futex_q
+-		 * object of the waiter.
+-		 */
+-		get_pi_state(pi_state);
+-
+-		/* Don't requeue when the waiter is already on the way out. */
+-		if (!futex_requeue_pi_prepare(this, pi_state)) {
+-			/*
+-			 * Early woken waiter signaled that it is on the
+-			 * way out. Drop the pi_state reference and try the
+-			 * next waiter. @this->pi_state is still NULL.
+-			 */
+-			put_pi_state(pi_state);
+-			continue;
+-		}
+-
+-		ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+-						this->rt_waiter,
+-						this->task);
+-
+-		if (ret == 1) {
+-			/*
+-			 * We got the lock. We do neither drop the refcount
+-			 * on pi_state nor clear this->pi_state because the
+-			 * waiter needs the pi_state for cleaning up the
+-			 * user space value. It will drop the refcount
+-			 * after doing so. this::requeue_state is updated
+-			 * in the wakeup as well.
+-			 */
+-			requeue_pi_wake_futex(this, &key2, hb2);
+-			task_count++;
+-		} else if (!ret) {
+-			/* Waiter is queued, move it to hb2 */
+-			requeue_futex(this, hb1, hb2, &key2);
+-			futex_requeue_pi_complete(this, 0);
+-			task_count++;
+-		} else {
+-			/*
+-			 * rt_mutex_start_proxy_lock() detected a potential
+-			 * deadlock when we tried to queue that waiter.
+-			 * Drop the pi_state reference which we took above
+-			 * and remove the pointer to the state from the
+-			 * waiters futex_q object.
+-			 */
+-			this->pi_state = NULL;
+-			put_pi_state(pi_state);
+-			futex_requeue_pi_complete(this, ret);
+-			/*
+-			 * We stop queueing more waiters and let user space
+-			 * deal with the mess.
+-			 */
+-			break;
+-		}
+-	}
+-
+-	/*
+-	 * We took an extra initial reference to the pi_state in
+-	 * futex_proxy_trylock_atomic(). We need to drop it here again.
+-	 */
+-	put_pi_state(pi_state);
+-
+-out_unlock:
+-	double_unlock_hb(hb1, hb2);
+-	wake_up_q(&wake_q);
+-	hb_waiters_dec(hb2);
+-	return ret ? ret : task_count;
+-}
+-
+-/* The key must be already stored in q->key. */
+-static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
+-	__acquires(&hb->lock)
+-{
+-	struct futex_hash_bucket *hb;
+-
+-	hb = hash_futex(&q->key);
+-
+-	/*
+-	 * Increment the counter before taking the lock so that
+-	 * a potential waker won't miss a to-be-slept task that is
+-	 * waiting for the spinlock. This is safe as all queue_lock()
+-	 * users end up calling queue_me(). Similarly, for housekeeping,
+-	 * decrement the counter at queue_unlock() when some error has
+-	 * occurred and we don't end up adding the task to the list.
+-	 */
+-	hb_waiters_inc(hb); /* implies smp_mb(); (A) */
+-
+-	q->lock_ptr = &hb->lock;
+-
+-	spin_lock(&hb->lock);
+-	return hb;
+-}
+-
+-static inline void
+-queue_unlock(struct futex_hash_bucket *hb)
+-	__releases(&hb->lock)
+-{
+-	spin_unlock(&hb->lock);
+-	hb_waiters_dec(hb);
+-}
+-
+-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+-{
+-	int prio;
+-
+-	/*
+-	 * The priority used to register this element is
+-	 * - either the real thread-priority for the real-time threads
+-	 * (i.e. threads with a priority lower than MAX_RT_PRIO)
+-	 * - or MAX_RT_PRIO for non-RT threads.
+-	 * Thus, all RT-threads are woken first in priority order, and
+-	 * the others are woken last, in FIFO order.
+-	 */
+-	prio = min(current->normal_prio, MAX_RT_PRIO);
+-
+-	plist_node_init(&q->list, prio);
+-	plist_add(&q->list, &hb->chain);
+-	q->task = current;
+-}
+-
+-/**
+- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+- * @q:	The futex_q to enqueue
+- * @hb:	The destination hash bucket
+- *
+- * The hb->lock must be held by the caller, and is released here. A call to
+- * queue_me() is typically paired with exactly one call to unqueue_me().  The
+- * exceptions involve the PI related operations, which may use unqueue_me_pi()
+- * or nothing if the unqueue is done as part of the wake process and the unqueue
+- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+- * an example).
+- */
+-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+-	__releases(&hb->lock)
+-{
+-	__queue_me(q, hb);
+-	spin_unlock(&hb->lock);
+-}
+-
+-/**
+- * unqueue_me() - Remove the futex_q from its futex_hash_bucket
+- * @q:	The futex_q to unqueue
+- *
+- * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
+- * be paired with exactly one earlier call to queue_me().
+- *
+- * Return:
+- *  - 1 - if the futex_q was still queued (and we removed unqueued it);
+- *  - 0 - if the futex_q was already removed by the waking thread
+- */
+-static int unqueue_me(struct futex_q *q)
+-{
+-	spinlock_t *lock_ptr;
+-	int ret = 0;
+-
+-	/* In the common case we don't take the spinlock, which is nice. */
+-retry:
+-	/*
+-	 * q->lock_ptr can change between this read and the following spin_lock.
+-	 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
+-	 * optimizing lock_ptr out of the logic below.
+-	 */
+-	lock_ptr = READ_ONCE(q->lock_ptr);
+-	if (lock_ptr != NULL) {
+-		spin_lock(lock_ptr);
+-		/*
+-		 * q->lock_ptr can change between reading it and
+-		 * spin_lock(), causing us to take the wrong lock.  This
+-		 * corrects the race condition.
+-		 *
+-		 * Reasoning goes like this: if we have the wrong lock,
+-		 * q->lock_ptr must have changed (maybe several times)
+-		 * between reading it and the spin_lock().  It can
+-		 * change again after the spin_lock() but only if it was
+-		 * already changed before the spin_lock().  It cannot,
+-		 * however, change back to the original value.  Therefore
+-		 * we can detect whether we acquired the correct lock.
+-		 */
+-		if (unlikely(lock_ptr != q->lock_ptr)) {
+-			spin_unlock(lock_ptr);
+-			goto retry;
+-		}
+-		__unqueue_futex(q);
+-
+-		BUG_ON(q->pi_state);
+-
+-		spin_unlock(lock_ptr);
+-		ret = 1;
+-	}
+-
+-	return ret;
+-}
+-
+-/*
+- * PI futexes can not be requeued and must remove themselves from the
+- * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
+- */
+-static void unqueue_me_pi(struct futex_q *q)
+-{
+-	__unqueue_futex(q);
+-
+-	BUG_ON(!q->pi_state);
+-	put_pi_state(q->pi_state);
+-	q->pi_state = NULL;
+-}
+-
+-static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+-				  struct task_struct *argowner)
+-{
+-	struct futex_pi_state *pi_state = q->pi_state;
+-	struct task_struct *oldowner, *newowner;
+-	u32 uval, curval, newval, newtid;
+-	int err = 0;
+-
+-	oldowner = pi_state->owner;
+-
+-	/*
+-	 * We are here because either:
+-	 *
+-	 *  - we stole the lock and pi_state->owner needs updating to reflect
+-	 *    that (@argowner == current),
+-	 *
+-	 * or:
+-	 *
+-	 *  - someone stole our lock and we need to fix things to point to the
+-	 *    new owner (@argowner == NULL).
+-	 *
+-	 * Either way, we have to replace the TID in the user space variable.
+-	 * This must be atomic as we have to preserve the owner died bit here.
+-	 *
+-	 * Note: We write the user space value _before_ changing the pi_state
+-	 * because we can fault here. Imagine swapped out pages or a fork
+-	 * that marked all the anonymous memory readonly for cow.
+-	 *
+-	 * Modifying pi_state _before_ the user space value would leave the
+-	 * pi_state in an inconsistent state when we fault here, because we
+-	 * need to drop the locks to handle the fault. This might be observed
+-	 * in the PID checks when attaching to PI state .
+-	 */
+-retry:
+-	if (!argowner) {
+-		if (oldowner != current) {
+-			/*
+-			 * We raced against a concurrent self; things are
+-			 * already fixed up. Nothing to do.
+-			 */
+-			return 0;
+-		}
+-
+-		if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+-			/* We got the lock. pi_state is correct. Tell caller. */
+-			return 1;
+-		}
+-
+-		/*
+-		 * The trylock just failed, so either there is an owner or
+-		 * there is a higher priority waiter than this one.
+-		 */
+-		newowner = rt_mutex_owner(&pi_state->pi_mutex);
+-		/*
+-		 * If the higher priority waiter has not yet taken over the
+-		 * rtmutex then newowner is NULL. We can't return here with
+-		 * that state because it's inconsistent vs. the user space
+-		 * state. So drop the locks and try again. It's a valid
+-		 * situation and not any different from the other retry
+-		 * conditions.
+-		 */
+-		if (unlikely(!newowner)) {
+-			err = -EAGAIN;
+-			goto handle_err;
+-		}
+-	} else {
+-		WARN_ON_ONCE(argowner != current);
+-		if (oldowner == current) {
+-			/*
+-			 * We raced against a concurrent self; things are
+-			 * already fixed up. Nothing to do.
+-			 */
+-			return 1;
+-		}
+-		newowner = argowner;
+-	}
+-
+-	newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+-	/* Owner died? */
+-	if (!pi_state->owner)
+-		newtid |= FUTEX_OWNER_DIED;
+-
+-	err = get_futex_value_locked(&uval, uaddr);
+-	if (err)
+-		goto handle_err;
+-
+-	for (;;) {
+-		newval = (uval & FUTEX_OWNER_DIED) | newtid;
+-
+-		err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
+-		if (err)
+-			goto handle_err;
+-
+-		if (curval == uval)
+-			break;
+-		uval = curval;
+-	}
+-
+-	/*
+-	 * We fixed up user space. Now we need to fix the pi_state
+-	 * itself.
+-	 */
+-	pi_state_update_owner(pi_state, newowner);
+-
+-	return argowner == current;
+-
+-	/*
+-	 * In order to reschedule or handle a page fault, we need to drop the
+-	 * locks here. In the case of a fault, this gives the other task
+-	 * (either the highest priority waiter itself or the task which stole
+-	 * the rtmutex) the chance to try the fixup of the pi_state. So once we
+-	 * are back from handling the fault we need to check the pi_state after
+-	 * reacquiring the locks and before trying to do another fixup. When
+-	 * the fixup has been done already we simply return.
+-	 *
+-	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+-	 * drop hb->lock since the caller owns the hb -> futex_q relation.
+-	 * Dropping the pi_mutex->wait_lock requires the state revalidate.
+-	 */
+-handle_err:
+-	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+-	spin_unlock(q->lock_ptr);
+-
+-	switch (err) {
+-	case -EFAULT:
+-		err = fault_in_user_writeable(uaddr);
+-		break;
+-
+-	case -EAGAIN:
+-		cond_resched();
+-		err = 0;
+-		break;
+-
+-	default:
+-		WARN_ON_ONCE(1);
+-		break;
+-	}
+-
+-	spin_lock(q->lock_ptr);
+-	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+-
+-	/*
+-	 * Check if someone else fixed it for us:
+-	 */
+-	if (pi_state->owner != oldowner)
+-		return argowner == current;
+-
+-	/* Retry if err was -EAGAIN or the fault in succeeded */
+-	if (!err)
+-		goto retry;
+-
+-	/*
+-	 * fault_in_user_writeable() failed so user state is immutable. At
+-	 * best we can make the kernel state consistent but user state will
+-	 * be most likely hosed and any subsequent unlock operation will be
+-	 * rejected due to PI futex rule [10].
+-	 *
+-	 * Ensure that the rtmutex owner is also the pi_state owner despite
+-	 * the user space value claiming something different. There is no
+-	 * point in unlocking the rtmutex if current is the owner as it
+-	 * would need to wait until the next waiter has taken the rtmutex
+-	 * to guarantee consistent state. Keep it simple. Userspace asked
+-	 * for this wreckaged state.
+-	 *
+-	 * The rtmutex has an owner - either current or some other
+-	 * task. See the EAGAIN loop above.
+-	 */
+-	pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
+-
+-	return err;
+-}
+-
+-static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+-				struct task_struct *argowner)
+-{
+-	struct futex_pi_state *pi_state = q->pi_state;
+-	int ret;
+-
+-	lockdep_assert_held(q->lock_ptr);
+-
+-	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+-	ret = __fixup_pi_state_owner(uaddr, q, argowner);
+-	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+-	return ret;
+-}
+-
+-static long futex_wait_restart(struct restart_block *restart);
+-
+-/**
+- * fixup_owner() - Post lock pi_state and corner case management
+- * @uaddr:	user address of the futex
+- * @q:		futex_q (contains pi_state and access to the rt_mutex)
+- * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
+- *
+- * After attempting to lock an rt_mutex, this function is called to cleanup
+- * the pi_state owner as well as handle race conditions that may allow us to
+- * acquire the lock. Must be called with the hb lock held.
+- *
+- * Return:
+- *  -  1 - success, lock taken;
+- *  -  0 - success, lock not taken;
+- *  - <0 - on error (-EFAULT)
+- */
+-static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
+-{
+-	if (locked) {
+-		/*
+-		 * Got the lock. We might not be the anticipated owner if we
+-		 * did a lock-steal - fix up the PI-state in that case:
+-		 *
+-		 * Speculative pi_state->owner read (we don't hold wait_lock);
+-		 * since we own the lock pi_state->owner == current is the
+-		 * stable state, anything else needs more attention.
+-		 */
+-		if (q->pi_state->owner != current)
+-			return fixup_pi_state_owner(uaddr, q, current);
+-		return 1;
+-	}
+-
+-	/*
+-	 * If we didn't get the lock; check if anybody stole it from us. In
+-	 * that case, we need to fix up the uval to point to them instead of
+-	 * us, otherwise bad things happen. [10]
+-	 *
+-	 * Another speculative read; pi_state->owner == current is unstable
+-	 * but needs our attention.
+-	 */
+-	if (q->pi_state->owner == current)
+-		return fixup_pi_state_owner(uaddr, q, NULL);
+-
+-	/*
+-	 * Paranoia check. If we did not take the lock, then we should not be
+-	 * the owner of the rt_mutex. Warn and establish consistent state.
+-	 */
+-	if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
+-		return fixup_pi_state_owner(uaddr, q, current);
+-
+-	return 0;
+-}
+-
+-/**
+- * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
+- * @hb:		the futex hash bucket, must be locked by the caller
+- * @q:		the futex_q to queue up on
+- * @timeout:	the prepared hrtimer_sleeper, or null for no timeout
+- */
+-static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
+-				struct hrtimer_sleeper *timeout)
+-{
+-	/*
+-	 * The task state is guaranteed to be set before another task can
+-	 * wake it. set_current_state() is implemented using smp_store_mb() and
+-	 * queue_me() calls spin_unlock() upon completion, both serializing
+-	 * access to the hash list and forcing another memory barrier.
+-	 */
+-	set_current_state(TASK_INTERRUPTIBLE);
+-	queue_me(q, hb);
+-
+-	/* Arm the timer */
+-	if (timeout)
+-		hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
+-
+-	/*
+-	 * If we have been removed from the hash list, then another task
+-	 * has tried to wake us, and we can skip the call to schedule().
+-	 */
+-	if (likely(!plist_node_empty(&q->list))) {
+-		/*
+-		 * If the timer has already expired, current will already be
+-		 * flagged for rescheduling. Only call schedule if there
+-		 * is no timeout, or if it has yet to expire.
+-		 */
+-		if (!timeout || timeout->task)
+-			freezable_schedule();
+-	}
+-	__set_current_state(TASK_RUNNING);
+-}
+-
+-/**
+- * futex_wait_setup() - Prepare to wait on a futex
+- * @uaddr:	the futex userspace address
+- * @val:	the expected value
+- * @flags:	futex flags (FLAGS_SHARED, etc.)
+- * @q:		the associated futex_q
+- * @hb:		storage for hash_bucket pointer to be returned to caller
+- *
+- * Setup the futex_q and locate the hash_bucket.  Get the futex value and
+- * compare it with the expected value.  Handle atomic faults internally.
+- * Return with the hb lock held on success, and unlocked on failure.
+- *
+- * Return:
+- *  -  0 - uaddr contains val and hb has been locked;
+- *  - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
+- */
+-static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
+-			   struct futex_q *q, struct futex_hash_bucket **hb)
+-{
+-	u32 uval;
+-	int ret;
+-
+-	/*
+-	 * Access the page AFTER the hash-bucket is locked.
+-	 * Order is important:
+-	 *
+-	 *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
+-	 *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
+-	 *
+-	 * The basic logical guarantee of a futex is that it blocks ONLY
+-	 * if cond(var) is known to be true at the time of blocking, for
+-	 * any cond.  If we locked the hash-bucket after testing *uaddr, that
+-	 * would open a race condition where we could block indefinitely with
+-	 * cond(var) false, which would violate the guarantee.
+-	 *
+-	 * On the other hand, we insert q and release the hash-bucket only
+-	 * after testing *uaddr.  This guarantees that futex_wait() will NOT
+-	 * absorb a wakeup if *uaddr does not match the desired values
+-	 * while the syscall executes.
+-	 */
+-retry:
+-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
+-	if (unlikely(ret != 0))
+-		return ret;
+-
+-retry_private:
+-	*hb = queue_lock(q);
+-
+-	ret = get_futex_value_locked(&uval, uaddr);
+-
+-	if (ret) {
+-		queue_unlock(*hb);
+-
+-		ret = get_user(uval, uaddr);
+-		if (ret)
+-			return ret;
+-
+-		if (!(flags & FLAGS_SHARED))
+-			goto retry_private;
+-
+-		goto retry;
+-	}
+-
+-	if (uval != val) {
+-		queue_unlock(*hb);
+-		ret = -EWOULDBLOCK;
+-	}
+-
+-	return ret;
+-}
+-
+-static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
+-		      ktime_t *abs_time, u32 bitset)
+-{
+-	struct hrtimer_sleeper timeout, *to;
+-	struct restart_block *restart;
+-	struct futex_hash_bucket *hb;
+-	struct futex_q q = futex_q_init;
+-	int ret;
+-
+-	if (!bitset)
+-		return -EINVAL;
+-	q.bitset = bitset;
+-
+-	to = futex_setup_timer(abs_time, &timeout, flags,
+-			       current->timer_slack_ns);
+-retry:
+-	/*
+-	 * Prepare to wait on uaddr. On success, it holds hb->lock and q
+-	 * is initialized.
+-	 */
+-	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+-	if (ret)
+-		goto out;
+-
+-	/* queue_me and wait for wakeup, timeout, or a signal. */
+-	futex_wait_queue_me(hb, &q, to);
+-
+-	/* If we were woken (and unqueued), we succeeded, whatever. */
+-	ret = 0;
+-	if (!unqueue_me(&q))
+-		goto out;
+-	ret = -ETIMEDOUT;
+-	if (to && !to->task)
+-		goto out;
+-
+-	/*
+-	 * We expect signal_pending(current), but we might be the
+-	 * victim of a spurious wakeup as well.
+-	 */
+-	if (!signal_pending(current))
+-		goto retry;
+-
+-	ret = -ERESTARTSYS;
+-	if (!abs_time)
+-		goto out;
+-
+-	restart = &current->restart_block;
+-	restart->futex.uaddr = uaddr;
+-	restart->futex.val = val;
+-	restart->futex.time = *abs_time;
+-	restart->futex.bitset = bitset;
+-	restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
+-
+-	ret = set_restart_fn(restart, futex_wait_restart);
+-
+-out:
+-	if (to) {
+-		hrtimer_cancel(&to->timer);
+-		destroy_hrtimer_on_stack(&to->timer);
+-	}
+-	return ret;
+-}
+-
+-
+-static long futex_wait_restart(struct restart_block *restart)
+-{
+-	u32 __user *uaddr = restart->futex.uaddr;
+-	ktime_t t, *tp = NULL;
+-
+-	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+-		t = restart->futex.time;
+-		tp = &t;
+-	}
+-	restart->fn = do_no_restart_syscall;
+-
+-	return (long)futex_wait(uaddr, restart->futex.flags,
+-				restart->futex.val, tp, restart->futex.bitset);
+-}
+-
+-
+-/*
+- * Userspace tried a 0 -> TID atomic transition of the futex value
+- * and failed. The kernel side here does the whole locking operation:
+- * if there are waiters then it will block as a consequence of relying
+- * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
+- * a 0 value of the futex too.).
+- *
+- * Also serves as futex trylock_pi()'ing, and due semantics.
+- */
+-static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
+-			 ktime_t *time, int trylock)
+-{
+-	struct hrtimer_sleeper timeout, *to;
+-	struct task_struct *exiting = NULL;
+-	struct rt_mutex_waiter rt_waiter;
+-	struct futex_hash_bucket *hb;
+-	struct futex_q q = futex_q_init;
+-	int res, ret;
+-
+-	if (!IS_ENABLED(CONFIG_FUTEX_PI))
+-		return -ENOSYS;
+-
+-	if (refill_pi_state_cache())
+-		return -ENOMEM;
+-
+-	to = futex_setup_timer(time, &timeout, flags, 0);
+-
+-retry:
+-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
+-	if (unlikely(ret != 0))
+-		goto out;
+-
+-retry_private:
+-	hb = queue_lock(&q);
+-
+-	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+-				   &exiting, 0);
+-	if (unlikely(ret)) {
+-		/*
+-		 * Atomic work succeeded and we got the lock,
+-		 * or failed. Either way, we do _not_ block.
+-		 */
+-		switch (ret) {
+-		case 1:
+-			/* We got the lock. */
+-			ret = 0;
+-			goto out_unlock_put_key;
+-		case -EFAULT:
+-			goto uaddr_faulted;
+-		case -EBUSY:
+-		case -EAGAIN:
+-			/*
+-			 * Two reasons for this:
+-			 * - EBUSY: Task is exiting and we just wait for the
+-			 *   exit to complete.
+-			 * - EAGAIN: The user space value changed.
+-			 */
+-			queue_unlock(hb);
+-			/*
+-			 * Handle the case where the owner is in the middle of
+-			 * exiting. Wait for the exit to complete otherwise
+-			 * this task might loop forever, aka. live lock.
+-			 */
+-			wait_for_owner_exiting(ret, exiting);
+-			cond_resched();
+-			goto retry;
+-		default:
+-			goto out_unlock_put_key;
+-		}
+-	}
+-
+-	WARN_ON(!q.pi_state);
+-
+-	/*
+-	 * Only actually queue now that the atomic ops are done:
+-	 */
+-	__queue_me(&q, hb);
+-
+-	if (trylock) {
+-		ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
+-		/* Fixup the trylock return value: */
+-		ret = ret ? 0 : -EWOULDBLOCK;
+-		goto no_block;
+-	}
+-
+-	rt_mutex_init_waiter(&rt_waiter);
+-
+-	/*
+-	 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
+-	 * hold it while doing rt_mutex_start_proxy(), because then it will
+-	 * include hb->lock in the blocking chain, even through we'll not in
+-	 * fact hold it while blocking. This will lead it to report -EDEADLK
+-	 * and BUG when futex_unlock_pi() interleaves with this.
+-	 *
+-	 * Therefore acquire wait_lock while holding hb->lock, but drop the
+-	 * latter before calling __rt_mutex_start_proxy_lock(). This
+-	 * interleaves with futex_unlock_pi() -- which does a similar lock
+-	 * handoff -- such that the latter can observe the futex_q::pi_state
+-	 * before __rt_mutex_start_proxy_lock() is done.
+-	 */
+-	raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+-	spin_unlock(q.lock_ptr);
+-	/*
+-	 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+-	 * such that futex_unlock_pi() is guaranteed to observe the waiter when
+-	 * it sees the futex_q::pi_state.
+-	 */
+-	ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+-	raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+-
+-	if (ret) {
+-		if (ret == 1)
+-			ret = 0;
+-		goto cleanup;
+-	}
+-
+-	if (unlikely(to))
+-		hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
+-
+-	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+-
+-cleanup:
+-	spin_lock(q.lock_ptr);
+-	/*
+-	 * If we failed to acquire the lock (deadlock/signal/timeout), we must
+-	 * first acquire the hb->lock before removing the lock from the
+-	 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
+-	 * lists consistent.
+-	 *
+-	 * In particular; it is important that futex_unlock_pi() can not
+-	 * observe this inconsistency.
+-	 */
+-	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+-		ret = 0;
+-
+-no_block:
+-	/*
+-	 * Fixup the pi_state owner and possibly acquire the lock if we
+-	 * haven't already.
+-	 */
+-	res = fixup_owner(uaddr, &q, !ret);
+-	/*
+-	 * If fixup_owner() returned an error, propagate that.  If it acquired
+-	 * the lock, clear our -ETIMEDOUT or -EINTR.
+-	 */
+-	if (res)
+-		ret = (res < 0) ? res : 0;
+-
+-	unqueue_me_pi(&q);
+-	spin_unlock(q.lock_ptr);
+-	goto out;
+-
+-out_unlock_put_key:
+-	queue_unlock(hb);
+-
+-out:
+-	if (to) {
+-		hrtimer_cancel(&to->timer);
+-		destroy_hrtimer_on_stack(&to->timer);
+-	}
+-	return ret != -EINTR ? ret : -ERESTARTNOINTR;
+-
+-uaddr_faulted:
+-	queue_unlock(hb);
+-
+-	ret = fault_in_user_writeable(uaddr);
+-	if (ret)
+-		goto out;
+-
+-	if (!(flags & FLAGS_SHARED))
+-		goto retry_private;
+-
+-	goto retry;
+-}
+-
+-/*
+- * Userspace attempted a TID -> 0 atomic transition, and failed.
+- * This is the in-kernel slowpath: we look up the PI state (if any),
+- * and do the rt-mutex unlock.
+- */
+-static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
+-{
+-	u32 curval, uval, vpid = task_pid_vnr(current);
+-	union futex_key key = FUTEX_KEY_INIT;
+-	struct futex_hash_bucket *hb;
+-	struct futex_q *top_waiter;
+-	int ret;
+-
+-	if (!IS_ENABLED(CONFIG_FUTEX_PI))
+-		return -ENOSYS;
+-
+-retry:
+-	if (get_user(uval, uaddr))
+-		return -EFAULT;
+-	/*
+-	 * We release only a lock we actually own:
+-	 */
+-	if ((uval & FUTEX_TID_MASK) != vpid)
+-		return -EPERM;
+-
+-	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
+-	if (ret)
+-		return ret;
+-
+-	hb = hash_futex(&key);
+-	spin_lock(&hb->lock);
+-
+-	/*
+-	 * Check waiters first. We do not trust user space values at
+-	 * all and we at least want to know if user space fiddled
+-	 * with the futex value instead of blindly unlocking.
+-	 */
+-	top_waiter = futex_top_waiter(hb, &key);
+-	if (top_waiter) {
+-		struct futex_pi_state *pi_state = top_waiter->pi_state;
+-
+-		ret = -EINVAL;
+-		if (!pi_state)
+-			goto out_unlock;
+-
+-		/*
+-		 * If current does not own the pi_state then the futex is
+-		 * inconsistent and user space fiddled with the futex value.
+-		 */
+-		if (pi_state->owner != current)
+-			goto out_unlock;
+-
+-		get_pi_state(pi_state);
+-		/*
+-		 * By taking wait_lock while still holding hb->lock, we ensure
+-		 * there is no point where we hold neither; and therefore
+-		 * wake_futex_pi() must observe a state consistent with what we
+-		 * observed.
+-		 *
+-		 * In particular; this forces __rt_mutex_start_proxy() to
+-		 * complete such that we're guaranteed to observe the
+-		 * rt_waiter. Also see the WARN in wake_futex_pi().
+-		 */
+-		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+-		spin_unlock(&hb->lock);
+-
+-		/* drops pi_state->pi_mutex.wait_lock */
+-		ret = wake_futex_pi(uaddr, uval, pi_state);
+-
+-		put_pi_state(pi_state);
+-
+-		/*
+-		 * Success, we're done! No tricky corner cases.
+-		 */
+-		if (!ret)
+-			return ret;
+-		/*
+-		 * The atomic access to the futex value generated a
+-		 * pagefault, so retry the user-access and the wakeup:
+-		 */
+-		if (ret == -EFAULT)
+-			goto pi_faulted;
+-		/*
+-		 * A unconditional UNLOCK_PI op raced against a waiter
+-		 * setting the FUTEX_WAITERS bit. Try again.
+-		 */
+-		if (ret == -EAGAIN)
+-			goto pi_retry;
+-		/*
+-		 * wake_futex_pi has detected invalid state. Tell user
+-		 * space.
+-		 */
+-		return ret;
+-	}
+-
+-	/*
+-	 * We have no kernel internal state, i.e. no waiters in the
+-	 * kernel. Waiters which are about to queue themselves are stuck
+-	 * on hb->lock. So we can safely ignore them. We do neither
+-	 * preserve the WAITERS bit not the OWNER_DIED one. We are the
+-	 * owner.
+-	 */
+-	if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
+-		spin_unlock(&hb->lock);
+-		switch (ret) {
+-		case -EFAULT:
+-			goto pi_faulted;
+-
+-		case -EAGAIN:
+-			goto pi_retry;
+-
+-		default:
+-			WARN_ON_ONCE(1);
+-			return ret;
+-		}
+-	}
+-
+-	/*
+-	 * If uval has changed, let user space handle it.
+-	 */
+-	ret = (curval == uval) ? 0 : -EAGAIN;
+-
+-out_unlock:
+-	spin_unlock(&hb->lock);
+-	return ret;
+-
+-pi_retry:
+-	cond_resched();
+-	goto retry;
+-
+-pi_faulted:
+-
+-	ret = fault_in_user_writeable(uaddr);
+-	if (!ret)
+-		goto retry;
+-
+-	return ret;
+-}
+-
+-/**
+- * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex
+- * @hb:		the hash_bucket futex_q was original enqueued on
+- * @q:		the futex_q woken while waiting to be requeued
+- * @timeout:	the timeout associated with the wait (NULL if none)
+- *
+- * Determine the cause for the early wakeup.
+- *
+- * Return:
+- *  -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR
+- */
+-static inline
+-int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+-				   struct futex_q *q,
+-				   struct hrtimer_sleeper *timeout)
+-{
+-	int ret;
+-
+-	/*
+-	 * With the hb lock held, we avoid races while we process the wakeup.
+-	 * We only need to hold hb (and not hb2) to ensure atomicity as the
+-	 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+-	 * It can't be requeued from uaddr2 to something else since we don't
+-	 * support a PI aware source futex for requeue.
+-	 */
+-	WARN_ON_ONCE(&hb->lock != q->lock_ptr);
+-
+-	/*
+-	 * We were woken prior to requeue by a timeout or a signal.
+-	 * Unqueue the futex_q and determine which it was.
+-	 */
+-	plist_del(&q->list, &hb->chain);
+-	hb_waiters_dec(hb);
+-
+-	/* Handle spurious wakeups gracefully */
+-	ret = -EWOULDBLOCK;
+-	if (timeout && !timeout->task)
+-		ret = -ETIMEDOUT;
+-	else if (signal_pending(current))
+-		ret = -ERESTARTNOINTR;
+-	return ret;
+-}
+-
+-/**
+- * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+- * @uaddr:	the futex we initially wait on (non-pi)
+- * @flags:	futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
+- *		the same type, no requeueing from private to shared, etc.
+- * @val:	the expected value of uaddr
+- * @abs_time:	absolute timeout
+- * @bitset:	32 bit wakeup bitset set by userspace, defaults to all
+- * @uaddr2:	the pi futex we will take prior to returning to user-space
+- *
+- * The caller will wait on uaddr and will be requeued by futex_requeue() to
+- * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
+- * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+- * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
+- * without one, the pi logic would not know which task to boost/deboost, if
+- * there was a need to.
+- *
+- * We call schedule in futex_wait_queue_me() when we enqueue and return there
+- * via the following--
+- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+- * 2) wakeup on uaddr2 after a requeue
+- * 3) signal
+- * 4) timeout
+- *
+- * If 3, cleanup and return -ERESTARTNOINTR.
+- *
+- * If 2, we may then block on trying to take the rt_mutex and return via:
+- * 5) successful lock
+- * 6) signal
+- * 7) timeout
+- * 8) other lock acquisition failure
+- *
+- * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
+- *
+- * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+- *
+- * Return:
+- *  -  0 - On success;
+- *  - <0 - On error
+- */
+-static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
+-				 u32 val, ktime_t *abs_time, u32 bitset,
+-				 u32 __user *uaddr2)
+-{
+-	struct hrtimer_sleeper timeout, *to;
+-	struct rt_mutex_waiter rt_waiter;
+-	struct futex_hash_bucket *hb;
+-	union futex_key key2 = FUTEX_KEY_INIT;
+-	struct futex_q q = futex_q_init;
+-	struct rt_mutex_base *pi_mutex;
+-	int res, ret;
+-
+-	if (!IS_ENABLED(CONFIG_FUTEX_PI))
+-		return -ENOSYS;
+-
+-	if (uaddr == uaddr2)
+-		return -EINVAL;
+-
+-	if (!bitset)
+-		return -EINVAL;
+-
+-	to = futex_setup_timer(abs_time, &timeout, flags,
+-			       current->timer_slack_ns);
+-
+-	/*
+-	 * The waiter is allocated on our stack, manipulated by the requeue
+-	 * code while we sleep on uaddr.
+-	 */
+-	rt_mutex_init_waiter(&rt_waiter);
+-
+-	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
+-	if (unlikely(ret != 0))
+-		goto out;
+-
+-	q.bitset = bitset;
+-	q.rt_waiter = &rt_waiter;
+-	q.requeue_pi_key = &key2;
+-
+-	/*
+-	 * Prepare to wait on uaddr. On success, it holds hb->lock and q
+-	 * is initialized.
+-	 */
+-	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
+-	if (ret)
+-		goto out;
+-
+-	/*
+-	 * The check above which compares uaddrs is not sufficient for
+-	 * shared futexes. We need to compare the keys:
+-	 */
+-	if (match_futex(&q.key, &key2)) {
+-		queue_unlock(hb);
+-		ret = -EINVAL;
+-		goto out;
+-	}
+-
+-	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
+-	futex_wait_queue_me(hb, &q, to);
+-
+-	switch (futex_requeue_pi_wakeup_sync(&q)) {
+-	case Q_REQUEUE_PI_IGNORE:
+-		/* The waiter is still on uaddr1 */
+-		spin_lock(&hb->lock);
+-		ret = handle_early_requeue_pi_wakeup(hb, &q, to);
+-		spin_unlock(&hb->lock);
+-		break;
+-
+-	case Q_REQUEUE_PI_LOCKED:
+-		/* The requeue acquired the lock */
+-		if (q.pi_state && (q.pi_state->owner != current)) {
+-			spin_lock(q.lock_ptr);
+-			ret = fixup_owner(uaddr2, &q, true);
+-			/*
+-			 * Drop the reference to the pi state which the
+-			 * requeue_pi() code acquired for us.
+-			 */
+-			put_pi_state(q.pi_state);
+-			spin_unlock(q.lock_ptr);
+-			/*
+-			 * Adjust the return value. It's either -EFAULT or
+-			 * success (1) but the caller expects 0 for success.
+-			 */
+-			ret = ret < 0 ? ret : 0;
+-		}
+-		break;
+-
+-	case Q_REQUEUE_PI_DONE:
+-		/* Requeue completed. Current is 'pi_blocked_on' the rtmutex */
+-		pi_mutex = &q.pi_state->pi_mutex;
+-		ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
+-
+-		/* Current is not longer pi_blocked_on */
+-		spin_lock(q.lock_ptr);
+-		if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+-			ret = 0;
+-
+-		debug_rt_mutex_free_waiter(&rt_waiter);
+-		/*
+-		 * Fixup the pi_state owner and possibly acquire the lock if we
+-		 * haven't already.
+-		 */
+-		res = fixup_owner(uaddr2, &q, !ret);
+-		/*
+-		 * If fixup_owner() returned an error, propagate that.  If it
+-		 * acquired the lock, clear -ETIMEDOUT or -EINTR.
+-		 */
+-		if (res)
+-			ret = (res < 0) ? res : 0;
+-
+-		unqueue_me_pi(&q);
+-		spin_unlock(q.lock_ptr);
+-
+-		if (ret == -EINTR) {
+-			/*
+-			 * We've already been requeued, but cannot restart
+-			 * by calling futex_lock_pi() directly. We could
+-			 * restart this syscall, but it would detect that
+-			 * the user space "val" changed and return
+-			 * -EWOULDBLOCK.  Save the overhead of the restart
+-			 * and return -EWOULDBLOCK directly.
+-			 */
+-			ret = -EWOULDBLOCK;
+-		}
+-		break;
+-	default:
+-		BUG();
+-	}
+-
+-out:
+-	if (to) {
+-		hrtimer_cancel(&to->timer);
+-		destroy_hrtimer_on_stack(&to->timer);
+-	}
+-	return ret;
+-}
+-
+-/*
+- * Support for robust futexes: the kernel cleans up held futexes at
+- * thread exit time.
+- *
+- * Implementation: user-space maintains a per-thread list of locks it
+- * is holding. Upon do_exit(), the kernel carefully walks this list,
+- * and marks all locks that are owned by this thread with the
+- * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
+- * always manipulated with the lock held, so the list is private and
+- * per-thread. Userspace also maintains a per-thread 'list_op_pending'
+- * field, to allow the kernel to clean up if the thread dies after
+- * acquiring the lock, but just before it could have added itself to
+- * the list. There can only be one such pending lock.
+- */
+-
+-/**
+- * sys_set_robust_list() - Set the robust-futex list head of a task
+- * @head:	pointer to the list-head
+- * @len:	length of the list-head, as userspace expects
+- */
+-SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
+-		size_t, len)
+-{
+-	if (!futex_cmpxchg_enabled)
+-		return -ENOSYS;
+-	/*
+-	 * The kernel knows only one size for now:
+-	 */
+-	if (unlikely(len != sizeof(*head)))
+-		return -EINVAL;
+-
+-	current->robust_list = head;
+-
+-	return 0;
+-}
+-
+-/**
+- * sys_get_robust_list() - Get the robust-futex list head of a task
+- * @pid:	pid of the process [zero for current task]
+- * @head_ptr:	pointer to a list-head pointer, the kernel fills it in
+- * @len_ptr:	pointer to a length field, the kernel fills in the header size
+- */
+-SYSCALL_DEFINE3(get_robust_list, int, pid,
+-		struct robust_list_head __user * __user *, head_ptr,
+-		size_t __user *, len_ptr)
+-{
+-	struct robust_list_head __user *head;
+-	unsigned long ret;
+-	struct task_struct *p;
+-
+-	if (!futex_cmpxchg_enabled)
+-		return -ENOSYS;
+-
+-	rcu_read_lock();
+-
+-	ret = -ESRCH;
+-	if (!pid)
+-		p = current;
+-	else {
+-		p = find_task_by_vpid(pid);
+-		if (!p)
+-			goto err_unlock;
+-	}
+-
+-	ret = -EPERM;
+-	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+-		goto err_unlock;
+-
+-	head = p->robust_list;
+-	rcu_read_unlock();
+-
+-	if (put_user(sizeof(*head), len_ptr))
+-		return -EFAULT;
+-	return put_user(head, head_ptr);
+-
+-err_unlock:
+-	rcu_read_unlock();
+-
+-	return ret;
+-}
+-
+-/* Constants for the pending_op argument of handle_futex_death */
+-#define HANDLE_DEATH_PENDING	true
+-#define HANDLE_DEATH_LIST	false
+-
+-/*
+- * Process a futex-list entry, check whether it's owned by the
+- * dying task, and do notification if so:
+- */
+-static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
+-			      bool pi, bool pending_op)
+-{
+-	u32 uval, nval, mval;
+-	int err;
+-
+-	/* Futex address must be 32bit aligned */
+-	if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
+-		return -1;
+-
+-retry:
+-	if (get_user(uval, uaddr))
+-		return -1;
+-
+-	/*
+-	 * Special case for regular (non PI) futexes. The unlock path in
+-	 * user space has two race scenarios:
+-	 *
+-	 * 1. The unlock path releases the user space futex value and
+-	 *    before it can execute the futex() syscall to wake up
+-	 *    waiters it is killed.
+-	 *
+-	 * 2. A woken up waiter is killed before it can acquire the
+-	 *    futex in user space.
+-	 *
+-	 * In both cases the TID validation below prevents a wakeup of
+-	 * potential waiters which can cause these waiters to block
+-	 * forever.
+-	 *
+-	 * In both cases the following conditions are met:
+-	 *
+-	 *	1) task->robust_list->list_op_pending != NULL
+-	 *	   @pending_op == true
+-	 *	2) User space futex value == 0
+-	 *	3) Regular futex: @pi == false
+-	 *
+-	 * If these conditions are met, it is safe to attempt waking up a
+-	 * potential waiter without touching the user space futex value and
+-	 * trying to set the OWNER_DIED bit. The user space futex value is
+-	 * uncontended and the rest of the user space mutex state is
+-	 * consistent, so a woken waiter will just take over the
+-	 * uncontended futex. Setting the OWNER_DIED bit would create
+-	 * inconsistent state and malfunction of the user space owner died
+-	 * handling.
+-	 */
+-	if (pending_op && !pi && !uval) {
+-		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+-		return 0;
+-	}
+-
+-	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+-		return 0;
+-
+-	/*
+-	 * Ok, this dying thread is truly holding a futex
+-	 * of interest. Set the OWNER_DIED bit atomically
+-	 * via cmpxchg, and if the value had FUTEX_WAITERS
+-	 * set, wake up a waiter (if any). (We have to do a
+-	 * futex_wake() even if OWNER_DIED is already set -
+-	 * to handle the rare but possible case of recursive
+-	 * thread-death.) The rest of the cleanup is done in
+-	 * userspace.
+-	 */
+-	mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+-
+-	/*
+-	 * We are not holding a lock here, but we want to have
+-	 * the pagefault_disable/enable() protection because
+-	 * we want to handle the fault gracefully. If the
+-	 * access fails we try to fault in the futex with R/W
+-	 * verification via get_user_pages. get_user() above
+-	 * does not guarantee R/W access. If that fails we
+-	 * give up and leave the futex locked.
+-	 */
+-	if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
+-		switch (err) {
+-		case -EFAULT:
+-			if (fault_in_user_writeable(uaddr))
+-				return -1;
+-			goto retry;
+-
+-		case -EAGAIN:
+-			cond_resched();
+-			goto retry;
+-
+-		default:
+-			WARN_ON_ONCE(1);
+-			return err;
+-		}
+-	}
+-
+-	if (nval != uval)
+-		goto retry;
+-
+-	/*
+-	 * Wake robust non-PI futexes here. The wakeup of
+-	 * PI futexes happens in exit_pi_state():
+-	 */
+-	if (!pi && (uval & FUTEX_WAITERS))
+-		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+-
+-	return 0;
+-}
+-
+-/*
+- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+- */
+-static inline int fetch_robust_entry(struct robust_list __user **entry,
+-				     struct robust_list __user * __user *head,
+-				     unsigned int *pi)
+-{
+-	unsigned long uentry;
+-
+-	if (get_user(uentry, (unsigned long __user *)head))
+-		return -EFAULT;
+-
+-	*entry = (void __user *)(uentry & ~1UL);
+-	*pi = uentry & 1;
+-
+-	return 0;
+-}
+-
+-/*
+- * Walk curr->robust_list (very carefully, it's a userspace list!)
+- * and mark any locks found there dead, and notify any waiters.
+- *
+- * We silently return on any sign of list-walking problem.
+- */
+-static void exit_robust_list(struct task_struct *curr)
+-{
+-	struct robust_list_head __user *head = curr->robust_list;
+-	struct robust_list __user *entry, *next_entry, *pending;
+-	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+-	unsigned int next_pi;
+-	unsigned long futex_offset;
+-	int rc;
+-
+-	if (!futex_cmpxchg_enabled)
+-		return;
+-
+-	/*
+-	 * Fetch the list head (which was registered earlier, via
+-	 * sys_set_robust_list()):
+-	 */
+-	if (fetch_robust_entry(&entry, &head->list.next, &pi))
+-		return;
+-	/*
+-	 * Fetch the relative futex offset:
+-	 */
+-	if (get_user(futex_offset, &head->futex_offset))
+-		return;
+-	/*
+-	 * Fetch any possibly pending lock-add first, and handle it
+-	 * if it exists:
+-	 */
+-	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
+-		return;
+-
+-	next_entry = NULL;	/* avoid warning with gcc */
+-	while (entry != &head->list) {
+-		/*
+-		 * Fetch the next entry in the list before calling
+-		 * handle_futex_death:
+-		 */
+-		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
+-		/*
+-		 * A pending lock might already be on the list, so
+-		 * don't process it twice:
+-		 */
+-		if (entry != pending) {
+-			if (handle_futex_death((void __user *)entry + futex_offset,
+-						curr, pi, HANDLE_DEATH_LIST))
+-				return;
+-		}
+-		if (rc)
+-			return;
+-		entry = next_entry;
+-		pi = next_pi;
+-		/*
+-		 * Avoid excessively long or circular lists:
+-		 */
+-		if (!--limit)
+-			break;
+-
+-		cond_resched();
+-	}
+-
+-	if (pending) {
+-		handle_futex_death((void __user *)pending + futex_offset,
+-				   curr, pip, HANDLE_DEATH_PENDING);
+-	}
+-}
+-
+-static void futex_cleanup(struct task_struct *tsk)
+-{
+-	if (unlikely(tsk->robust_list)) {
+-		exit_robust_list(tsk);
+-		tsk->robust_list = NULL;
+-	}
+-
+-#ifdef CONFIG_COMPAT
+-	if (unlikely(tsk->compat_robust_list)) {
+-		compat_exit_robust_list(tsk);
+-		tsk->compat_robust_list = NULL;
+-	}
+-#endif
+-
+-	if (unlikely(!list_empty(&tsk->pi_state_list)))
+-		exit_pi_state_list(tsk);
+-}
+-
+-/**
+- * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
+- * @tsk:	task to set the state on
+- *
+- * Set the futex exit state of the task lockless. The futex waiter code
+- * observes that state when a task is exiting and loops until the task has
+- * actually finished the futex cleanup. The worst case for this is that the
+- * waiter runs through the wait loop until the state becomes visible.
+- *
+- * This is called from the recursive fault handling path in do_exit().
+- *
+- * This is best effort. Either the futex exit code has run already or
+- * not. If the OWNER_DIED bit has been set on the futex then the waiter can
+- * take it over. If not, the problem is pushed back to user space. If the
+- * futex exit code did not run yet, then an already queued waiter might
+- * block forever, but there is nothing which can be done about that.
+- */
+-void futex_exit_recursive(struct task_struct *tsk)
+-{
+-	/* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
+-	if (tsk->futex_state == FUTEX_STATE_EXITING)
+-		mutex_unlock(&tsk->futex_exit_mutex);
+-	tsk->futex_state = FUTEX_STATE_DEAD;
+-}
+-
+-static void futex_cleanup_begin(struct task_struct *tsk)
+-{
+-	/*
+-	 * Prevent various race issues against a concurrent incoming waiter
+-	 * including live locks by forcing the waiter to block on
+-	 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
+-	 * attach_to_pi_owner().
+-	 */
+-	mutex_lock(&tsk->futex_exit_mutex);
+-
+-	/*
+-	 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+-	 *
+-	 * This ensures that all subsequent checks of tsk->futex_state in
+-	 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
+-	 * tsk->pi_lock held.
+-	 *
+-	 * It guarantees also that a pi_state which was queued right before
+-	 * the state change under tsk->pi_lock by a concurrent waiter must
+-	 * be observed in exit_pi_state_list().
+-	 */
+-	raw_spin_lock_irq(&tsk->pi_lock);
+-	tsk->futex_state = FUTEX_STATE_EXITING;
+-	raw_spin_unlock_irq(&tsk->pi_lock);
+-}
+-
+-static void futex_cleanup_end(struct task_struct *tsk, int state)
+-{
+-	/*
+-	 * Lockless store. The only side effect is that an observer might
+-	 * take another loop until it becomes visible.
+-	 */
+-	tsk->futex_state = state;
+-	/*
+-	 * Drop the exit protection. This unblocks waiters which observed
+-	 * FUTEX_STATE_EXITING to reevaluate the state.
+-	 */
+-	mutex_unlock(&tsk->futex_exit_mutex);
+-}
+-
+-void futex_exec_release(struct task_struct *tsk)
+-{
+-	/*
+-	 * The state handling is done for consistency, but in the case of
+-	 * exec() there is no way to prevent further damage as the PID stays
+-	 * the same. But for the unlikely and arguably buggy case that a
+-	 * futex is held on exec(), this provides at least as much state
+-	 * consistency protection which is possible.
+-	 */
+-	futex_cleanup_begin(tsk);
+-	futex_cleanup(tsk);
+-	/*
+-	 * Reset the state to FUTEX_STATE_OK. The task is alive and about
+-	 * exec a new binary.
+-	 */
+-	futex_cleanup_end(tsk, FUTEX_STATE_OK);
+-}
+-
+-void futex_exit_release(struct task_struct *tsk)
+-{
+-	futex_cleanup_begin(tsk);
+-	futex_cleanup(tsk);
+-	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
+-}
+-
+-long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
+-		u32 __user *uaddr2, u32 val2, u32 val3)
+-{
+-	int cmd = op & FUTEX_CMD_MASK;
+-	unsigned int flags = 0;
+-
+-	if (!(op & FUTEX_PRIVATE_FLAG))
+-		flags |= FLAGS_SHARED;
+-
+-	if (op & FUTEX_CLOCK_REALTIME) {
+-		flags |= FLAGS_CLOCKRT;
+-		if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
+-		    cmd != FUTEX_LOCK_PI2)
+-			return -ENOSYS;
+-	}
+-
+-	switch (cmd) {
+-	case FUTEX_LOCK_PI:
+-	case FUTEX_LOCK_PI2:
+-	case FUTEX_UNLOCK_PI:
+-	case FUTEX_TRYLOCK_PI:
+-	case FUTEX_WAIT_REQUEUE_PI:
+-	case FUTEX_CMP_REQUEUE_PI:
+-		if (!futex_cmpxchg_enabled)
+-			return -ENOSYS;
+-	}
+-
+-	switch (cmd) {
+-	case FUTEX_WAIT:
+-		val3 = FUTEX_BITSET_MATCH_ANY;
+-		fallthrough;
+-	case FUTEX_WAIT_BITSET:
+-		return futex_wait(uaddr, flags, val, timeout, val3);
+-	case FUTEX_WAKE:
+-		val3 = FUTEX_BITSET_MATCH_ANY;
+-		fallthrough;
+-	case FUTEX_WAKE_BITSET:
+-		return futex_wake(uaddr, flags, val, val3);
+-	case FUTEX_REQUEUE:
+-		return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
+-	case FUTEX_CMP_REQUEUE:
+-		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
+-	case FUTEX_WAKE_OP:
+-		return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
+-	case FUTEX_LOCK_PI:
+-		flags |= FLAGS_CLOCKRT;
+-		fallthrough;
+-	case FUTEX_LOCK_PI2:
+-		return futex_lock_pi(uaddr, flags, timeout, 0);
+-	case FUTEX_UNLOCK_PI:
+-		return futex_unlock_pi(uaddr, flags);
+-	case FUTEX_TRYLOCK_PI:
+-		return futex_lock_pi(uaddr, flags, NULL, 1);
+-	case FUTEX_WAIT_REQUEUE_PI:
+-		val3 = FUTEX_BITSET_MATCH_ANY;
+-		return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
+-					     uaddr2);
+-	case FUTEX_CMP_REQUEUE_PI:
+-		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+-	}
+-	return -ENOSYS;
+-}
+-
+-static __always_inline bool futex_cmd_has_timeout(u32 cmd)
+-{
+-	switch (cmd) {
+-	case FUTEX_WAIT:
+-	case FUTEX_LOCK_PI:
+-	case FUTEX_LOCK_PI2:
+-	case FUTEX_WAIT_BITSET:
+-	case FUTEX_WAIT_REQUEUE_PI:
+-		return true;
+-	}
+-	return false;
+-}
+-
+-static __always_inline int
+-futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
+-{
+-	if (!timespec64_valid(ts))
+-		return -EINVAL;
+-
+-	*t = timespec64_to_ktime(*ts);
+-	if (cmd == FUTEX_WAIT)
+-		*t = ktime_add_safe(ktime_get(), *t);
+-	else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
+-		*t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
+-	return 0;
+-}
+-
+-SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+-		const struct __kernel_timespec __user *, utime,
+-		u32 __user *, uaddr2, u32, val3)
+-{
+-	int ret, cmd = op & FUTEX_CMD_MASK;
+-	ktime_t t, *tp = NULL;
+-	struct timespec64 ts;
+-
+-	if (utime && futex_cmd_has_timeout(cmd)) {
+-		if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
+-			return -EFAULT;
+-		if (get_timespec64(&ts, utime))
+-			return -EFAULT;
+-		ret = futex_init_timeout(cmd, op, &ts, &t);
+-		if (ret)
+-			return ret;
+-		tp = &t;
+-	}
+-
+-	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
+-}
+-
+-#ifdef CONFIG_COMPAT
+-/*
+- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+- */
+-static inline int
+-compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+-		   compat_uptr_t __user *head, unsigned int *pi)
+-{
+-	if (get_user(*uentry, head))
+-		return -EFAULT;
+-
+-	*entry = compat_ptr((*uentry) & ~1);
+-	*pi = (unsigned int)(*uentry) & 1;
+-
+-	return 0;
+-}
+-
+-static void __user *futex_uaddr(struct robust_list __user *entry,
+-				compat_long_t futex_offset)
+-{
+-	compat_uptr_t base = ptr_to_compat(entry);
+-	void __user *uaddr = compat_ptr(base + futex_offset);
+-
+-	return uaddr;
+-}
+-
+-/*
+- * Walk curr->robust_list (very carefully, it's a userspace list!)
+- * and mark any locks found there dead, and notify any waiters.
+- *
+- * We silently return on any sign of list-walking problem.
+- */
+-static void compat_exit_robust_list(struct task_struct *curr)
+-{
+-	struct compat_robust_list_head __user *head = curr->compat_robust_list;
+-	struct robust_list __user *entry, *next_entry, *pending;
+-	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+-	unsigned int next_pi;
+-	compat_uptr_t uentry, next_uentry, upending;
+-	compat_long_t futex_offset;
+-	int rc;
+-
+-	if (!futex_cmpxchg_enabled)
+-		return;
+-
+-	/*
+-	 * Fetch the list head (which was registered earlier, via
+-	 * sys_set_robust_list()):
+-	 */
+-	if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+-		return;
+-	/*
+-	 * Fetch the relative futex offset:
+-	 */
+-	if (get_user(futex_offset, &head->futex_offset))
+-		return;
+-	/*
+-	 * Fetch any possibly pending lock-add first, and handle it
+-	 * if it exists:
+-	 */
+-	if (compat_fetch_robust_entry(&upending, &pending,
+-			       &head->list_op_pending, &pip))
+-		return;
+-
+-	next_entry = NULL;	/* avoid warning with gcc */
+-	while (entry != (struct robust_list __user *) &head->list) {
+-		/*
+-		 * Fetch the next entry in the list before calling
+-		 * handle_futex_death:
+-		 */
+-		rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
+-			(compat_uptr_t __user *)&entry->next, &next_pi);
+-		/*
+-		 * A pending lock might already be on the list, so
+-		 * dont process it twice:
+-		 */
+-		if (entry != pending) {
+-			void __user *uaddr = futex_uaddr(entry, futex_offset);
+-
+-			if (handle_futex_death(uaddr, curr, pi,
+-					       HANDLE_DEATH_LIST))
+-				return;
+-		}
+-		if (rc)
+-			return;
+-		uentry = next_uentry;
+-		entry = next_entry;
+-		pi = next_pi;
+-		/*
+-		 * Avoid excessively long or circular lists:
+-		 */
+-		if (!--limit)
+-			break;
+-
+-		cond_resched();
+-	}
+-	if (pending) {
+-		void __user *uaddr = futex_uaddr(pending, futex_offset);
+-
+-		handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
+-	}
+-}
+-
+-COMPAT_SYSCALL_DEFINE2(set_robust_list,
+-		struct compat_robust_list_head __user *, head,
+-		compat_size_t, len)
+-{
+-	if (!futex_cmpxchg_enabled)
+-		return -ENOSYS;
+-
+-	if (unlikely(len != sizeof(*head)))
+-		return -EINVAL;
+-
+-	current->compat_robust_list = head;
+-
+-	return 0;
+-}
+-
+-COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+-			compat_uptr_t __user *, head_ptr,
+-			compat_size_t __user *, len_ptr)
+-{
+-	struct compat_robust_list_head __user *head;
+-	unsigned long ret;
+-	struct task_struct *p;
+-
+-	if (!futex_cmpxchg_enabled)
+-		return -ENOSYS;
+-
+-	rcu_read_lock();
+-
+-	ret = -ESRCH;
+-	if (!pid)
+-		p = current;
+-	else {
+-		p = find_task_by_vpid(pid);
+-		if (!p)
+-			goto err_unlock;
+-	}
+-
+-	ret = -EPERM;
+-	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+-		goto err_unlock;
+-
+-	head = p->compat_robust_list;
+-	rcu_read_unlock();
+-
+-	if (put_user(sizeof(*head), len_ptr))
+-		return -EFAULT;
+-	return put_user(ptr_to_compat(head), head_ptr);
+-
+-err_unlock:
+-	rcu_read_unlock();
+-
+-	return ret;
+-}
+-#endif /* CONFIG_COMPAT */
+-
+-#ifdef CONFIG_COMPAT_32BIT_TIME
+-SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
+-		const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
+-		u32, val3)
+-{
+-	int ret, cmd = op & FUTEX_CMD_MASK;
+-	ktime_t t, *tp = NULL;
+-	struct timespec64 ts;
+-
+-	if (utime && futex_cmd_has_timeout(cmd)) {
+-		if (get_old_timespec32(&ts, utime))
+-			return -EFAULT;
+-		ret = futex_init_timeout(cmd, op, &ts, &t);
+-		if (ret)
+-			return ret;
+-		tp = &t;
+-	}
+-
+-	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
+-}
+-#endif /* CONFIG_COMPAT_32BIT_TIME */
+-
+-static void __init futex_detect_cmpxchg(void)
+-{
+-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
+-	u32 curval;
+-
+-	/*
+-	 * This will fail and we want it. Some arch implementations do
+-	 * runtime detection of the futex_atomic_cmpxchg_inatomic()
+-	 * functionality. We want to know that before we call in any
+-	 * of the complex code paths. Also we want to prevent
+-	 * registration of robust lists in that case. NULL is
+-	 * guaranteed to fault and we get -EFAULT on functional
+-	 * implementation, the non-functional ones will return
+-	 * -ENOSYS.
+-	 */
+-	if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
+-		futex_cmpxchg_enabled = 1;
+-#endif
+-}
+-
+-static int __init futex_init(void)
+-{
+-	unsigned int futex_shift;
+-	unsigned long i;
+-
+-#if CONFIG_BASE_SMALL
+-	futex_hashsize = 16;
+-#else
+-	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+-#endif
+-
+-	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
+-					       futex_hashsize, 0,
+-					       futex_hashsize < 256 ? HASH_SMALL : 0,
+-					       &futex_shift, NULL,
+-					       futex_hashsize, futex_hashsize);
+-	futex_hashsize = 1UL << futex_shift;
+-
+-	futex_detect_cmpxchg();
+-
+-	for (i = 0; i < futex_hashsize; i++) {
+-		atomic_set(&futex_queues[i].waiters, 0);
+-		plist_head_init(&futex_queues[i].chain);
+-		spin_lock_init(&futex_queues[i].lock);
+-	}
+-
+-	return 0;
+-}
+-core_initcall(futex_init);
+diff --git a/kernel/futex/Makefile b/kernel/futex/Makefile
+new file mode 100644
+index 000000000..b77188d1f
+--- /dev/null
++++ b/kernel/futex/Makefile
+@@ -0,0 +1,3 @@
++# SPDX-License-Identifier: GPL-2.0
++
++obj-y += core.o syscalls.o pi.o requeue.o waitwake.o
+diff --git a/kernel/futex/core.c b/kernel/futex/core.c
+new file mode 100644
+index 000000000..25d8a88b3
+--- /dev/null
++++ b/kernel/futex/core.c
+@@ -0,0 +1,1176 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ *  Fast Userspace Mutexes (which I call "Futexes!").
++ *  (C) Rusty Russell, IBM 2002
++ *
++ *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
++ *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
++ *
++ *  Removed page pinning, fix privately mapped COW pages and other cleanups
++ *  (C) Copyright 2003, 2004 Jamie Lokier
++ *
++ *  Robust futex support started by Ingo Molnar
++ *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
++ *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
++ *
++ *  PI-futex support started by Ingo Molnar and Thomas Gleixner
++ *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
++ *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
++ *
++ *  PRIVATE futexes by Eric Dumazet
++ *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
++ *
++ *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
++ *  Copyright (C) IBM Corporation, 2009
++ *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
++ *
++ *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
++ *  enough at me, Linus for the original (flawed) idea, Matthew
++ *  Kirkwood for proof-of-concept implementation.
++ *
++ *  "The futexes are also cursed."
++ *  "But they come in a choice of three flavours!"
++ */
++#include <linux/compat.h>
++#include <linux/jhash.h>
++#include <linux/pagemap.h>
++#include <linux/memblock.h>
++#include <linux/fault-inject.h>
++#include <linux/slab.h>
++
++#include "futex.h"
++#include "../locking/rtmutex_common.h"
++
++#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
++int  __read_mostly futex_cmpxchg_enabled;
++#endif
++
++
++/*
++ * The base of the bucket array and its size are always used together
++ * (after initialization only in futex_hash()), so ensure that they
++ * reside in the same cacheline.
++ */
++static struct {
++	struct futex_hash_bucket *queues;
++	unsigned long            hashsize;
++} __futex_data __read_mostly __aligned(2*sizeof(long));
++#define futex_queues   (__futex_data.queues)
++#define futex_hashsize (__futex_data.hashsize)
++
++
++/*
++ * Fault injections for futexes.
++ */
++#ifdef CONFIG_FAIL_FUTEX
++
++static struct {
++	struct fault_attr attr;
++
++	bool ignore_private;
++} fail_futex = {
++	.attr = FAULT_ATTR_INITIALIZER,
++	.ignore_private = false,
++};
++
++static int __init setup_fail_futex(char *str)
++{
++	return setup_fault_attr(&fail_futex.attr, str);
++}
++__setup("fail_futex=", setup_fail_futex);
++
++bool should_fail_futex(bool fshared)
++{
++	if (fail_futex.ignore_private && !fshared)
++		return false;
++
++	return should_fail(&fail_futex.attr, 1);
++}
++
++#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
++
++static int __init fail_futex_debugfs(void)
++{
++	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
++	struct dentry *dir;
++
++	dir = fault_create_debugfs_attr("fail_futex", NULL,
++					&fail_futex.attr);
++	if (IS_ERR(dir))
++		return PTR_ERR(dir);
++
++	debugfs_create_bool("ignore-private", mode, dir,
++			    &fail_futex.ignore_private);
++	return 0;
++}
++
++late_initcall(fail_futex_debugfs);
++
++#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
++
++#endif /* CONFIG_FAIL_FUTEX */
++
++/**
++ * futex_hash - Return the hash bucket in the global hash
++ * @key:	Pointer to the futex key for which the hash is calculated
++ *
++ * We hash on the keys returned from get_futex_key (see below) and return the
++ * corresponding hash bucket in the global hash.
++ */
++struct futex_hash_bucket *futex_hash(union futex_key *key)
++{
++	u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
++			  key->both.offset);
++
++	return &futex_queues[hash & (futex_hashsize - 1)];
++}
++
++
++/**
++ * futex_setup_timer - set up the sleeping hrtimer.
++ * @time:	ptr to the given timeout value
++ * @timeout:	the hrtimer_sleeper structure to be set up
++ * @flags:	futex flags
++ * @range_ns:	optional range in ns
++ *
++ * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
++ *	   value given
++ */
++struct hrtimer_sleeper *
++futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
++		  int flags, u64 range_ns)
++{
++	if (!time)
++		return NULL;
++
++	hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
++				      CLOCK_REALTIME : CLOCK_MONOTONIC,
++				      HRTIMER_MODE_ABS);
++	/*
++	 * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
++	 * effectively the same as calling hrtimer_set_expires().
++	 */
++	hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
++
++	return timeout;
++}
++
++/*
++ * Generate a machine wide unique identifier for this inode.
++ *
++ * This relies on u64 not wrapping in the life-time of the machine; which with
++ * 1ns resolution means almost 585 years.
++ *
++ * This further relies on the fact that a well formed program will not unmap
++ * the file while it has a (shared) futex waiting on it. This mapping will have
++ * a file reference which pins the mount and inode.
++ *
++ * If for some reason an inode gets evicted and read back in again, it will get
++ * a new sequence number and will _NOT_ match, even though it is the exact same
++ * file.
++ *
++ * It is important that futex_match() will never have a false-positive, esp.
++ * for PI futexes that can mess up the state. The above argues that false-negatives
++ * are only possible for malformed programs.
++ */
++static u64 get_inode_sequence_number(struct inode *inode)
++{
++	static atomic64_t i_seq;
++	u64 old;
++
++	/* Does the inode already have a sequence number? */
++	old = atomic64_read(&inode->i_sequence);
++	if (likely(old))
++		return old;
++
++	for (;;) {
++		u64 new = atomic64_add_return(1, &i_seq);
++		if (WARN_ON_ONCE(!new))
++			continue;
++
++		old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
++		if (old)
++			return old;
++		return new;
++	}
++}
++
++/**
++ * get_futex_key() - Get parameters which are the keys for a futex
++ * @uaddr:	virtual address of the futex
++ * @fshared:	false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
++ * @key:	address where result is stored.
++ * @rw:		mapping needs to be read/write (values: FUTEX_READ,
++ *              FUTEX_WRITE)
++ *
++ * Return: a negative error code or 0
++ *
++ * The key words are stored in @key on success.
++ *
++ * For shared mappings (when @fshared), the key is:
++ *
++ *   ( inode->i_sequence, page->index, offset_within_page )
++ *
++ * [ also see get_inode_sequence_number() ]
++ *
++ * For private mappings (or when !@fshared), the key is:
++ *
++ *   ( current->mm, address, 0 )
++ *
++ * This allows (cross process, where applicable) identification of the futex
++ * without keeping the page pinned for the duration of the FUTEX_WAIT.
++ *
++ * lock_page() might sleep, the caller should not hold a spinlock.
++ */
++int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
++		  enum futex_access rw)
++{
++	unsigned long address = (unsigned long)uaddr;
++	struct mm_struct *mm = current->mm;
++	struct page *page, *tail;
++	struct address_space *mapping;
++	int err, ro = 0;
++
++	/*
++	 * The futex address must be "naturally" aligned.
++	 */
++	key->both.offset = address % PAGE_SIZE;
++	if (unlikely((address % sizeof(u32)) != 0))
++		return -EINVAL;
++	address -= key->both.offset;
++
++	if (unlikely(!access_ok(uaddr, sizeof(u32))))
++		return -EFAULT;
++
++	if (unlikely(should_fail_futex(fshared)))
++		return -EFAULT;
++
++	/*
++	 * PROCESS_PRIVATE futexes are fast.
++	 * As the mm cannot disappear under us and the 'key' only needs
++	 * virtual address, we dont even have to find the underlying vma.
++	 * Note : We do have to check 'uaddr' is a valid user address,
++	 *        but access_ok() should be faster than find_vma()
++	 */
++	if (!fshared) {
++		key->private.mm = mm;
++		key->private.address = address;
++		return 0;
++	}
++
++again:
++	/* Ignore any VERIFY_READ mapping (futex common case) */
++	if (unlikely(should_fail_futex(true)))
++		return -EFAULT;
++
++	err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
++	/*
++	 * If write access is not required (eg. FUTEX_WAIT), try
++	 * and get read-only access.
++	 */
++	if (err == -EFAULT && rw == FUTEX_READ) {
++		err = get_user_pages_fast(address, 1, 0, &page);
++		ro = 1;
++	}
++	if (err < 0)
++		return err;
++	else
++		err = 0;
++
++	/*
++	 * The treatment of mapping from this point on is critical. The page
++	 * lock protects many things but in this context the page lock
++	 * stabilizes mapping, prevents inode freeing in the shared
++	 * file-backed region case and guards against movement to swap cache.
++	 *
++	 * Strictly speaking the page lock is not needed in all cases being
++	 * considered here and page lock forces unnecessarily serialization
++	 * From this point on, mapping will be re-verified if necessary and
++	 * page lock will be acquired only if it is unavoidable
++	 *
++	 * Mapping checks require the head page for any compound page so the
++	 * head page and mapping is looked up now. For anonymous pages, it
++	 * does not matter if the page splits in the future as the key is
++	 * based on the address. For filesystem-backed pages, the tail is
++	 * required as the index of the page determines the key. For
++	 * base pages, there is no tail page and tail == page.
++	 */
++	tail = page;
++	page = compound_head(page);
++	mapping = READ_ONCE(page->mapping);
++
++	/*
++	 * If page->mapping is NULL, then it cannot be a PageAnon
++	 * page; but it might be the ZERO_PAGE or in the gate area or
++	 * in a special mapping (all cases which we are happy to fail);
++	 * or it may have been a good file page when get_user_pages_fast
++	 * found it, but truncated or holepunched or subjected to
++	 * invalidate_complete_page2 before we got the page lock (also
++	 * cases which we are happy to fail).  And we hold a reference,
++	 * so refcount care in invalidate_complete_page's remove_mapping
++	 * prevents drop_caches from setting mapping to NULL beneath us.
++	 *
++	 * The case we do have to guard against is when memory pressure made
++	 * shmem_writepage move it from filecache to swapcache beneath us:
++	 * an unlikely race, but we do need to retry for page->mapping.
++	 */
++	if (unlikely(!mapping)) {
++		int shmem_swizzled;
++
++		/*
++		 * Page lock is required to identify which special case above
++		 * applies. If this is really a shmem page then the page lock
++		 * will prevent unexpected transitions.
++		 */
++		lock_page(page);
++		shmem_swizzled = PageSwapCache(page) || page->mapping;
++		unlock_page(page);
++		put_page(page);
++
++		if (shmem_swizzled)
++			goto again;
++
++		return -EFAULT;
++	}
++
++	/*
++	 * Private mappings are handled in a simple way.
++	 *
++	 * If the futex key is stored on an anonymous page, then the associated
++	 * object is the mm which is implicitly pinned by the calling process.
++	 *
++	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
++	 * it's a read-only handle, it's expected that futexes attach to
++	 * the object not the particular process.
++	 */
++	if (PageAnon(page)) {
++		/*
++		 * A RO anonymous page will never change and thus doesn't make
++		 * sense for futex operations.
++		 */
++		if (unlikely(should_fail_futex(true)) || ro) {
++			err = -EFAULT;
++			goto out;
++		}
++
++		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
++		key->private.mm = mm;
++		key->private.address = address;
++
++	} else {
++		struct inode *inode;
++
++		/*
++		 * The associated futex object in this case is the inode and
++		 * the page->mapping must be traversed. Ordinarily this should
++		 * be stabilised under page lock but it's not strictly
++		 * necessary in this case as we just want to pin the inode, not
++		 * update the radix tree or anything like that.
++		 *
++		 * The RCU read lock is taken as the inode is finally freed
++		 * under RCU. If the mapping still matches expectations then the
++		 * mapping->host can be safely accessed as being a valid inode.
++		 */
++		rcu_read_lock();
++
++		if (READ_ONCE(page->mapping) != mapping) {
++			rcu_read_unlock();
++			put_page(page);
++
++			goto again;
++		}
++
++		inode = READ_ONCE(mapping->host);
++		if (!inode) {
++			rcu_read_unlock();
++			put_page(page);
++
++			goto again;
++		}
++
++		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
++		key->shared.i_seq = get_inode_sequence_number(inode);
++		key->shared.pgoff = page_to_pgoff(tail);
++		rcu_read_unlock();
++	}
++
++out:
++	put_page(page);
++	return err;
++}
++
++/**
++ * fault_in_user_writeable() - Fault in user address and verify RW access
++ * @uaddr:	pointer to faulting user space address
++ *
++ * Slow path to fixup the fault we just took in the atomic write
++ * access to @uaddr.
++ *
++ * We have no generic implementation of a non-destructive write to the
++ * user address. We know that we faulted in the atomic pagefault
++ * disabled section so we can as well avoid the #PF overhead by
++ * calling get_user_pages() right away.
++ */
++int fault_in_user_writeable(u32 __user *uaddr)
++{
++	struct mm_struct *mm = current->mm;
++	int ret;
++
++	mmap_read_lock(mm);
++	ret = fixup_user_fault(mm, (unsigned long)uaddr,
++			       FAULT_FLAG_WRITE, NULL);
++	mmap_read_unlock(mm);
++
++	return ret < 0 ? ret : 0;
++}
++
++/**
++ * futex_top_waiter() - Return the highest priority waiter on a futex
++ * @hb:		the hash bucket the futex_q's reside in
++ * @key:	the futex key (to distinguish it from other futex futex_q's)
++ *
++ * Must be called with the hb lock held.
++ */
++struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
++{
++	struct futex_q *this;
++
++	plist_for_each_entry(this, &hb->chain, list) {
++		if (futex_match(&this->key, key))
++			return this;
++	}
++	return NULL;
++}
++
++int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
++{
++	int ret;
++
++	pagefault_disable();
++	ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
++	pagefault_enable();
++
++	return ret;
++}
++
++int futex_get_value_locked(u32 *dest, u32 __user *from)
++{
++	int ret;
++
++	pagefault_disable();
++	ret = __get_user(*dest, from);
++	pagefault_enable();
++
++	return ret ? -EFAULT : 0;
++}
++
++/**
++ * wait_for_owner_exiting - Block until the owner has exited
++ * @ret: owner's current futex lock status
++ * @exiting:	Pointer to the exiting task
++ *
++ * Caller must hold a refcount on @exiting.
++ */
++void wait_for_owner_exiting(int ret, struct task_struct *exiting)
++{
++	if (ret != -EBUSY) {
++		WARN_ON_ONCE(exiting);
++		return;
++	}
++
++	if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
++		return;
++
++	mutex_lock(&exiting->futex_exit_mutex);
++	/*
++	 * No point in doing state checking here. If the waiter got here
++	 * while the task was in exec()->exec_futex_release() then it can
++	 * have any FUTEX_STATE_* value when the waiter has acquired the
++	 * mutex. OK, if running, EXITING or DEAD if it reached exit()
++	 * already. Highly unlikely and not a problem. Just one more round
++	 * through the futex maze.
++	 */
++	mutex_unlock(&exiting->futex_exit_mutex);
++
++	put_task_struct(exiting);
++}
++
++/**
++ * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
++ * @q:	The futex_q to unqueue
++ *
++ * The q->lock_ptr must not be NULL and must be held by the caller.
++ */
++void __futex_unqueue(struct futex_q *q)
++{
++	struct futex_hash_bucket *hb;
++
++	if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
++		return;
++	lockdep_assert_held(q->lock_ptr);
++
++	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
++	plist_del(&q->list, &hb->chain);
++	futex_hb_waiters_dec(hb);
++}
++
++/* The key must be already stored in q->key. */
++struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
++	__acquires(&hb->lock)
++{
++	struct futex_hash_bucket *hb;
++
++	hb = futex_hash(&q->key);
++
++	/*
++	 * Increment the counter before taking the lock so that
++	 * a potential waker won't miss a to-be-slept task that is
++	 * waiting for the spinlock. This is safe as all futex_q_lock()
++	 * users end up calling futex_queue(). Similarly, for housekeeping,
++	 * decrement the counter at futex_q_unlock() when some error has
++	 * occurred and we don't end up adding the task to the list.
++	 */
++	futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */
++
++	q->lock_ptr = &hb->lock;
++
++	spin_lock(&hb->lock);
++	return hb;
++}
++
++void futex_q_unlock(struct futex_hash_bucket *hb)
++	__releases(&hb->lock)
++{
++	spin_unlock(&hb->lock);
++	futex_hb_waiters_dec(hb);
++}
++
++void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
++{
++	int prio;
++
++	/*
++	 * The priority used to register this element is
++	 * - either the real thread-priority for the real-time threads
++	 * (i.e. threads with a priority lower than MAX_RT_PRIO)
++	 * - or MAX_RT_PRIO for non-RT threads.
++	 * Thus, all RT-threads are woken first in priority order, and
++	 * the others are woken last, in FIFO order.
++	 */
++	prio = min(current->normal_prio, MAX_RT_PRIO);
++
++	plist_node_init(&q->list, prio);
++	plist_add(&q->list, &hb->chain);
++	q->task = current;
++}
++
++/**
++ * futex_unqueue() - Remove the futex_q from its futex_hash_bucket
++ * @q:	The futex_q to unqueue
++ *
++ * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
++ * be paired with exactly one earlier call to futex_queue().
++ *
++ * Return:
++ *  - 1 - if the futex_q was still queued (and we removed unqueued it);
++ *  - 0 - if the futex_q was already removed by the waking thread
++ */
++int futex_unqueue(struct futex_q *q)
++{
++	spinlock_t *lock_ptr;
++	int ret = 0;
++
++	/* In the common case we don't take the spinlock, which is nice. */
++retry:
++	/*
++	 * q->lock_ptr can change between this read and the following spin_lock.
++	 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
++	 * optimizing lock_ptr out of the logic below.
++	 */
++	lock_ptr = READ_ONCE(q->lock_ptr);
++	if (lock_ptr != NULL) {
++		spin_lock(lock_ptr);
++		/*
++		 * q->lock_ptr can change between reading it and
++		 * spin_lock(), causing us to take the wrong lock.  This
++		 * corrects the race condition.
++		 *
++		 * Reasoning goes like this: if we have the wrong lock,
++		 * q->lock_ptr must have changed (maybe several times)
++		 * between reading it and the spin_lock().  It can
++		 * change again after the spin_lock() but only if it was
++		 * already changed before the spin_lock().  It cannot,
++		 * however, change back to the original value.  Therefore
++		 * we can detect whether we acquired the correct lock.
++		 */
++		if (unlikely(lock_ptr != q->lock_ptr)) {
++			spin_unlock(lock_ptr);
++			goto retry;
++		}
++		__futex_unqueue(q);
++
++		BUG_ON(q->pi_state);
++
++		spin_unlock(lock_ptr);
++		ret = 1;
++	}
++
++	return ret;
++}
++
++/*
++ * PI futexes can not be requeued and must remove themselves from the
++ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
++ */
++void futex_unqueue_pi(struct futex_q *q)
++{
++	__futex_unqueue(q);
++
++	BUG_ON(!q->pi_state);
++	put_pi_state(q->pi_state);
++	q->pi_state = NULL;
++}
++
++/* Constants for the pending_op argument of handle_futex_death */
++#define HANDLE_DEATH_PENDING	true
++#define HANDLE_DEATH_LIST	false
++
++/*
++ * Process a futex-list entry, check whether it's owned by the
++ * dying task, and do notification if so:
++ */
++static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
++			      bool pi, bool pending_op)
++{
++	u32 uval, nval, mval;
++	int err;
++
++	/* Futex address must be 32bit aligned */
++	if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
++		return -1;
++
++retry:
++	if (get_user(uval, uaddr))
++		return -1;
++
++	/*
++	 * Special case for regular (non PI) futexes. The unlock path in
++	 * user space has two race scenarios:
++	 *
++	 * 1. The unlock path releases the user space futex value and
++	 *    before it can execute the futex() syscall to wake up
++	 *    waiters it is killed.
++	 *
++	 * 2. A woken up waiter is killed before it can acquire the
++	 *    futex in user space.
++	 *
++	 * In both cases the TID validation below prevents a wakeup of
++	 * potential waiters which can cause these waiters to block
++	 * forever.
++	 *
++	 * In both cases the following conditions are met:
++	 *
++	 *	1) task->robust_list->list_op_pending != NULL
++	 *	   @pending_op == true
++	 *	2) User space futex value == 0
++	 *	3) Regular futex: @pi == false
++	 *
++	 * If these conditions are met, it is safe to attempt waking up a
++	 * potential waiter without touching the user space futex value and
++	 * trying to set the OWNER_DIED bit. The user space futex value is
++	 * uncontended and the rest of the user space mutex state is
++	 * consistent, so a woken waiter will just take over the
++	 * uncontended futex. Setting the OWNER_DIED bit would create
++	 * inconsistent state and malfunction of the user space owner died
++	 * handling.
++	 */
++	if (pending_op && !pi && !uval) {
++		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
++		return 0;
++	}
++
++	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
++		return 0;
++
++	/*
++	 * Ok, this dying thread is truly holding a futex
++	 * of interest. Set the OWNER_DIED bit atomically
++	 * via cmpxchg, and if the value had FUTEX_WAITERS
++	 * set, wake up a waiter (if any). (We have to do a
++	 * futex_wake() even if OWNER_DIED is already set -
++	 * to handle the rare but possible case of recursive
++	 * thread-death.) The rest of the cleanup is done in
++	 * userspace.
++	 */
++	mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
++
++	/*
++	 * We are not holding a lock here, but we want to have
++	 * the pagefault_disable/enable() protection because
++	 * we want to handle the fault gracefully. If the
++	 * access fails we try to fault in the futex with R/W
++	 * verification via get_user_pages. get_user() above
++	 * does not guarantee R/W access. If that fails we
++	 * give up and leave the futex locked.
++	 */
++	if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
++		switch (err) {
++		case -EFAULT:
++			if (fault_in_user_writeable(uaddr))
++				return -1;
++			goto retry;
++
++		case -EAGAIN:
++			cond_resched();
++			goto retry;
++
++		default:
++			WARN_ON_ONCE(1);
++			return err;
++		}
++	}
++
++	if (nval != uval)
++		goto retry;
++
++	/*
++	 * Wake robust non-PI futexes here. The wakeup of
++	 * PI futexes happens in exit_pi_state():
++	 */
++	if (!pi && (uval & FUTEX_WAITERS))
++		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
++
++	return 0;
++}
++
++/*
++ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
++ */
++static inline int fetch_robust_entry(struct robust_list __user **entry,
++				     struct robust_list __user * __user *head,
++				     unsigned int *pi)
++{
++	unsigned long uentry;
++
++	if (get_user(uentry, (unsigned long __user *)head))
++		return -EFAULT;
++
++	*entry = (void __user *)(uentry & ~1UL);
++	*pi = uentry & 1;
++
++	return 0;
++}
++
++/*
++ * Walk curr->robust_list (very carefully, it's a userspace list!)
++ * and mark any locks found there dead, and notify any waiters.
++ *
++ * We silently return on any sign of list-walking problem.
++ */
++static void exit_robust_list(struct task_struct *curr)
++{
++	struct robust_list_head __user *head = curr->robust_list;
++	struct robust_list __user *entry, *next_entry, *pending;
++	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
++	unsigned int next_pi;
++	unsigned long futex_offset;
++	int rc;
++
++	if (!futex_cmpxchg_enabled)
++		return;
++
++	/*
++	 * Fetch the list head (which was registered earlier, via
++	 * sys_set_robust_list()):
++	 */
++	if (fetch_robust_entry(&entry, &head->list.next, &pi))
++		return;
++	/*
++	 * Fetch the relative futex offset:
++	 */
++	if (get_user(futex_offset, &head->futex_offset))
++		return;
++	/*
++	 * Fetch any possibly pending lock-add first, and handle it
++	 * if it exists:
++	 */
++	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
++		return;
++
++	next_entry = NULL;	/* avoid warning with gcc */
++	while (entry != &head->list) {
++		/*
++		 * Fetch the next entry in the list before calling
++		 * handle_futex_death:
++		 */
++		rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
++		/*
++		 * A pending lock might already be on the list, so
++		 * don't process it twice:
++		 */
++		if (entry != pending) {
++			if (handle_futex_death((void __user *)entry + futex_offset,
++						curr, pi, HANDLE_DEATH_LIST))
++				return;
++		}
++		if (rc)
++			return;
++		entry = next_entry;
++		pi = next_pi;
++		/*
++		 * Avoid excessively long or circular lists:
++		 */
++		if (!--limit)
++			break;
++
++		cond_resched();
++	}
++
++	if (pending) {
++		handle_futex_death((void __user *)pending + futex_offset,
++				   curr, pip, HANDLE_DEATH_PENDING);
++	}
++}
++
++#ifdef CONFIG_COMPAT
++static void __user *futex_uaddr(struct robust_list __user *entry,
++				compat_long_t futex_offset)
++{
++	compat_uptr_t base = ptr_to_compat(entry);
++	void __user *uaddr = compat_ptr(base + futex_offset);
++
++	return uaddr;
++}
++
++/*
++ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
++ */
++static inline int
++compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
++		   compat_uptr_t __user *head, unsigned int *pi)
++{
++	if (get_user(*uentry, head))
++		return -EFAULT;
++
++	*entry = compat_ptr((*uentry) & ~1);
++	*pi = (unsigned int)(*uentry) & 1;
++
++	return 0;
++}
++
++/*
++ * Walk curr->robust_list (very carefully, it's a userspace list!)
++ * and mark any locks found there dead, and notify any waiters.
++ *
++ * We silently return on any sign of list-walking problem.
++ */
++static void compat_exit_robust_list(struct task_struct *curr)
++{
++	struct compat_robust_list_head __user *head = curr->compat_robust_list;
++	struct robust_list __user *entry, *next_entry, *pending;
++	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
++	unsigned int next_pi;
++	compat_uptr_t uentry, next_uentry, upending;
++	compat_long_t futex_offset;
++	int rc;
++
++	if (!futex_cmpxchg_enabled)
++		return;
++
++	/*
++	 * Fetch the list head (which was registered earlier, via
++	 * sys_set_robust_list()):
++	 */
++	if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
++		return;
++	/*
++	 * Fetch the relative futex offset:
++	 */
++	if (get_user(futex_offset, &head->futex_offset))
++		return;
++	/*
++	 * Fetch any possibly pending lock-add first, and handle it
++	 * if it exists:
++	 */
++	if (compat_fetch_robust_entry(&upending, &pending,
++			       &head->list_op_pending, &pip))
++		return;
++
++	next_entry = NULL;	/* avoid warning with gcc */
++	while (entry != (struct robust_list __user *) &head->list) {
++		/*
++		 * Fetch the next entry in the list before calling
++		 * handle_futex_death:
++		 */
++		rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
++			(compat_uptr_t __user *)&entry->next, &next_pi);
++		/*
++		 * A pending lock might already be on the list, so
++		 * dont process it twice:
++		 */
++		if (entry != pending) {
++			void __user *uaddr = futex_uaddr(entry, futex_offset);
++
++			if (handle_futex_death(uaddr, curr, pi,
++					       HANDLE_DEATH_LIST))
++				return;
++		}
++		if (rc)
++			return;
++		uentry = next_uentry;
++		entry = next_entry;
++		pi = next_pi;
++		/*
++		 * Avoid excessively long or circular lists:
++		 */
++		if (!--limit)
++			break;
++
++		cond_resched();
++	}
++	if (pending) {
++		void __user *uaddr = futex_uaddr(pending, futex_offset);
++
++		handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
++	}
++}
++#endif
++
++#ifdef CONFIG_FUTEX_PI
++
++/*
++ * This task is holding PI mutexes at exit time => bad.
++ * Kernel cleans up PI-state, but userspace is likely hosed.
++ * (Robust-futex cleanup is separate and might save the day for userspace.)
++ */
++static void exit_pi_state_list(struct task_struct *curr)
++{
++	struct list_head *next, *head = &curr->pi_state_list;
++	struct futex_pi_state *pi_state;
++	struct futex_hash_bucket *hb;
++	union futex_key key = FUTEX_KEY_INIT;
++
++	if (!futex_cmpxchg_enabled)
++		return;
++	/*
++	 * We are a ZOMBIE and nobody can enqueue itself on
++	 * pi_state_list anymore, but we have to be careful
++	 * versus waiters unqueueing themselves:
++	 */
++	raw_spin_lock_irq(&curr->pi_lock);
++	while (!list_empty(head)) {
++		next = head->next;
++		pi_state = list_entry(next, struct futex_pi_state, list);
++		key = pi_state->key;
++		hb = futex_hash(&key);
++
++		/*
++		 * We can race against put_pi_state() removing itself from the
++		 * list (a waiter going away). put_pi_state() will first
++		 * decrement the reference count and then modify the list, so
++		 * its possible to see the list entry but fail this reference
++		 * acquire.
++		 *
++		 * In that case; drop the locks to let put_pi_state() make
++		 * progress and retry the loop.
++		 */
++		if (!refcount_inc_not_zero(&pi_state->refcount)) {
++			raw_spin_unlock_irq(&curr->pi_lock);
++			cpu_relax();
++			raw_spin_lock_irq(&curr->pi_lock);
++			continue;
++		}
++		raw_spin_unlock_irq(&curr->pi_lock);
++
++		spin_lock(&hb->lock);
++		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++		raw_spin_lock(&curr->pi_lock);
++		/*
++		 * We dropped the pi-lock, so re-check whether this
++		 * task still owns the PI-state:
++		 */
++		if (head->next != next) {
++			/* retain curr->pi_lock for the loop invariant */
++			raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
++			spin_unlock(&hb->lock);
++			put_pi_state(pi_state);
++			continue;
++		}
++
++		WARN_ON(pi_state->owner != curr);
++		WARN_ON(list_empty(&pi_state->list));
++		list_del_init(&pi_state->list);
++		pi_state->owner = NULL;
++
++		raw_spin_unlock(&curr->pi_lock);
++		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++		spin_unlock(&hb->lock);
++
++		rt_mutex_futex_unlock(&pi_state->pi_mutex);
++		put_pi_state(pi_state);
++
++		raw_spin_lock_irq(&curr->pi_lock);
++	}
++	raw_spin_unlock_irq(&curr->pi_lock);
++}
++#else
++static inline void exit_pi_state_list(struct task_struct *curr) { }
++#endif
++
++static void futex_cleanup(struct task_struct *tsk)
++{
++	if (unlikely(tsk->robust_list)) {
++		exit_robust_list(tsk);
++		tsk->robust_list = NULL;
++	}
++
++#ifdef CONFIG_COMPAT
++	if (unlikely(tsk->compat_robust_list)) {
++		compat_exit_robust_list(tsk);
++		tsk->compat_robust_list = NULL;
++	}
++#endif
++
++	if (unlikely(!list_empty(&tsk->pi_state_list)))
++		exit_pi_state_list(tsk);
++}
++
++/**
++ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
++ * @tsk:	task to set the state on
++ *
++ * Set the futex exit state of the task lockless. The futex waiter code
++ * observes that state when a task is exiting and loops until the task has
++ * actually finished the futex cleanup. The worst case for this is that the
++ * waiter runs through the wait loop until the state becomes visible.
++ *
++ * This is called from the recursive fault handling path in do_exit().
++ *
++ * This is best effort. Either the futex exit code has run already or
++ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
++ * take it over. If not, the problem is pushed back to user space. If the
++ * futex exit code did not run yet, then an already queued waiter might
++ * block forever, but there is nothing which can be done about that.
++ */
++void futex_exit_recursive(struct task_struct *tsk)
++{
++	/* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
++	if (tsk->futex_state == FUTEX_STATE_EXITING)
++		mutex_unlock(&tsk->futex_exit_mutex);
++	tsk->futex_state = FUTEX_STATE_DEAD;
++}
++
++static void futex_cleanup_begin(struct task_struct *tsk)
++{
++	/*
++	 * Prevent various race issues against a concurrent incoming waiter
++	 * including live locks by forcing the waiter to block on
++	 * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
++	 * attach_to_pi_owner().
++	 */
++	mutex_lock(&tsk->futex_exit_mutex);
++
++	/*
++	 * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
++	 *
++	 * This ensures that all subsequent checks of tsk->futex_state in
++	 * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
++	 * tsk->pi_lock held.
++	 *
++	 * It guarantees also that a pi_state which was queued right before
++	 * the state change under tsk->pi_lock by a concurrent waiter must
++	 * be observed in exit_pi_state_list().
++	 */
++	raw_spin_lock_irq(&tsk->pi_lock);
++	tsk->futex_state = FUTEX_STATE_EXITING;
++	raw_spin_unlock_irq(&tsk->pi_lock);
++}
++
++static void futex_cleanup_end(struct task_struct *tsk, int state)
++{
++	/*
++	 * Lockless store. The only side effect is that an observer might
++	 * take another loop until it becomes visible.
++	 */
++	tsk->futex_state = state;
++	/*
++	 * Drop the exit protection. This unblocks waiters which observed
++	 * FUTEX_STATE_EXITING to reevaluate the state.
++	 */
++	mutex_unlock(&tsk->futex_exit_mutex);
++}
++
++void futex_exec_release(struct task_struct *tsk)
++{
++	/*
++	 * The state handling is done for consistency, but in the case of
++	 * exec() there is no way to prevent further damage as the PID stays
++	 * the same. But for the unlikely and arguably buggy case that a
++	 * futex is held on exec(), this provides at least as much state
++	 * consistency protection which is possible.
++	 */
++	futex_cleanup_begin(tsk);
++	futex_cleanup(tsk);
++	/*
++	 * Reset the state to FUTEX_STATE_OK. The task is alive and about
++	 * exec a new binary.
++	 */
++	futex_cleanup_end(tsk, FUTEX_STATE_OK);
++}
++
++void futex_exit_release(struct task_struct *tsk)
++{
++	futex_cleanup_begin(tsk);
++	futex_cleanup(tsk);
++	futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
++}
++
++static void __init futex_detect_cmpxchg(void)
++{
++#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
++	u32 curval;
++
++	/*
++	 * This will fail and we want it. Some arch implementations do
++	 * runtime detection of the futex_atomic_cmpxchg_inatomic()
++	 * functionality. We want to know that before we call in any
++	 * of the complex code paths. Also we want to prevent
++	 * registration of robust lists in that case. NULL is
++	 * guaranteed to fault and we get -EFAULT on functional
++	 * implementation, the non-functional ones will return
++	 * -ENOSYS.
++	 */
++	if (futex_cmpxchg_value_locked(&curval, NULL, 0, 0) == -EFAULT)
++		futex_cmpxchg_enabled = 1;
++#endif
++}
++
++static int __init futex_init(void)
++{
++	unsigned int futex_shift;
++	unsigned long i;
++
++#if CONFIG_BASE_SMALL
++	futex_hashsize = 16;
++#else
++	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
++#endif
++
++	futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
++					       futex_hashsize, 0,
++					       futex_hashsize < 256 ? HASH_SMALL : 0,
++					       &futex_shift, NULL,
++					       futex_hashsize, futex_hashsize);
++	futex_hashsize = 1UL << futex_shift;
++
++	futex_detect_cmpxchg();
++
++	for (i = 0; i < futex_hashsize; i++) {
++		atomic_set(&futex_queues[i].waiters, 0);
++		plist_head_init(&futex_queues[i].chain);
++		spin_lock_init(&futex_queues[i].lock);
++	}
++
++	return 0;
++}
++core_initcall(futex_init);
+diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
+new file mode 100644
+index 000000000..948fcf317
+--- /dev/null
++++ b/kernel/futex/futex.h
+@@ -0,0 +1,295 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _FUTEX_H
++#define _FUTEX_H
++
++#include <linux/futex.h>
++#include <linux/sched/wake_q.h>
++
++#include <asm/futex.h>
++
++/*
++ * Futex flags used to encode options to functions and preserve them across
++ * restarts.
++ */
++#ifdef CONFIG_MMU
++# define FLAGS_SHARED		0x01
++#else
++/*
++ * NOMMU does not have per process address space. Let the compiler optimize
++ * code away.
++ */
++# define FLAGS_SHARED		0x00
++#endif
++#define FLAGS_CLOCKRT		0x02
++#define FLAGS_HAS_TIMEOUT	0x04
++
++#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
++#define futex_cmpxchg_enabled 1
++#else
++extern int  __read_mostly futex_cmpxchg_enabled;
++#endif
++
++#ifdef CONFIG_FAIL_FUTEX
++extern bool should_fail_futex(bool fshared);
++#else
++static inline bool should_fail_futex(bool fshared)
++{
++	return false;
++}
++#endif
++
++/*
++ * Hash buckets are shared by all the futex_keys that hash to the same
++ * location.  Each key may have multiple futex_q structures, one for each task
++ * waiting on a futex.
++ */
++struct futex_hash_bucket {
++	atomic_t waiters;
++	spinlock_t lock;
++	struct plist_head chain;
++} ____cacheline_aligned_in_smp;
++
++/*
++ * Priority Inheritance state:
++ */
++struct futex_pi_state {
++	/*
++	 * list of 'owned' pi_state instances - these have to be
++	 * cleaned up in do_exit() if the task exits prematurely:
++	 */
++	struct list_head list;
++
++	/*
++	 * The PI object:
++	 */
++	struct rt_mutex_base pi_mutex;
++
++	struct task_struct *owner;
++	refcount_t refcount;
++
++	union futex_key key;
++} __randomize_layout;
++
++/**
++ * struct futex_q - The hashed futex queue entry, one per waiting task
++ * @list:		priority-sorted list of tasks waiting on this futex
++ * @task:		the task waiting on the futex
++ * @lock_ptr:		the hash bucket lock
++ * @key:		the key the futex is hashed on
++ * @pi_state:		optional priority inheritance state
++ * @rt_waiter:		rt_waiter storage for use with requeue_pi
++ * @requeue_pi_key:	the requeue_pi target futex key
++ * @bitset:		bitset for the optional bitmasked wakeup
++ * @requeue_state:	State field for futex_requeue_pi()
++ * @requeue_wait:	RCU wait for futex_requeue_pi() (RT only)
++ *
++ * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
++ * we can wake only the relevant ones (hashed queues may be shared).
++ *
++ * A futex_q has a woken state, just like tasks have TASK_RUNNING.
++ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
++ * The order of wakeup is always to make the first condition true, then
++ * the second.
++ *
++ * PI futexes are typically woken before they are removed from the hash list via
++ * the rt_mutex code. See futex_unqueue_pi().
++ */
++struct futex_q {
++	struct plist_node list;
++
++	struct task_struct *task;
++	spinlock_t *lock_ptr;
++	union futex_key key;
++	struct futex_pi_state *pi_state;
++	struct rt_mutex_waiter *rt_waiter;
++	union futex_key *requeue_pi_key;
++	u32 bitset;
++	atomic_t requeue_state;
++#ifdef CONFIG_PREEMPT_RT
++	struct rcuwait requeue_wait;
++#endif
++} __randomize_layout;
++
++extern const struct futex_q futex_q_init;
++
++enum futex_access {
++	FUTEX_READ,
++	FUTEX_WRITE
++};
++
++extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
++			 enum futex_access rw);
++
++extern struct hrtimer_sleeper *
++futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
++		  int flags, u64 range_ns);
++
++extern struct futex_hash_bucket *futex_hash(union futex_key *key);
++
++/**
++ * futex_match - Check whether two futex keys are equal
++ * @key1:	Pointer to key1
++ * @key2:	Pointer to key2
++ *
++ * Return 1 if two futex_keys are equal, 0 otherwise.
++ */
++static inline int futex_match(union futex_key *key1, union futex_key *key2)
++{
++	return (key1 && key2
++		&& key1->both.word == key2->both.word
++		&& key1->both.ptr == key2->both.ptr
++		&& key1->both.offset == key2->both.offset);
++}
++
++extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
++			    struct futex_q *q, struct futex_hash_bucket **hb);
++extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
++				   struct hrtimer_sleeper *timeout);
++extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
++
++extern int fault_in_user_writeable(u32 __user *uaddr);
++extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval);
++extern int futex_get_value_locked(u32 *dest, u32 __user *from);
++extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key);
++
++extern void __futex_unqueue(struct futex_q *q);
++extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb);
++extern int futex_unqueue(struct futex_q *q);
++
++/**
++ * futex_queue() - Enqueue the futex_q on the futex_hash_bucket
++ * @q:	The futex_q to enqueue
++ * @hb:	The destination hash bucket
++ *
++ * The hb->lock must be held by the caller, and is released here. A call to
++ * futex_queue() is typically paired with exactly one call to futex_unqueue().  The
++ * exceptions involve the PI related operations, which may use futex_unqueue_pi()
++ * or nothing if the unqueue is done as part of the wake process and the unqueue
++ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
++ * an example).
++ */
++static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
++	__releases(&hb->lock)
++{
++	__futex_queue(q, hb);
++	spin_unlock(&hb->lock);
++}
++
++extern void futex_unqueue_pi(struct futex_q *q);
++
++extern void wait_for_owner_exiting(int ret, struct task_struct *exiting);
++
++/*
++ * Reflects a new waiter being added to the waitqueue.
++ */
++static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb)
++{
++#ifdef CONFIG_SMP
++	atomic_inc(&hb->waiters);
++	/*
++	 * Full barrier (A), see the ordering comment above.
++	 */
++	smp_mb__after_atomic();
++#endif
++}
++
++/*
++ * Reflects a waiter being removed from the waitqueue by wakeup
++ * paths.
++ */
++static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb)
++{
++#ifdef CONFIG_SMP
++	atomic_dec(&hb->waiters);
++#endif
++}
++
++static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb)
++{
++#ifdef CONFIG_SMP
++	/*
++	 * Full barrier (B), see the ordering comment above.
++	 */
++	smp_mb();
++	return atomic_read(&hb->waiters);
++#else
++	return 1;
++#endif
++}
++
++extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q);
++extern void futex_q_unlock(struct futex_hash_bucket *hb);
++
++
++extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
++				union futex_key *key,
++				struct futex_pi_state **ps,
++				struct task_struct *task,
++				struct task_struct **exiting,
++				int set_waiters);
++
++extern int refill_pi_state_cache(void);
++extern void get_pi_state(struct futex_pi_state *pi_state);
++extern void put_pi_state(struct futex_pi_state *pi_state);
++extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked);
++
++/*
++ * Express the locking dependencies for lockdep:
++ */
++static inline void
++double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
++{
++	if (hb1 > hb2)
++		swap(hb1, hb2);
++
++	spin_lock(&hb1->lock);
++	if (hb1 != hb2)
++		spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
++}
++
++static inline void
++double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
++{
++	spin_unlock(&hb1->lock);
++	if (hb1 != hb2)
++		spin_unlock(&hb2->lock);
++}
++
++/* syscalls */
++
++extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32
++				 val, ktime_t *abs_time, u32 bitset, u32 __user
++				 *uaddr2);
++
++extern int futex_requeue(u32 __user *uaddr1, unsigned int flags,
++			 u32 __user *uaddr2, int nr_wake, int nr_requeue,
++			 u32 *cmpval, int requeue_pi);
++
++extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
++		      ktime_t *abs_time, u32 bitset);
++
++/**
++ * struct futex_vector - Auxiliary struct for futex_waitv()
++ * @w: Userspace provided data
++ * @q: Kernel side data
++ *
++ * Struct used to build an array with all data need for futex_waitv()
++ */
++struct futex_vector {
++	struct futex_waitv w;
++	struct futex_q q;
++};
++
++extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
++			       struct hrtimer_sleeper *to);
++
++extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset);
++
++extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags,
++			 u32 __user *uaddr2, int nr_wake, int nr_wake2, int op);
++
++extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags);
++
++extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock);
++
++#endif /* _FUTEX_H */
+diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
+new file mode 100644
+index 000000000..183b28c32
+--- /dev/null
++++ b/kernel/futex/pi.c
+@@ -0,0 +1,1233 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++
++#include <linux/slab.h>
++#include <linux/sched/task.h>
++
++#include "futex.h"
++#include "../locking/rtmutex_common.h"
++
++/*
++ * PI code:
++ */
++int refill_pi_state_cache(void)
++{
++	struct futex_pi_state *pi_state;
++
++	if (likely(current->pi_state_cache))
++		return 0;
++
++	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
++
++	if (!pi_state)
++		return -ENOMEM;
++
++	INIT_LIST_HEAD(&pi_state->list);
++	/* pi_mutex gets initialized later */
++	pi_state->owner = NULL;
++	refcount_set(&pi_state->refcount, 1);
++	pi_state->key = FUTEX_KEY_INIT;
++
++	current->pi_state_cache = pi_state;
++
++	return 0;
++}
++
++static struct futex_pi_state *alloc_pi_state(void)
++{
++	struct futex_pi_state *pi_state = current->pi_state_cache;
++
++	WARN_ON(!pi_state);
++	current->pi_state_cache = NULL;
++
++	return pi_state;
++}
++
++static void pi_state_update_owner(struct futex_pi_state *pi_state,
++				  struct task_struct *new_owner)
++{
++	struct task_struct *old_owner = pi_state->owner;
++
++	lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
++
++	if (old_owner) {
++		raw_spin_lock(&old_owner->pi_lock);
++		WARN_ON(list_empty(&pi_state->list));
++		list_del_init(&pi_state->list);
++		raw_spin_unlock(&old_owner->pi_lock);
++	}
++
++	if (new_owner) {
++		raw_spin_lock(&new_owner->pi_lock);
++		WARN_ON(!list_empty(&pi_state->list));
++		list_add(&pi_state->list, &new_owner->pi_state_list);
++		pi_state->owner = new_owner;
++		raw_spin_unlock(&new_owner->pi_lock);
++	}
++}
++
++void get_pi_state(struct futex_pi_state *pi_state)
++{
++	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
++}
++
++/*
++ * Drops a reference to the pi_state object and frees or caches it
++ * when the last reference is gone.
++ */
++void put_pi_state(struct futex_pi_state *pi_state)
++{
++	if (!pi_state)
++		return;
++
++	if (!refcount_dec_and_test(&pi_state->refcount))
++		return;
++
++	/*
++	 * If pi_state->owner is NULL, the owner is most probably dying
++	 * and has cleaned up the pi_state already
++	 */
++	if (pi_state->owner) {
++		unsigned long flags;
++
++		raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
++		pi_state_update_owner(pi_state, NULL);
++		rt_mutex_proxy_unlock(&pi_state->pi_mutex);
++		raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
++	}
++
++	if (current->pi_state_cache) {
++		kfree(pi_state);
++	} else {
++		/*
++		 * pi_state->list is already empty.
++		 * clear pi_state->owner.
++		 * refcount is at 0 - put it back to 1.
++		 */
++		pi_state->owner = NULL;
++		refcount_set(&pi_state->refcount, 1);
++		current->pi_state_cache = pi_state;
++	}
++}
++
++/*
++ * We need to check the following states:
++ *
++ *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
++ *
++ * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
++ * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
++ *
++ * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
++ *
++ * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
++ * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
++ *
++ * [6]  Found  | Found    | task      | 0         | 1      | Valid
++ *
++ * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
++ *
++ * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
++ * [9]  Found  | Found    | task      | 0         | 0      | Invalid
++ * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
++ *
++ * [1]	Indicates that the kernel can acquire the futex atomically. We
++ *	came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
++ *
++ * [2]	Valid, if TID does not belong to a kernel thread. If no matching
++ *      thread is found then it indicates that the owner TID has died.
++ *
++ * [3]	Invalid. The waiter is queued on a non PI futex
++ *
++ * [4]	Valid state after exit_robust_list(), which sets the user space
++ *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
++ *
++ * [5]	The user space value got manipulated between exit_robust_list()
++ *	and exit_pi_state_list()
++ *
++ * [6]	Valid state after exit_pi_state_list() which sets the new owner in
++ *	the pi_state but cannot access the user space value.
++ *
++ * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set.
++ *
++ * [8]	Owner and user space value match
++ *
++ * [9]	There is no transient state which sets the user space TID to 0
++ *	except exit_robust_list(), but this is indicated by the
++ *	FUTEX_OWNER_DIED bit. See [4]
++ *
++ * [10] There is no transient state which leaves owner and user space
++ *	TID out of sync. Except one error case where the kernel is denied
++ *	write access to the user address, see fixup_pi_state_owner().
++ *
++ *
++ * Serialization and lifetime rules:
++ *
++ * hb->lock:
++ *
++ *	hb -> futex_q, relation
++ *	futex_q -> pi_state, relation
++ *
++ *	(cannot be raw because hb can contain arbitrary amount
++ *	 of futex_q's)
++ *
++ * pi_mutex->wait_lock:
++ *
++ *	{uval, pi_state}
++ *
++ *	(and pi_mutex 'obviously')
++ *
++ * p->pi_lock:
++ *
++ *	p->pi_state_list -> pi_state->list, relation
++ *	pi_mutex->owner -> pi_state->owner, relation
++ *
++ * pi_state->refcount:
++ *
++ *	pi_state lifetime
++ *
++ *
++ * Lock order:
++ *
++ *   hb->lock
++ *     pi_mutex->wait_lock
++ *       p->pi_lock
++ *
++ */
++
++/*
++ * Validate that the existing waiter has a pi_state and sanity check
++ * the pi_state against the user space value. If correct, attach to
++ * it.
++ */
++static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
++			      struct futex_pi_state *pi_state,
++			      struct futex_pi_state **ps)
++{
++	pid_t pid = uval & FUTEX_TID_MASK;
++	u32 uval2;
++	int ret;
++
++	/*
++	 * Userspace might have messed up non-PI and PI futexes [3]
++	 */
++	if (unlikely(!pi_state))
++		return -EINVAL;
++
++	/*
++	 * We get here with hb->lock held, and having found a
++	 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
++	 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
++	 * which in turn means that futex_lock_pi() still has a reference on
++	 * our pi_state.
++	 *
++	 * The waiter holding a reference on @pi_state also protects against
++	 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
++	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
++	 * free pi_state before we can take a reference ourselves.
++	 */
++	WARN_ON(!refcount_read(&pi_state->refcount));
++
++	/*
++	 * Now that we have a pi_state, we can acquire wait_lock
++	 * and do the state validation.
++	 */
++	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++
++	/*
++	 * Since {uval, pi_state} is serialized by wait_lock, and our current
++	 * uval was read without holding it, it can have changed. Verify it
++	 * still is what we expect it to be, otherwise retry the entire
++	 * operation.
++	 */
++	if (futex_get_value_locked(&uval2, uaddr))
++		goto out_efault;
++
++	if (uval != uval2)
++		goto out_eagain;
++
++	/*
++	 * Handle the owner died case:
++	 */
++	if (uval & FUTEX_OWNER_DIED) {
++		/*
++		 * exit_pi_state_list sets owner to NULL and wakes the
++		 * topmost waiter. The task which acquires the
++		 * pi_state->rt_mutex will fixup owner.
++		 */
++		if (!pi_state->owner) {
++			/*
++			 * No pi state owner, but the user space TID
++			 * is not 0. Inconsistent state. [5]
++			 */
++			if (pid)
++				goto out_einval;
++			/*
++			 * Take a ref on the state and return success. [4]
++			 */
++			goto out_attach;
++		}
++
++		/*
++		 * If TID is 0, then either the dying owner has not
++		 * yet executed exit_pi_state_list() or some waiter
++		 * acquired the rtmutex in the pi state, but did not
++		 * yet fixup the TID in user space.
++		 *
++		 * Take a ref on the state and return success. [6]
++		 */
++		if (!pid)
++			goto out_attach;
++	} else {
++		/*
++		 * If the owner died bit is not set, then the pi_state
++		 * must have an owner. [7]
++		 */
++		if (!pi_state->owner)
++			goto out_einval;
++	}
++
++	/*
++	 * Bail out if user space manipulated the futex value. If pi
++	 * state exists then the owner TID must be the same as the
++	 * user space TID. [9/10]
++	 */
++	if (pid != task_pid_vnr(pi_state->owner))
++		goto out_einval;
++
++out_attach:
++	get_pi_state(pi_state);
++	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++	*ps = pi_state;
++	return 0;
++
++out_einval:
++	ret = -EINVAL;
++	goto out_error;
++
++out_eagain:
++	ret = -EAGAIN;
++	goto out_error;
++
++out_efault:
++	ret = -EFAULT;
++	goto out_error;
++
++out_error:
++	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++	return ret;
++}
++
++static int handle_exit_race(u32 __user *uaddr, u32 uval,
++			    struct task_struct *tsk)
++{
++	u32 uval2;
++
++	/*
++	 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
++	 * caller that the alleged owner is busy.
++	 */
++	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
++		return -EBUSY;
++
++	/*
++	 * Reread the user space value to handle the following situation:
++	 *
++	 * CPU0				CPU1
++	 *
++	 * sys_exit()			sys_futex()
++	 *  do_exit()			 futex_lock_pi()
++	 *                                futex_lock_pi_atomic()
++	 *   exit_signals(tsk)		    No waiters:
++	 *    tsk->flags |= PF_EXITING;	    *uaddr == 0x00000PID
++	 *  mm_release(tsk)		    Set waiter bit
++	 *   exit_robust_list(tsk) {	    *uaddr = 0x80000PID;
++	 *      Set owner died		    attach_to_pi_owner() {
++	 *    *uaddr = 0xC0000000;	     tsk = get_task(PID);
++	 *   }				     if (!tsk->flags & PF_EXITING) {
++	 *  ...				       attach();
++	 *  tsk->futex_state =               } else {
++	 *	FUTEX_STATE_DEAD;              if (tsk->futex_state !=
++	 *					  FUTEX_STATE_DEAD)
++	 *				         return -EAGAIN;
++	 *				       return -ESRCH; <--- FAIL
++	 *				     }
++	 *
++	 * Returning ESRCH unconditionally is wrong here because the
++	 * user space value has been changed by the exiting task.
++	 *
++	 * The same logic applies to the case where the exiting task is
++	 * already gone.
++	 */
++	if (futex_get_value_locked(&uval2, uaddr))
++		return -EFAULT;
++
++	/* If the user space value has changed, try again. */
++	if (uval2 != uval)
++		return -EAGAIN;
++
++	/*
++	 * The exiting task did not have a robust list, the robust list was
++	 * corrupted or the user space value in *uaddr is simply bogus.
++	 * Give up and tell user space.
++	 */
++	return -ESRCH;
++}
++
++static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
++				 struct futex_pi_state **ps)
++{
++	/*
++	 * No existing pi state. First waiter. [2]
++	 *
++	 * This creates pi_state, we have hb->lock held, this means nothing can
++	 * observe this state, wait_lock is irrelevant.
++	 */
++	struct futex_pi_state *pi_state = alloc_pi_state();
++
++	/*
++	 * Initialize the pi_mutex in locked state and make @p
++	 * the owner of it:
++	 */
++	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
++
++	/* Store the key for possible exit cleanups: */
++	pi_state->key = *key;
++
++	WARN_ON(!list_empty(&pi_state->list));
++	list_add(&pi_state->list, &p->pi_state_list);
++	/*
++	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
++	 * because there is no concurrency as the object is not published yet.
++	 */
++	pi_state->owner = p;
++
++	*ps = pi_state;
++}
++/*
++ * Lookup the task for the TID provided from user space and attach to
++ * it after doing proper sanity checks.
++ */
++static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
++			      struct futex_pi_state **ps,
++			      struct task_struct **exiting)
++{
++	pid_t pid = uval & FUTEX_TID_MASK;
++	struct task_struct *p;
++
++	/*
++	 * We are the first waiter - try to look up the real owner and attach
++	 * the new pi_state to it, but bail out when TID = 0 [1]
++	 *
++	 * The !pid check is paranoid. None of the call sites should end up
++	 * with pid == 0, but better safe than sorry. Let the caller retry
++	 */
++	if (!pid)
++		return -EAGAIN;
++	p = find_get_task_by_vpid(pid);
++	if (!p)
++		return handle_exit_race(uaddr, uval, NULL);
++
++	if (unlikely(p->flags & PF_KTHREAD)) {
++		put_task_struct(p);
++		return -EPERM;
++	}
++
++	/*
++	 * We need to look at the task state to figure out, whether the
++	 * task is exiting. To protect against the change of the task state
++	 * in futex_exit_release(), we do this protected by p->pi_lock:
++	 */
++	raw_spin_lock_irq(&p->pi_lock);
++	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
++		/*
++		 * The task is on the way out. When the futex state is
++		 * FUTEX_STATE_DEAD, we know that the task has finished
++		 * the cleanup:
++		 */
++		int ret = handle_exit_race(uaddr, uval, p);
++
++		raw_spin_unlock_irq(&p->pi_lock);
++		/*
++		 * If the owner task is between FUTEX_STATE_EXITING and
++		 * FUTEX_STATE_DEAD then store the task pointer and keep
++		 * the reference on the task struct. The calling code will
++		 * drop all locks, wait for the task to reach
++		 * FUTEX_STATE_DEAD and then drop the refcount. This is
++		 * required to prevent a live lock when the current task
++		 * preempted the exiting task between the two states.
++		 */
++		if (ret == -EBUSY)
++			*exiting = p;
++		else
++			put_task_struct(p);
++		return ret;
++	}
++
++	__attach_to_pi_owner(p, key, ps);
++	raw_spin_unlock_irq(&p->pi_lock);
++
++	put_task_struct(p);
++
++	return 0;
++}
++
++static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
++{
++	int err;
++	u32 curval;
++
++	if (unlikely(should_fail_futex(true)))
++		return -EFAULT;
++
++	err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
++	if (unlikely(err))
++		return err;
++
++	/* If user space value changed, let the caller retry */
++	return curval != uval ? -EAGAIN : 0;
++}
++
++/**
++ * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
++ * @uaddr:		the pi futex user address
++ * @hb:			the pi futex hash bucket
++ * @key:		the futex key associated with uaddr and hb
++ * @ps:			the pi_state pointer where we store the result of the
++ *			lookup
++ * @task:		the task to perform the atomic lock work for.  This will
++ *			be "current" except in the case of requeue pi.
++ * @exiting:		Pointer to store the task pointer of the owner task
++ *			which is in the middle of exiting
++ * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
++ *
++ * Return:
++ *  -  0 - ready to wait;
++ *  -  1 - acquired the lock;
++ *  - <0 - error
++ *
++ * The hb->lock must be held by the caller.
++ *
++ * @exiting is only set when the return value is -EBUSY. If so, this holds
++ * a refcount on the exiting task on return and the caller needs to drop it
++ * after waiting for the exit to complete.
++ */
++int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
++			 union futex_key *key,
++			 struct futex_pi_state **ps,
++			 struct task_struct *task,
++			 struct task_struct **exiting,
++			 int set_waiters)
++{
++	u32 uval, newval, vpid = task_pid_vnr(task);
++	struct futex_q *top_waiter;
++	int ret;
++
++	/*
++	 * Read the user space value first so we can validate a few
++	 * things before proceeding further.
++	 */
++	if (futex_get_value_locked(&uval, uaddr))
++		return -EFAULT;
++
++	if (unlikely(should_fail_futex(true)))
++		return -EFAULT;
++
++	/*
++	 * Detect deadlocks.
++	 */
++	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
++		return -EDEADLK;
++
++	if ((unlikely(should_fail_futex(true))))
++		return -EDEADLK;
++
++	/*
++	 * Lookup existing state first. If it exists, try to attach to
++	 * its pi_state.
++	 */
++	top_waiter = futex_top_waiter(hb, key);
++	if (top_waiter)
++		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
++
++	/*
++	 * No waiter and user TID is 0. We are here because the
++	 * waiters or the owner died bit is set or called from
++	 * requeue_cmp_pi or for whatever reason something took the
++	 * syscall.
++	 */
++	if (!(uval & FUTEX_TID_MASK)) {
++		/*
++		 * We take over the futex. No other waiters and the user space
++		 * TID is 0. We preserve the owner died bit.
++		 */
++		newval = uval & FUTEX_OWNER_DIED;
++		newval |= vpid;
++
++		/* The futex requeue_pi code can enforce the waiters bit */
++		if (set_waiters)
++			newval |= FUTEX_WAITERS;
++
++		ret = lock_pi_update_atomic(uaddr, uval, newval);
++		if (ret)
++			return ret;
++
++		/*
++		 * If the waiter bit was requested the caller also needs PI
++		 * state attached to the new owner of the user space futex.
++		 *
++		 * @task is guaranteed to be alive and it cannot be exiting
++		 * because it is either sleeping or waiting in
++		 * futex_requeue_pi_wakeup_sync().
++		 *
++		 * No need to do the full attach_to_pi_owner() exercise
++		 * because @task is known and valid.
++		 */
++		if (set_waiters) {
++			raw_spin_lock_irq(&task->pi_lock);
++			__attach_to_pi_owner(task, key, ps);
++			raw_spin_unlock_irq(&task->pi_lock);
++		}
++		return 1;
++	}
++
++	/*
++	 * First waiter. Set the waiters bit before attaching ourself to
++	 * the owner. If owner tries to unlock, it will be forced into
++	 * the kernel and blocked on hb->lock.
++	 */
++	newval = uval | FUTEX_WAITERS;
++	ret = lock_pi_update_atomic(uaddr, uval, newval);
++	if (ret)
++		return ret;
++	/*
++	 * If the update of the user space value succeeded, we try to
++	 * attach to the owner. If that fails, no harm done, we only
++	 * set the FUTEX_WAITERS bit in the user space variable.
++	 */
++	return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
++}
++
++/*
++ * Caller must hold a reference on @pi_state.
++ */
++static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
++{
++	struct rt_mutex_waiter *top_waiter;
++	struct task_struct *new_owner;
++	bool postunlock = false;
++	DEFINE_RT_WAKE_Q(wqh);
++	u32 curval, newval;
++	int ret = 0;
++
++	top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
++	if (WARN_ON_ONCE(!top_waiter)) {
++		/*
++		 * As per the comment in futex_unlock_pi() this should not happen.
++		 *
++		 * When this happens, give up our locks and try again, giving
++		 * the futex_lock_pi() instance time to complete, either by
++		 * waiting on the rtmutex or removing itself from the futex
++		 * queue.
++		 */
++		ret = -EAGAIN;
++		goto out_unlock;
++	}
++
++	new_owner = top_waiter->task;
++
++	/*
++	 * We pass it to the next owner. The WAITERS bit is always kept
++	 * enabled while there is PI state around. We cleanup the owner
++	 * died bit, because we are the owner.
++	 */
++	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
++
++	if (unlikely(should_fail_futex(true))) {
++		ret = -EFAULT;
++		goto out_unlock;
++	}
++
++	ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
++	if (!ret && (curval != uval)) {
++		/*
++		 * If a unconditional UNLOCK_PI operation (user space did not
++		 * try the TID->0 transition) raced with a waiter setting the
++		 * FUTEX_WAITERS flag between get_user() and locking the hash
++		 * bucket lock, retry the operation.
++		 */
++		if ((FUTEX_TID_MASK & curval) == uval)
++			ret = -EAGAIN;
++		else
++			ret = -EINVAL;
++	}
++
++	if (!ret) {
++		/*
++		 * This is a point of no return; once we modified the uval
++		 * there is no going back and subsequent operations must
++		 * not fail.
++		 */
++		pi_state_update_owner(pi_state, new_owner);
++		postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
++	}
++
++out_unlock:
++	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++
++	if (postunlock)
++		rt_mutex_postunlock(&wqh);
++
++	return ret;
++}
++
++static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
++				  struct task_struct *argowner)
++{
++	struct futex_pi_state *pi_state = q->pi_state;
++	struct task_struct *oldowner, *newowner;
++	u32 uval, curval, newval, newtid;
++	int err = 0;
++
++	oldowner = pi_state->owner;
++
++	/*
++	 * We are here because either:
++	 *
++	 *  - we stole the lock and pi_state->owner needs updating to reflect
++	 *    that (@argowner == current),
++	 *
++	 * or:
++	 *
++	 *  - someone stole our lock and we need to fix things to point to the
++	 *    new owner (@argowner == NULL).
++	 *
++	 * Either way, we have to replace the TID in the user space variable.
++	 * This must be atomic as we have to preserve the owner died bit here.
++	 *
++	 * Note: We write the user space value _before_ changing the pi_state
++	 * because we can fault here. Imagine swapped out pages or a fork
++	 * that marked all the anonymous memory readonly for cow.
++	 *
++	 * Modifying pi_state _before_ the user space value would leave the
++	 * pi_state in an inconsistent state when we fault here, because we
++	 * need to drop the locks to handle the fault. This might be observed
++	 * in the PID checks when attaching to PI state .
++	 */
++retry:
++	if (!argowner) {
++		if (oldowner != current) {
++			/*
++			 * We raced against a concurrent self; things are
++			 * already fixed up. Nothing to do.
++			 */
++			return 0;
++		}
++
++		if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
++			/* We got the lock. pi_state is correct. Tell caller. */
++			return 1;
++		}
++
++		/*
++		 * The trylock just failed, so either there is an owner or
++		 * there is a higher priority waiter than this one.
++		 */
++		newowner = rt_mutex_owner(&pi_state->pi_mutex);
++		/*
++		 * If the higher priority waiter has not yet taken over the
++		 * rtmutex then newowner is NULL. We can't return here with
++		 * that state because it's inconsistent vs. the user space
++		 * state. So drop the locks and try again. It's a valid
++		 * situation and not any different from the other retry
++		 * conditions.
++		 */
++		if (unlikely(!newowner)) {
++			err = -EAGAIN;
++			goto handle_err;
++		}
++	} else {
++		WARN_ON_ONCE(argowner != current);
++		if (oldowner == current) {
++			/*
++			 * We raced against a concurrent self; things are
++			 * already fixed up. Nothing to do.
++			 */
++			return 1;
++		}
++		newowner = argowner;
++	}
++
++	newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
++	/* Owner died? */
++	if (!pi_state->owner)
++		newtid |= FUTEX_OWNER_DIED;
++
++	err = futex_get_value_locked(&uval, uaddr);
++	if (err)
++		goto handle_err;
++
++	for (;;) {
++		newval = (uval & FUTEX_OWNER_DIED) | newtid;
++
++		err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
++		if (err)
++			goto handle_err;
++
++		if (curval == uval)
++			break;
++		uval = curval;
++	}
++
++	/*
++	 * We fixed up user space. Now we need to fix the pi_state
++	 * itself.
++	 */
++	pi_state_update_owner(pi_state, newowner);
++
++	return argowner == current;
++
++	/*
++	 * In order to reschedule or handle a page fault, we need to drop the
++	 * locks here. In the case of a fault, this gives the other task
++	 * (either the highest priority waiter itself or the task which stole
++	 * the rtmutex) the chance to try the fixup of the pi_state. So once we
++	 * are back from handling the fault we need to check the pi_state after
++	 * reacquiring the locks and before trying to do another fixup. When
++	 * the fixup has been done already we simply return.
++	 *
++	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
++	 * drop hb->lock since the caller owns the hb -> futex_q relation.
++	 * Dropping the pi_mutex->wait_lock requires the state revalidate.
++	 */
++handle_err:
++	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++	spin_unlock(q->lock_ptr);
++
++	switch (err) {
++	case -EFAULT:
++		err = fault_in_user_writeable(uaddr);
++		break;
++
++	case -EAGAIN:
++		cond_resched();
++		err = 0;
++		break;
++
++	default:
++		WARN_ON_ONCE(1);
++		break;
++	}
++
++	spin_lock(q->lock_ptr);
++	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++
++	/*
++	 * Check if someone else fixed it for us:
++	 */
++	if (pi_state->owner != oldowner)
++		return argowner == current;
++
++	/* Retry if err was -EAGAIN or the fault in succeeded */
++	if (!err)
++		goto retry;
++
++	/*
++	 * fault_in_user_writeable() failed so user state is immutable. At
++	 * best we can make the kernel state consistent but user state will
++	 * be most likely hosed and any subsequent unlock operation will be
++	 * rejected due to PI futex rule [10].
++	 *
++	 * Ensure that the rtmutex owner is also the pi_state owner despite
++	 * the user space value claiming something different. There is no
++	 * point in unlocking the rtmutex if current is the owner as it
++	 * would need to wait until the next waiter has taken the rtmutex
++	 * to guarantee consistent state. Keep it simple. Userspace asked
++	 * for this wreckaged state.
++	 *
++	 * The rtmutex has an owner - either current or some other
++	 * task. See the EAGAIN loop above.
++	 */
++	pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
++
++	return err;
++}
++
++static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
++				struct task_struct *argowner)
++{
++	struct futex_pi_state *pi_state = q->pi_state;
++	int ret;
++
++	lockdep_assert_held(q->lock_ptr);
++
++	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++	ret = __fixup_pi_state_owner(uaddr, q, argowner);
++	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++	return ret;
++}
++
++/**
++ * fixup_pi_owner() - Post lock pi_state and corner case management
++ * @uaddr:	user address of the futex
++ * @q:		futex_q (contains pi_state and access to the rt_mutex)
++ * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
++ *
++ * After attempting to lock an rt_mutex, this function is called to cleanup
++ * the pi_state owner as well as handle race conditions that may allow us to
++ * acquire the lock. Must be called with the hb lock held.
++ *
++ * Return:
++ *  -  1 - success, lock taken;
++ *  -  0 - success, lock not taken;
++ *  - <0 - on error (-EFAULT)
++ */
++int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
++{
++	if (locked) {
++		/*
++		 * Got the lock. We might not be the anticipated owner if we
++		 * did a lock-steal - fix up the PI-state in that case:
++		 *
++		 * Speculative pi_state->owner read (we don't hold wait_lock);
++		 * since we own the lock pi_state->owner == current is the
++		 * stable state, anything else needs more attention.
++		 */
++		if (q->pi_state->owner != current)
++			return fixup_pi_state_owner(uaddr, q, current);
++		return 1;
++	}
++
++	/*
++	 * If we didn't get the lock; check if anybody stole it from us. In
++	 * that case, we need to fix up the uval to point to them instead of
++	 * us, otherwise bad things happen. [10]
++	 *
++	 * Another speculative read; pi_state->owner == current is unstable
++	 * but needs our attention.
++	 */
++	if (q->pi_state->owner == current)
++		return fixup_pi_state_owner(uaddr, q, NULL);
++
++	/*
++	 * Paranoia check. If we did not take the lock, then we should not be
++	 * the owner of the rt_mutex. Warn and establish consistent state.
++	 */
++	if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
++		return fixup_pi_state_owner(uaddr, q, current);
++
++	return 0;
++}
++
++/*
++ * Userspace tried a 0 -> TID atomic transition of the futex value
++ * and failed. The kernel side here does the whole locking operation:
++ * if there are waiters then it will block as a consequence of relying
++ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
++ * a 0 value of the futex too.).
++ *
++ * Also serves as futex trylock_pi()'ing, and due semantics.
++ */
++int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
++{
++	struct hrtimer_sleeper timeout, *to;
++	struct task_struct *exiting = NULL;
++	struct rt_mutex_waiter rt_waiter;
++	struct futex_hash_bucket *hb;
++	struct futex_q q = futex_q_init;
++	int res, ret;
++
++	if (!IS_ENABLED(CONFIG_FUTEX_PI))
++		return -ENOSYS;
++
++	if (refill_pi_state_cache())
++		return -ENOMEM;
++
++	to = futex_setup_timer(time, &timeout, flags, 0);
++
++retry:
++	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
++	if (unlikely(ret != 0))
++		goto out;
++
++retry_private:
++	hb = futex_q_lock(&q);
++
++	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
++				   &exiting, 0);
++	if (unlikely(ret)) {
++		/*
++		 * Atomic work succeeded and we got the lock,
++		 * or failed. Either way, we do _not_ block.
++		 */
++		switch (ret) {
++		case 1:
++			/* We got the lock. */
++			ret = 0;
++			goto out_unlock_put_key;
++		case -EFAULT:
++			goto uaddr_faulted;
++		case -EBUSY:
++		case -EAGAIN:
++			/*
++			 * Two reasons for this:
++			 * - EBUSY: Task is exiting and we just wait for the
++			 *   exit to complete.
++			 * - EAGAIN: The user space value changed.
++			 */
++			futex_q_unlock(hb);
++			/*
++			 * Handle the case where the owner is in the middle of
++			 * exiting. Wait for the exit to complete otherwise
++			 * this task might loop forever, aka. live lock.
++			 */
++			wait_for_owner_exiting(ret, exiting);
++			cond_resched();
++			goto retry;
++		default:
++			goto out_unlock_put_key;
++		}
++	}
++
++	WARN_ON(!q.pi_state);
++
++	/*
++	 * Only actually queue now that the atomic ops are done:
++	 */
++	__futex_queue(&q, hb);
++
++	if (trylock) {
++		ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
++		/* Fixup the trylock return value: */
++		ret = ret ? 0 : -EWOULDBLOCK;
++		goto no_block;
++	}
++
++	rt_mutex_init_waiter(&rt_waiter);
++
++	/*
++	 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
++	 * hold it while doing rt_mutex_start_proxy(), because then it will
++	 * include hb->lock in the blocking chain, even through we'll not in
++	 * fact hold it while blocking. This will lead it to report -EDEADLK
++	 * and BUG when futex_unlock_pi() interleaves with this.
++	 *
++	 * Therefore acquire wait_lock while holding hb->lock, but drop the
++	 * latter before calling __rt_mutex_start_proxy_lock(). This
++	 * interleaves with futex_unlock_pi() -- which does a similar lock
++	 * handoff -- such that the latter can observe the futex_q::pi_state
++	 * before __rt_mutex_start_proxy_lock() is done.
++	 */
++	raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
++	spin_unlock(q.lock_ptr);
++	/*
++	 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
++	 * such that futex_unlock_pi() is guaranteed to observe the waiter when
++	 * it sees the futex_q::pi_state.
++	 */
++	ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
++	raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
++
++	if (ret) {
++		if (ret == 1)
++			ret = 0;
++		goto cleanup;
++	}
++
++	if (unlikely(to))
++		hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
++
++	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
++
++cleanup:
++	spin_lock(q.lock_ptr);
++	/*
++	 * If we failed to acquire the lock (deadlock/signal/timeout), we must
++	 * first acquire the hb->lock before removing the lock from the
++	 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
++	 * lists consistent.
++	 *
++	 * In particular; it is important that futex_unlock_pi() can not
++	 * observe this inconsistency.
++	 */
++	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
++		ret = 0;
++
++no_block:
++	/*
++	 * Fixup the pi_state owner and possibly acquire the lock if we
++	 * haven't already.
++	 */
++	res = fixup_pi_owner(uaddr, &q, !ret);
++	/*
++	 * If fixup_pi_owner() returned an error, propagate that.  If it acquired
++	 * the lock, clear our -ETIMEDOUT or -EINTR.
++	 */
++	if (res)
++		ret = (res < 0) ? res : 0;
++
++	futex_unqueue_pi(&q);
++	spin_unlock(q.lock_ptr);
++	goto out;
++
++out_unlock_put_key:
++	futex_q_unlock(hb);
++
++out:
++	if (to) {
++		hrtimer_cancel(&to->timer);
++		destroy_hrtimer_on_stack(&to->timer);
++	}
++	return ret != -EINTR ? ret : -ERESTARTNOINTR;
++
++uaddr_faulted:
++	futex_q_unlock(hb);
++
++	ret = fault_in_user_writeable(uaddr);
++	if (ret)
++		goto out;
++
++	if (!(flags & FLAGS_SHARED))
++		goto retry_private;
++
++	goto retry;
++}
++
++/*
++ * Userspace attempted a TID -> 0 atomic transition, and failed.
++ * This is the in-kernel slowpath: we look up the PI state (if any),
++ * and do the rt-mutex unlock.
++ */
++int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
++{
++	u32 curval, uval, vpid = task_pid_vnr(current);
++	union futex_key key = FUTEX_KEY_INIT;
++	struct futex_hash_bucket *hb;
++	struct futex_q *top_waiter;
++	int ret;
++
++	if (!IS_ENABLED(CONFIG_FUTEX_PI))
++		return -ENOSYS;
++
++retry:
++	if (get_user(uval, uaddr))
++		return -EFAULT;
++	/*
++	 * We release only a lock we actually own:
++	 */
++	if ((uval & FUTEX_TID_MASK) != vpid)
++		return -EPERM;
++
++	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
++	if (ret)
++		return ret;
++
++	hb = futex_hash(&key);
++	spin_lock(&hb->lock);
++
++	/*
++	 * Check waiters first. We do not trust user space values at
++	 * all and we at least want to know if user space fiddled
++	 * with the futex value instead of blindly unlocking.
++	 */
++	top_waiter = futex_top_waiter(hb, &key);
++	if (top_waiter) {
++		struct futex_pi_state *pi_state = top_waiter->pi_state;
++
++		ret = -EINVAL;
++		if (!pi_state)
++			goto out_unlock;
++
++		/*
++		 * If current does not own the pi_state then the futex is
++		 * inconsistent and user space fiddled with the futex value.
++		 */
++		if (pi_state->owner != current)
++			goto out_unlock;
++
++		get_pi_state(pi_state);
++		/*
++		 * By taking wait_lock while still holding hb->lock, we ensure
++		 * there is no point where we hold neither; and therefore
++		 * wake_futex_p() must observe a state consistent with what we
++		 * observed.
++		 *
++		 * In particular; this forces __rt_mutex_start_proxy() to
++		 * complete such that we're guaranteed to observe the
++		 * rt_waiter. Also see the WARN in wake_futex_pi().
++		 */
++		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++		spin_unlock(&hb->lock);
++
++		/* drops pi_state->pi_mutex.wait_lock */
++		ret = wake_futex_pi(uaddr, uval, pi_state);
++
++		put_pi_state(pi_state);
++
++		/*
++		 * Success, we're done! No tricky corner cases.
++		 */
++		if (!ret)
++			return ret;
++		/*
++		 * The atomic access to the futex value generated a
++		 * pagefault, so retry the user-access and the wakeup:
++		 */
++		if (ret == -EFAULT)
++			goto pi_faulted;
++		/*
++		 * A unconditional UNLOCK_PI op raced against a waiter
++		 * setting the FUTEX_WAITERS bit. Try again.
++		 */
++		if (ret == -EAGAIN)
++			goto pi_retry;
++		/*
++		 * wake_futex_pi has detected invalid state. Tell user
++		 * space.
++		 */
++		return ret;
++	}
++
++	/*
++	 * We have no kernel internal state, i.e. no waiters in the
++	 * kernel. Waiters which are about to queue themselves are stuck
++	 * on hb->lock. So we can safely ignore them. We do neither
++	 * preserve the WAITERS bit not the OWNER_DIED one. We are the
++	 * owner.
++	 */
++	if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
++		spin_unlock(&hb->lock);
++		switch (ret) {
++		case -EFAULT:
++			goto pi_faulted;
++
++		case -EAGAIN:
++			goto pi_retry;
++
++		default:
++			WARN_ON_ONCE(1);
++			return ret;
++		}
++	}
++
++	/*
++	 * If uval has changed, let user space handle it.
++	 */
++	ret = (curval == uval) ? 0 : -EAGAIN;
++
++out_unlock:
++	spin_unlock(&hb->lock);
++	return ret;
++
++pi_retry:
++	cond_resched();
++	goto retry;
++
++pi_faulted:
++
++	ret = fault_in_user_writeable(uaddr);
++	if (!ret)
++		goto retry;
++
++	return ret;
++}
++
+diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
+new file mode 100644
+index 000000000..cba8b1a6a
+--- /dev/null
++++ b/kernel/futex/requeue.c
+@@ -0,0 +1,897 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++
++#include <linux/sched/signal.h>
++
++#include "futex.h"
++#include "../locking/rtmutex_common.h"
++
++/*
++ * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an
++ * underlying rtmutex. The task which is about to be requeued could have
++ * just woken up (timeout, signal). After the wake up the task has to
++ * acquire hash bucket lock, which is held by the requeue code.  As a task
++ * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking
++ * and the hash bucket lock blocking would collide and corrupt state.
++ *
++ * On !PREEMPT_RT this is not a problem and everything could be serialized
++ * on hash bucket lock, but aside of having the benefit of common code,
++ * this allows to avoid doing the requeue when the task is already on the
++ * way out and taking the hash bucket lock of the original uaddr1 when the
++ * requeue has been completed.
++ *
++ * The following state transitions are valid:
++ *
++ * On the waiter side:
++ *   Q_REQUEUE_PI_NONE		-> Q_REQUEUE_PI_IGNORE
++ *   Q_REQUEUE_PI_IN_PROGRESS	-> Q_REQUEUE_PI_WAIT
++ *
++ * On the requeue side:
++ *   Q_REQUEUE_PI_NONE		-> Q_REQUEUE_PI_INPROGRESS
++ *   Q_REQUEUE_PI_IN_PROGRESS	-> Q_REQUEUE_PI_DONE/LOCKED
++ *   Q_REQUEUE_PI_IN_PROGRESS	-> Q_REQUEUE_PI_NONE (requeue failed)
++ *   Q_REQUEUE_PI_WAIT		-> Q_REQUEUE_PI_DONE/LOCKED
++ *   Q_REQUEUE_PI_WAIT		-> Q_REQUEUE_PI_IGNORE (requeue failed)
++ *
++ * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this
++ * signals that the waiter is already on the way out. It also means that
++ * the waiter is still on the 'wait' futex, i.e. uaddr1.
++ *
++ * The waiter side signals early wakeup to the requeue side either through
++ * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending
++ * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately
++ * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,
++ * which means the wakeup is interleaving with a requeue in progress it has
++ * to wait for the requeue side to change the state. Either to DONE/LOCKED
++ * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex
++ * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by
++ * the requeue side when the requeue attempt failed via deadlock detection
++ * and therefore the waiter q is still on the uaddr1 futex.
++ */
++enum {
++	Q_REQUEUE_PI_NONE		=  0,
++	Q_REQUEUE_PI_IGNORE,
++	Q_REQUEUE_PI_IN_PROGRESS,
++	Q_REQUEUE_PI_WAIT,
++	Q_REQUEUE_PI_DONE,
++	Q_REQUEUE_PI_LOCKED,
++};
++
++const struct futex_q futex_q_init = {
++	/* list gets initialized in futex_queue()*/
++	.key		= FUTEX_KEY_INIT,
++	.bitset		= FUTEX_BITSET_MATCH_ANY,
++	.requeue_state	= ATOMIC_INIT(Q_REQUEUE_PI_NONE),
++};
++
++/**
++ * requeue_futex() - Requeue a futex_q from one hb to another
++ * @q:		the futex_q to requeue
++ * @hb1:	the source hash_bucket
++ * @hb2:	the target hash_bucket
++ * @key2:	the new key for the requeued futex_q
++ */
++static inline
++void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
++		   struct futex_hash_bucket *hb2, union futex_key *key2)
++{
++
++	/*
++	 * If key1 and key2 hash to the same bucket, no need to
++	 * requeue.
++	 */
++	if (likely(&hb1->chain != &hb2->chain)) {
++		plist_del(&q->list, &hb1->chain);
++		futex_hb_waiters_dec(hb1);
++		futex_hb_waiters_inc(hb2);
++		plist_add(&q->list, &hb2->chain);
++		q->lock_ptr = &hb2->lock;
++	}
++	q->key = *key2;
++}
++
++static inline bool futex_requeue_pi_prepare(struct futex_q *q,
++					    struct futex_pi_state *pi_state)
++{
++	int old, new;
++
++	/*
++	 * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
++	 * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
++	 * ignore the waiter.
++	 */
++	old = atomic_read_acquire(&q->requeue_state);
++	do {
++		if (old == Q_REQUEUE_PI_IGNORE)
++			return false;
++
++		/*
++		 * futex_proxy_trylock_atomic() might have set it to
++		 * IN_PROGRESS and a interleaved early wake to WAIT.
++		 *
++		 * It was considered to have an extra state for that
++		 * trylock, but that would just add more conditionals
++		 * all over the place for a dubious value.
++		 */
++		if (old != Q_REQUEUE_PI_NONE)
++			break;
++
++		new = Q_REQUEUE_PI_IN_PROGRESS;
++	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
++
++	q->pi_state = pi_state;
++	return true;
++}
++
++static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
++{
++	int old, new;
++
++	old = atomic_read_acquire(&q->requeue_state);
++	do {
++		if (old == Q_REQUEUE_PI_IGNORE)
++			return;
++
++		if (locked >= 0) {
++			/* Requeue succeeded. Set DONE or LOCKED */
++			WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS &&
++				     old != Q_REQUEUE_PI_WAIT);
++			new = Q_REQUEUE_PI_DONE + locked;
++		} else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
++			/* Deadlock, no early wakeup interleave */
++			new = Q_REQUEUE_PI_NONE;
++		} else {
++			/* Deadlock, early wakeup interleave. */
++			WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT);
++			new = Q_REQUEUE_PI_IGNORE;
++		}
++	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
++
++#ifdef CONFIG_PREEMPT_RT
++	/* If the waiter interleaved with the requeue let it know */
++	if (unlikely(old == Q_REQUEUE_PI_WAIT))
++		rcuwait_wake_up(&q->requeue_wait);
++#endif
++}
++
++static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
++{
++	int old, new;
++
++	old = atomic_read_acquire(&q->requeue_state);
++	do {
++		/* Is requeue done already? */
++		if (old >= Q_REQUEUE_PI_DONE)
++			return old;
++
++		/*
++		 * If not done, then tell the requeue code to either ignore
++		 * the waiter or to wake it up once the requeue is done.
++		 */
++		new = Q_REQUEUE_PI_WAIT;
++		if (old == Q_REQUEUE_PI_NONE)
++			new = Q_REQUEUE_PI_IGNORE;
++	} while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
++
++	/* If the requeue was in progress, wait for it to complete */
++	if (old == Q_REQUEUE_PI_IN_PROGRESS) {
++#ifdef CONFIG_PREEMPT_RT
++		rcuwait_wait_event(&q->requeue_wait,
++				   atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
++				   TASK_UNINTERRUPTIBLE);
++#else
++		(void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
++#endif
++	}
++
++	/*
++	 * Requeue is now either prohibited or complete. Reread state
++	 * because during the wait above it might have changed. Nothing
++	 * will modify q->requeue_state after this point.
++	 */
++	return atomic_read(&q->requeue_state);
++}
++
++/**
++ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
++ * @q:		the futex_q
++ * @key:	the key of the requeue target futex
++ * @hb:		the hash_bucket of the requeue target futex
++ *
++ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
++ * target futex if it is uncontended or via a lock steal.
++ *
++ * 1) Set @q::key to the requeue target futex key so the waiter can detect
++ *    the wakeup on the right futex.
++ *
++ * 2) Dequeue @q from the hash bucket.
++ *
++ * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock
++ *    acquisition.
++ *
++ * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that
++ *    the waiter has to fixup the pi state.
++ *
++ * 5) Complete the requeue state so the waiter can make progress. After
++ *    this point the waiter task can return from the syscall immediately in
++ *    case that the pi state does not have to be fixed up.
++ *
++ * 6) Wake the waiter task.
++ *
++ * Must be called with both q->lock_ptr and hb->lock held.
++ */
++static inline
++void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
++			   struct futex_hash_bucket *hb)
++{
++	q->key = *key;
++
++	__futex_unqueue(q);
++
++	WARN_ON(!q->rt_waiter);
++	q->rt_waiter = NULL;
++
++	q->lock_ptr = &hb->lock;
++
++	/* Signal locked state to the waiter */
++	futex_requeue_pi_complete(q, 1);
++	wake_up_state(q->task, TASK_NORMAL);
++}
++
++/**
++ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
++ * @pifutex:		the user address of the to futex
++ * @hb1:		the from futex hash bucket, must be locked by the caller
++ * @hb2:		the to futex hash bucket, must be locked by the caller
++ * @key1:		the from futex key
++ * @key2:		the to futex key
++ * @ps:			address to store the pi_state pointer
++ * @exiting:		Pointer to store the task pointer of the owner task
++ *			which is in the middle of exiting
++ * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
++ *
++ * Try and get the lock on behalf of the top waiter if we can do it atomically.
++ * Wake the top waiter if we succeed.  If the caller specified set_waiters,
++ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
++ * hb1 and hb2 must be held by the caller.
++ *
++ * @exiting is only set when the return value is -EBUSY. If so, this holds
++ * a refcount on the exiting task on return and the caller needs to drop it
++ * after waiting for the exit to complete.
++ *
++ * Return:
++ *  -  0 - failed to acquire the lock atomically;
++ *  - >0 - acquired the lock, return value is vpid of the top_waiter
++ *  - <0 - error
++ */
++static int
++futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
++			   struct futex_hash_bucket *hb2, union futex_key *key1,
++			   union futex_key *key2, struct futex_pi_state **ps,
++			   struct task_struct **exiting, int set_waiters)
++{
++	struct futex_q *top_waiter = NULL;
++	u32 curval;
++	int ret;
++
++	if (futex_get_value_locked(&curval, pifutex))
++		return -EFAULT;
++
++	if (unlikely(should_fail_futex(true)))
++		return -EFAULT;
++
++	/*
++	 * Find the top_waiter and determine if there are additional waiters.
++	 * If the caller intends to requeue more than 1 waiter to pifutex,
++	 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
++	 * as we have means to handle the possible fault.  If not, don't set
++	 * the bit unnecessarily as it will force the subsequent unlock to enter
++	 * the kernel.
++	 */
++	top_waiter = futex_top_waiter(hb1, key1);
++
++	/* There are no waiters, nothing for us to do. */
++	if (!top_waiter)
++		return 0;
++
++	/*
++	 * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
++	 * and waiting on the 'waitqueue' futex which is always !PI.
++	 */
++	if (!top_waiter->rt_waiter || top_waiter->pi_state)
++		return -EINVAL;
++
++	/* Ensure we requeue to the expected futex. */
++	if (!futex_match(top_waiter->requeue_pi_key, key2))
++		return -EINVAL;
++
++	/* Ensure that this does not race against an early wakeup */
++	if (!futex_requeue_pi_prepare(top_waiter, NULL))
++		return -EAGAIN;
++
++	/*
++	 * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
++	 * in the contended case or if @set_waiters is true.
++	 *
++	 * In the contended case PI state is attached to the lock owner. If
++	 * the user space lock can be acquired then PI state is attached to
++	 * the new owner (@top_waiter->task) when @set_waiters is true.
++	 */
++	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
++				   exiting, set_waiters);
++	if (ret == 1) {
++		/*
++		 * Lock was acquired in user space and PI state was
++		 * attached to @top_waiter->task. That means state is fully
++		 * consistent and the waiter can return to user space
++		 * immediately after the wakeup.
++		 */
++		requeue_pi_wake_futex(top_waiter, key2, hb2);
++	} else if (ret < 0) {
++		/* Rewind top_waiter::requeue_state */
++		futex_requeue_pi_complete(top_waiter, ret);
++	} else {
++		/*
++		 * futex_lock_pi_atomic() did not acquire the user space
++		 * futex, but managed to establish the proxy lock and pi
++		 * state. top_waiter::requeue_state cannot be fixed up here
++		 * because the waiter is not enqueued on the rtmutex
++		 * yet. This is handled at the callsite depending on the
++		 * result of rt_mutex_start_proxy_lock() which is
++		 * guaranteed to be reached with this function returning 0.
++		 */
++	}
++	return ret;
++}
++
++/**
++ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
++ * @uaddr1:	source futex user address
++ * @flags:	futex flags (FLAGS_SHARED, etc.)
++ * @uaddr2:	target futex user address
++ * @nr_wake:	number of waiters to wake (must be 1 for requeue_pi)
++ * @nr_requeue:	number of waiters to requeue (0-INT_MAX)
++ * @cmpval:	@uaddr1 expected value (or %NULL)
++ * @requeue_pi:	if we are attempting to requeue from a non-pi futex to a
++ *		pi futex (pi to pi requeue is not supported)
++ *
++ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
++ * uaddr2 atomically on behalf of the top waiter.
++ *
++ * Return:
++ *  - >=0 - on success, the number of tasks requeued or woken;
++ *  -  <0 - on error
++ */
++int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
++		  int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi)
++{
++	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
++	int task_count = 0, ret;
++	struct futex_pi_state *pi_state = NULL;
++	struct futex_hash_bucket *hb1, *hb2;
++	struct futex_q *this, *next;
++	DEFINE_WAKE_Q(wake_q);
++
++	if (nr_wake < 0 || nr_requeue < 0)
++		return -EINVAL;
++
++	/*
++	 * When PI not supported: return -ENOSYS if requeue_pi is true,
++	 * consequently the compiler knows requeue_pi is always false past
++	 * this point which will optimize away all the conditional code
++	 * further down.
++	 */
++	if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
++		return -ENOSYS;
++
++	if (requeue_pi) {
++		/*
++		 * Requeue PI only works on two distinct uaddrs. This
++		 * check is only valid for private futexes. See below.
++		 */
++		if (uaddr1 == uaddr2)
++			return -EINVAL;
++
++		/*
++		 * futex_requeue() allows the caller to define the number
++		 * of waiters to wake up via the @nr_wake argument. With
++		 * REQUEUE_PI, waking up more than one waiter is creating
++		 * more problems than it solves. Waking up a waiter makes
++		 * only sense if the PI futex @uaddr2 is uncontended as
++		 * this allows the requeue code to acquire the futex
++		 * @uaddr2 before waking the waiter. The waiter can then
++		 * return to user space without further action. A secondary
++		 * wakeup would just make the futex_wait_requeue_pi()
++		 * handling more complex, because that code would have to
++		 * look up pi_state and do more or less all the handling
++		 * which the requeue code has to do for the to be requeued
++		 * waiters. So restrict the number of waiters to wake to
++		 * one, and only wake it up when the PI futex is
++		 * uncontended. Otherwise requeue it and let the unlock of
++		 * the PI futex handle the wakeup.
++		 *
++		 * All REQUEUE_PI users, e.g. pthread_cond_signal() and
++		 * pthread_cond_broadcast() must use nr_wake=1.
++		 */
++		if (nr_wake != 1)
++			return -EINVAL;
++
++		/*
++		 * requeue_pi requires a pi_state, try to allocate it now
++		 * without any locks in case it fails.
++		 */
++		if (refill_pi_state_cache())
++			return -ENOMEM;
++	}
++
++retry:
++	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
++	if (unlikely(ret != 0))
++		return ret;
++	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
++			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
++	if (unlikely(ret != 0))
++		return ret;
++
++	/*
++	 * The check above which compares uaddrs is not sufficient for
++	 * shared futexes. We need to compare the keys:
++	 */
++	if (requeue_pi && futex_match(&key1, &key2))
++		return -EINVAL;
++
++	hb1 = futex_hash(&key1);
++	hb2 = futex_hash(&key2);
++
++retry_private:
++	futex_hb_waiters_inc(hb2);
++	double_lock_hb(hb1, hb2);
++
++	if (likely(cmpval != NULL)) {
++		u32 curval;
++
++		ret = futex_get_value_locked(&curval, uaddr1);
++
++		if (unlikely(ret)) {
++			double_unlock_hb(hb1, hb2);
++			futex_hb_waiters_dec(hb2);
++
++			ret = get_user(curval, uaddr1);
++			if (ret)
++				return ret;
++
++			if (!(flags & FLAGS_SHARED))
++				goto retry_private;
++
++			goto retry;
++		}
++		if (curval != *cmpval) {
++			ret = -EAGAIN;
++			goto out_unlock;
++		}
++	}
++
++	if (requeue_pi) {
++		struct task_struct *exiting = NULL;
++
++		/*
++		 * Attempt to acquire uaddr2 and wake the top waiter. If we
++		 * intend to requeue waiters, force setting the FUTEX_WAITERS
++		 * bit.  We force this here where we are able to easily handle
++		 * faults rather in the requeue loop below.
++		 *
++		 * Updates topwaiter::requeue_state if a top waiter exists.
++		 */
++		ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
++						 &key2, &pi_state,
++						 &exiting, nr_requeue);
++
++		/*
++		 * At this point the top_waiter has either taken uaddr2 or
++		 * is waiting on it. In both cases pi_state has been
++		 * established and an initial refcount on it. In case of an
++		 * error there's nothing.
++		 *
++		 * The top waiter's requeue_state is up to date:
++		 *
++		 *  - If the lock was acquired atomically (ret == 1), then
++		 *    the state is Q_REQUEUE_PI_LOCKED.
++		 *
++		 *    The top waiter has been dequeued and woken up and can
++		 *    return to user space immediately. The kernel/user
++		 *    space state is consistent. In case that there must be
++		 *    more waiters requeued the WAITERS bit in the user
++		 *    space futex is set so the top waiter task has to go
++		 *    into the syscall slowpath to unlock the futex. This
++		 *    will block until this requeue operation has been
++		 *    completed and the hash bucket locks have been
++		 *    dropped.
++		 *
++		 *  - If the trylock failed with an error (ret < 0) then
++		 *    the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
++		 *    happened", or Q_REQUEUE_PI_IGNORE when there was an
++		 *    interleaved early wakeup.
++		 *
++		 *  - If the trylock did not succeed (ret == 0) then the
++		 *    state is either Q_REQUEUE_PI_IN_PROGRESS or
++		 *    Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
++		 *    This will be cleaned up in the loop below, which
++		 *    cannot fail because futex_proxy_trylock_atomic() did
++		 *    the same sanity checks for requeue_pi as the loop
++		 *    below does.
++		 */
++		switch (ret) {
++		case 0:
++			/* We hold a reference on the pi state. */
++			break;
++
++		case 1:
++			/*
++			 * futex_proxy_trylock_atomic() acquired the user space
++			 * futex. Adjust task_count.
++			 */
++			task_count++;
++			ret = 0;
++			break;
++
++		/*
++		 * If the above failed, then pi_state is NULL and
++		 * waiter::requeue_state is correct.
++		 */
++		case -EFAULT:
++			double_unlock_hb(hb1, hb2);
++			futex_hb_waiters_dec(hb2);
++			ret = fault_in_user_writeable(uaddr2);
++			if (!ret)
++				goto retry;
++			return ret;
++		case -EBUSY:
++		case -EAGAIN:
++			/*
++			 * Two reasons for this:
++			 * - EBUSY: Owner is exiting and we just wait for the
++			 *   exit to complete.
++			 * - EAGAIN: The user space value changed.
++			 */
++			double_unlock_hb(hb1, hb2);
++			futex_hb_waiters_dec(hb2);
++			/*
++			 * Handle the case where the owner is in the middle of
++			 * exiting. Wait for the exit to complete otherwise
++			 * this task might loop forever, aka. live lock.
++			 */
++			wait_for_owner_exiting(ret, exiting);
++			cond_resched();
++			goto retry;
++		default:
++			goto out_unlock;
++		}
++	}
++
++	plist_for_each_entry_safe(this, next, &hb1->chain, list) {
++		if (task_count - nr_wake >= nr_requeue)
++			break;
++
++		if (!futex_match(&this->key, &key1))
++			continue;
++
++		/*
++		 * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
++		 * be paired with each other and no other futex ops.
++		 *
++		 * We should never be requeueing a futex_q with a pi_state,
++		 * which is awaiting a futex_unlock_pi().
++		 */
++		if ((requeue_pi && !this->rt_waiter) ||
++		    (!requeue_pi && this->rt_waiter) ||
++		    this->pi_state) {
++			ret = -EINVAL;
++			break;
++		}
++
++		/* Plain futexes just wake or requeue and are done */
++		if (!requeue_pi) {
++			if (++task_count <= nr_wake)
++				futex_wake_mark(&wake_q, this);
++			else
++				requeue_futex(this, hb1, hb2, &key2);
++			continue;
++		}
++
++		/* Ensure we requeue to the expected futex for requeue_pi. */
++		if (!futex_match(this->requeue_pi_key, &key2)) {
++			ret = -EINVAL;
++			break;
++		}
++
++		/*
++		 * Requeue nr_requeue waiters and possibly one more in the case
++		 * of requeue_pi if we couldn't acquire the lock atomically.
++		 *
++		 * Prepare the waiter to take the rt_mutex. Take a refcount
++		 * on the pi_state and store the pointer in the futex_q
++		 * object of the waiter.
++		 */
++		get_pi_state(pi_state);
++
++		/* Don't requeue when the waiter is already on the way out. */
++		if (!futex_requeue_pi_prepare(this, pi_state)) {
++			/*
++			 * Early woken waiter signaled that it is on the
++			 * way out. Drop the pi_state reference and try the
++			 * next waiter. @this->pi_state is still NULL.
++			 */
++			put_pi_state(pi_state);
++			continue;
++		}
++
++		ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
++						this->rt_waiter,
++						this->task);
++
++		if (ret == 1) {
++			/*
++			 * We got the lock. We do neither drop the refcount
++			 * on pi_state nor clear this->pi_state because the
++			 * waiter needs the pi_state for cleaning up the
++			 * user space value. It will drop the refcount
++			 * after doing so. this::requeue_state is updated
++			 * in the wakeup as well.
++			 */
++			requeue_pi_wake_futex(this, &key2, hb2);
++			task_count++;
++		} else if (!ret) {
++			/* Waiter is queued, move it to hb2 */
++			requeue_futex(this, hb1, hb2, &key2);
++			futex_requeue_pi_complete(this, 0);
++			task_count++;
++		} else {
++			/*
++			 * rt_mutex_start_proxy_lock() detected a potential
++			 * deadlock when we tried to queue that waiter.
++			 * Drop the pi_state reference which we took above
++			 * and remove the pointer to the state from the
++			 * waiters futex_q object.
++			 */
++			this->pi_state = NULL;
++			put_pi_state(pi_state);
++			futex_requeue_pi_complete(this, ret);
++			/*
++			 * We stop queueing more waiters and let user space
++			 * deal with the mess.
++			 */
++			break;
++		}
++	}
++
++	/*
++	 * We took an extra initial reference to the pi_state in
++	 * futex_proxy_trylock_atomic(). We need to drop it here again.
++	 */
++	put_pi_state(pi_state);
++
++out_unlock:
++	double_unlock_hb(hb1, hb2);
++	wake_up_q(&wake_q);
++	futex_hb_waiters_dec(hb2);
++	return ret ? ret : task_count;
++}
++
++/**
++ * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex
++ * @hb:		the hash_bucket futex_q was original enqueued on
++ * @q:		the futex_q woken while waiting to be requeued
++ * @timeout:	the timeout associated with the wait (NULL if none)
++ *
++ * Determine the cause for the early wakeup.
++ *
++ * Return:
++ *  -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR
++ */
++static inline
++int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
++				   struct futex_q *q,
++				   struct hrtimer_sleeper *timeout)
++{
++	int ret;
++
++	/*
++	 * With the hb lock held, we avoid races while we process the wakeup.
++	 * We only need to hold hb (and not hb2) to ensure atomicity as the
++	 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
++	 * It can't be requeued from uaddr2 to something else since we don't
++	 * support a PI aware source futex for requeue.
++	 */
++	WARN_ON_ONCE(&hb->lock != q->lock_ptr);
++
++	/*
++	 * We were woken prior to requeue by a timeout or a signal.
++	 * Unqueue the futex_q and determine which it was.
++	 */
++	plist_del(&q->list, &hb->chain);
++	futex_hb_waiters_dec(hb);
++
++	/* Handle spurious wakeups gracefully */
++	ret = -EWOULDBLOCK;
++	if (timeout && !timeout->task)
++		ret = -ETIMEDOUT;
++	else if (signal_pending(current))
++		ret = -ERESTARTNOINTR;
++	return ret;
++}
++
++/**
++ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
++ * @uaddr:	the futex we initially wait on (non-pi)
++ * @flags:	futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
++ *		the same type, no requeueing from private to shared, etc.
++ * @val:	the expected value of uaddr
++ * @abs_time:	absolute timeout
++ * @bitset:	32 bit wakeup bitset set by userspace, defaults to all
++ * @uaddr2:	the pi futex we will take prior to returning to user-space
++ *
++ * The caller will wait on uaddr and will be requeued by futex_requeue() to
++ * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
++ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
++ * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
++ * without one, the pi logic would not know which task to boost/deboost, if
++ * there was a need to.
++ *
++ * We call schedule in futex_wait_queue() when we enqueue and return there
++ * via the following--
++ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
++ * 2) wakeup on uaddr2 after a requeue
++ * 3) signal
++ * 4) timeout
++ *
++ * If 3, cleanup and return -ERESTARTNOINTR.
++ *
++ * If 2, we may then block on trying to take the rt_mutex and return via:
++ * 5) successful lock
++ * 6) signal
++ * 7) timeout
++ * 8) other lock acquisition failure
++ *
++ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
++ *
++ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
++ *
++ * Return:
++ *  -  0 - On success;
++ *  - <0 - On error
++ */
++int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
++			  u32 val, ktime_t *abs_time, u32 bitset,
++			  u32 __user *uaddr2)
++{
++	struct hrtimer_sleeper timeout, *to;
++	struct rt_mutex_waiter rt_waiter;
++	struct futex_hash_bucket *hb;
++	union futex_key key2 = FUTEX_KEY_INIT;
++	struct futex_q q = futex_q_init;
++	struct rt_mutex_base *pi_mutex;
++	int res, ret;
++
++	if (!IS_ENABLED(CONFIG_FUTEX_PI))
++		return -ENOSYS;
++
++	if (uaddr == uaddr2)
++		return -EINVAL;
++
++	if (!bitset)
++		return -EINVAL;
++
++	to = futex_setup_timer(abs_time, &timeout, flags,
++			       current->timer_slack_ns);
++
++	/*
++	 * The waiter is allocated on our stack, manipulated by the requeue
++	 * code while we sleep on uaddr.
++	 */
++	rt_mutex_init_waiter(&rt_waiter);
++
++	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
++	if (unlikely(ret != 0))
++		goto out;
++
++	q.bitset = bitset;
++	q.rt_waiter = &rt_waiter;
++	q.requeue_pi_key = &key2;
++
++	/*
++	 * Prepare to wait on uaddr. On success, it holds hb->lock and q
++	 * is initialized.
++	 */
++	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
++	if (ret)
++		goto out;
++
++	/*
++	 * The check above which compares uaddrs is not sufficient for
++	 * shared futexes. We need to compare the keys:
++	 */
++	if (futex_match(&q.key, &key2)) {
++		futex_q_unlock(hb);
++		ret = -EINVAL;
++		goto out;
++	}
++
++	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
++	futex_wait_queue(hb, &q, to);
++
++	switch (futex_requeue_pi_wakeup_sync(&q)) {
++	case Q_REQUEUE_PI_IGNORE:
++		/* The waiter is still on uaddr1 */
++		spin_lock(&hb->lock);
++		ret = handle_early_requeue_pi_wakeup(hb, &q, to);
++		spin_unlock(&hb->lock);
++		break;
++
++	case Q_REQUEUE_PI_LOCKED:
++		/* The requeue acquired the lock */
++		if (q.pi_state && (q.pi_state->owner != current)) {
++			spin_lock(q.lock_ptr);
++			ret = fixup_pi_owner(uaddr2, &q, true);
++			/*
++			 * Drop the reference to the pi state which the
++			 * requeue_pi() code acquired for us.
++			 */
++			put_pi_state(q.pi_state);
++			spin_unlock(q.lock_ptr);
++			/*
++			 * Adjust the return value. It's either -EFAULT or
++			 * success (1) but the caller expects 0 for success.
++			 */
++			ret = ret < 0 ? ret : 0;
++		}
++		break;
++
++	case Q_REQUEUE_PI_DONE:
++		/* Requeue completed. Current is 'pi_blocked_on' the rtmutex */
++		pi_mutex = &q.pi_state->pi_mutex;
++		ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
++
++		/* Current is not longer pi_blocked_on */
++		spin_lock(q.lock_ptr);
++		if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
++			ret = 0;
++
++		debug_rt_mutex_free_waiter(&rt_waiter);
++		/*
++		 * Fixup the pi_state owner and possibly acquire the lock if we
++		 * haven't already.
++		 */
++		res = fixup_pi_owner(uaddr2, &q, !ret);
++		/*
++		 * If fixup_pi_owner() returned an error, propagate that.  If it
++		 * acquired the lock, clear -ETIMEDOUT or -EINTR.
++		 */
++		if (res)
++			ret = (res < 0) ? res : 0;
++
++		futex_unqueue_pi(&q);
++		spin_unlock(q.lock_ptr);
++
++		if (ret == -EINTR) {
++			/*
++			 * We've already been requeued, but cannot restart
++			 * by calling futex_lock_pi() directly. We could
++			 * restart this syscall, but it would detect that
++			 * the user space "val" changed and return
++			 * -EWOULDBLOCK.  Save the overhead of the restart
++			 * and return -EWOULDBLOCK directly.
++			 */
++			ret = -EWOULDBLOCK;
++		}
++		break;
++	default:
++		BUG();
++	}
++
++out:
++	if (to) {
++		hrtimer_cancel(&to->timer);
++		destroy_hrtimer_on_stack(&to->timer);
++	}
++	return ret;
++}
++
+diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
+new file mode 100644
+index 000000000..368e9c17f
+--- /dev/null
++++ b/kernel/futex/syscalls.c
+@@ -0,0 +1,396 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++
++#include <linux/compat.h>
++#include <linux/syscalls.h>
++#include <linux/time_namespace.h>
++
++#include "futex.h"
++
++/*
++ * Support for robust futexes: the kernel cleans up held futexes at
++ * thread exit time.
++ *
++ * Implementation: user-space maintains a per-thread list of locks it
++ * is holding. Upon do_exit(), the kernel carefully walks this list,
++ * and marks all locks that are owned by this thread with the
++ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
++ * always manipulated with the lock held, so the list is private and
++ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
++ * field, to allow the kernel to clean up if the thread dies after
++ * acquiring the lock, but just before it could have added itself to
++ * the list. There can only be one such pending lock.
++ */
++
++/**
++ * sys_set_robust_list() - Set the robust-futex list head of a task
++ * @head:	pointer to the list-head
++ * @len:	length of the list-head, as userspace expects
++ */
++SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
++		size_t, len)
++{
++	if (!futex_cmpxchg_enabled)
++		return -ENOSYS;
++	/*
++	 * The kernel knows only one size for now:
++	 */
++	if (unlikely(len != sizeof(*head)))
++		return -EINVAL;
++
++	current->robust_list = head;
++
++	return 0;
++}
++
++/**
++ * sys_get_robust_list() - Get the robust-futex list head of a task
++ * @pid:	pid of the process [zero for current task]
++ * @head_ptr:	pointer to a list-head pointer, the kernel fills it in
++ * @len_ptr:	pointer to a length field, the kernel fills in the header size
++ */
++SYSCALL_DEFINE3(get_robust_list, int, pid,
++		struct robust_list_head __user * __user *, head_ptr,
++		size_t __user *, len_ptr)
++{
++	struct robust_list_head __user *head;
++	unsigned long ret;
++	struct task_struct *p;
++
++	if (!futex_cmpxchg_enabled)
++		return -ENOSYS;
++
++	rcu_read_lock();
++
++	ret = -ESRCH;
++	if (!pid)
++		p = current;
++	else {
++		p = find_task_by_vpid(pid);
++		if (!p)
++			goto err_unlock;
++	}
++
++	ret = -EPERM;
++	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
++		goto err_unlock;
++
++	head = p->robust_list;
++	rcu_read_unlock();
++
++	if (put_user(sizeof(*head), len_ptr))
++		return -EFAULT;
++	return put_user(head, head_ptr);
++
++err_unlock:
++	rcu_read_unlock();
++
++	return ret;
++}
++
++long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
++		u32 __user *uaddr2, u32 val2, u32 val3)
++{
++	int cmd = op & FUTEX_CMD_MASK;
++	unsigned int flags = 0;
++
++	if (!(op & FUTEX_PRIVATE_FLAG))
++		flags |= FLAGS_SHARED;
++
++	if (op & FUTEX_CLOCK_REALTIME) {
++		flags |= FLAGS_CLOCKRT;
++		if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
++		    cmd != FUTEX_LOCK_PI2)
++			return -ENOSYS;
++	}
++
++	switch (cmd) {
++	case FUTEX_LOCK_PI:
++	case FUTEX_LOCK_PI2:
++	case FUTEX_UNLOCK_PI:
++	case FUTEX_TRYLOCK_PI:
++	case FUTEX_WAIT_REQUEUE_PI:
++	case FUTEX_CMP_REQUEUE_PI:
++		if (!futex_cmpxchg_enabled)
++			return -ENOSYS;
++	}
++
++	switch (cmd) {
++	case FUTEX_WAIT:
++		val3 = FUTEX_BITSET_MATCH_ANY;
++		fallthrough;
++	case FUTEX_WAIT_BITSET:
++		return futex_wait(uaddr, flags, val, timeout, val3);
++	case FUTEX_WAKE:
++		val3 = FUTEX_BITSET_MATCH_ANY;
++		fallthrough;
++	case FUTEX_WAKE_BITSET:
++		return futex_wake(uaddr, flags, val, val3);
++	case FUTEX_REQUEUE:
++		return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
++	case FUTEX_CMP_REQUEUE:
++		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
++	case FUTEX_WAKE_OP:
++		return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
++	case FUTEX_LOCK_PI:
++		flags |= FLAGS_CLOCKRT;
++		fallthrough;
++	case FUTEX_LOCK_PI2:
++		return futex_lock_pi(uaddr, flags, timeout, 0);
++	case FUTEX_UNLOCK_PI:
++		return futex_unlock_pi(uaddr, flags);
++	case FUTEX_TRYLOCK_PI:
++		return futex_lock_pi(uaddr, flags, NULL, 1);
++	case FUTEX_WAIT_REQUEUE_PI:
++		val3 = FUTEX_BITSET_MATCH_ANY;
++		return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
++					     uaddr2);
++	case FUTEX_CMP_REQUEUE_PI:
++		return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
++	}
++	return -ENOSYS;
++}
++
++static __always_inline bool futex_cmd_has_timeout(u32 cmd)
++{
++	switch (cmd) {
++	case FUTEX_WAIT:
++	case FUTEX_LOCK_PI:
++	case FUTEX_LOCK_PI2:
++	case FUTEX_WAIT_BITSET:
++	case FUTEX_WAIT_REQUEUE_PI:
++		return true;
++	}
++	return false;
++}
++
++static __always_inline int
++futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
++{
++	if (!timespec64_valid(ts))
++		return -EINVAL;
++
++	*t = timespec64_to_ktime(*ts);
++	if (cmd == FUTEX_WAIT)
++		*t = ktime_add_safe(ktime_get(), *t);
++	else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
++		*t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
++	return 0;
++}
++
++SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
++		const struct __kernel_timespec __user *, utime,
++		u32 __user *, uaddr2, u32, val3)
++{
++	int ret, cmd = op & FUTEX_CMD_MASK;
++	ktime_t t, *tp = NULL;
++	struct timespec64 ts;
++
++	if (utime && futex_cmd_has_timeout(cmd)) {
++		if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
++			return -EFAULT;
++		if (get_timespec64(&ts, utime))
++			return -EFAULT;
++		ret = futex_init_timeout(cmd, op, &ts, &t);
++		if (ret)
++			return ret;
++		tp = &t;
++	}
++
++	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
++}
++
++/* Mask of available flags for each futex in futex_waitv list */
++#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
++
++/**
++ * futex_parse_waitv - Parse a waitv array from userspace
++ * @futexv:	Kernel side list of waiters to be filled
++ * @uwaitv:     Userspace list to be parsed
++ * @nr_futexes: Length of futexv
++ *
++ * Return: Error code on failure, 0 on success
++ */
++static int futex_parse_waitv(struct futex_vector *futexv,
++			     struct futex_waitv __user *uwaitv,
++			     unsigned int nr_futexes)
++{
++	struct futex_waitv aux;
++	unsigned int i;
++
++	for (i = 0; i < nr_futexes; i++) {
++		if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
++			return -EFAULT;
++
++		if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved)
++			return -EINVAL;
++
++		if (!(aux.flags & FUTEX_32))
++			return -EINVAL;
++
++		futexv[i].w.flags = aux.flags;
++		futexv[i].w.val = aux.val;
++		futexv[i].w.uaddr = aux.uaddr;
++		futexv[i].q = futex_q_init;
++	}
++
++	return 0;
++}
++
++/**
++ * sys_futex_waitv - Wait on a list of futexes
++ * @waiters:    List of futexes to wait on
++ * @nr_futexes: Length of futexv
++ * @flags:      Flag for timeout (monotonic/realtime)
++ * @timeout:	Optional absolute timeout.
++ * @clockid:	Clock to be used for the timeout, realtime or monotonic.
++ *
++ * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
++ * if a futex_wake() is performed at any uaddr. The syscall returns immediately
++ * if any waiter has *uaddr != val. *timeout is an optional timeout value for the
++ * operation. Each waiter has individual flags. The `flags` argument for the
++ * syscall should be used solely for specifying the timeout as realtime, if
++ * needed. Flags for private futexes, sizes, etc. should be used on the
++ * individual flags of each waiter.
++ *
++ * Returns the array index of one of the awaken futexes. There's no given
++ * information of how many were awakened, or any particular attribute of it (if
++ * it's the first awakened, if it is of the smaller index...).
++ */
++
++SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
++		unsigned int, nr_futexes, unsigned int, flags,
++		struct __kernel_timespec __user *, timeout, clockid_t, clockid)
++{
++	struct hrtimer_sleeper to;
++	struct futex_vector *futexv;
++	struct timespec64 ts;
++	ktime_t time;
++	int ret;
++
++	/* This syscall supports no flags for now */
++	if (flags)
++		return -EINVAL;
++
++	if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
++		return -EINVAL;
++
++	if (timeout) {
++		int flag_clkid = 0, flag_init = 0;
++
++		if (clockid == CLOCK_REALTIME) {
++			flag_clkid = FLAGS_CLOCKRT;
++			flag_init = FUTEX_CLOCK_REALTIME;
++		}
++
++		if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
++			return -EINVAL;
++
++		if (get_timespec64(&ts, timeout))
++			return -EFAULT;
++
++		/*
++		 * Since there's no opcode for futex_waitv, use
++		 * FUTEX_WAIT_BITSET that uses absolute timeout as well
++		 */
++		ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
++		if (ret)
++			return ret;
++
++		futex_setup_timer(&time, &to, flag_clkid, 0);
++	}
++
++	futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
++	if (!futexv)
++		return -ENOMEM;
++
++	ret = futex_parse_waitv(futexv, waiters, nr_futexes);
++	if (!ret)
++		ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
++
++	if (timeout) {
++		hrtimer_cancel(&to.timer);
++		destroy_hrtimer_on_stack(&to.timer);
++	}
++
++	kfree(futexv);
++	return ret;
++}
++
++#ifdef CONFIG_COMPAT
++COMPAT_SYSCALL_DEFINE2(set_robust_list,
++		struct compat_robust_list_head __user *, head,
++		compat_size_t, len)
++{
++	if (!futex_cmpxchg_enabled)
++		return -ENOSYS;
++
++	if (unlikely(len != sizeof(*head)))
++		return -EINVAL;
++
++	current->compat_robust_list = head;
++
++	return 0;
++}
++
++COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
++			compat_uptr_t __user *, head_ptr,
++			compat_size_t __user *, len_ptr)
++{
++	struct compat_robust_list_head __user *head;
++	unsigned long ret;
++	struct task_struct *p;
++
++	if (!futex_cmpxchg_enabled)
++		return -ENOSYS;
++
++	rcu_read_lock();
++
++	ret = -ESRCH;
++	if (!pid)
++		p = current;
++	else {
++		p = find_task_by_vpid(pid);
++		if (!p)
++			goto err_unlock;
++	}
++
++	ret = -EPERM;
++	if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
++		goto err_unlock;
++
++	head = p->compat_robust_list;
++	rcu_read_unlock();
++
++	if (put_user(sizeof(*head), len_ptr))
++		return -EFAULT;
++	return put_user(ptr_to_compat(head), head_ptr);
++
++err_unlock:
++	rcu_read_unlock();
++
++	return ret;
++}
++#endif /* CONFIG_COMPAT */
++
++#ifdef CONFIG_COMPAT_32BIT_TIME
++SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
++		const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
++		u32, val3)
++{
++	int ret, cmd = op & FUTEX_CMD_MASK;
++	ktime_t t, *tp = NULL;
++	struct timespec64 ts;
++
++	if (utime && futex_cmd_has_timeout(cmd)) {
++		if (get_old_timespec32(&ts, utime))
++			return -EFAULT;
++		ret = futex_init_timeout(cmd, op, &ts, &t);
++		if (ret)
++			return ret;
++		tp = &t;
++	}
++
++	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
++}
++#endif /* CONFIG_COMPAT_32BIT_TIME */
++
+diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
+new file mode 100644
+index 000000000..b45597aab
+--- /dev/null
++++ b/kernel/futex/waitwake.c
+@@ -0,0 +1,708 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++
++#include <linux/sched/task.h>
++#include <linux/sched/signal.h>
++#include <linux/freezer.h>
++
++#include "futex.h"
++
++/*
++ * READ this before attempting to hack on futexes!
++ *
++ * Basic futex operation and ordering guarantees
++ * =============================================
++ *
++ * The waiter reads the futex value in user space and calls
++ * futex_wait(). This function computes the hash bucket and acquires
++ * the hash bucket lock. After that it reads the futex user space value
++ * again and verifies that the data has not changed. If it has not changed
++ * it enqueues itself into the hash bucket, releases the hash bucket lock
++ * and schedules.
++ *
++ * The waker side modifies the user space value of the futex and calls
++ * futex_wake(). This function computes the hash bucket and acquires the
++ * hash bucket lock. Then it looks for waiters on that futex in the hash
++ * bucket and wakes them.
++ *
++ * In futex wake up scenarios where no tasks are blocked on a futex, taking
++ * the hb spinlock can be avoided and simply return. In order for this
++ * optimization to work, ordering guarantees must exist so that the waiter
++ * being added to the list is acknowledged when the list is concurrently being
++ * checked by the waker, avoiding scenarios like the following:
++ *
++ * CPU 0                               CPU 1
++ * val = *futex;
++ * sys_futex(WAIT, futex, val);
++ *   futex_wait(futex, val);
++ *   uval = *futex;
++ *                                     *futex = newval;
++ *                                     sys_futex(WAKE, futex);
++ *                                       futex_wake(futex);
++ *                                       if (queue_empty())
++ *                                         return;
++ *   if (uval == val)
++ *      lock(hash_bucket(futex));
++ *      queue();
++ *     unlock(hash_bucket(futex));
++ *     schedule();
++ *
++ * This would cause the waiter on CPU 0 to wait forever because it
++ * missed the transition of the user space value from val to newval
++ * and the waker did not find the waiter in the hash bucket queue.
++ *
++ * The correct serialization ensures that a waiter either observes
++ * the changed user space value before blocking or is woken by a
++ * concurrent waker:
++ *
++ * CPU 0                                 CPU 1
++ * val = *futex;
++ * sys_futex(WAIT, futex, val);
++ *   futex_wait(futex, val);
++ *
++ *   waiters++; (a)
++ *   smp_mb(); (A) <-- paired with -.
++ *                                  |
++ *   lock(hash_bucket(futex));      |
++ *                                  |
++ *   uval = *futex;                 |
++ *                                  |        *futex = newval;
++ *                                  |        sys_futex(WAKE, futex);
++ *                                  |          futex_wake(futex);
++ *                                  |
++ *                                  `--------> smp_mb(); (B)
++ *   if (uval == val)
++ *     queue();
++ *     unlock(hash_bucket(futex));
++ *     schedule();                         if (waiters)
++ *                                           lock(hash_bucket(futex));
++ *   else                                    wake_waiters(futex);
++ *     waiters--; (b)                        unlock(hash_bucket(futex));
++ *
++ * Where (A) orders the waiters increment and the futex value read through
++ * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write
++ * to futex and the waiters read (see futex_hb_waiters_pending()).
++ *
++ * This yields the following case (where X:=waiters, Y:=futex):
++ *
++ *	X = Y = 0
++ *
++ *	w[X]=1		w[Y]=1
++ *	MB		MB
++ *	r[Y]=y		r[X]=x
++ *
++ * Which guarantees that x==0 && y==0 is impossible; which translates back into
++ * the guarantee that we cannot both miss the futex variable change and the
++ * enqueue.
++ *
++ * Note that a new waiter is accounted for in (a) even when it is possible that
++ * the wait call can return error, in which case we backtrack from it in (b).
++ * Refer to the comment in futex_q_lock().
++ *
++ * Similarly, in order to account for waiters being requeued on another
++ * address we always increment the waiters for the destination bucket before
++ * acquiring the lock. It then decrements them again  after releasing it -
++ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
++ * will do the additional required waiter count housekeeping. This is done for
++ * double_lock_hb() and double_unlock_hb(), respectively.
++ */
++
++/*
++ * The hash bucket lock must be held when this is called.
++ * Afterwards, the futex_q must not be accessed. Callers
++ * must ensure to later call wake_up_q() for the actual
++ * wakeups to occur.
++ */
++void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
++{
++	struct task_struct *p = q->task;
++
++	if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
++		return;
++
++	get_task_struct(p);
++	__futex_unqueue(q);
++	/*
++	 * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
++	 * is written, without taking any locks. This is possible in the event
++	 * of a spurious wakeup, for example. A memory barrier is required here
++	 * to prevent the following store to lock_ptr from getting ahead of the
++	 * plist_del in __futex_unqueue().
++	 */
++	smp_store_release(&q->lock_ptr, NULL);
++
++	/*
++	 * Queue the task for later wakeup for after we've released
++	 * the hb->lock.
++	 */
++	wake_q_add_safe(wake_q, p);
++}
++
++/*
++ * Wake up waiters matching bitset queued on this futex (uaddr).
++ */
++int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
++{
++	struct futex_hash_bucket *hb;
++	struct futex_q *this, *next;
++	union futex_key key = FUTEX_KEY_INIT;
++	int ret;
++	DEFINE_WAKE_Q(wake_q);
++
++	if (!bitset)
++		return -EINVAL;
++
++	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
++	if (unlikely(ret != 0))
++		return ret;
++
++	hb = futex_hash(&key);
++
++	/* Make sure we really have tasks to wakeup */
++	if (!futex_hb_waiters_pending(hb))
++		return ret;
++
++	spin_lock(&hb->lock);
++
++	plist_for_each_entry_safe(this, next, &hb->chain, list) {
++		if (futex_match (&this->key, &key)) {
++			if (this->pi_state || this->rt_waiter) {
++				ret = -EINVAL;
++				break;
++			}
++
++			/* Check if one of the bits is set in both bitsets */
++			if (!(this->bitset & bitset))
++				continue;
++
++			futex_wake_mark(&wake_q, this);
++			if (++ret >= nr_wake)
++				break;
++		}
++	}
++
++	spin_unlock(&hb->lock);
++	wake_up_q(&wake_q);
++	return ret;
++}
++
++static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
++{
++	unsigned int op =	  (encoded_op & 0x70000000) >> 28;
++	unsigned int cmp =	  (encoded_op & 0x0f000000) >> 24;
++	int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
++	int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
++	int oldval, ret;
++
++	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
++		if (oparg < 0 || oparg > 31) {
++			char comm[sizeof(current->comm)];
++			/*
++			 * kill this print and return -EINVAL when userspace
++			 * is sane again
++			 */
++			pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
++					get_task_comm(comm, current), oparg);
++			oparg &= 31;
++		}
++		oparg = 1 << oparg;
++	}
++
++	pagefault_disable();
++	ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
++	pagefault_enable();
++	if (ret)
++		return ret;
++
++	switch (cmp) {
++	case FUTEX_OP_CMP_EQ:
++		return oldval == cmparg;
++	case FUTEX_OP_CMP_NE:
++		return oldval != cmparg;
++	case FUTEX_OP_CMP_LT:
++		return oldval < cmparg;
++	case FUTEX_OP_CMP_GE:
++		return oldval >= cmparg;
++	case FUTEX_OP_CMP_LE:
++		return oldval <= cmparg;
++	case FUTEX_OP_CMP_GT:
++		return oldval > cmparg;
++	default:
++		return -ENOSYS;
++	}
++}
++
++/*
++ * Wake up all waiters hashed on the physical page that is mapped
++ * to this virtual address:
++ */
++int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
++		  int nr_wake, int nr_wake2, int op)
++{
++	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
++	struct futex_hash_bucket *hb1, *hb2;
++	struct futex_q *this, *next;
++	int ret, op_ret;
++	DEFINE_WAKE_Q(wake_q);
++
++retry:
++	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
++	if (unlikely(ret != 0))
++		return ret;
++	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
++	if (unlikely(ret != 0))
++		return ret;
++
++	hb1 = futex_hash(&key1);
++	hb2 = futex_hash(&key2);
++
++retry_private:
++	double_lock_hb(hb1, hb2);
++	op_ret = futex_atomic_op_inuser(op, uaddr2);
++	if (unlikely(op_ret < 0)) {
++		double_unlock_hb(hb1, hb2);
++
++		if (!IS_ENABLED(CONFIG_MMU) ||
++		    unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
++			/*
++			 * we don't get EFAULT from MMU faults if we don't have
++			 * an MMU, but we might get them from range checking
++			 */
++			ret = op_ret;
++			return ret;
++		}
++
++		if (op_ret == -EFAULT) {
++			ret = fault_in_user_writeable(uaddr2);
++			if (ret)
++				return ret;
++		}
++
++		cond_resched();
++		if (!(flags & FLAGS_SHARED))
++			goto retry_private;
++		goto retry;
++	}
++
++	plist_for_each_entry_safe(this, next, &hb1->chain, list) {
++		if (futex_match (&this->key, &key1)) {
++			if (this->pi_state || this->rt_waiter) {
++				ret = -EINVAL;
++				goto out_unlock;
++			}
++			futex_wake_mark(&wake_q, this);
++			if (++ret >= nr_wake)
++				break;
++		}
++	}
++
++	if (op_ret > 0) {
++		op_ret = 0;
++		plist_for_each_entry_safe(this, next, &hb2->chain, list) {
++			if (futex_match (&this->key, &key2)) {
++				if (this->pi_state || this->rt_waiter) {
++					ret = -EINVAL;
++					goto out_unlock;
++				}
++				futex_wake_mark(&wake_q, this);
++				if (++op_ret >= nr_wake2)
++					break;
++			}
++		}
++		ret += op_ret;
++	}
++
++out_unlock:
++	double_unlock_hb(hb1, hb2);
++	wake_up_q(&wake_q);
++	return ret;
++}
++
++static long futex_wait_restart(struct restart_block *restart);
++
++/**
++ * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal
++ * @hb:		the futex hash bucket, must be locked by the caller
++ * @q:		the futex_q to queue up on
++ * @timeout:	the prepared hrtimer_sleeper, or null for no timeout
++ */
++void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
++			    struct hrtimer_sleeper *timeout)
++{
++	/*
++	 * The task state is guaranteed to be set before another task can
++	 * wake it. set_current_state() is implemented using smp_store_mb() and
++	 * futex_queue() calls spin_unlock() upon completion, both serializing
++	 * access to the hash list and forcing another memory barrier.
++	 */
++	set_current_state(TASK_INTERRUPTIBLE);
++	futex_queue(q, hb);
++
++	/* Arm the timer */
++	if (timeout)
++		hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
++
++	/*
++	 * If we have been removed from the hash list, then another task
++	 * has tried to wake us, and we can skip the call to schedule().
++	 */
++	if (likely(!plist_node_empty(&q->list))) {
++		/*
++		 * If the timer has already expired, current will already be
++		 * flagged for rescheduling. Only call schedule if there
++		 * is no timeout, or if it has yet to expire.
++		 */
++		if (!timeout || timeout->task)
++			freezable_schedule();
++	}
++	__set_current_state(TASK_RUNNING);
++}
++
++/**
++ * unqueue_multiple - Remove various futexes from their hash bucket
++ * @v:	   The list of futexes to unqueue
++ * @count: Number of futexes in the list
++ *
++ * Helper to unqueue a list of futexes. This can't fail.
++ *
++ * Return:
++ *  - >=0 - Index of the last futex that was awoken;
++ *  - -1  - No futex was awoken
++ */
++static int unqueue_multiple(struct futex_vector *v, int count)
++{
++	int ret = -1, i;
++
++	for (i = 0; i < count; i++) {
++		if (!futex_unqueue(&v[i].q))
++			ret = i;
++	}
++
++	return ret;
++}
++
++/**
++ * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes
++ * @vs:		The futex list to wait on
++ * @count:	The size of the list
++ * @awaken:	Index of the last awoken futex, if any. Used to notify the
++ *		caller that it can return this index to userspace (return parameter)
++ *
++ * Prepare multiple futexes in a single step and enqueue them. This may fail if
++ * the futex list is invalid or if any futex was already awoken. On success the
++ * task is ready to interruptible sleep.
++ *
++ * Return:
++ *  -  1 - One of the futexes was awaken by another thread
++ *  -  0 - Success
++ *  - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
++ */
++static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *awaken)
++{
++	struct futex_hash_bucket *hb;
++	bool retry = false;
++	int ret, i;
++	u32 uval;
++
++	/*
++	 * Enqueuing multiple futexes is tricky, because we need to enqueue
++	 * each futex in the list before dealing with the next one to avoid
++	 * deadlocking on the hash bucket. But, before enqueuing, we need to
++	 * make sure that current->state is TASK_INTERRUPTIBLE, so we don't
++	 * absorb any awake events, which cannot be done before the
++	 * get_futex_key of the next key, because it calls get_user_pages,
++	 * which can sleep. Thus, we fetch the list of futexes keys in two
++	 * steps, by first pinning all the memory keys in the futex key, and
++	 * only then we read each key and queue the corresponding futex.
++	 *
++	 * Private futexes doesn't need to recalculate hash in retry, so skip
++	 * get_futex_key() when retrying.
++	 */
++retry:
++	for (i = 0; i < count; i++) {
++		if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry)
++			continue;
++
++		ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
++				    !(vs[i].w.flags & FUTEX_PRIVATE_FLAG),
++				    &vs[i].q.key, FUTEX_READ);
++
++		if (unlikely(ret))
++			return ret;
++	}
++
++	set_current_state(TASK_INTERRUPTIBLE);
++
++	for (i = 0; i < count; i++) {
++		u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
++		struct futex_q *q = &vs[i].q;
++		u32 val = (u32)vs[i].w.val;
++
++		hb = futex_q_lock(q);
++		ret = futex_get_value_locked(&uval, uaddr);
++
++		if (!ret && uval == val) {
++			/*
++			 * The bucket lock can't be held while dealing with the
++			 * next futex. Queue each futex at this moment so hb can
++			 * be unlocked.
++			 */
++			futex_queue(q, hb);
++			continue;
++		}
++
++		futex_q_unlock(hb);
++		__set_current_state(TASK_RUNNING);
++
++		/*
++		 * Even if something went wrong, if we find out that a futex
++		 * was awaken, we don't return error and return this index to
++		 * userspace
++		 */
++		*awaken = unqueue_multiple(vs, i);
++		if (*awaken >= 0)
++			return 1;
++
++		if (ret) {
++			/*
++			 * If we need to handle a page fault, we need to do so
++			 * without any lock and any enqueued futex (otherwise
++			 * we could lose some wakeup). So we do it here, after
++			 * undoing all the work done so far. In success, we
++			 * retry all the work.
++			 */
++			if (get_user(uval, uaddr))
++				return -EFAULT;
++
++			retry = true;
++			goto retry;
++		}
++
++		if (uval != val)
++			return -EWOULDBLOCK;
++	}
++
++	return 0;
++}
++
++/**
++ * futex_sleep_multiple - Check sleeping conditions and sleep
++ * @vs:    List of futexes to wait for
++ * @count: Length of vs
++ * @to:    Timeout
++ *
++ * Sleep if and only if the timeout hasn't expired and no futex on the list has
++ * been awaken.
++ */
++static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
++				 struct hrtimer_sleeper *to)
++{
++	if (to && !to->task)
++		return;
++
++	for (; count; count--, vs++) {
++		if (!READ_ONCE(vs->q.lock_ptr))
++			return;
++	}
++
++	freezable_schedule();
++}
++
++/**
++ * futex_wait_multiple - Prepare to wait on and enqueue several futexes
++ * @vs:		The list of futexes to wait on
++ * @count:	The number of objects
++ * @to:		Timeout before giving up and returning to userspace
++ *
++ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function
++ * sleeps on a group of futexes and returns on the first futex that is
++ * wake, or after the timeout has elapsed.
++ *
++ * Return:
++ *  - >=0 - Hint to the futex that was awoken
++ *  - <0  - On error
++ */
++int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
++			struct hrtimer_sleeper *to)
++{
++	int ret, hint = 0;
++
++	if (to)
++		hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
++
++	while (1) {
++		ret = futex_wait_multiple_setup(vs, count, &hint);
++		if (ret) {
++			if (ret > 0) {
++				/* A futex was awaken during setup */
++				ret = hint;
++			}
++			return ret;
++		}
++
++		futex_sleep_multiple(vs, count, to);
++
++		__set_current_state(TASK_RUNNING);
++
++		ret = unqueue_multiple(vs, count);
++		if (ret >= 0)
++			return ret;
++
++		if (to && !to->task)
++			return -ETIMEDOUT;
++		else if (signal_pending(current))
++			return -ERESTARTSYS;
++		/*
++		 * The final case is a spurious wakeup, for
++		 * which just retry.
++		 */
++	}
++}
++
++/**
++ * futex_wait_setup() - Prepare to wait on a futex
++ * @uaddr:	the futex userspace address
++ * @val:	the expected value
++ * @flags:	futex flags (FLAGS_SHARED, etc.)
++ * @q:		the associated futex_q
++ * @hb:		storage for hash_bucket pointer to be returned to caller
++ *
++ * Setup the futex_q and locate the hash_bucket.  Get the futex value and
++ * compare it with the expected value.  Handle atomic faults internally.
++ * Return with the hb lock held on success, and unlocked on failure.
++ *
++ * Return:
++ *  -  0 - uaddr contains val and hb has been locked;
++ *  - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
++ */
++int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
++		     struct futex_q *q, struct futex_hash_bucket **hb)
++{
++	u32 uval;
++	int ret;
++
++	/*
++	 * Access the page AFTER the hash-bucket is locked.
++	 * Order is important:
++	 *
++	 *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
++	 *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
++	 *
++	 * The basic logical guarantee of a futex is that it blocks ONLY
++	 * if cond(var) is known to be true at the time of blocking, for
++	 * any cond.  If we locked the hash-bucket after testing *uaddr, that
++	 * would open a race condition where we could block indefinitely with
++	 * cond(var) false, which would violate the guarantee.
++	 *
++	 * On the other hand, we insert q and release the hash-bucket only
++	 * after testing *uaddr.  This guarantees that futex_wait() will NOT
++	 * absorb a wakeup if *uaddr does not match the desired values
++	 * while the syscall executes.
++	 */
++retry:
++	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
++	if (unlikely(ret != 0))
++		return ret;
++
++retry_private:
++	*hb = futex_q_lock(q);
++
++	ret = futex_get_value_locked(&uval, uaddr);
++
++	if (ret) {
++		futex_q_unlock(*hb);
++
++		ret = get_user(uval, uaddr);
++		if (ret)
++			return ret;
++
++		if (!(flags & FLAGS_SHARED))
++			goto retry_private;
++
++		goto retry;
++	}
++
++	if (uval != val) {
++		futex_q_unlock(*hb);
++		ret = -EWOULDBLOCK;
++	}
++
++	return ret;
++}
++
++int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
++{
++	struct hrtimer_sleeper timeout, *to;
++	struct restart_block *restart;
++	struct futex_hash_bucket *hb;
++	struct futex_q q = futex_q_init;
++	int ret;
++
++	if (!bitset)
++		return -EINVAL;
++	q.bitset = bitset;
++
++	to = futex_setup_timer(abs_time, &timeout, flags,
++			       current->timer_slack_ns);
++retry:
++	/*
++	 * Prepare to wait on uaddr. On success, it holds hb->lock and q
++	 * is initialized.
++	 */
++	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
++	if (ret)
++		goto out;
++
++	/* futex_queue and wait for wakeup, timeout, or a signal. */
++	futex_wait_queue(hb, &q, to);
++
++	/* If we were woken (and unqueued), we succeeded, whatever. */
++	ret = 0;
++	if (!futex_unqueue(&q))
++		goto out;
++	ret = -ETIMEDOUT;
++	if (to && !to->task)
++		goto out;
++
++	/*
++	 * We expect signal_pending(current), but we might be the
++	 * victim of a spurious wakeup as well.
++	 */
++	if (!signal_pending(current))
++		goto retry;
++
++	ret = -ERESTARTSYS;
++	if (!abs_time)
++		goto out;
++
++	restart = &current->restart_block;
++	restart->futex.uaddr = uaddr;
++	restart->futex.val = val;
++	restart->futex.time = *abs_time;
++	restart->futex.bitset = bitset;
++	restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
++
++	ret = set_restart_fn(restart, futex_wait_restart);
++
++out:
++	if (to) {
++		hrtimer_cancel(&to->timer);
++		destroy_hrtimer_on_stack(&to->timer);
++	}
++	return ret;
++}
++
++static long futex_wait_restart(struct restart_block *restart)
++{
++	u32 __user *uaddr = restart->futex.uaddr;
++	ktime_t t, *tp = NULL;
++
++	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
++		t = restart->futex.time;
++		tp = &t;
++	}
++	restart->fn = do_no_restart_syscall;
++
++	return (long)futex_wait(uaddr, restart->futex.flags,
++				restart->futex.val, tp, restart->futex.bitset);
++}
++
+diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
+index f43d89d92..d1944258c 100644
+--- a/kernel/sys_ni.c
++++ b/kernel/sys_ni.c
+@@ -143,13 +143,14 @@ COND_SYSCALL(capset);
+ /* __ARCH_WANT_SYS_CLONE3 */
+ COND_SYSCALL(clone3);
+ 
+-/* kernel/futex.c */
++/* kernel/futex/syscalls.c */
+ COND_SYSCALL(futex);
+ COND_SYSCALL(futex_time32);
+ COND_SYSCALL(set_robust_list);
+ COND_SYSCALL_COMPAT(set_robust_list);
+ COND_SYSCALL(get_robust_list);
+ COND_SYSCALL_COMPAT(get_robust_list);
++COND_SYSCALL(futex_waitv);
+ 
+ /* kernel/hrtimer.c */
+ 
+diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore
+index 0e78b49d0..fbcbdb696 100644
+--- a/tools/testing/selftests/futex/functional/.gitignore
++++ b/tools/testing/selftests/futex/functional/.gitignore
+@@ -8,3 +8,4 @@ futex_wait_uninitialized_heap
+ futex_wait_wouldblock
+ futex_wait
+ futex_requeue
++futex_waitv
+diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile
+index bd1fec59e..5cc38de9d 100644
+--- a/tools/testing/selftests/futex/functional/Makefile
++++ b/tools/testing/selftests/futex/functional/Makefile
+@@ -17,7 +17,8 @@ TEST_GEN_FILES := \
+ 	futex_wait_uninitialized_heap \
+ 	futex_wait_private_mapped_file \
+ 	futex_wait \
+-	futex_requeue
++	futex_requeue \
++	futex_waitv
+ 
+ TEST_PROGS := run.sh
+ 
+diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c
+index 1f8f6daaf..3651ce17b 100644
+--- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c
++++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c
+@@ -17,6 +17,7 @@
+ 
+ #include <pthread.h>
+ #include "futextest.h"
++#include "futex2test.h"
+ #include "logging.h"
+ 
+ #define TEST_NAME "futex-wait-timeout"
+@@ -96,6 +97,12 @@ int main(int argc, char *argv[])
+ 	struct timespec to;
+ 	pthread_t thread;
+ 	int c;
++	struct futex_waitv waitv = {
++			.uaddr = (uintptr_t)&f1,
++			.val = f1,
++			.flags = FUTEX_32,
++			.__reserved = 0
++		};
+ 
+ 	while ((c = getopt(argc, argv, "cht:v:")) != -1) {
+ 		switch (c) {
+@@ -118,7 +125,7 @@ int main(int argc, char *argv[])
+ 	}
+ 
+ 	ksft_print_header();
+-	ksft_set_plan(7);
++	ksft_set_plan(9);
+ 	ksft_print_msg("%s: Block on a futex and wait for timeout\n",
+ 	       basename(argv[0]));
+ 	ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns);
+@@ -175,6 +182,18 @@ int main(int argc, char *argv[])
+ 	res = futex_lock_pi(&futex_pi, NULL, 0, FUTEX_CLOCK_REALTIME);
+ 	test_timeout(res, &ret, "futex_lock_pi invalid timeout flag", ENOSYS);
+ 
++	/* futex_waitv with CLOCK_MONOTONIC */
++	if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns))
++		return RET_FAIL;
++	res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC);
++	test_timeout(res, &ret, "futex_waitv monotonic", ETIMEDOUT);
++
++	/* futex_waitv with CLOCK_REALTIME */
++	if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns))
++		return RET_FAIL;
++	res = futex_waitv(&waitv, 1, 0, &to, CLOCK_REALTIME);
++	test_timeout(res, &ret, "futex_waitv realtime", ETIMEDOUT);
++
+ 	ksft_print_cnts();
+ 	return ret;
+ }
+diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
+index 0ae390ff8..7d7a6a06c 100644
+--- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
++++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
+@@ -22,6 +22,7 @@
+ #include <string.h>
+ #include <time.h>
+ #include "futextest.h"
++#include "futex2test.h"
+ #include "logging.h"
+ 
+ #define TEST_NAME "futex-wait-wouldblock"
+@@ -42,6 +43,12 @@ int main(int argc, char *argv[])
+ 	futex_t f1 = FUTEX_INITIALIZER;
+ 	int res, ret = RET_PASS;
+ 	int c;
++	struct futex_waitv waitv = {
++			.uaddr = (uintptr_t)&f1,
++			.val = f1+1,
++			.flags = FUTEX_32,
++			.__reserved = 0
++		};
+ 
+ 	while ((c = getopt(argc, argv, "cht:v:")) != -1) {
+ 		switch (c) {
+@@ -61,18 +68,44 @@ int main(int argc, char *argv[])
+ 	}
+ 
+ 	ksft_print_header();
+-	ksft_set_plan(1);
++	ksft_set_plan(2);
+ 	ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n",
+ 	       basename(argv[0]));
+ 
+ 	info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1);
+ 	res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG);
+ 	if (!res || errno != EWOULDBLOCK) {
+-		fail("futex_wait returned: %d %s\n",
+-		     res ? errno : res, res ? strerror(errno) : "");
++		ksft_test_result_fail("futex_wait returned: %d %s\n",
++				      res ? errno : res,
++				      res ? strerror(errno) : "");
+ 		ret = RET_FAIL;
++	} else {
++		ksft_test_result_pass("futex_wait\n");
+ 	}
+ 
+-	print_result(TEST_NAME, ret);
++	if (clock_gettime(CLOCK_MONOTONIC, &to)) {
++		error("clock_gettime failed\n", errno);
++		return errno;
++	}
++
++	to.tv_nsec += timeout_ns;
++
++	if (to.tv_nsec >= 1000000000) {
++		to.tv_sec++;
++		to.tv_nsec -= 1000000000;
++	}
++
++	info("Calling futex_waitv on f1: %u @ %p with val=%u\n", f1, &f1, f1+1);
++	res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC);
++	if (!res || errno != EWOULDBLOCK) {
++		ksft_test_result_pass("futex_waitv returned: %d %s\n",
++				      res ? errno : res,
++				      res ? strerror(errno) : "");
++		ret = RET_FAIL;
++	} else {
++		ksft_test_result_pass("futex_waitv\n");
++	}
++
++	ksft_print_cnts();
+ 	return ret;
+ }
+diff --git a/tools/testing/selftests/futex/functional/futex_waitv.c b/tools/testing/selftests/futex/functional/futex_waitv.c
+new file mode 100644
+index 000000000..a94337f67
+--- /dev/null
++++ b/tools/testing/selftests/futex/functional/futex_waitv.c
+@@ -0,0 +1,237 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ * futex_waitv() test by André Almeida <andrealmeid@collabora.com>
++ *
++ * Copyright 2021 Collabora Ltd.
++ */
++
++#include <errno.h>
++#include <error.h>
++#include <getopt.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <time.h>
++#include <pthread.h>
++#include <stdint.h>
++#include <sys/shm.h>
++#include "futextest.h"
++#include "futex2test.h"
++#include "logging.h"
++
++#define TEST_NAME "futex-wait"
++#define WAKE_WAIT_US 10000
++#define NR_FUTEXES 30
++static struct futex_waitv waitv[NR_FUTEXES];
++u_int32_t futexes[NR_FUTEXES] = {0};
++
++void usage(char *prog)
++{
++	printf("Usage: %s\n", prog);
++	printf("  -c	Use color\n");
++	printf("  -h	Display this help message\n");
++	printf("  -v L	Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
++	       VQUIET, VCRITICAL, VINFO);
++}
++
++void *waiterfn(void *arg)
++{
++	struct timespec to;
++	int res;
++
++	/* setting absolute timeout for futex2 */
++	if (clock_gettime(CLOCK_MONOTONIC, &to))
++		error("gettime64 failed\n", errno);
++
++	to.tv_sec++;
++
++	res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC);
++	if (res < 0) {
++		ksft_test_result_fail("futex_waitv returned: %d %s\n",
++				      errno, strerror(errno));
++	} else if (res != NR_FUTEXES - 1) {
++		ksft_test_result_fail("futex_waitv returned: %d, expecting %d\n",
++				      res, NR_FUTEXES - 1);
++	}
++
++	return NULL;
++}
++
++int main(int argc, char *argv[])
++{
++	pthread_t waiter;
++	int res, ret = RET_PASS;
++	struct timespec to;
++	int c, i;
++
++	while ((c = getopt(argc, argv, "cht:v:")) != -1) {
++		switch (c) {
++		case 'c':
++			log_color(1);
++			break;
++		case 'h':
++			usage(basename(argv[0]));
++			exit(0);
++		case 'v':
++			log_verbosity(atoi(optarg));
++			break;
++		default:
++			usage(basename(argv[0]));
++			exit(1);
++		}
++	}
++
++	ksft_print_header();
++	ksft_set_plan(7);
++	ksft_print_msg("%s: Test FUTEX_WAITV\n",
++		       basename(argv[0]));
++
++	for (i = 0; i < NR_FUTEXES; i++) {
++		waitv[i].uaddr = (uintptr_t)&futexes[i];
++		waitv[i].flags = FUTEX_32 | FUTEX_PRIVATE_FLAG;
++		waitv[i].val = 0;
++		waitv[i].__reserved = 0;
++	}
++
++	/* Private waitv */
++	if (pthread_create(&waiter, NULL, waiterfn, NULL))
++		error("pthread_create failed\n", errno);
++
++	usleep(WAKE_WAIT_US);
++
++	res = futex_wake(u64_to_ptr(waitv[NR_FUTEXES - 1].uaddr), 1, FUTEX_PRIVATE_FLAG);
++	if (res != 1) {
++		ksft_test_result_fail("futex_wake private returned: %d %s\n",
++				      res ? errno : res,
++				      res ? strerror(errno) : "");
++		ret = RET_FAIL;
++	} else {
++		ksft_test_result_pass("futex_waitv private\n");
++	}
++
++	/* Shared waitv */
++	for (i = 0; i < NR_FUTEXES; i++) {
++		int shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666);
++
++		if (shm_id < 0) {
++			perror("shmget");
++			exit(1);
++		}
++
++		unsigned int *shared_data = shmat(shm_id, NULL, 0);
++
++		*shared_data = 0;
++		waitv[i].uaddr = (uintptr_t)shared_data;
++		waitv[i].flags = FUTEX_32;
++		waitv[i].val = 0;
++		waitv[i].__reserved = 0;
++	}
++
++	if (pthread_create(&waiter, NULL, waiterfn, NULL))
++		error("pthread_create failed\n", errno);
++
++	usleep(WAKE_WAIT_US);
++
++	res = futex_wake(u64_to_ptr(waitv[NR_FUTEXES - 1].uaddr), 1, 0);
++	if (res != 1) {
++		ksft_test_result_fail("futex_wake shared returned: %d %s\n",
++				      res ? errno : res,
++				      res ? strerror(errno) : "");
++		ret = RET_FAIL;
++	} else {
++		ksft_test_result_pass("futex_waitv shared\n");
++	}
++
++	for (i = 0; i < NR_FUTEXES; i++)
++		shmdt(u64_to_ptr(waitv[i].uaddr));
++
++	/* Testing a waiter without FUTEX_32 flag */
++	waitv[0].flags = FUTEX_PRIVATE_FLAG;
++
++	if (clock_gettime(CLOCK_MONOTONIC, &to))
++		error("gettime64 failed\n", errno);
++
++	to.tv_sec++;
++
++	res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC);
++	if (res == EINVAL) {
++		ksft_test_result_fail("futex_waitv private returned: %d %s\n",
++				      res ? errno : res,
++				      res ? strerror(errno) : "");
++		ret = RET_FAIL;
++	} else {
++		ksft_test_result_pass("futex_waitv without FUTEX_32\n");
++	}
++
++	/* Testing a waiter with an unaligned address */
++	waitv[0].flags = FUTEX_PRIVATE_FLAG | FUTEX_32;
++	waitv[0].uaddr = 1;
++
++	if (clock_gettime(CLOCK_MONOTONIC, &to))
++		error("gettime64 failed\n", errno);
++
++	to.tv_sec++;
++
++	res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC);
++	if (res == EINVAL) {
++		ksft_test_result_fail("futex_wake private returned: %d %s\n",
++				      res ? errno : res,
++				      res ? strerror(errno) : "");
++		ret = RET_FAIL;
++	} else {
++		ksft_test_result_pass("futex_waitv with an unaligned address\n");
++	}
++
++	/* Testing a NULL address for waiters.uaddr */
++	waitv[0].uaddr = 0x00000000;
++
++	if (clock_gettime(CLOCK_MONOTONIC, &to))
++		error("gettime64 failed\n", errno);
++
++	to.tv_sec++;
++
++	res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC);
++	if (res == EINVAL) {
++		ksft_test_result_fail("futex_waitv private returned: %d %s\n",
++				      res ? errno : res,
++				      res ? strerror(errno) : "");
++		ret = RET_FAIL;
++	} else {
++		ksft_test_result_pass("futex_waitv NULL address in waitv.uaddr\n");
++	}
++
++	/* Testing a NULL address for *waiters */
++	if (clock_gettime(CLOCK_MONOTONIC, &to))
++		error("gettime64 failed\n", errno);
++
++	to.tv_sec++;
++
++	res = futex_waitv(NULL, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC);
++	if (res == EINVAL) {
++		ksft_test_result_fail("futex_waitv private returned: %d %s\n",
++				      res ? errno : res,
++				      res ? strerror(errno) : "");
++		ret = RET_FAIL;
++	} else {
++		ksft_test_result_pass("futex_waitv NULL address in *waiters\n");
++	}
++
++	/* Testing an invalid clockid */
++	if (clock_gettime(CLOCK_MONOTONIC, &to))
++		error("gettime64 failed\n", errno);
++
++	to.tv_sec++;
++
++	res = futex_waitv(NULL, NR_FUTEXES, 0, &to, CLOCK_TAI);
++	if (res == EINVAL) {
++		ksft_test_result_fail("futex_waitv private returned: %d %s\n",
++				      res ? errno : res,
++				      res ? strerror(errno) : "");
++		ret = RET_FAIL;
++	} else {
++		ksft_test_result_pass("futex_waitv invalid clockid\n");
++	}
++
++	ksft_print_cnts();
++	return ret;
++}
+diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh
+index 11a9d6229..5ccd599da 100755
+--- a/tools/testing/selftests/futex/functional/run.sh
++++ b/tools/testing/selftests/futex/functional/run.sh
+@@ -79,3 +79,6 @@ echo
+ 
+ echo
+ ./futex_requeue $COLOR
++
++echo
++./futex_waitv $COLOR
+diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h
+new file mode 100644
+index 000000000..9d305520e
+--- /dev/null
++++ b/tools/testing/selftests/futex/include/futex2test.h
+@@ -0,0 +1,22 @@
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++/*
++ * Futex2 library addons for futex tests
++ *
++ * Copyright 2021 Collabora Ltd.
++ */
++#include <stdint.h>
++
++#define u64_to_ptr(x) ((void *)(uintptr_t)(x))
++
++/**
++ * futex_waitv - Wait at multiple futexes, wake on any
++ * @waiters:    Array of waiters
++ * @nr_waiters: Length of waiters array
++ * @flags: Operation flags
++ * @timo:  Optional timeout for operation
++ */
++static inline int futex_waitv(volatile struct futex_waitv *waiters, unsigned long nr_waiters,
++			      unsigned long flags, struct timespec *timo, clockid_t clockid)
++{
++	return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, timo, clockid);
++}
+-- 
+2.33.1.711.g9d530dc002
+
diff --git a/sys-kernel/calculate-sources/Manifest b/sys-kernel/calculate-sources/Manifest
index 1c2667830..f60b71fd8 100644
--- a/sys-kernel/calculate-sources/Manifest
+++ b/sys-kernel/calculate-sources/Manifest
@@ -3,7 +3,7 @@ DIST linux-5.14.tar.xz 120669872 BLAKE2B 0047f5aaa3940dff97f4055ef544faafbbb5282
 DIST linux-5.15.tar.xz 121913744 BLAKE2B 3921274b23f7938abdf3ed9334534b4581e13d7484303d3a5280eddb038999aaa8b836666a487472d9c4a219af0f06b9fecccaf348fb5510ab8762f4ef4b7e83 SHA512 d25ad40b5bcd6a4c6042fd0fd84e196e7a58024734c3e9a484fd0d5d54a0c1d87db8a3c784eff55e43b6f021709dc685eb0efa18d2aec327e4f88a79f405705a
 DIST linux-5.4.tar.xz 109441440 BLAKE2B 193bc4a3147e147d5529956164ec4912fad5d5c6fb07f909ff1056e57235834173194afc686993ccd785c1ff15804de0961b625f3008cca0e27493efc8f27b13 SHA512 9f60f77e8ab972b9438ac648bed17551c8491d6585a5e85f694b2eaa4c623fbc61eb18419b2656b6795eac5deec0edaa04547fc6723fbda52256bd7f3486898f
 DIST patch-5.10.73.xz 2213532 BLAKE2B 8069486891bd8bd2d6f15204f6c18848c374a650254a4b381126407fdc3136b625f8e13e3a3ba93817f71c80297a5ad7c4f25a8f43330f0017f37396c033cf95 SHA512 630d564b49ea9e5d67ac2395b312e653ba8aa28011cd5565ad506b0296c7598eebbf2e7a792e06b4c90e75bb764e0c2dc0246b11d496e7ad5533c0e177d887cc
-DIST patch-5.10.76.xz 2256124 BLAKE2B 2c9b8f25e2a39ad7208313c738db45873ff9bf839d59ca3c3f57d1e7f079581c6d57af79a21e2340d9fbbaa159ac168dd819fab3e840764b360c5ab87c892b6d SHA512 a64d038a370da6a18e5dfea1f9962edc5ea44a144cfc0bebde57672b5a054533fffa5077e37efc8e9171d6fd2e8475de280f1806de6826ba7eebf5799ed0728b
-DIST patch-5.14.15.xz 573672 BLAKE2B 1c33320afd02af7cb159e8b298c56d4bc6e3785387193f3fbb03460240353b7d0d353d69c412fdb2aaf8b33c76f9094e5cee53f02311ea4c2e05d2c73dd653bb SHA512 02f6e4e44e05bcfd9cbb0d2df0df8b42df67980f02decd95fb23bbbd29d0185da85d203ee1de20059863e1dec93f91ca5e1ae18b37763f788f7091a64d62eb53
+DIST patch-5.10.77.xz 2272320 BLAKE2B 209beaa7f6cbc12e9b40370cad2fa30cb062be843aa669cff1908742ceed5eb65a1f1b0d5b2e55379907065b5d86c431f8db80ef01e58bf77e4552e4163e7938 SHA512 51d222ab9622e37dcfb3ec4bba867b043880179e38fb991da48b86183a83f3f1af00485b461f757a7e58899e062738b9e8f7caced550dd2660fef33e5afe3262
+DIST patch-5.14.16.xz 602960 BLAKE2B 467f64556136fb2f961042f9a99a8e80f651b4f3476e88d908f04193a8fed7975e8685e4867cdd16e2e0912e0e22931727502958c61ba0940cd3bc4395a7ce8e SHA512 c909aea0ca239d32e06a8602b154397de3885588410738d099d6a13f0149766fd106e6e7e235b0c4a3873d8155e403a371d82a85af5592c619d1ce683800c811
 DIST patch-5.4.124.xz 3006220 BLAKE2B 394645f0cf7898c98d416e93858b3effe171d5bdbc968bccd876c41de527ce0221331a06744cc081a05407d3b539d7ab83ae526ff1cc99e9e17629af7b968932 SHA512 9a8a5388d921c55a6f620f2da0528c4d0ed4487cfa58ac876b7b9625247860e3b25bbfcd39b4ae73f34c2d2b8a45b155a149613a650a1306bdab4bad57f8f9e9
-DIST patch-5.4.156.xz 3438776 BLAKE2B 60024ef0f6665a425136c32349477056fa49df93974ae11387ca4aab3a3f987e469fe3a38e04c15d016a67c44d364c23806111a86c6d5ee1547833dee3ee9a5a SHA512 96fb6c79d86efa2a578bdf49025e7ec46963fc8d5a4ba2d11a19e4b0c84adbc86033e13400a90fa467b460adbf1f2b4ab60f2b050dfb20c6eb3e497bf885c4aa
+DIST patch-5.4.157.xz 3446952 BLAKE2B e443b31e2c505886009b8625638d678d974f6223291c3ef079189df341f3700cca208b350150e14d284269ec0143d228277d1a0edb35e59808e268ffe3fddfc0 SHA512 b274006c15c7a09ad459f59d90fe88128b15a4a4eaf984bce97ebc5b701fa83a28ebfb5fb44d7c5086f73ee99a212b137b2de06a5bfc1b1c889a63f8840afeb2
diff --git a/sys-kernel/calculate-sources/calculate-sources-5.10.76.ebuild b/sys-kernel/calculate-sources/calculate-sources-5.10.77.ebuild
similarity index 100%
rename from sys-kernel/calculate-sources/calculate-sources-5.10.76.ebuild
rename to sys-kernel/calculate-sources/calculate-sources-5.10.77.ebuild
diff --git a/sys-kernel/calculate-sources/calculate-sources-5.14.15.ebuild b/sys-kernel/calculate-sources/calculate-sources-5.14.16.ebuild
similarity index 100%
rename from sys-kernel/calculate-sources/calculate-sources-5.14.15.ebuild
rename to sys-kernel/calculate-sources/calculate-sources-5.14.16.ebuild
diff --git a/sys-kernel/calculate-sources/calculate-sources-5.15.0.ebuild b/sys-kernel/calculate-sources/calculate-sources-5.15.0.ebuild
index 1b9e9d758..199708287 100644
--- a/sys-kernel/calculate-sources/calculate-sources-5.15.0.ebuild
+++ b/sys-kernel/calculate-sources/calculate-sources-5.15.0.ebuild
@@ -13,7 +13,7 @@ HOMEPAGE="http://www.calculate-linux.org"
 
 SRC_URI="${KERNEL_URI} ${ARCH_URI}"
 
-IUSE="uksm"
+IUSE="fsync uksm"
 
 src_unpack() {
 	calculate-kernel-8_src_unpack
diff --git a/sys-kernel/calculate-sources/calculate-sources-5.4.156.ebuild b/sys-kernel/calculate-sources/calculate-sources-5.4.157.ebuild
similarity index 100%
rename from sys-kernel/calculate-sources/calculate-sources-5.4.156.ebuild
rename to sys-kernel/calculate-sources/calculate-sources-5.4.157.ebuild