diff --git a/Makefile b/Makefile
index c4610edd8..cd79b4837 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 3
 PATCHLEVEL = 4
-SUBLEVEL = 111
+SUBLEVEL = 112
 EXTRAVERSION =
 NAME = Saber-toothed Squirrel
 
@@ -243,8 +243,8 @@ CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \
 	  else if [ -x /bin/bash ]; then echo /bin/bash; \
 	  else echo sh; fi ; fi)
 
-HOSTCC       = gcc
-HOSTCXX      = g++
+HOSTCC       = ccache gcc
+HOSTCXX      = ccache g++
 HOSTCFLAGS   = -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89
 HOSTCXXFLAGS = -O2
 
@@ -330,7 +330,7 @@ include $(srctree)/scripts/Kbuild.include
 
 AS		= $(CROSS_COMPILE)as
 LD		= $(CROSS_COMPILE)ld
-REAL_CC		= $(CROSS_COMPILE)gcc
+REAL_CC		= ccache $(CROSS_COMPILE)gcc
 CPP		= $(CC) -E
 AR		= $(CROSS_COMPILE)ar
 NM		= $(CROSS_COMPILE)nm
@@ -377,6 +377,8 @@ KBUILD_CFLAGS   := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 		   -fno-strict-aliasing -fno-common \
 		   -Werror-implicit-function-declaration \
 		   -Wno-format-security \
+		   -Wno-discarded-array-qualifiers \
+		   -Wno-switch-bool \
                    -mtune=cortex-a15 \
                    --param l1-cache-size=32 \
                    --param l2-cache-size=2048 \
diff --git a/VERSION b/VERSION
index a3fcc7121..0ee843cc6 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-7.1.0
+7.2.0
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4b3aa788b..45b36ecca 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -816,9 +816,7 @@ config ARCH_NR_GPIO
 
 source kernel/Kconfig.preempt
 
-config HZ
-	int
-	default 100
+source kernel/Kconfig.hz
 
 config THUMB2_KERNEL
 	bool "Compile the kernel in Thumb-2 mode (EXPERIMENTAL)"
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 529e88060..b1eedba96 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -53,6 +53,14 @@ endif
 
 comma = ,
 
+#
+# The Scalar Replacement of Aggregates (SRA) optimization pass in GCC 4.9 and
+# later may result in code being generated that handles signed short and signed
+# char struct members incorrectly. So disable it.
+# (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65932)
+#
+KBUILD_CFLAGS	+= $(call cc-option,-fno-ipa-sra)
+
 arch-$(CONFIG_ARCH_MSM_KRAIT) :=-D__LINUX_ARM_ARCH__=7 $(call cc-option,-mcpu=cortex-a15,-march=armv7-a)
 
 ifeq ($(CONFIG_AEABI),y)
diff --git a/arch/arm/boot/dts/msm8974/msm8974-gpu.dtsi b/arch/arm/boot/dts/msm8974/msm8974-gpu.dtsi
index 695e45298..21487a85a 100644
--- a/arch/arm/boot/dts/msm8974/msm8974-gpu.dtsi
+++ b/arch/arm/boot/dts/msm8974/msm8974-gpu.dtsi
@@ -24,7 +24,7 @@
 
 		qcom,initial-pwrlevel = <1>;
 
-		qcom,idle-timeout = <8>; //<HZ/12>
+		qcom,idle-timeout = <80>; //msec
 		qcom,strtstp-sleepwake;
 		qcom,clk-map = <0x0000006>; //KGSL_CLK_CORE | KGSL_CLK_IFACE
 
diff --git a/arch/arm/configs/ik_defconfig b/arch/arm/configs/ik_defconfig
index 1ec09b28d..7ff4bd3d8 100644
--- a/arch/arm/configs/ik_defconfig
+++ b/arch/arm/configs/ik_defconfig
@@ -47,6 +47,7 @@ CONFIG_LOCALVERSION="-idleKernel-hlte-"
 CONFIG_HAVE_KERNEL_LZMA=y
 CONFIG_HAVE_KERNEL_XZ=y
 CONFIG_HAVE_KERNEL_LZO=y
+CONFIG_HAVE_KERNEL_LZ4=y
 # CONFIG_KERNEL_LZMA is not set
 CONFIG_KERNEL_XZ=y
 # CONFIG_KERNEL_LZO is not set
@@ -574,7 +575,11 @@ CONFIG_ARCH_NR_GPIO=0
 # CONFIG_PREEMPT_VOLUNTARY is not set
 CONFIG_PREEMPT=y
 CONFIG_PREEMPT_COUNT=y
-CONFIG_HZ=100
+# CONFIG_HZ_100 is not set
+# CONFIG_HZ_250 is not set
+CONFIG_HZ_300=y
+# CONFIG_HZ_1000 is not set
+CONFIG_HZ=300
 # CONFIG_THUMB2_KERNEL is not set
 CONFIG_AEABI=y
 # CONFIG_OABI_COMPAT is not set
@@ -602,13 +607,16 @@ CONFIG_KSM=y
 CONFIG_UKSM=y
 # CONFIG_KSM_LEGACY is not set
 CONFIG_DEFAULT_MMAP_MIN_ADDR=4096
-# CONFIG_CLEANCACHE is not set
+CONFIG_CLEANCACHE=y
 CONFIG_FRONTSWAP=y
 CONFIG_ZSMALLOC_NEW=y
 CONFIG_PGTABLE_MAPPING=y
 CONFIG_ZSWAP=y
 # CONFIG_SWAP_ENABLE_READAHEAD is not set
 # CONFIG_ZSWAP_ENABLE_WRITEBACK is not set
+CONFIG_ZPOOL=y
+CONFIG_ZBUD=y
+CONFIG_ZCACHE=y
 CONFIG_DIRECT_RECLAIM_FILE_PAGES_ONLY=y
 CONFIG_INCREASE_MAXIMUM_SWAPPINESS=y
 CONFIG_FIX_INACTIVE_RATIO=y
@@ -1244,6 +1252,8 @@ CONFIG_OF_BATTERYDATA=y
 CONFIG_OF_SUBCMDLINE_PARSE=y
 # CONFIG_PARPORT is not set
 CONFIG_BLK_DEV=y
+CONFIG_ZRAM=y
+CONFIG_ZRAM_LZ4_COMPRESS=y
 # CONFIG_BLK_DEV_COW_COMMON is not set
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_LOOP_MIN_COUNT=8
@@ -1718,6 +1728,7 @@ CONFIG_INPUT_WACOM=y
 # CONFIG_TOUCHSCREEN_FTS is not set
 # CONFIG_TOUCHSCREEN_FTS_S is not set
 # CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI_JSGLTE is not set
+CONFIG_STATE_NOTIFIER=y
 CONFIG_INPUT_MISC=y
 # CONFIG_INPUT_AD714X is not set
 # CONFIG_INPUT_BMA150 is not set
@@ -3677,7 +3688,7 @@ CONFIG_SEC_FPGA=y
 # CONFIG_SENSORS_SSP_STM is not set
 CONFIG_SENSORS_SSP_STM_32F=y
 # CONFIG_SENSORS_SSP_STM_HESTIA is not set
-CONFIG_SENSORS_SSP_STM_LEGACY=y
+# CONFIG_SENSORS_SSP_STM_LEGACY is not set
 CONFIG_SENSORS_SYSFS=y
 CONFIG_SENSORS_SSP=y
 # CONFIG_SENSORS_SSP_AK8963C is not set
@@ -4189,6 +4200,9 @@ CONFIG_CRYPTO_TWOFISH_COMMON=y
 CONFIG_CRYPTO_DEFLATE=y
 # CONFIG_CRYPTO_ZLIB is not set
 CONFIG_CRYPTO_LZO=y
+CONFIG_CRYPTO_LZO=y
+CONFIG_CRYPTO_LZ4=y
+CONFIG_CRYPTO_LZ4HC=y
 
 #
 # Random Number Generation
@@ -4228,6 +4242,9 @@ CONFIG_ZLIB_INFLATE=y
 CONFIG_ZLIB_DEFLATE=y
 CONFIG_LZO_COMPRESS=y
 CONFIG_LZO_DECOMPRESS=y
+CONFIG_LZ4_COMPRESS=y
+CONFIG_LZ4HC_COMPRESS=y
+CONFIG_LZ4_DECOMPRESS=y
 CONFIG_XZ_DEC=y
 CONFIG_XZ_DEC_X86=y
 CONFIG_XZ_DEC_POWERPC=y
@@ -4239,6 +4256,7 @@ CONFIG_XZ_DEC_BCJ=y
 # CONFIG_XZ_DEC_TEST is not set
 CONFIG_DECOMPRESS_GZIP=y
 CONFIG_DECOMPRESS_XZ=y
+CONFIG_DECOMPRESS_LZ4=y
 CONFIG_GENERIC_ALLOCATOR=y
 CONFIG_REED_SOLOMON=y
 CONFIG_REED_SOLOMON_ENC8=y
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index a178f75da..38ecc608b 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -440,12 +440,17 @@ setup_return(struct pt_regs *regs, struct k_sigaction *ka,
 		 */
 		thumb = handler & 1;
 
-#if __LINUX_ARM_ARCH__ >= 7
+#if __LINUX_ARM_ARCH__ >= 6
 		/*
-		 * Clear the If-Then Thumb-2 execution state
-		 * ARM spec requires this to be all 000s in ARM mode
-		 * Snapdragon S4/Krait misbehaves on a Thumb=>ARM
-		 * signal transition without this.
+		 * Clear the If-Then Thumb-2 execution state.  ARM spec
+		 * requires this to be all 000s in ARM mode.  Snapdragon
+		 * S4/Krait misbehaves on a Thumb=>ARM signal transition
+		 * without this.
+		 *
+		 * We must do this whenever we are running on a Thumb-2
+		 * capable CPU, which includes ARMv6T2.  However, we elect
+		 * to do this whenever we're on an ARMv6 or later CPU for
+		 * simplicity.
 		 */
 		cpsr &= ~PSR_IT_MASK;
 #endif
diff --git a/arch/arm/mach-msm/bam_dmux.c b/arch/arm/mach-msm/bam_dmux.c
index 529e7d12a..6c5df1d38 100644
--- a/arch/arm/mach-msm/bam_dmux.c
+++ b/arch/arm/mach-msm/bam_dmux.c
@@ -1716,7 +1716,7 @@ static void ul_wakeup(void)
 		}
 		if (wait_for_dfab) {
 			ret = wait_for_completion_timeout(
-					&dfab_unvote_completion, HZ);
+					&dfab_unvote_completion, msecs_to_jiffies(1000));
 			BUG_ON(ret == 0);
 		}
 		if (likely(do_vote_dfab))
diff --git a/arch/arm/mach-msm/clock-debug.c b/arch/arm/mach-msm/clock-debug.c
index ec3bd0036..89d97b0d1 100644
--- a/arch/arm/mach-msm/clock-debug.c
+++ b/arch/arm/mach-msm/clock-debug.c
@@ -33,7 +33,7 @@ static LIST_HEAD(clk_list);
 static DEFINE_SPINLOCK(clk_list_lock);
 
 static struct dentry *debugfs_base;
-static u32 debug_suspend = 1;
+static u32 debug_suspend = 0;
 
 struct clk_table {
 	struct list_head node;
diff --git a/arch/arm/mach-msm/dma.c b/arch/arm/mach-msm/dma.c
index afee25f67..adb1b596c 100644
--- a/arch/arm/mach-msm/dma.c
+++ b/arch/arm/mach-msm/dma.c
@@ -356,7 +356,7 @@ static void msm_dmov_enqueue_cmd_ext_work(struct work_struct *work)
 	}
 	if (!dmov_conf[adm].channel_active) {
 		dmov_conf[adm].clk_ctl = CLK_TO_BE_DIS;
-		schedule_delayed_work(&dmov_conf[adm].work, (HZ/10));
+		schedule_delayed_work(&dmov_conf[adm].work, msecs_to_jiffies(100));
 	}
 	spin_unlock_irqrestore(&dmov_conf[adm].list_lock, flags);
 error:
diff --git a/arch/arm/mach-msm/ipc_router.c b/arch/arm/mach-msm/ipc_router.c
index fa9546984..a84b904a3 100644
--- a/arch/arm/mach-msm/ipc_router.c
+++ b/arch/arm/mach-msm/ipc_router.c
@@ -189,7 +189,7 @@ static LIST_HEAD(xprt_info_list);
 static DECLARE_RWSEM(xprt_info_list_lock_lha5);
 
 static DECLARE_COMPLETION(msm_ipc_local_router_up);
-#define IPC_ROUTER_INIT_TIMEOUT (10 * HZ)
+#define IPC_ROUTER_INIT_TIMEOUT 10000
 
 static uint32_t next_port_id;
 static DEFINE_MUTEX(next_port_id_lock_lha1);
@@ -3267,7 +3267,7 @@ void msm_ipc_router_xprt_notify(struct msm_ipc_router_xprt *xprt,
 
 	if (!msm_ipc_router_workqueue) {
 		ret = wait_for_completion_timeout(&msm_ipc_local_router_up,
-						  IPC_ROUTER_INIT_TIMEOUT);
+						  msecs_to_jiffies(IPC_ROUTER_INIT_TIMEOUT));
 		if (!ret || !msm_ipc_router_workqueue) {
 			pr_err("%s: IPC Router not initialized\n", __func__);
 			return;
diff --git a/arch/arm/mach-msm/platsmp.c b/arch/arm/mach-msm/platsmp.c
index 796fba1de..c780f776e 100644
--- a/arch/arm/mach-msm/platsmp.c
+++ b/arch/arm/mach-msm/platsmp.c
@@ -235,7 +235,7 @@ static int __cpuinit release_from_pen(unsigned int cpu)
 	 */
 	gic_raise_softirq(cpumask_of(cpu), 1);
 
-	timeout = jiffies + (1 * HZ);
+	timeout = jiffies + msecs_to_jiffies(1000);
 	while (time_before(jiffies, timeout)) {
 		smp_rmb();
 		if (pen_release == -1)
diff --git a/arch/arm/mach-msm/qdsp6v2/apr.c b/arch/arm/mach-msm/qdsp6v2/apr.c
index d3119bccf..9de7c6cf0 100644
--- a/arch/arm/mach-msm/qdsp6v2/apr.c
+++ b/arch/arm/mach-msm/qdsp6v2/apr.c
@@ -220,11 +220,11 @@ int apr_wait_for_device_up(int dest_id)
 	if (dest_id == APR_DEST_MODEM)
 		rc = wait_event_interruptible_timeout(modem_wait,
 				    (apr_get_modem_state() == APR_SUBSYS_UP),
-				    (1 * HZ));
+				    msecs_to_jiffies(1000));
 	else if (dest_id == APR_DEST_QDSP6)
 		rc = wait_event_interruptible_timeout(dsp_wait,
 				    (apr_get_q6_state() == APR_SUBSYS_UP),
-				    (1 * HZ));
+				    msecs_to_jiffies(1000));
 	else
 		pr_err("%s: unknown dest_id %d\n", __func__, dest_id);
 	/* returns left time */
diff --git a/arch/arm/mach-msm/qdsp6v2/apr_tal.c b/arch/arm/mach-msm/qdsp6v2/apr_tal.c
index 4443d64e5..333ae8f6e 100644
--- a/arch/arm/mach-msm/qdsp6v2/apr_tal.c
+++ b/arch/arm/mach-msm/qdsp6v2/apr_tal.c
@@ -183,7 +183,7 @@ struct apr_svc_ch_dev *apr_tal_open(uint32_t svc, uint32_t dest,
 		return NULL;
 	}
 	rc = wait_event_timeout(apr_svc_ch[dl][dest][svc].wait,
-		(apr_svc_ch[dl][dest][svc].smd_state == 1), 5 * HZ);
+		(apr_svc_ch[dl][dest][svc].smd_state == 1), msecs_to_jiffies(5000));
 	if (rc == 0) {
 		pr_err("apr_tal:TIMEOUT for OPEN event\n");
 		mutex_unlock(&apr_svc_ch[dl][dest][svc].m_lock);
diff --git a/arch/arm/mach-msm/qdsp6v2/ultrasound/usf.c b/arch/arm/mach-msm/qdsp6v2/ultrasound/usf.c
index 1c42020f1..9733612fc 100644
--- a/arch/arm/mach-msm/qdsp6v2/ultrasound/usf.c
+++ b/arch/arm/mach-msm/qdsp6v2/ultrasound/usf.c
@@ -31,7 +31,7 @@
 #define USF_VERSION_ID 0x0171
 
 /* Standard timeout in the asynchronous ops */
-#define USF_TIMEOUT_JIFFIES (1*HZ) /* 1 sec */
+#define USF_TIMEOUT_JIFFIES 1000  /* 1 sec */
 
 /* Undefined USF device */
 #define USF_UNDEF_DEV_ID 0xffff
@@ -886,9 +886,9 @@ static int usf_set_us_detection(struct usf_type *usf, unsigned long arg)
 						USF_ADSP_RESTART_STATE));
 	} else {
 		if (detect_info.detect_timeout == USF_DEFAULT_TIMEOUT)
-			timeout = USF_TIMEOUT_JIFFIES;
+			timeout = msecs_to_jiffies(USF_TIMEOUT_JIFFIES);
 		else
-			timeout = detect_info.detect_timeout * HZ;
+			timeout = detect_info.detect_timeout * msecs_to_jiffies(1000);
 	}
 	rc = wait_event_interruptible_timeout(usf_xx->wait,
 					(usf_xx->us_detect_type !=
@@ -1098,7 +1098,7 @@ static int usf_get_tx_update(struct usf_type *usf, unsigned long arg)
 		else {
 			prev_jiffies = jiffies;
 			if (upd_tx_info.timeout == USF_DEFAULT_TIMEOUT) {
-				timeout = USF_TIMEOUT_JIFFIES;
+				timeout = msecs_to_jiffies(USF_TIMEOUT_JIFFIES);
 				rc = wait_event_timeout(
 						usf_xx->wait,
 						(usf_xx->prev_region !=
@@ -1107,7 +1107,7 @@ static int usf_get_tx_update(struct usf_type *usf, unsigned long arg)
 						 USF_WORK_STATE),
 						timeout);
 			} else {
-				timeout = upd_tx_info.timeout * HZ;
+				timeout = upd_tx_info.timeout * msecs_to_jiffies(1000);
 				rc = wait_event_interruptible_timeout(
 						usf_xx->wait,
 						(usf_xx->prev_region !=
@@ -1190,7 +1190,7 @@ static int usf_set_rx_update(struct usf_xx_type *usf_xx, unsigned long arg)
 			usf_xx->usc,
 			&upd_rx_info.free_region) ||
 		(usf_xx->usf_state == USF_IDLE_STATE),
-		USF_TIMEOUT_JIFFIES);
+		msecs_to_jiffies(USF_TIMEOUT_JIFFIES));
 
 	if (!rc) {
 		rc = -ETIME;
diff --git a/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c b/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c
index 1a1d58727..3c9f6c6b9 100644
--- a/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c
+++ b/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c
@@ -33,7 +33,7 @@
 #define WRITEDONE_IDX_STATUS    0
 
 /* Standard timeout in the asynchronous ops */
-#define Q6USM_TIMEOUT_JIFFIES	(1*HZ) /* 1 sec */
+#define Q6USM_TIMEOUT_JIFFIES	1000  /* 1 sec */
 
 static DEFINE_MUTEX(session_lock);
 
@@ -103,7 +103,7 @@ static int q6usm_memory_map(uint32_t buf_add, int dir, uint32_t bufsz,
 
 	rc = wait_event_timeout(this_mmap.cmd_wait,
 				(atomic_read(&this_mmap.cmd_state) == 0),
-				Q6USM_TIMEOUT_JIFFIES);
+				msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES));
 	if (!rc) {
 		rc = -ETIME;
 		pr_err("%s: timeout. waited for memory_map\n", __func__);
@@ -141,7 +141,7 @@ int q6usm_memory_unmap(uint32_t buf_add, int dir, uint32_t session,
 
 	rc = wait_event_timeout(this_mmap.cmd_wait,
 				(atomic_read(&this_mmap.cmd_state) == 0),
-				Q6USM_TIMEOUT_JIFFIES);
+				msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES));
 	if (!rc) {
 		rc = -ETIME;
 		pr_err("%s: timeout. waited for memory_unmap\n", __func__);
@@ -795,7 +795,7 @@ int q6usm_open_read(struct us_client *usc,
 	}
 	rc = wait_event_timeout(usc->cmd_wait,
 				(atomic_read(&usc->cmd_state) == 0),
-				Q6USM_TIMEOUT_JIFFIES);
+				msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES));
 	if (!rc) {
 		rc = -ETIME;
 		pr_err("%s: timeout, waited for OPEN_READ rc[%d]\n",
@@ -905,7 +905,7 @@ int q6usm_enc_cfg_blk(struct us_client *usc, struct us_encdec_cfg *us_cfg)
 	}
 	rc = wait_event_timeout(usc->cmd_wait,
 				(atomic_read(&usc->cmd_state) == 0),
-				Q6USM_TIMEOUT_JIFFIES);
+				msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES));
 	if (!rc) {
 		rc = -ETIME;
 		pr_err("%s: timeout opcode[0x%x]\n",
@@ -993,7 +993,7 @@ int q6usm_dec_cfg_blk(struct us_client *usc, struct us_encdec_cfg *us_cfg)
 	}
 	rc = wait_event_timeout(usc->cmd_wait,
 				(atomic_read(&usc->cmd_state) == 0),
-				Q6USM_TIMEOUT_JIFFIES);
+				msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES));
 	if (!rc) {
 		rc = -ETIME;
 		pr_err("%s: timeout opcode[0x%x]\n",
@@ -1041,7 +1041,7 @@ int q6usm_open_write(struct us_client *usc,
 	}
 	rc = wait_event_timeout(usc->cmd_wait,
 				(atomic_read(&usc->cmd_state) == 0),
-				Q6USM_TIMEOUT_JIFFIES);
+				msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES));
 	if (!rc) {
 		rc = -ETIME;
 		pr_err("%s:timeout. waited for OPEN_WRITR rc[%d]\n",
@@ -1079,7 +1079,7 @@ int q6usm_run(struct us_client *usc, uint32_t flags,
 
 	rc = wait_event_timeout(usc->cmd_wait,
 				(atomic_read(&usc->cmd_state) == 0),
-				Q6USM_TIMEOUT_JIFFIES);
+				msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES));
 	if (!rc) {
 		rc = -ETIME;
 		pr_err("%s: timeout. waited for run success rc[%d]\n",
@@ -1280,7 +1280,7 @@ int q6usm_cmd(struct us_client *usc, int cmd)
 		goto fail_cmd;
 	}
 	rc = wait_event_timeout(usc->cmd_wait, (atomic_read(state) == 0),
-				Q6USM_TIMEOUT_JIFFIES);
+				msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES));
 	if (!rc) {
 		rc = -ETIME;
 		pr_err("%s:timeout. waited for response opcode[0x%x]\n",
@@ -1320,11 +1320,11 @@ int q6usm_set_us_detection(struct us_client *usc,
 	}
 	rc = wait_event_timeout(usc->cmd_wait,
 				(atomic_read(&usc->cmd_state) == 0),
-				Q6USM_TIMEOUT_JIFFIES);
+				msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES));
 	if (!rc) {
 		rc = -ETIME;
-		pr_err("%s: CMD_SIGNAL_DETECT_MODE: timeout=%d\n",
-		       __func__, Q6USM_TIMEOUT_JIFFIES);
+		pr_err("%s: CMD_SIGNAL_DETECT_MODE: timeout=%ld\n",
+		       __func__, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES));
 	} else
 		rc = 0;
 
@@ -1366,11 +1366,11 @@ int q6usm_set_us_stream_param(int dir, struct us_client *usc,
 
 	rc = wait_event_timeout(usc->cmd_wait,
 				(atomic_read(&usc->cmd_state) == 0),
-				Q6USM_TIMEOUT_JIFFIES);
+				msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES));
 	if (!rc) {
 		rc = -ETIME;
-		pr_err("%s: CMD_SET_PARAM: timeout=%d\n",
-			__func__, Q6USM_TIMEOUT_JIFFIES);
+		pr_err("%s: CMD_SET_PARAM: timeout=%ld\n",
+			__func__, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES));
 	} else
 		rc = 0;
 
@@ -1412,11 +1412,11 @@ int q6usm_get_us_stream_param(int dir, struct us_client *usc,
 
 	rc = wait_event_timeout(usc->cmd_wait,
 				(atomic_read(&usc->cmd_state) == 0),
-				Q6USM_TIMEOUT_JIFFIES);
+				msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES));
 	if (!rc) {
 		rc = -ETIME;
-		pr_err("%s: CMD_GET_PARAM: timeout=%d\n",
-			__func__, Q6USM_TIMEOUT_JIFFIES);
+		pr_err("%s: CMD_GET_PARAM: timeout=%ld\n",
+			__func__, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES));
 	} else
 		rc = 0;
 
diff --git a/arch/arm/mach-msm/smd_pkt.c b/arch/arm/mach-msm/smd_pkt.c
index c6001a626..27cd2e50f 100644
--- a/arch/arm/mach-msm/smd_pkt.c
+++ b/arch/arm/mach-msm/smd_pkt.c
@@ -896,7 +896,7 @@ int smd_pkt_open(struct inode *inode, struct file *file)
 
 		r = wait_event_interruptible_timeout(
 				smd_pkt_devp->ch_opened_wait_queue,
-				smd_pkt_devp->is_open, (2 * HZ));
+				smd_pkt_devp->is_open, msecs_to_jiffies(2000));
 		if (r == 0) {
 			r = -ETIMEDOUT;
 			/* close the ch to sync smd's state with smd_pkt */
diff --git a/arch/arm/mach-msm/smd_qmi.c b/arch/arm/mach-msm/smd_qmi.c
index ca10f00d0..925fb7d4b 100644
--- a/arch/arm/mach-msm/smd_qmi.c
+++ b/arch/arm/mach-msm/smd_qmi.c
@@ -509,7 +509,7 @@ static void qmi_notify(void *priv, unsigned event)
 		int sz;
 		sz = smd_cur_packet_size(ctxt->ch);
 		if ((sz > 0) && (sz <= smd_read_avail(ctxt->ch))) {
-			wake_lock_timeout(&ctxt->wake_lock, HZ / 2);
+			wake_lock_timeout(&ctxt->wake_lock, msecs_to_jiffies(500));
 			queue_work(qmi_wq, &ctxt->read_work);
 		}
 		break;
diff --git a/arch/arm/mach-msm/smd_tty.c b/arch/arm/mach-msm/smd_tty.c
index 55b000340..e7945633e 100644
--- a/arch/arm/mach-msm/smd_tty.c
+++ b/arch/arm/mach-msm/smd_tty.c
@@ -484,7 +484,7 @@ static int smd_tty_port_activate(struct tty_port *tport,
 	}
 
 	res = wait_event_interruptible_timeout(info->ch_opened_wait_queue,
-					       info->is_open, (2 * HZ));
+					       info->is_open, msecs_to_jiffies(2000));
 	if (res == 0)
 		res = -ETIMEDOUT;
 	if (res < 0) {
diff --git a/arch/arm/mach-msm/smem_log.c b/arch/arm/mach-msm/smem_log.c
index 35a6e1599..93cf370f8 100644
--- a/arch/arm/mach-msm/smem_log.c
+++ b/arch/arm/mach-msm/smem_log.c
@@ -1740,7 +1740,7 @@ static int debug_dump(char *buf, int max, uint32_t cont)
 		r = wait_event_interruptible_timeout(inst[GEN].read_wait,
 						     inst[GEN].last_read_avail,
 						     smem_log_timeout_ms *
-						     HZ / 1000);
+						     msecs_to_jiffies(1));
 		DBG("%s: read available %d\n", __func__,
 		    inst[GEN].last_read_avail);
 		if (r < 0)
@@ -1764,7 +1764,7 @@ static int debug_dump_sym(char *buf, int max, uint32_t cont)
 		r = wait_event_interruptible_timeout(inst[GEN].read_wait,
 						     inst[GEN].last_read_avail,
 						     smem_log_timeout_ms *
-						     HZ / 1000);
+						     msecs_to_jiffies(1));
 		DBG("%s: readavailable %d\n", __func__,
 		    inst[GEN].last_read_avail);
 		if (r < 0)
diff --git a/arch/arm/mach-msm/smp2p_spinlock_test.c b/arch/arm/mach-msm/smp2p_spinlock_test.c
index c14bac0a7..5ecf6be79 100644
--- a/arch/arm/mach-msm/smp2p_spinlock_test.c
+++ b/arch/arm/mach-msm/smp2p_spinlock_test.c
@@ -88,7 +88,7 @@ static void smp2p_ut_remote_spinlock_core(struct seq_file *s, int remote_pid,
 		UT_ASSERT_INT(ret, ==, 0);
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&cb_out.cb_completion, HZ * 2),
+					&cb_out.cb_completion, msecs_to_jiffies(2000)),
 			>, 0);
 		UT_ASSERT_INT(cb_out.cb_count, ==, 1);
 		UT_ASSERT_INT(cb_out.event_open, ==, 1);
@@ -99,7 +99,7 @@ static void smp2p_ut_remote_spinlock_core(struct seq_file *s, int remote_pid,
 		UT_ASSERT_INT(ret, ==, 0);
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&cb_in.cb_completion, HZ * 2),
+					&cb_in.cb_completion, msecs_to_jiffies(2000)),
 			>, 0);
 		UT_ASSERT_INT(cb_in.cb_count, ==, 1);
 		UT_ASSERT_INT(cb_in.event_open, ==, 1);
@@ -116,7 +116,7 @@ static void smp2p_ut_remote_spinlock_core(struct seq_file *s, int remote_pid,
 
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&cb_in.cb_completion, HZ * 2),
+					&cb_in.cb_completion, msecs_to_jiffies(2000)),
 			>, 0);
 		UT_ASSERT_INT(cb_in.cb_count, ==, 1);
 		UT_ASSERT_INT(cb_in.event_entry_update, ==, 1);
@@ -225,7 +225,7 @@ static void smp2p_ut_remote_spinlock_core(struct seq_file *s, int remote_pid,
 		do {
 			UT_ASSERT_INT(
 				(int)wait_for_completion_timeout(
-					&cb_in.cb_completion, HZ * 2),
+					&cb_in.cb_completion, msecs_to_jiffies(2000)),
 				>, 0);
 			INIT_COMPLETION(cb_in.cb_completion);
 			ret = msm_smp2p_in_read(remote_pid,
diff --git a/arch/arm/mach-msm/smp2p_test.c b/arch/arm/mach-msm/smp2p_test.c
index 2f86df4b7..c16e79371 100644
--- a/arch/arm/mach-msm/smp2p_test.c
+++ b/arch/arm/mach-msm/smp2p_test.c
@@ -85,7 +85,7 @@ static void smp2p_ut_local_basic(struct seq_file *s)
 		/* verify port was opened */
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&cb_data.cb_completion, HZ / 2), >, 0);
+					&cb_data.cb_completion, msecs_to_jiffies(500)), >, 0);
 		UT_ASSERT_INT(cb_data.cb_count, ==, 1);
 		UT_ASSERT_INT(cb_data.event_open, ==, 1);
 		UT_ASSERT_INT(rmp->rx_interrupt_count, ==, 2);
@@ -173,7 +173,7 @@ static void smp2p_ut_local_late_open(struct seq_file *s)
 		/* verify port was opened */
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&cb_data.cb_completion, HZ / 2),
+					&cb_data.cb_completion, msecs_to_jiffies(500)),
 			>, 0);
 		UT_ASSERT_INT(cb_data.cb_count, ==, 1);
 		UT_ASSERT_INT(cb_data.event_open, ==, 1);
@@ -265,7 +265,7 @@ static void smp2p_ut_local_early_open(struct seq_file *s)
 
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&cb_data.cb_completion, HZ / 8),
+					&cb_data.cb_completion, msecs_to_jiffies(125)),
 			==, 0);
 		UT_ASSERT_INT(cb_data.cb_count, ==, 0);
 		UT_ASSERT_INT(cb_data.event_open, ==, 0);
@@ -294,7 +294,7 @@ static void smp2p_ut_local_early_open(struct seq_file *s)
 
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&cb_data.cb_completion, HZ / 2),
+					&cb_data.cb_completion, msecs_to_jiffies(500)),
 			>, 0);
 		UT_ASSERT_INT(cb_data.cb_count, ==, 1);
 		UT_ASSERT_INT(cb_data.event_open, ==, 1);
@@ -384,7 +384,7 @@ static void smp2p_ut_mock_loopback(struct seq_file *s)
 		local = msm_smp2p_init_rmt_lpb_proc(SMP2P_REMOTE_MOCK_PROC);
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&rmp->cb_completion, HZ / 2),
+					&rmp->cb_completion, msecs_to_jiffies(500)),
 			>, 0);
 		UT_ASSERT_INT(rmp->rx_interrupt_count, ==, 2);
 
@@ -398,7 +398,7 @@ static void smp2p_ut_mock_loopback(struct seq_file *s)
 		rmp->tx_interrupt();
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&rmp->cb_completion, HZ / 2),
+					&rmp->cb_completion, msecs_to_jiffies(500)),
 			>, 0);
 
 		/* Verify Echo Response */
@@ -421,7 +421,7 @@ static void smp2p_ut_mock_loopback(struct seq_file *s)
 		rmp->tx_interrupt();
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&rmp->cb_completion, HZ / 2),
+					&rmp->cb_completion, msecs_to_jiffies(500)),
 			>, 0);
 
 		/* Verify PINGPONG Response */
@@ -443,7 +443,7 @@ static void smp2p_ut_mock_loopback(struct seq_file *s)
 		rmp->tx_interrupt();
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&rmp->cb_completion, HZ / 2),
+					&rmp->cb_completion, msecs_to_jiffies(500)),
 			>, 0);
 
 		/* Verify CLEARALL response */
@@ -494,7 +494,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid)
 		UT_ASSERT_INT(ret, ==, 0);
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&cb_out.cb_completion, HZ / 2),
+					&cb_out.cb_completion, msecs_to_jiffies(500)),
 			>, 0);
 		UT_ASSERT_INT(cb_out.cb_count, ==, 1);
 		UT_ASSERT_INT(cb_out.event_open, ==, 1);
@@ -505,7 +505,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid)
 		UT_ASSERT_INT(ret, ==, 0);
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&cb_in.cb_completion, HZ / 2),
+					&cb_in.cb_completion, msecs_to_jiffies(500)),
 			>, 0);
 		UT_ASSERT_INT(cb_in.cb_count, ==, 1);
 		UT_ASSERT_INT(cb_in.event_open, ==, 1);
@@ -523,7 +523,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid)
 		/* Verify inbound reply */
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&cb_in.cb_completion, HZ / 2),
+					&cb_in.cb_completion, msecs_to_jiffies(500)),
 			>, 0);
 		UT_ASSERT_INT(cb_in.cb_count, ==, 1);
 		UT_ASSERT_INT(cb_in.event_entry_update, ==, 1);
@@ -549,7 +549,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid)
 		/* Verify inbound reply */
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&cb_in.cb_completion, HZ / 2),
+					&cb_in.cb_completion, msecs_to_jiffies(500)),
 			>, 0);
 		UT_ASSERT_INT(cb_in.cb_count, ==, 1);
 		UT_ASSERT_INT(cb_in.event_entry_update, ==, 1);
@@ -573,7 +573,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid)
 		/* Verify inbound reply */
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&cb_in.cb_completion, HZ / 2),
+					&cb_in.cb_completion, msecs_to_jiffies(500)),
 			>, 0);
 		UT_ASSERT_INT(cb_in.cb_count, ==, 1);
 		UT_ASSERT_INT(cb_in.event_entry_update, ==, 1);
@@ -598,7 +598,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid)
 
 		UT_ASSERT_INT(
 			(int)wait_for_completion_timeout(
-					&cb_in.cb_completion, HZ / 2),
+					&cb_in.cb_completion, msecs_to_jiffies(500)),
 			==, 0);
 		UT_ASSERT_INT(cb_in.cb_count, ==, 0);
 		UT_ASSERT_INT(cb_in.event_entry_update, ==, 0);
@@ -814,7 +814,7 @@ static void smp2p_ut_local_in_max_entries(struct seq_file *s)
 			UT_ASSERT_INT(ret, ==, 0);
 			UT_ASSERT_INT(
 				(int)wait_for_completion_timeout(
-					&(cb_in[j].cb_completion), HZ / 2),
+					&(cb_in[j].cb_completion), msecs_to_jiffies(500)),
 				>, 0);
 			UT_ASSERT_INT(cb_in[j].cb_count, ==, 1);
 			UT_ASSERT_INT(cb_in[j].event_entry_update, ==, 0);
@@ -909,7 +909,7 @@ static void smp2p_ut_local_in_multiple(struct seq_file *s)
 		UT_ASSERT_INT(ret, ==, 0);
 		UT_ASSERT_INT(
 				(int)wait_for_completion_timeout(
-				&(cb_in_1.cb_completion), HZ / 2),
+				&(cb_in_1.cb_completion), msecs_to_jiffies(500)),
 				>, 0);
 		UT_ASSERT_INT(cb_in_1.cb_count, ==, 1);
 		UT_ASSERT_INT(cb_in_1.event_entry_update, ==, 0);
@@ -920,7 +920,7 @@ static void smp2p_ut_local_in_multiple(struct seq_file *s)
 		UT_ASSERT_INT(ret, ==, 0);
 		UT_ASSERT_INT(
 				(int)wait_for_completion_timeout(
-				&(cb_in_2.cb_completion), HZ / 2),
+				&(cb_in_2.cb_completion), msecs_to_jiffies(500)),
 				>, 0);
 		UT_ASSERT_INT(cb_in_2.cb_count, ==, 1);
 		UT_ASSERT_INT(cb_in_2.event_entry_update, ==, 0);
diff --git a/arch/arm/mach-msm/timer.c b/arch/arm/mach-msm/timer.c
index 1ea0f2dc6..2a6e0bda5 100644
--- a/arch/arm/mach-msm/timer.c
+++ b/arch/arm/mach-msm/timer.c
@@ -722,7 +722,7 @@ static void msm_timer_sync_to_gpt(struct msm_clock *clock, int exit_sleep)
 		&__get_cpu_var(msm_clocks_percpu)[clock->index];
 	struct msm_timer_sync_data_t data;
 	uint32_t gpt_clk_val;
-	u64 gpt_period = (1ULL << 32) * HZ;
+	u64 gpt_period = (1ULL << 32) * msecs_to_jiffies(1000);
 	u64 now = get_jiffies_64();
 
 	do_div(gpt_period, gpt_hz);
diff --git a/block/Makefile b/block/Makefile
index 25f094699..9085458d0 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 obj-$(CONFIG_IOSCHED_SIO)	+= sio-iosched.o
 obj-$(CONFIG_IOSCHED_FIOPS)	+= fiops-iosched.o
 obj-$(CONFIG_IOSCHED_TEST)	+= test-iosched.o
+obj-$(CONFIG_ZRAM) += zram/
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
new file mode 100644
index 000000000..637c90665
--- /dev/null
+++ b/block/bfq-iosched.c
@@ -0,0 +1,4207 @@
+/*
+ * Budget Fair Queueing (BFQ) disk scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
+ * file.
+ *
+ * BFQ is a proportional-share storage-I/O scheduling algorithm based on
+ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
+ * measured in number of sectors, to processes instead of time slices. The
+ * device is not granted to the in-service process for a given time slice,
+ * but until it has exhausted its assigned budget. This change from the time
+ * to the service domain allows BFQ to distribute the device throughput
+ * among processes as desired, without any distortion due to ZBR, workload
+ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler,
+ * called B-WF2Q+, to schedule processes according to their budgets. More
+ * precisely, BFQ schedules queues associated to processes. Thanks to the
+ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to
+ * I/O-bound processes issuing sequential requests (to boost the
+ * throughput), and yet guarantee a low latency to interactive and soft
+ * real-time applications.
+ *
+ * BFQ is described in [1], where also a reference to the initial, more
+ * theoretical paper on BFQ can be found. The interested reader can find
+ * in the latter paper full details on the main algorithm, as well as
+ * formulas of the guarantees and formal proofs of all the properties.
+ * With respect to the version of BFQ presented in these papers, this
+ * implementation adds a few more heuristics, such as the one that
+ * guarantees a low latency to soft real-time applications, and a
+ * hierarchical extension based on H-WF2Q+.
+ *
+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
+ * complexity derives from the one introduced with EEVDF in [3].
+ *
+ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
+ *     with the BFQ Disk I/O Scheduler'',
+ *     Proceedings of the 5th Annual International Systems and Storage
+ *     Conference (SYSTOR '12), June 2012.
+ *
+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
+ *
+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
+ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
+ *     Oct 1997.
+ *
+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
+ *
+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
+ *     First: A Flexible and Accurate Mechanism for Proportional Share
+ *     Resource Allocation,'' technical report.
+ *
+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/cgroup.h>
+#include <linux/elevator.h>
+#include <linux/jiffies.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include "bfq.h"
+#include "blk.h"
+
+/* Expiration time of sync (0) and async (1) requests, in jiffies. */
+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
+
+/* Maximum backwards seek, in KiB. */
+static const int bfq_back_max = 16 * 1024;
+
+/* Penalty of a backwards seek, in number of sectors. */
+static const int bfq_back_penalty = 2;
+
+/* Idling period duration, in jiffies. */
+static int bfq_slice_idle = HZ / 125;
+
+/* Default maximum budget values, in sectors and number of requests. */
+static const int bfq_default_max_budget = 16 * 1024;
+static const int bfq_max_budget_async_rq = 4;
+
+/*
+ * Async to sync throughput distribution is controlled as follows:
+ * when an async request is served, the entity is charged the number
+ * of sectors of the request, multiplied by the factor below
+ */
+static const int bfq_async_charge_factor = 10;
+
+/* Default timeout values, in jiffies, approximating CFQ defaults. */
+static const int bfq_timeout_sync = HZ / 8;
+static int bfq_timeout_async = HZ / 25;
+
+struct kmem_cache *bfq_pool;
+
+/* Below this threshold (in ms), we consider thinktime immediate. */
+#define BFQ_MIN_TT		2
+
+/* hw_tag detection: parallel requests threshold and min samples needed. */
+#define BFQ_HW_QUEUE_THRESHOLD	4
+#define BFQ_HW_QUEUE_SAMPLES	32
+
+#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)
+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
+
+/* Min samples used for peak rate estimation (for autotuning). */
+#define BFQ_PEAK_RATE_SAMPLES	32
+
+/* Shift used for peak rate fixed precision calculations. */
+#define BFQ_RATE_SHIFT		16
+
+/*
+ * By default, BFQ computes the duration of the weight raising for
+ * interactive applications automatically, using the following formula:
+ * duration = (R / r) * T, where r is the peak rate of the device, and
+ * R and T are two reference parameters.
+ * In particular, R is the peak rate of the reference device (see below),
+ * and T is a reference time: given the systems that are likely to be
+ * installed on the reference device according to its speed class, T is
+ * about the maximum time needed, under BFQ and while reading two files in
+ * parallel, to load typical large applications on these systems.
+ * In practice, the slower/faster the device at hand is, the more/less it
+ * takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive
+ * applications.
+ *
+ * BFQ uses four different reference pairs (R, T), depending on:
+ * . whether the device is rotational or non-rotational;
+ * . whether the device is slow, such as old or portable HDDs, as well as
+ *   SD cards, or fast, such as newer HDDs and SSDs.
+ *
+ * The device's speed class is dynamically (re)detected in
+ * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ *
+ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]
+ * are the reference values for a slow/fast rotational device, whereas
+ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for
+ * a slow/fast non-rotational device. Finally, device_speed_thresh are the
+ * thresholds used to switch between speed classes.
+ * Both the reference peak rates and the thresholds are measured in
+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ */
+static int R_slow[2] = {1536, 10752};
+static int R_fast[2] = {17415, 34791};
+/*
+ * To improve readability, a conversion function is used to initialize the
+ * following arrays, which entails that they can be initialized only in a
+ * function.
+ */
+static int T_slow[2];
+static int T_fast[2];
+static int device_speed_thresh[2];
+
+#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\
+				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
+
+#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])
+#define RQ_BFQQ(rq)		((rq)->elv.priv[1])
+
+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
+
+#include "bfq-ioc.c"
+#include "bfq-sched.c"
+#include "bfq-cgroup.c"
+
+#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\
+				 IOPRIO_CLASS_IDLE)
+#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\
+				 IOPRIO_CLASS_RT)
+
+#define bfq_sample_valid(samples)	((samples) > 80)
+
+/*
+ * The following macro groups conditions that need to be evaluated when
+ * checking if existing queues and groups form a symmetric scenario
+ * and therefore idling can be reduced or disabled for some of the
+ * queues. See the comment to the function bfq_bfqq_must_not_expire()
+ * for further details.
+ */
+#ifdef CONFIG_CGROUP_BFQIO
+#define symmetric_scenario	  (!bfqd->active_numerous_groups && \
+				   !bfq_differentiated_weights(bfqd))
+#else
+#define symmetric_scenario	  (!bfq_differentiated_weights(bfqd))
+#endif
+
+/*
+ * We regard a request as SYNC, if either it's a read or has the SYNC bit
+ * set (in which case it could also be a direct WRITE).
+ */
+static inline int bfq_bio_sync(struct bio *bio)
+{
+	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
+{
+	if (bfqd->queued != 0) {
+		bfq_log(bfqd, "schedule dispatch");
+		kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
+	}
+}
+
+/*
+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
+ * We choose the request that is closesr to the head right now.  Distance
+ * behind the head is penalized and only allowed to a certain extent.
+ */
+static struct request *bfq_choose_req(struct bfq_data *bfqd,
+				      struct request *rq1,
+				      struct request *rq2,
+				      sector_t last)
+{
+	sector_t s1, s2, d1 = 0, d2 = 0;
+	unsigned long back_max;
+#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */
+#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */
+	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
+
+	if (rq1 == NULL || rq1 == rq2)
+		return rq2;
+	if (rq2 == NULL)
+		return rq1;
+
+	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
+		return rq1;
+	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
+		return rq2;
+	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
+		return rq1;
+	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
+		return rq2;
+
+	s1 = blk_rq_pos(rq1);
+	s2 = blk_rq_pos(rq2);
+
+	/*
+	 * By definition, 1KiB is 2 sectors.
+	 */
+	back_max = bfqd->bfq_back_max * 2;
+
+	/*
+	 * Strict one way elevator _except_ in the case where we allow
+	 * short backward seeks which are biased as twice the cost of a
+	 * similar forward seek.
+	 */
+	if (s1 >= last)
+		d1 = s1 - last;
+	else if (s1 + back_max >= last)
+		d1 = (last - s1) * bfqd->bfq_back_penalty;
+	else
+		wrap |= BFQ_RQ1_WRAP;
+
+	if (s2 >= last)
+		d2 = s2 - last;
+	else if (s2 + back_max >= last)
+		d2 = (last - s2) * bfqd->bfq_back_penalty;
+	else
+		wrap |= BFQ_RQ2_WRAP;
+
+	/* Found required data */
+
+	/*
+	 * By doing switch() on the bit mask "wrap" we avoid having to
+	 * check two variables for all permutations: --> faster!
+	 */
+	switch (wrap) {
+	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
+		if (d1 < d2)
+			return rq1;
+		else if (d2 < d1)
+			return rq2;
+		else {
+			if (s1 >= s2)
+				return rq1;
+			else
+				return rq2;
+		}
+
+	case BFQ_RQ2_WRAP:
+		return rq1;
+	case BFQ_RQ1_WRAP:
+		return rq2;
+	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
+	default:
+		/*
+		 * Since both rqs are wrapped,
+		 * start with the one that's further behind head
+		 * (--> only *one* back seek required),
+		 * since back seek takes more time than forward.
+		 */
+		if (s1 <= s2)
+			return rq1;
+		else
+			return rq2;
+	}
+}
+
+static struct bfq_queue *
+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
+		     sector_t sector, struct rb_node **ret_parent,
+		     struct rb_node ***rb_link)
+{
+	struct rb_node **p, *parent;
+	struct bfq_queue *bfqq = NULL;
+
+	parent = NULL;
+	p = &root->rb_node;
+	while (*p) {
+		struct rb_node **n;
+
+		parent = *p;
+		bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+
+		/*
+		 * Sort strictly based on sector. Smallest to the left,
+		 * largest to the right.
+		 */
+		if (sector > blk_rq_pos(bfqq->next_rq))
+			n = &(*p)->rb_right;
+		else if (sector < blk_rq_pos(bfqq->next_rq))
+			n = &(*p)->rb_left;
+		else
+			break;
+		p = n;
+		bfqq = NULL;
+	}
+
+	*ret_parent = parent;
+	if (rb_link)
+		*rb_link = p;
+
+	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
+		(long long unsigned)sector,
+		bfqq != NULL ? bfqq->pid : 0);
+
+	return bfqq;
+}
+
+static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct rb_node **p, *parent;
+	struct bfq_queue *__bfqq;
+
+	if (bfqq->pos_root != NULL) {
+		rb_erase(&bfqq->pos_node, bfqq->pos_root);
+		bfqq->pos_root = NULL;
+	}
+
+	if (bfq_class_idle(bfqq))
+		return;
+	if (!bfqq->next_rq)
+		return;
+
+	bfqq->pos_root = &bfqd->rq_pos_tree;
+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
+			blk_rq_pos(bfqq->next_rq), &parent, &p);
+	if (__bfqq == NULL) {
+		rb_link_node(&bfqq->pos_node, parent, p);
+		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
+	} else
+		bfqq->pos_root = NULL;
+}
+
+/*
+ * Tell whether there are active queues or groups with differentiated weights.
+ */
+static inline bool bfq_differentiated_weights(struct bfq_data *bfqd)
+{
+	/*
+	 * For weights to differ, at least one of the trees must contain
+	 * at least two nodes.
+	 */
+	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+		(bfqd->queue_weights_tree.rb_node->rb_left ||
+		 bfqd->queue_weights_tree.rb_node->rb_right)
+#ifdef CONFIG_CGROUP_BFQIO
+	       ) ||
+	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
+		(bfqd->group_weights_tree.rb_node->rb_left ||
+		 bfqd->group_weights_tree.rb_node->rb_right)
+#endif
+	       );
+}
+
+/*
+ * If the weight-counter tree passed as input contains no counter for
+ * the weight of the input entity, then add that counter; otherwise just
+ * increment the existing counter.
+ *
+ * Note that weight-counter trees contain few nodes in mostly symmetric
+ * scenarios. For example, if all queues have the same weight, then the
+ * weight-counter tree for the queues may contain at most one node.
+ * This holds even if low_latency is on, because weight-raised queues
+ * are not inserted in the tree.
+ * In most scenarios, the rate at which nodes are created/destroyed
+ * should be low too.
+ */
+static void bfq_weights_tree_add(struct bfq_data *bfqd,
+				 struct bfq_entity *entity,
+				 struct rb_root *root)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/*
+	 * Do not insert if the entity is already associated with a
+	 * counter, which happens if:
+	 *   1) the entity is associated with a queue,
+	 *   2) a request arrival has caused the queue to become both
+	 *      non-weight-raised, and hence change its weight, and
+	 *      backlogged; in this respect, each of the two events
+	 *      causes an invocation of this function,
+	 *   3) this is the invocation of this function caused by the
+	 *      second event. This second invocation is actually useless,
+	 *      and we handle this fact by exiting immediately. More
+	 *      efficient or clearer solutions might possibly be adopted.
+	 */
+	if (entity->weight_counter)
+		return;
+
+	while (*new) {
+		struct bfq_weight_counter *__counter = container_of(*new,
+						struct bfq_weight_counter,
+						weights_node);
+		parent = *new;
+
+		if (entity->weight == __counter->weight) {
+			entity->weight_counter = __counter;
+			goto inc_counter;
+		}
+		if (entity->weight < __counter->weight)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+					 GFP_ATOMIC);
+	entity->weight_counter->weight = entity->weight;
+	rb_link_node(&entity->weight_counter->weights_node, parent, new);
+	rb_insert_color(&entity->weight_counter->weights_node, root);
+
+inc_counter:
+	entity->weight_counter->num_active++;
+}
+
+/*
+ * Decrement the weight counter associated with the entity, and, if the
+ * counter reaches 0, remove the counter from the tree.
+ * See the comments to the function bfq_weights_tree_add() for considerations
+ * about overhead.
+ */
+static void bfq_weights_tree_remove(struct bfq_data *bfqd,
+				    struct bfq_entity *entity,
+				    struct rb_root *root)
+{
+	if (!entity->weight_counter)
+		return;
+
+	BUG_ON(RB_EMPTY_ROOT(root));
+	BUG_ON(entity->weight_counter->weight != entity->weight);
+
+	BUG_ON(!entity->weight_counter->num_active);
+	entity->weight_counter->num_active--;
+	if (entity->weight_counter->num_active > 0)
+		goto reset_entity_pointer;
+
+	rb_erase(&entity->weight_counter->weights_node, root);
+	kfree(entity->weight_counter);
+
+reset_entity_pointer:
+	entity->weight_counter = NULL;
+}
+
+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
+					struct bfq_queue *bfqq,
+					struct request *last)
+{
+	struct rb_node *rbnext = rb_next(&last->rb_node);
+	struct rb_node *rbprev = rb_prev(&last->rb_node);
+	struct request *next = NULL, *prev = NULL;
+
+	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
+
+	if (rbprev != NULL)
+		prev = rb_entry_rq(rbprev);
+
+	if (rbnext != NULL)
+		next = rb_entry_rq(rbnext);
+	else {
+		rbnext = rb_first(&bfqq->sort_list);
+		if (rbnext && rbnext != &last->rb_node)
+			next = rb_entry_rq(rbnext);
+	}
+
+	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
+}
+
+/* see the definition of bfq_async_charge_factor for details */
+static inline unsigned long bfq_serv_to_charge(struct request *rq,
+					       struct bfq_queue *bfqq)
+{
+	return blk_rq_sectors(rq) *
+		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *
+		bfq_async_charge_factor));
+}
+
+/**
+ * bfq_updated_next_req - update the queue after a new next_rq selection.
+ * @bfqd: the device data the queue belongs to.
+ * @bfqq: the queue to update.
+ *
+ * If the first request of a queue changes we make sure that the queue
+ * has enough budget to serve at least its first request (if the
+ * request has grown).  We do this because if the queue has not enough
+ * budget for its first request, it has to go through two dispatch
+ * rounds to actually get it dispatched.
+ */
+static void bfq_updated_next_req(struct bfq_data *bfqd,
+				 struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	struct request *next_rq = bfqq->next_rq;
+	unsigned long new_budget;
+
+	if (next_rq == NULL)
+		return;
+
+	if (bfqq == bfqd->in_service_queue)
+		/*
+		 * In order not to break guarantees, budgets cannot be
+		 * changed after an entity has been selected.
+		 */
+		return;
+
+	BUG_ON(entity->tree != &st->active);
+	BUG_ON(entity == entity->sched_data->in_service_entity);
+
+	new_budget = max_t(unsigned long, bfqq->max_budget,
+			   bfq_serv_to_charge(next_rq, bfqq));
+	if (entity->budget != new_budget) {
+		entity->budget = new_budget;
+		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
+					 new_budget);
+		bfq_activate_bfqq(bfqd, bfqq);
+	}
+}
+
+static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+{
+	u64 dur;
+
+	if (bfqd->bfq_wr_max_time > 0)
+		return bfqd->bfq_wr_max_time;
+
+	dur = bfqd->RT_prod;
+	do_div(dur, bfqd->peak_rate);
+
+	return dur;
+}
+
+static inline unsigned
+bfq_bfqq_cooperations(struct bfq_queue *bfqq)
+{
+	return bfqq->bic ? bfqq->bic->cooperations : 0;
+}
+
+static inline void
+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+{
+	if (bic->saved_idle_window)
+		bfq_mark_bfqq_idle_window(bfqq);
+	else
+		bfq_clear_bfqq_idle_window(bfqq);
+	if (bic->saved_IO_bound)
+		bfq_mark_bfqq_IO_bound(bfqq);
+	else
+		bfq_clear_bfqq_IO_bound(bfqq);
+	/* Assuming that the flag in_large_burst is already correctly set */
+	if (bic->wr_time_left && bfqq->bfqd->low_latency &&
+	    !bfq_bfqq_in_large_burst(bfqq) &&
+	    bic->cooperations < bfqq->bfqd->bfq_coop_thresh) {
+		/*
+		 * Start a weight raising period with the duration given by
+		 * the raising_time_left snapshot.
+		 */
+		if (bfq_bfqq_busy(bfqq))
+			bfqq->bfqd->wr_busy_queues++;
+		bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;
+		bfqq->wr_cur_max_time = bic->wr_time_left;
+		bfqq->last_wr_start_finish = jiffies;
+		bfqq->entity.ioprio_changed = 1;
+	}
+	/*
+	 * Clear wr_time_left to prevent bfq_bfqq_save_state() from
+	 * getting confused about the queue's need of a weight-raising
+	 * period.
+	 */
+	bic->wr_time_left = 0;
+}
+
+/* Must be called with the queue_lock held. */
+static int bfqq_process_refs(struct bfq_queue *bfqq)
+{
+	int process_refs, io_refs;
+
+	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
+	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
+	BUG_ON(process_refs < 0);
+	return process_refs;
+}
+
+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
+static inline void bfq_reset_burst_list(struct bfq_data *bfqd,
+					struct bfq_queue *bfqq)
+{
+	struct bfq_queue *item;
+	struct hlist_node *pos, *n;
+
+	hlist_for_each_entry_safe(item, pos, n,
+				  &bfqd->burst_list, burst_list_node)
+		hlist_del_init(&item->burst_list_node);
+	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+	bfqd->burst_size = 1;
+}
+
+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	/* Increment burst size to take into account also bfqq */
+	bfqd->burst_size++;
+
+	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
+		struct bfq_queue *pos, *bfqq_item;
+		struct hlist_node *p, *n;
+
+		/*
+		 * Enough queues have been activated shortly after each
+		 * other to consider this burst as large.
+		 */
+		bfqd->large_burst = true;
+
+		/*
+		 * We can now mark all queues in the burst list as
+		 * belonging to a large burst.
+		 */
+		hlist_for_each_entry(bfqq_item, n, &bfqd->burst_list,
+				     burst_list_node)
+		        bfq_mark_bfqq_in_large_burst(bfqq_item);
+		bfq_mark_bfqq_in_large_burst(bfqq);
+
+		/*
+		 * From now on, and until the current burst finishes, any
+		 * new queue being activated shortly after the last queue
+		 * was inserted in the burst can be immediately marked as
+		 * belonging to a large burst. So the burst list is not
+		 * needed any more. Remove it.
+		 */
+		hlist_for_each_entry_safe(pos, p, n, &bfqd->burst_list,
+					  burst_list_node)
+			hlist_del_init(&pos->burst_list_node);
+	} else /* burst not yet large: add bfqq to the burst list */
+		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+}
+
+/*
+ * If many queues happen to become active shortly after each other, then,
+ * to help the processes associated to these queues get their job done as
+ * soon as possible, it is usually better to not grant either weight-raising
+ * or device idling to these queues. In this comment we describe, firstly,
+ * the reasons why this fact holds, and, secondly, the next function, which
+ * implements the main steps needed to properly mark these queues so that
+ * they can then be treated in a different way.
+ *
+ * As for the terminology, we say that a queue becomes active, i.e.,
+ * switches from idle to backlogged, either when it is created (as a
+ * consequence of the arrival of an I/O request), or, if already existing,
+ * when a new request for the queue arrives while the queue is idle.
+ * Bursts of activations, i.e., activations of different queues occurring
+ * shortly after each other, are typically caused by services or applications
+ * that spawn or reactivate many parallel threads/processes. Examples are
+ * systemd during boot or git grep.
+ *
+ * These services or applications benefit mostly from a high throughput:
+ * the quicker the requests of the activated queues are cumulatively served,
+ * the sooner the target job of these queues gets completed. As a consequence,
+ * weight-raising any of these queues, which also implies idling the device
+ * for it, is almost always counterproductive: in most cases it just lowers
+ * throughput.
+ *
+ * On the other hand, a burst of activations may be also caused by the start
+ * of an application that does not consist in a lot of parallel I/O-bound
+ * threads. In fact, with a complex application, the burst may be just a
+ * consequence of the fact that several processes need to be executed to
+ * start-up the application. To start an application as quickly as possible,
+ * the best thing to do is to privilege the I/O related to the application
+ * with respect to all other I/O. Therefore, the best strategy to start as
+ * quickly as possible an application that causes a burst of activations is
+ * to weight-raise all the queues activated during the burst. This is the
+ * exact opposite of the best strategy for the other type of bursts.
+ *
+ * In the end, to take the best action for each of the two cases, the two
+ * types of bursts need to be distinguished. Fortunately, this seems
+ * relatively easy to do, by looking at the sizes of the bursts. In
+ * particular, we found a threshold such that bursts with a larger size
+ * than that threshold are apparently caused only by services or commands
+ * such as systemd or git grep. For brevity, hereafter we call just 'large'
+ * these bursts. BFQ *does not* weight-raise queues whose activations occur
+ * in a large burst. In addition, for each of these queues BFQ performs or
+ * does not perform idling depending on which choice boosts the throughput
+ * most. The exact choice depends on the device and request pattern at
+ * hand.
+ *
+ * Turning back to the next function, it implements all the steps needed
+ * to detect the occurrence of a large burst and to properly mark all the
+ * queues belonging to it (so that they can then be treated in a different
+ * way). This goal is achieved by maintaining a special "burst list" that
+ * holds, temporarily, the queues that belong to the burst in progress. The
+ * list is then used to mark these queues as belonging to a large burst if
+ * the burst does become large. The main steps are the following.
+ *
+ * . when the very first queue is activated, the queue is inserted into the
+ *   list (as it could be the first queue in a possible burst)
+ *
+ * . if the current burst has not yet become large, and a queue Q that does
+ *   not yet belong to the burst is activated shortly after the last time
+ *   at which a new queue entered the burst list, then the function appends
+ *   Q to the burst list
+ *
+ * . if, as a consequence of the previous step, the burst size reaches
+ *   the large-burst threshold, then
+ *
+ *     . all the queues in the burst list are marked as belonging to a
+ *       large burst
+ *
+ *     . the burst list is deleted; in fact, the burst list already served
+ *       its purpose (keeping temporarily track of the queues in a burst,
+ *       so as to be able to mark them as belonging to a large burst in the
+ *       previous sub-step), and now is not needed any more
+ *
+ *     . the device enters a large-burst mode
+ *
+ * . if a queue Q that does not belong to the burst is activated while
+ *   the device is in large-burst mode and shortly after the last time
+ *   at which a queue either entered the burst list or was marked as
+ *   belonging to the current large burst, then Q is immediately marked
+ *   as belonging to a large burst.
+ *
+ * . if a queue Q that does not belong to the burst is activated a while
+ *   later, i.e., not shortly after, than the last time at which a queue
+ *   either entered the burst list or was marked as belonging to the
+ *   current large burst, then the current burst is deemed as finished and:
+ *
+ *        . the large-burst mode is reset if set
+ *
+ *        . the burst list is emptied
+ *
+ *        . Q is inserted in the burst list, as Q may be the first queue
+ *          in a possible new burst (then the burst list contains just Q
+ *          after this step).
+ */
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			     bool idle_for_long_time)
+{
+	/*
+	 * If bfqq happened to be activated in a burst, but has been idle
+	 * for at least as long as an interactive queue, then we assume
+	 * that, in the overall I/O initiated in the burst, the I/O
+	 * associated to bfqq is finished. So bfqq does not need to be
+	 * treated as a queue belonging to a burst anymore. Accordingly,
+	 * we reset bfqq's in_large_burst flag if set, and remove bfqq
+	 * from the burst list if it's there. We do not decrement instead
+	 * burst_size, because the fact that bfqq does not need to belong
+	 * to the burst list any more does not invalidate the fact that
+	 * bfqq may have been activated during the current burst.
+	 */
+	if (idle_for_long_time) {
+		hlist_del_init(&bfqq->burst_list_node);
+		bfq_clear_bfqq_in_large_burst(bfqq);
+	}
+
+	/*
+	 * If bfqq is already in the burst list or is part of a large
+	 * burst, then there is nothing else to do.
+	 */
+	if (!hlist_unhashed(&bfqq->burst_list_node) ||
+	    bfq_bfqq_in_large_burst(bfqq))
+		return;
+
+	/*
+	 * If bfqq's activation happens late enough, then the current
+	 * burst is finished, and related data structures must be reset.
+	 *
+	 * In this respect, consider the special case where bfqq is the very
+	 * first queue being activated. In this case, last_ins_in_burst is
+	 * not yet significant when we get here. But it is easy to verify
+	 * that, whether or not the following condition is true, bfqq will
+	 * end up being inserted into the burst list. In particular the
+	 * list will happen to contain only bfqq. And this is exactly what
+	 * has to happen, as bfqq may be the first queue in a possible
+	 * burst.
+	 */
+	if (time_is_before_jiffies(bfqd->last_ins_in_burst +
+	    bfqd->bfq_burst_interval)) {
+		bfqd->large_burst = false;
+		bfq_reset_burst_list(bfqd, bfqq);
+		return;
+	}
+
+	/*
+	 * If we get here, then bfqq is being activated shortly after the
+	 * last queue. So, if the current burst is also large, we can mark
+	 * bfqq as belonging to this large burst immediately.
+	 */
+	if (bfqd->large_burst) {
+		bfq_mark_bfqq_in_large_burst(bfqq);
+		return;
+	}
+
+	/*
+	 * If we get here, then a large-burst state has not yet been
+	 * reached, but bfqq is being activated shortly after the last
+	 * queue. Then we add bfqq to the burst.
+	 */
+	bfq_add_to_burst(bfqd, bfqq);
+}
+
+static void bfq_add_request(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_data *bfqd = bfqq->bfqd;
+	struct request *next_rq, *prev;
+	unsigned long old_wr_coeff = bfqq->wr_coeff;
+	bool interactive = false;
+
+	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
+	bfqq->queued[rq_is_sync(rq)]++;
+	bfqd->queued++;
+
+	elv_rb_add(&bfqq->sort_list, rq);
+
+	/*
+	 * Check if this request is a better next-serve candidate.
+	 */
+	prev = bfqq->next_rq;
+	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
+	BUG_ON(next_rq == NULL);
+	bfqq->next_rq = next_rq;
+
+	/*
+	 * Adjust priority tree position, if next_rq changes.
+	 */
+	if (prev != bfqq->next_rq)
+		bfq_rq_pos_tree_add(bfqd, bfqq);
+
+	if (!bfq_bfqq_busy(bfqq)) {
+		bool soft_rt, coop_or_in_burst,
+		     idle_for_long_time = time_is_before_jiffies(
+						bfqq->budget_timeout +
+						bfqd->bfq_wr_min_idle_time);
+
+		if (bfq_bfqq_sync(bfqq)) {
+			bool already_in_burst =
+			   !hlist_unhashed(&bfqq->burst_list_node) ||
+			   bfq_bfqq_in_large_burst(bfqq);
+			bfq_handle_burst(bfqd, bfqq, idle_for_long_time);
+			/*
+			 * If bfqq was not already in the current burst,
+			 * then, at this point, bfqq either has been
+			 * added to the current burst or has caused the
+			 * current burst to terminate. In particular, in
+			 * the second case, bfqq has become the first
+			 * queue in a possible new burst.
+			 * In both cases last_ins_in_burst needs to be
+			 * moved forward.
+			 */
+			if (!already_in_burst)
+				bfqd->last_ins_in_burst = jiffies;
+		}
+
+		coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) ||
+			bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh;
+		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+			!coop_or_in_burst &&
+			time_is_before_jiffies(bfqq->soft_rt_next_start);
+		interactive = !coop_or_in_burst && idle_for_long_time;
+		entity->budget = max_t(unsigned long, bfqq->max_budget,
+				       bfq_serv_to_charge(next_rq, bfqq));
+
+		if (!bfq_bfqq_IO_bound(bfqq)) {
+			if (time_before(jiffies,
+					RQ_BIC(rq)->ttime.last_end_request +
+					bfqd->bfq_slice_idle)) {
+				bfqq->requests_within_timer++;
+				if (bfqq->requests_within_timer >=
+				    bfqd->bfq_requests_within_timer)
+					bfq_mark_bfqq_IO_bound(bfqq);
+			} else
+				bfqq->requests_within_timer = 0;
+		}
+
+		if (!bfqd->low_latency)
+			goto add_bfqq_busy;
+
+		if (bfq_bfqq_just_split(bfqq))
+			goto set_ioprio_changed;
+
+		/*
+		 * If the queue:
+		 * - is not being boosted,
+		 * - has been idle for enough time,
+		 * - is not a sync queue or is linked to a bfq_io_cq (it is
+		 *   shared "for its nature" or it is not shared and its
+		 *   requests have not been redirected to a shared queue)
+		 * start a weight-raising period.
+		 */
+		if (old_wr_coeff == 1 && (interactive || soft_rt) &&
+		    (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
+			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+			if (interactive)
+				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+			else
+				bfqq->wr_cur_max_time =
+					bfqd->bfq_wr_rt_max_time;
+			bfq_log_bfqq(bfqd, bfqq,
+				     "wrais starting at %lu, rais_max_time %u",
+				     jiffies,
+				     jiffies_to_msecs(bfqq->wr_cur_max_time));
+		} else if (old_wr_coeff > 1) {
+			if (interactive)
+				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+			else if (coop_or_in_burst ||
+				 (bfqq->wr_cur_max_time ==
+				  bfqd->bfq_wr_rt_max_time &&
+				  !soft_rt)) {
+				bfqq->wr_coeff = 1;
+				bfq_log_bfqq(bfqd, bfqq,
+					"wrais ending at %lu, rais_max_time %u",
+					jiffies,
+					jiffies_to_msecs(bfqq->
+						wr_cur_max_time));
+			} else if (time_before(
+					bfqq->last_wr_start_finish +
+					bfqq->wr_cur_max_time,
+					jiffies +
+					bfqd->bfq_wr_rt_max_time) &&
+				   soft_rt) {
+				/*
+				 *
+				 * The remaining weight-raising time is lower
+				 * than bfqd->bfq_wr_rt_max_time, which means
+				 * that the application is enjoying weight
+				 * raising either because deemed soft-rt in
+				 * the near past, or because deemed interactive
+				 * a long ago.
+				 * In both cases, resetting now the current
+				 * remaining weight-raising time for the
+				 * application to the weight-raising duration
+				 * for soft rt applications would not cause any
+				 * latency increase for the application (as the
+				 * new duration would be higher than the
+				 * remaining time).
+				 *
+				 * In addition, the application is now meeting
+				 * the requirements for being deemed soft rt.
+				 * In the end we can correctly and safely
+				 * (re)charge the weight-raising duration for
+				 * the application with the weight-raising
+				 * duration for soft rt applications.
+				 *
+				 * In particular, doing this recharge now, i.e.,
+				 * before the weight-raising period for the
+				 * application finishes, reduces the probability
+				 * of the following negative scenario:
+				 * 1) the weight of a soft rt application is
+				 *    raised at startup (as for any newly
+				 *    created application),
+				 * 2) since the application is not interactive,
+				 *    at a certain time weight-raising is
+				 *    stopped for the application,
+				 * 3) at that time the application happens to
+				 *    still have pending requests, and hence
+				 *    is destined to not have a chance to be
+				 *    deemed soft rt before these requests are
+				 *    completed (see the comments to the
+				 *    function bfq_bfqq_softrt_next_start()
+				 *    for details on soft rt detection),
+				 * 4) these pending requests experience a high
+				 *    latency because the application is not
+				 *    weight-raised while they are pending.
+				 */
+				bfqq->last_wr_start_finish = jiffies;
+				bfqq->wr_cur_max_time =
+					bfqd->bfq_wr_rt_max_time;
+			}
+		}
+set_ioprio_changed:
+		if (old_wr_coeff != bfqq->wr_coeff)
+			entity->ioprio_changed = 1;
+add_bfqq_busy:
+		bfqq->last_idle_bklogged = jiffies;
+		bfqq->service_from_backlogged = 0;
+		bfq_clear_bfqq_softrt_update(bfqq);
+		bfq_add_bfqq_busy(bfqd, bfqq);
+	} else {
+		if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
+		    time_is_before_jiffies(
+				bfqq->last_wr_start_finish +
+				bfqd->bfq_wr_min_inter_arr_async)) {
+			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+
+			bfqd->wr_busy_queues++;
+			entity->ioprio_changed = 1;
+			bfq_log_bfqq(bfqd, bfqq,
+			    "non-idle wrais starting at %lu, rais_max_time %u",
+			    jiffies,
+			    jiffies_to_msecs(bfqq->wr_cur_max_time));
+		}
+		if (prev != bfqq->next_rq)
+			bfq_updated_next_req(bfqd, bfqq);
+	}
+
+	if (bfqd->low_latency &&
+		(old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
+		bfqq->last_wr_start_finish = jiffies;
+}
+
+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
+					  struct bio *bio)
+{
+	struct task_struct *tsk = current;
+	struct bfq_io_cq *bic;
+	struct bfq_queue *bfqq;
+
+	bic = bfq_bic_lookup(bfqd, tsk->io_context);
+	if (bic == NULL)
+		return NULL;
+
+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
+	if (bfqq != NULL) {
+		sector_t sector = bio->bi_sector + bio_sectors(bio);
+
+		return elv_rb_find(&bfqq->sort_list, sector);
+	}
+
+	return NULL;
+}
+
+static void bfq_activate_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+
+	bfqd->rq_in_driver++;
+	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
+	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
+		(long long unsigned)bfqd->last_position);
+}
+
+static inline void bfq_deactivate_request(struct request_queue *q,
+					  struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+
+	BUG_ON(bfqd->rq_in_driver == 0);
+	bfqd->rq_in_driver--;
+}
+
+static void bfq_remove_request(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_data *bfqd = bfqq->bfqd;
+	const int sync = rq_is_sync(rq);
+
+	if (bfqq->next_rq == rq) {
+		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
+		bfq_updated_next_req(bfqd, bfqq);
+	}
+
+	if (rq->queuelist.prev != &rq->queuelist)
+		list_del_init(&rq->queuelist);
+	BUG_ON(bfqq->queued[sync] == 0);
+	bfqq->queued[sync]--;
+	bfqd->queued--;
+	elv_rb_del(&bfqq->sort_list, rq);
+
+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
+			bfq_del_bfqq_busy(bfqd, bfqq, 1);
+		/*
+		 * Remove queue from request-position tree as it is empty.
+		 */
+		if (bfqq->pos_root != NULL) {
+			rb_erase(&bfqq->pos_node, bfqq->pos_root);
+			bfqq->pos_root = NULL;
+		}
+	}
+
+	if (rq->cmd_flags & REQ_META) {
+		BUG_ON(bfqq->meta_pending == 0);
+		bfqq->meta_pending--;
+	}
+}
+
+static int bfq_merge(struct request_queue *q, struct request **req,
+		     struct bio *bio)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct request *__rq;
+
+	__rq = bfq_find_rq_fmerge(bfqd, bio);
+	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
+		*req = __rq;
+		return ELEVATOR_FRONT_MERGE;
+	}
+
+	return ELEVATOR_NO_MERGE;
+}
+
+static void bfq_merged_request(struct request_queue *q, struct request *req,
+			       int type)
+{
+	if (type == ELEVATOR_FRONT_MERGE &&
+	    rb_prev(&req->rb_node) &&
+	    blk_rq_pos(req) <
+	    blk_rq_pos(container_of(rb_prev(&req->rb_node),
+				    struct request, rb_node))) {
+		struct bfq_queue *bfqq = RQ_BFQQ(req);
+		struct bfq_data *bfqd = bfqq->bfqd;
+		struct request *prev, *next_rq;
+
+		/* Reposition request in its sort_list */
+		elv_rb_del(&bfqq->sort_list, req);
+		elv_rb_add(&bfqq->sort_list, req);
+		/* Choose next request to be served for bfqq */
+		prev = bfqq->next_rq;
+		next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
+					 bfqd->last_position);
+		BUG_ON(next_rq == NULL);
+		bfqq->next_rq = next_rq;
+		/*
+		 * If next_rq changes, update both the queue's budget to
+		 * fit the new request and the queue's position in its
+		 * rq_pos_tree.
+		 */
+		if (prev != bfqq->next_rq) {
+			bfq_updated_next_req(bfqd, bfqq);
+			bfq_rq_pos_tree_add(bfqd, bfqq);
+		}
+	}
+}
+
+static void bfq_merged_requests(struct request_queue *q, struct request *rq,
+				struct request *next)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
+
+	/*
+	 * If next and rq belong to the same bfq_queue and next is older
+	 * than rq, then reposition rq in the fifo (by substituting next
+	 * with rq). Otherwise, if next and rq belong to different
+	 * bfq_queues, never reposition rq: in fact, we would have to
+	 * reposition it with respect to next's position in its own fifo,
+	 * which would most certainly be too expensive with respect to
+	 * the benefits.
+	 */
+	if (bfqq == next_bfqq &&
+	    !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
+	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
+		list_del_init(&rq->queuelist);
+		list_replace_init(&next->queuelist, &rq->queuelist);
+		rq_set_fifo_time(rq, rq_fifo_time(next));
+	}
+
+	if (bfqq->next_rq == next)
+		bfqq->next_rq = rq;
+
+	bfq_remove_request(next);
+}
+
+/* Must be called with bfqq != NULL */
+static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
+{
+	BUG_ON(bfqq == NULL);
+	if (bfq_bfqq_busy(bfqq))
+		bfqq->bfqd->wr_busy_queues--;
+	bfqq->wr_coeff = 1;
+	bfqq->wr_cur_max_time = 0;
+	/* Trigger a weight change on the next activation of the queue */
+	bfqq->entity.ioprio_changed = 1;
+}
+
+static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+				    struct bfq_group *bfqg)
+{
+	int i, j;
+
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < IOPRIO_BE_NR; j++)
+			if (bfqg->async_bfqq[i][j] != NULL)
+				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
+	if (bfqg->async_idle_bfqq != NULL)
+		bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
+}
+
+static void bfq_end_wr(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq;
+
+	spin_lock_irq(bfqd->queue->queue_lock);
+
+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+		bfq_bfqq_end_wr(bfqq);
+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
+		bfq_bfqq_end_wr(bfqq);
+	bfq_end_wr_async(bfqd);
+
+	spin_unlock_irq(bfqd->queue->queue_lock);
+}
+
+static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
+{
+	if (request)
+		return blk_rq_pos(io_struct);
+	else
+		return ((struct bio *)io_struct)->bi_sector;
+}
+
+static inline sector_t bfq_dist_from(sector_t pos1,
+				     sector_t pos2)
+{
+	if (pos1 >= pos2)
+		return pos1 - pos2;
+	else
+		return pos2 - pos1;
+}
+
+static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
+					 sector_t sector)
+{
+	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
+	       BFQQ_SEEK_THR;
+}
+
+static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
+{
+	struct rb_root *root = &bfqd->rq_pos_tree;
+	struct rb_node *parent, *node;
+	struct bfq_queue *__bfqq;
+
+	if (RB_EMPTY_ROOT(root))
+		return NULL;
+
+	/*
+	 * First, if we find a request starting at the end of the last
+	 * request, choose it.
+	 */
+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
+	if (__bfqq != NULL)
+		return __bfqq;
+
+	/*
+	 * If the exact sector wasn't found, the parent of the NULL leaf
+	 * will contain the closest sector (rq_pos_tree sorted by
+	 * next_request position).
+	 */
+	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
+		return __bfqq;
+
+	if (blk_rq_pos(__bfqq->next_rq) < sector)
+		node = rb_next(&__bfqq->pos_node);
+	else
+		node = rb_prev(&__bfqq->pos_node);
+	if (node == NULL)
+		return NULL;
+
+	__bfqq = rb_entry(node, struct bfq_queue, pos_node);
+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
+		return __bfqq;
+
+	return NULL;
+}
+
+/*
+ * bfqd - obvious
+ * cur_bfqq - passed in so that we don't decide that the current queue
+ *            is closely cooperating with itself
+ * sector - used as a reference point to search for a close queue
+ */
+static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
+					      struct bfq_queue *cur_bfqq,
+					      sector_t sector)
+{
+	struct bfq_queue *bfqq;
+
+	if (bfq_class_idle(cur_bfqq))
+		return NULL;
+	if (!bfq_bfqq_sync(cur_bfqq))
+		return NULL;
+	if (BFQQ_SEEKY(cur_bfqq))
+		return NULL;
+
+	/* If device has only one backlogged bfq_queue, don't search. */
+	if (bfqd->busy_queues == 1)
+		return NULL;
+
+	/*
+	 * We should notice if some of the queues are cooperating, e.g.
+	 * working closely on the same area of the disk. In that case,
+	 * we can group them together and don't waste time idling.
+	 */
+	bfqq = bfqq_close(bfqd, sector);
+	if (bfqq == NULL || bfqq == cur_bfqq)
+		return NULL;
+
+	/*
+	 * Do not merge queues from different bfq_groups.
+	*/
+	if (bfqq->entity.parent != cur_bfqq->entity.parent)
+		return NULL;
+
+	/*
+	 * It only makes sense to merge sync queues.
+	 */
+	if (!bfq_bfqq_sync(bfqq))
+		return NULL;
+	if (BFQQ_SEEKY(bfqq))
+		return NULL;
+
+	/*
+	 * Do not merge queues of different priority classes.
+	 */
+	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
+		return NULL;
+
+	return bfqq;
+}
+
+static struct bfq_queue *
+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+	int process_refs, new_process_refs;
+	struct bfq_queue *__bfqq;
+
+	/*
+	 * If there are no process references on the new_bfqq, then it is
+	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
+	 * may have dropped their last reference (not just their last process
+	 * reference).
+	 */
+	if (!bfqq_process_refs(new_bfqq))
+		return NULL;
+
+	/* Avoid a circular list and skip interim queue merges. */
+	while ((__bfqq = new_bfqq->new_bfqq)) {
+		if (__bfqq == bfqq)
+			return NULL;
+		new_bfqq = __bfqq;
+	}
+
+	process_refs = bfqq_process_refs(bfqq);
+	new_process_refs = bfqq_process_refs(new_bfqq);
+	/*
+	 * If the process for the bfqq has gone away, there is no
+	 * sense in merging the queues.
+	 */
+	if (process_refs == 0 || new_process_refs == 0)
+		return NULL;
+
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
+		new_bfqq->pid);
+
+	/*
+	 * Merging is just a redirection: the requests of the process
+	 * owning one of the two queues are redirected to the other queue.
+	 * The latter queue, in its turn, is set as shared if this is the
+	 * first time that the requests of some process are redirected to
+	 * it.
+	 *
+	 * We redirect bfqq to new_bfqq and not the opposite, because we
+	 * are in the context of the process owning bfqq, hence we have
+	 * the io_cq of this process. So we can immediately configure this
+	 * io_cq to redirect the requests of the process to new_bfqq.
+	 *
+	 * NOTE, even if new_bfqq coincides with the in-service queue, the
+	 * io_cq of new_bfqq is not available, because, if the in-service
+	 * queue is shared, bfqd->in_service_bic may not point to the
+	 * io_cq of the in-service queue.
+	 * Redirecting the requests of the process owning bfqq to the
+	 * currently in-service queue is in any case the best option, as
+	 * we feed the in-service queue with new requests close to the
+	 * last request served and, by doing so, hopefully increase the
+	 * throughput.
+	 */
+	bfqq->new_bfqq = new_bfqq;
+	atomic_add(process_refs, &new_bfqq->ref);
+	return new_bfqq;
+}
+
+/*
+ * Attempt to schedule a merge of bfqq with the currently in-service queue
+ * or with a close queue among the scheduled queues.
+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
+ * structure otherwise.
+ *
+ * The OOM queue is not allowed to participate to cooperation: in fact, since
+ * the requests temporarily redirected to the OOM queue could be redirected
+ * again to dedicated queues at any time, the state needed to correctly
+ * handle merging with the OOM queue would be quite complex and expensive
+ * to maintain. Besides, in such a critical condition as an out of memory,
+ * the benefits of queue merging may be little relevant, or even negligible.
+ */
+static struct bfq_queue *
+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+		     void *io_struct, bool request)
+{
+	struct bfq_queue *in_service_bfqq, *new_bfqq;
+
+	if (bfqq->new_bfqq)
+		return bfqq->new_bfqq;
+
+	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
+		return NULL;
+
+	in_service_bfqq = bfqd->in_service_queue;
+
+	if (in_service_bfqq == NULL || in_service_bfqq == bfqq ||
+	    !bfqd->in_service_bic ||
+	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))
+		goto check_scheduled;
+
+	if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq))
+		goto check_scheduled;
+
+	if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq))
+		goto check_scheduled;
+
+	if (in_service_bfqq->entity.parent != bfqq->entity.parent)
+		goto check_scheduled;
+
+	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
+	    bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) {
+		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
+		if (new_bfqq != NULL)
+			return new_bfqq; /* Merge with in-service queue */
+	}
+
+	/*
+	 * Check whether there is a cooperator among currently scheduled
+	 * queues. The only thing we need is that the bio/request is not
+	 * NULL, as we need it to establish whether a cooperator exists.
+	 */
+check_scheduled:
+	new_bfqq = bfq_close_cooperator(bfqd, bfqq,
+					bfq_io_struct_pos(io_struct, request));
+	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq))
+		return bfq_setup_merge(bfqq, new_bfqq);
+
+	return NULL;
+}
+
+static inline void
+bfq_bfqq_save_state(struct bfq_queue *bfqq)
+{
+	/*
+	 * If bfqq->bic == NULL, the queue is already shared or its requests
+	 * have already been redirected to a shared queue; both idle window
+	 * and weight raising state have already been saved. Do nothing.
+	 */
+	if (bfqq->bic == NULL)
+		return;
+	if (bfqq->bic->wr_time_left)
+		/*
+		 * This is the queue of a just-started process, and would
+		 * deserve weight raising: we set wr_time_left to the full
+		 * weight-raising duration to trigger weight-raising when
+		 * and if the queue is split and the first request of the
+		 * queue is enqueued.
+		 */
+		bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd);
+	else if (bfqq->wr_coeff > 1) {
+		unsigned long wr_duration =
+			jiffies - bfqq->last_wr_start_finish;
+		/*
+		 * It may happen that a queue's weight raising period lasts
+		 * longer than its wr_cur_max_time, as weight raising is
+		 * handled only when a request is enqueued or dispatched (it
+		 * does not use any timer). If the weight raising period is
+		 * about to end, don't save it.
+		 */
+		if (bfqq->wr_cur_max_time <= wr_duration)
+			bfqq->bic->wr_time_left = 0;
+		else
+			bfqq->bic->wr_time_left =
+				bfqq->wr_cur_max_time - wr_duration;
+		/*
+		 * The bfq_queue is becoming shared or the requests of the
+		 * process owning the queue are being redirected to a shared
+		 * queue. Stop the weight raising period of the queue, as in
+		 * both cases it should not be owned by an interactive or
+		 * soft real-time application.
+		 */
+		bfq_bfqq_end_wr(bfqq);
+	} else
+		bfqq->bic->wr_time_left = 0;
+	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
+	bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+	bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+	bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
+	bfqq->bic->cooperations++;
+	bfqq->bic->failed_cooperations = 0;
+}
+
+static inline void
+bfq_get_bic_reference(struct bfq_queue *bfqq)
+{
+	/*
+	 * If bfqq->bic has a non-NULL value, the bic to which it belongs
+	 * is about to begin using a shared bfq_queue.
+	 */
+	if (bfqq->bic)
+		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
+}
+
+static void
+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
+		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
+		(long unsigned)new_bfqq->pid);
+	/* Save weight raising and idle window of the merged queues */
+	bfq_bfqq_save_state(bfqq);
+	bfq_bfqq_save_state(new_bfqq);
+	if (bfq_bfqq_IO_bound(bfqq))
+		bfq_mark_bfqq_IO_bound(new_bfqq);
+	bfq_clear_bfqq_IO_bound(bfqq);
+	/*
+	 * Grab a reference to the bic, to prevent it from being destroyed
+	 * before being possibly touched by a bfq_split_bfqq().
+	 */
+	bfq_get_bic_reference(bfqq);
+	bfq_get_bic_reference(new_bfqq);
+	/*
+	 * Merge queues (that is, let bic redirect its requests to new_bfqq)
+	 */
+	bic_set_bfqq(bic, new_bfqq, 1);
+	bfq_mark_bfqq_coop(new_bfqq);
+	/*
+	 * new_bfqq now belongs to at least two bics (it is a shared queue):
+	 * set new_bfqq->bic to NULL. bfqq either:
+	 * - does not belong to any bic any more, and hence bfqq->bic must
+	 *   be set to NULL, or
+	 * - is a queue whose owning bics have already been redirected to a
+	 *   different queue, hence the queue is destined to not belong to
+	 *   any bic soon and bfqq->bic is already NULL (therefore the next
+	 *   assignment causes no harm).
+	 */
+	new_bfqq->bic = NULL;
+	bfqq->bic = NULL;
+	bfq_put_queue(bfqq);
+}
+
+static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)
+{
+	struct bfq_io_cq *bic = bfqq->bic;
+	struct bfq_data *bfqd = bfqq->bfqd;
+
+	if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) {
+		bic->failed_cooperations++;
+		if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations)
+			bic->cooperations = 0;
+	}
+}
+
+static int bfq_allow_merge(struct request_queue *q, struct request *rq,
+			   struct bio *bio)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_io_cq *bic;
+	struct bfq_queue *bfqq, *new_bfqq;
+
+	/*
+	 * Disallow merge of a sync bio into an async request.
+	 */
+	if (bfq_bio_sync(bio) && !rq_is_sync(rq))
+		return 0;
+
+	/*
+	 * Lookup the bfqq that this bio will be queued with. Allow
+	 * merge only if rq is queued there.
+	 * Queue lock is held here.
+	 */
+	bic = bfq_bic_lookup(bfqd, current->io_context);
+	if (bic == NULL)
+		return 0;
+
+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
+	/*
+	 * We take advantage of this function to perform an early merge
+	 * of the queues of possible cooperating processes.
+	 */
+	if (bfqq != NULL) {
+		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
+		if (new_bfqq != NULL) {
+			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
+			/*
+			 * If we get here, the bio will be queued in the
+			 * shared queue, i.e., new_bfqq, so use new_bfqq
+			 * to decide whether bio and rq can be merged.
+			 */
+			bfqq = new_bfqq;
+		} else
+			bfq_bfqq_increase_failed_cooperations(bfqq);
+	}
+
+	return bfqq == RQ_BFQQ(rq);
+}
+
+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
+				       struct bfq_queue *bfqq)
+{
+	if (bfqq != NULL) {
+		bfq_mark_bfqq_must_alloc(bfqq);
+		bfq_mark_bfqq_budget_new(bfqq);
+		bfq_clear_bfqq_fifo_expire(bfqq);
+
+		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
+
+		bfq_log_bfqq(bfqd, bfqq,
+			     "set_in_service_queue, cur-budget = %lu",
+			     bfqq->entity.budget);
+	}
+
+	bfqd->in_service_queue = bfqq;
+}
+
+/*
+ * Get and set a new queue for service.
+ */
+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
+
+	__bfq_set_in_service_queue(bfqd, bfqq);
+	return bfqq;
+}
+
+/*
+ * If enough samples have been computed, return the current max budget
+ * stored in bfqd, which is dynamically updated according to the
+ * estimated disk peak rate; otherwise return the default max budget
+ */
+static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
+{
+	if (bfqd->budgets_assigned < 194)
+		return bfq_default_max_budget;
+	else
+		return bfqd->bfq_max_budget;
+}
+
+/*
+ * Return min budget, which is a fraction of the current or default
+ * max budget (trying with 1/32)
+ */
+static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
+{
+	if (bfqd->budgets_assigned < 194)
+		return bfq_default_max_budget / 32;
+	else
+		return bfqd->bfq_max_budget / 32;
+}
+
+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq = bfqd->in_service_queue;
+	struct bfq_io_cq *bic;
+	unsigned long sl;
+
+	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
+
+	/* Processes have exited, don't wait. */
+	bic = bfqd->in_service_bic;
+	if (bic == NULL || atomic_read(&bic->icq.ioc->nr_tasks) == 0)
+		return;
+
+	bfq_mark_bfqq_wait_request(bfqq);
+
+	/*
+	 * We don't want to idle for seeks, but we do want to allow
+	 * fair distribution of slice time for a process doing back-to-back
+	 * seeks. So allow a little bit of time for him to submit a new rq.
+	 *
+	 * To prevent processes with (partly) seeky workloads from
+	 * being too ill-treated, grant them a small fraction of the
+	 * assigned budget before reducing the waiting time to
+	 * BFQ_MIN_TT. This happened to help reduce latency.
+	 */
+	sl = bfqd->bfq_slice_idle;
+	/*
+	 * Unless the queue is being weight-raised or the scenario is
+	 * asymmetric, grant only minimum idle time if the queue either
+	 * has been seeky for long enough or has already proved to be
+	 * constantly seeky.
+	 */
+	if (bfq_sample_valid(bfqq->seek_samples) &&
+	    ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >
+				  bfq_max_budget(bfqq->bfqd) / 8) ||
+	      bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&
+	    symmetric_scenario)
+		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
+	else if (bfqq->wr_coeff > 1)
+		sl = sl * 3;
+	bfqd->last_idling_start = ktime_get();
+	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
+	bfq_log(bfqd, "arm idle: %u/%u ms",
+		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
+}
+
+/*
+ * Set the maximum time for the in-service queue to consume its
+ * budget. This prevents seeky processes from lowering the disk
+ * throughput (always guaranteed with a time slice scheme as in CFQ).
+ */
+static void bfq_set_budget_timeout(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq = bfqd->in_service_queue;
+	unsigned int timeout_coeff;
+	if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
+		timeout_coeff = 1;
+	else
+		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
+
+	bfqd->last_budget_start = ktime_get();
+
+	bfq_clear_bfqq_budget_new(bfqq);
+	bfqq->budget_timeout = jiffies +
+		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
+
+	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
+		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
+		timeout_coeff));
+}
+
+/*
+ * Move request from internal lists to the request queue dispatch list.
+ */
+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+	/*
+	 * For consistency, the next instruction should have been executed
+	 * after removing the request from the queue and dispatching it.
+	 * We execute instead this instruction before bfq_remove_request()
+	 * (and hence introduce a temporary inconsistency), for efficiency.
+	 * In fact, in a forced_dispatch, this prevents two counters related
+	 * to bfqq->dispatched to risk to be uselessly decremented if bfqq
+	 * is not in service, and then to be incremented again after
+	 * incrementing bfqq->dispatched.
+	 */
+	bfqq->dispatched++;
+	bfq_remove_request(rq);
+	elv_dispatch_sort(q, rq);
+
+	if (bfq_bfqq_sync(bfqq))
+		bfqd->sync_flight++;
+}
+
+/*
+ * Return expired entry, or NULL to just start from scratch in rbtree.
+ */
+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
+{
+	struct request *rq = NULL;
+
+	if (bfq_bfqq_fifo_expire(bfqq))
+		return NULL;
+
+	bfq_mark_bfqq_fifo_expire(bfqq);
+
+	if (list_empty(&bfqq->fifo))
+		return NULL;
+
+	rq = rq_entry_fifo(bfqq->fifo.next);
+
+	if (time_before(jiffies, rq_fifo_time(rq)))
+		return NULL;
+
+	return rq;
+}
+
+static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	return entity->budget - entity->service;
+}
+
+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	BUG_ON(bfqq != bfqd->in_service_queue);
+
+	__bfq_bfqd_reset_in_service(bfqd);
+
+	/*
+	 * If this bfqq is shared between multiple processes, check
+	 * to make sure that those processes are still issuing I/Os
+	 * within the mean seek distance. If not, it may be time to
+	 * break the queues apart again.
+	 */
+	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
+		bfq_mark_bfqq_split_coop(bfqq);
+
+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+		/*
+		 * Overloading budget_timeout field to store the time
+		 * at which the queue remains with no backlog; used by
+		 * the weight-raising mechanism.
+		 */
+		bfqq->budget_timeout = jiffies;
+		bfq_del_bfqq_busy(bfqd, bfqq, 1);
+	} else {
+		bfq_activate_bfqq(bfqd, bfqq);
+		/*
+		 * Resort priority tree of potential close cooperators.
+		 */
+		bfq_rq_pos_tree_add(bfqd, bfqq);
+	}
+}
+
+/**
+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
+ * @bfqd: device data.
+ * @bfqq: queue to update.
+ * @reason: reason for expiration.
+ *
+ * Handle the feedback on @bfqq budget.  See the body for detailed
+ * comments.
+ */
+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
+				     struct bfq_queue *bfqq,
+				     enum bfqq_expiration reason)
+{
+	struct request *next_rq;
+	unsigned long budget, min_budget;
+
+	budget = bfqq->max_budget;
+	min_budget = bfq_min_budget(bfqd);
+
+	BUG_ON(bfqq != bfqd->in_service_queue);
+
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
+		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
+		budget, bfq_min_budget(bfqd));
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
+		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
+
+	if (bfq_bfqq_sync(bfqq)) {
+		switch (reason) {
+		/*
+		 * Caveat: in all the following cases we trade latency
+		 * for throughput.
+		 */
+		case BFQ_BFQQ_TOO_IDLE:
+			/*
+			 * This is the only case where we may reduce
+			 * the budget: if there is no request of the
+			 * process still waiting for completion, then
+			 * we assume (tentatively) that the timer has
+			 * expired because the batch of requests of
+			 * the process could have been served with a
+			 * smaller budget.  Hence, betting that
+			 * process will behave in the same way when it
+			 * becomes backlogged again, we reduce its
+			 * next budget.  As long as we guess right,
+			 * this budget cut reduces the latency
+			 * experienced by the process.
+			 *
+			 * However, if there are still outstanding
+			 * requests, then the process may have not yet
+			 * issued its next request just because it is
+			 * still waiting for the completion of some of
+			 * the still outstanding ones.  So in this
+			 * subcase we do not reduce its budget, on the
+			 * contrary we increase it to possibly boost
+			 * the throughput, as discussed in the
+			 * comments to the BUDGET_TIMEOUT case.
+			 */
+			if (bfqq->dispatched > 0) /* still outstanding reqs */
+				budget = min(budget * 2, bfqd->bfq_max_budget);
+			else {
+				if (budget > 5 * min_budget)
+					budget -= 4 * min_budget;
+				else
+					budget = min_budget;
+			}
+			break;
+		case BFQ_BFQQ_BUDGET_TIMEOUT:
+			/*
+			 * We double the budget here because: 1) it
+			 * gives the chance to boost the throughput if
+			 * this is not a seeky process (which may have
+			 * bumped into this timeout because of, e.g.,
+			 * ZBR), 2) together with charge_full_budget
+			 * it helps give seeky processes higher
+			 * timestamps, and hence be served less
+			 * frequently.
+			 */
+			budget = min(budget * 2, bfqd->bfq_max_budget);
+			break;
+		case BFQ_BFQQ_BUDGET_EXHAUSTED:
+			/*
+			 * The process still has backlog, and did not
+			 * let either the budget timeout or the disk
+			 * idling timeout expire. Hence it is not
+			 * seeky, has a short thinktime and may be
+			 * happy with a higher budget too. So
+			 * definitely increase the budget of this good
+			 * candidate to boost the disk throughput.
+			 */
+			budget = min(budget * 4, bfqd->bfq_max_budget);
+			break;
+		case BFQ_BFQQ_NO_MORE_REQUESTS:
+		       /*
+			* Leave the budget unchanged.
+			*/
+		default:
+			return;
+		}
+	} else /* async queue */
+	    /* async queues get always the maximum possible budget
+	     * (their ability to dispatch is limited by
+	     * @bfqd->bfq_max_budget_async_rq).
+	     */
+		budget = bfqd->bfq_max_budget;
+
+	bfqq->max_budget = budget;
+
+	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
+	    bfqq->max_budget > bfqd->bfq_max_budget)
+		bfqq->max_budget = bfqd->bfq_max_budget;
+
+	/*
+	 * Make sure that we have enough budget for the next request.
+	 * Since the finish time of the bfqq must be kept in sync with
+	 * the budget, be sure to call __bfq_bfqq_expire() after the
+	 * update.
+	 */
+	next_rq = bfqq->next_rq;
+	if (next_rq != NULL)
+		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
+					    bfq_serv_to_charge(next_rq, bfqq));
+	else
+		bfqq->entity.budget = bfqq->max_budget;
+
+	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
+			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
+			bfqq->entity.budget);
+}
+
+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
+{
+	unsigned long max_budget;
+
+	/*
+	 * The max_budget calculated when autotuning is equal to the
+	 * amount of sectors transfered in timeout_sync at the
+	 * estimated peak rate.
+	 */
+	max_budget = (unsigned long)(peak_rate * 1000 *
+				     timeout >> BFQ_RATE_SHIFT);
+
+	return max_budget;
+}
+
+/*
+ * In addition to updating the peak rate, checks whether the process
+ * is "slow", and returns 1 if so. This slow flag is used, in addition
+ * to the budget timeout, to reduce the amount of service provided to
+ * seeky processes, and hence reduce their chances to lower the
+ * throughput. See the code for more details.
+ */
+static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				int compensate, enum bfqq_expiration reason)
+{
+	u64 bw, usecs, expected, timeout;
+	ktime_t delta;
+	int update = 0;
+
+	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
+		return 0;
+
+	if (compensate)
+		delta = bfqd->last_idling_start;
+	else
+		delta = ktime_get();
+	delta = ktime_sub(delta, bfqd->last_budget_start);
+	usecs = ktime_to_us(delta);
+
+	/* Don't trust short/unrealistic values. */
+	if (usecs < 100 || usecs >= LONG_MAX)
+		return 0;
+
+	/*
+	 * Calculate the bandwidth for the last slice.  We use a 64 bit
+	 * value to store the peak rate, in sectors per usec in fixed
+	 * point math.  We do so to have enough precision in the estimate
+	 * and to avoid overflows.
+	 */
+	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
+	do_div(bw, (unsigned long)usecs);
+
+	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
+
+	/*
+	 * Use only long (> 20ms) intervals to filter out spikes for
+	 * the peak rate estimation.
+	 */
+	if (usecs > 20000) {
+		if (bw > bfqd->peak_rate ||
+		   (!BFQQ_SEEKY(bfqq) &&
+		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
+			bfq_log(bfqd, "measured bw =%llu", bw);
+			/*
+			 * To smooth oscillations use a low-pass filter with
+			 * alpha=7/8, i.e.,
+			 * new_rate = (7/8) * old_rate + (1/8) * bw
+			 */
+			do_div(bw, 8);
+			if (bw == 0)
+				return 0;
+			bfqd->peak_rate *= 7;
+			do_div(bfqd->peak_rate, 8);
+			bfqd->peak_rate += bw;
+			update = 1;
+			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
+		}
+
+		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
+
+		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
+			bfqd->peak_rate_samples++;
+
+		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
+		    update) {
+			int dev_type = blk_queue_nonrot(bfqd->queue);
+			if (bfqd->bfq_user_max_budget == 0) {
+				bfqd->bfq_max_budget =
+					bfq_calc_max_budget(bfqd->peak_rate,
+							    timeout);
+				bfq_log(bfqd, "new max_budget=%lu",
+					bfqd->bfq_max_budget);
+			}
+			if (bfqd->device_speed == BFQ_BFQD_FAST &&
+			    bfqd->peak_rate < device_speed_thresh[dev_type]) {
+				bfqd->device_speed = BFQ_BFQD_SLOW;
+				bfqd->RT_prod = R_slow[dev_type] *
+						T_slow[dev_type];
+			} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
+			    bfqd->peak_rate > device_speed_thresh[dev_type]) {
+				bfqd->device_speed = BFQ_BFQD_FAST;
+				bfqd->RT_prod = R_fast[dev_type] *
+						T_fast[dev_type];
+			}
+		}
+	}
+
+	/*
+	 * If the process has been served for a too short time
+	 * interval to let its possible sequential accesses prevail on
+	 * the initial seek time needed to move the disk head on the
+	 * first sector it requested, then give the process a chance
+	 * and for the moment return false.
+	 */
+	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
+		return 0;
+
+	/*
+	 * A process is considered ``slow'' (i.e., seeky, so that we
+	 * cannot treat it fairly in the service domain, as it would
+	 * slow down too much the other processes) if, when a slice
+	 * ends for whatever reason, it has received service at a
+	 * rate that would not be high enough to complete the budget
+	 * before the budget timeout expiration.
+	 */
+	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
+
+	/*
+	 * Caveat: processes doing IO in the slower disk zones will
+	 * tend to be slow(er) even if not seeky. And the estimated
+	 * peak rate will actually be an average over the disk
+	 * surface. Hence, to not be too harsh with unlucky processes,
+	 * we keep a budget/3 margin of safety before declaring a
+	 * process slow.
+	 */
+	return expected > (4 * bfqq->entity.budget) / 3;
+}
+
+/*
+ * To be deemed as soft real-time, an application must meet two
+ * requirements. First, the application must not require an average
+ * bandwidth higher than the approximate bandwidth required to playback or
+ * record a compressed high-definition video.
+ * The next function is invoked on the completion of the last request of a
+ * batch, to compute the next-start time instant, soft_rt_next_start, such
+ * that, if the next request of the application does not arrive before
+ * soft_rt_next_start, then the above requirement on the bandwidth is met.
+ *
+ * The second requirement is that the request pattern of the application is
+ * isochronous, i.e., that, after issuing a request or a batch of requests,
+ * the application stops issuing new requests until all its pending requests
+ * have been completed. After that, the application may issue a new batch,
+ * and so on.
+ * For this reason the next function is invoked to compute
+ * soft_rt_next_start only for applications that meet this requirement,
+ * whereas soft_rt_next_start is set to infinity for applications that do
+ * not.
+ *
+ * Unfortunately, even a greedy application may happen to behave in an
+ * isochronous way if the CPU load is high. In fact, the application may
+ * stop issuing requests while the CPUs are busy serving other processes,
+ * then restart, then stop again for a while, and so on. In addition, if
+ * the disk achieves a low enough throughput with the request pattern
+ * issued by the application (e.g., because the request pattern is random
+ * and/or the device is slow), then the application may meet the above
+ * bandwidth requirement too. To prevent such a greedy application to be
+ * deemed as soft real-time, a further rule is used in the computation of
+ * soft_rt_next_start: soft_rt_next_start must be higher than the current
+ * time plus the maximum time for which the arrival of a request is waited
+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
+ * This filters out greedy applications, as the latter issue instead their
+ * next request as soon as possible after the last one has been completed
+ * (in contrast, when a batch of requests is completed, a soft real-time
+ * application spends some time processing data).
+ *
+ * Unfortunately, the last filter may easily generate false positives if
+ * only bfqd->bfq_slice_idle is used as a reference time interval and one
+ * or both the following cases occur:
+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher
+ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
+ *    HZ=100.
+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
+ *    for a while, then suddenly 'jump' by several units to recover the lost
+ *    increments. This seems to happen, e.g., inside virtual machines.
+ * To address this issue, we do not use as a reference time interval just
+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
+ * particular we add the minimum number of jiffies for which the filter
+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual
+ * machines.
+ */
+static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
+						       struct bfq_queue *bfqq)
+{
+	return max(bfqq->last_idle_bklogged +
+		   HZ * bfqq->service_from_backlogged /
+		   bfqd->bfq_wr_max_softrt_rate,
+		   jiffies + bfqq->bfqd->bfq_slice_idle + 4);
+}
+
+/*
+ * Return the largest-possible time instant such that, for as long as possible,
+ * the current time will be lower than this time instant according to the macro
+ * time_is_before_jiffies().
+ */
+static inline unsigned long bfq_infinity_from_now(unsigned long now)
+{
+	return now + ULONG_MAX / 2;
+}
+
+/**
+ * bfq_bfqq_expire - expire a queue.
+ * @bfqd: device owning the queue.
+ * @bfqq: the queue to expire.
+ * @compensate: if true, compensate for the time spent idling.
+ * @reason: the reason causing the expiration.
+ *
+ *
+ * If the process associated to the queue is slow (i.e., seeky), or in
+ * case of budget timeout, or, finally, if it is async, we
+ * artificially charge it an entire budget (independently of the
+ * actual service it received). As a consequence, the queue will get
+ * higher timestamps than the correct ones upon reactivation, and
+ * hence it will be rescheduled as if it had received more service
+ * than what it actually received. In the end, this class of processes
+ * will receive less service in proportion to how slowly they consume
+ * their budgets (and hence how seriously they tend to lower the
+ * throughput).
+ *
+ * In contrast, when a queue expires because it has been idling for
+ * too much or because it exhausted its budget, we do not touch the
+ * amount of service it has received. Hence when the queue will be
+ * reactivated and its timestamps updated, the latter will be in sync
+ * with the actual service received by the queue until expiration.
+ *
+ * Charging a full budget to the first type of queues and the exact
+ * service to the others has the effect of using the WF2Q+ policy to
+ * schedule the former on a timeslice basis, without violating the
+ * service domain guarantees of the latter.
+ */
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+			    struct bfq_queue *bfqq,
+			    int compensate,
+			    enum bfqq_expiration reason)
+{
+	int slow;
+	BUG_ON(bfqq != bfqd->in_service_queue);
+
+	/* Update disk peak rate for autotuning and check whether the
+	 * process is slow (see bfq_update_peak_rate).
+	 */
+	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
+
+	/*
+	 * As above explained, 'punish' slow (i.e., seeky), timed-out
+	 * and async queues, to favor sequential sync workloads.
+	 *
+	 * Processes doing I/O in the slower disk zones will tend to be
+	 * slow(er) even if not seeky. Hence, since the estimated peak
+	 * rate is actually an average over the disk surface, these
+	 * processes may timeout just for bad luck. To avoid punishing
+	 * them we do not charge a full budget to a process that
+	 * succeeded in consuming at least 2/3 of its budget.
+	 */
+	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
+		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))
+		bfq_bfqq_charge_full_budget(bfqq);
+
+	bfqq->service_from_backlogged += bfqq->entity.service;
+
+	if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
+	    !bfq_bfqq_constantly_seeky(bfqq)) {
+		bfq_mark_bfqq_constantly_seeky(bfqq);
+		if (!blk_queue_nonrot(bfqd->queue))
+			bfqd->const_seeky_busy_in_flight_queues++;
+	}
+
+	if (reason == BFQ_BFQQ_TOO_IDLE &&
+	    bfqq->entity.service <= 2 * bfqq->entity.budget / 10 )
+		bfq_clear_bfqq_IO_bound(bfqq);
+
+	if (bfqd->low_latency && bfqq->wr_coeff == 1)
+		bfqq->last_wr_start_finish = jiffies;
+
+	if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
+	    RB_EMPTY_ROOT(&bfqq->sort_list)) {
+		/*
+		 * If we get here, and there are no outstanding requests,
+		 * then the request pattern is isochronous (see the comments
+		 * to the function bfq_bfqq_softrt_next_start()). Hence we
+		 * can compute soft_rt_next_start. If, instead, the queue
+		 * still has outstanding requests, then we have to wait
+		 * for the completion of all the outstanding requests to
+		 * discover whether the request pattern is actually
+		 * isochronous.
+		 */
+		if (bfqq->dispatched == 0)
+			bfqq->soft_rt_next_start =
+				bfq_bfqq_softrt_next_start(bfqd, bfqq);
+		else {
+			/*
+			 * The application is still waiting for the
+			 * completion of one or more requests:
+			 * prevent it from possibly being incorrectly
+			 * deemed as soft real-time by setting its
+			 * soft_rt_next_start to infinity. In fact,
+			 * without this assignment, the application
+			 * would be incorrectly deemed as soft
+			 * real-time if:
+			 * 1) it issued a new request before the
+			 *    completion of all its in-flight
+			 *    requests, and
+			 * 2) at that time, its soft_rt_next_start
+			 *    happened to be in the past.
+			 */
+			bfqq->soft_rt_next_start =
+				bfq_infinity_from_now(jiffies);
+			/*
+			 * Schedule an update of soft_rt_next_start to when
+			 * the task may be discovered to be isochronous.
+			 */
+			bfq_mark_bfqq_softrt_update(bfqq);
+		}
+	}
+
+	bfq_log_bfqq(bfqd, bfqq,
+		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
+		slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
+
+	/*
+	 * Increase, decrease or leave budget unchanged according to
+	 * reason.
+	 */
+	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
+	__bfq_bfqq_expire(bfqd, bfqq);
+}
+
+/*
+ * Budget timeout is not implemented through a dedicated timer, but
+ * just checked on request arrivals and completions, as well as on
+ * idle timer expirations.
+ */
+static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
+{
+	if (bfq_bfqq_budget_new(bfqq) ||
+	    time_before(jiffies, bfqq->budget_timeout))
+		return 0;
+	return 1;
+}
+
+/*
+ * If we expire a queue that is waiting for the arrival of a new
+ * request, we may prevent the fictitious timestamp back-shifting that
+ * allows the guarantees of the queue to be preserved (see [1] for
+ * this tricky aspect). Hence we return true only if this condition
+ * does not hold, or if the queue is slow enough to deserve only to be
+ * kicked off for preserving a high throughput.
+*/
+static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqq->bfqd, bfqq,
+		"may_budget_timeout: wait_request %d left %d timeout %d",
+		bfq_bfqq_wait_request(bfqq),
+			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,
+		bfq_bfqq_budget_timeout(bfqq));
+
+	return (!bfq_bfqq_wait_request(bfqq) ||
+		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)
+		&&
+		bfq_bfqq_budget_timeout(bfqq);
+}
+
+/*
+ * Device idling is allowed only for the queues for which this function
+ * returns true. For this reason, the return value of this function plays a
+ * critical role for both throughput boosting and service guarantees. The
+ * return value is computed through a logical expression. In this rather
+ * long comment, we try to briefly describe all the details and motivations
+ * behind the components of this logical expression.
+ *
+ * First, the expression is false if bfqq is not sync, or if: bfqq happened
+ * to become active during a large burst of queue activations, and the
+ * pattern of requests bfqq contains boosts the throughput if bfqq is
+ * expired. In fact, queues that became active during a large burst benefit
+ * only from throughput, as discussed in the comments to bfq_handle_burst.
+ * In this respect, expiring bfqq certainly boosts the throughput on NCQ-
+ * capable flash-based devices, whereas, on rotational devices, it boosts
+ * the throughput only if bfqq contains random requests.
+ *
+ * On the opposite end, if (a) bfqq is sync, (b) the above burst-related
+ * condition does not hold, and (c) bfqq is being weight-raised, then the
+ * expression always evaluates to true, as device idling is instrumental
+ * for preserving low-latency guarantees (see [1]). If, instead, conditions
+ * (a) and (b) do hold, but (c) does not, then the expression evaluates to
+ * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and
+ * (2) at least one of the following two conditions holds.
+ * The first condition is that the device is not performing NCQ, because
+ * idling the device most certainly boosts the throughput if this condition
+ * holds and bfqq is I/O-bound and has been granted a non-null idle window.
+ * The second compound condition is made of the logical AND of two components.
+ *
+ * The first component is true only if there is no weight-raised busy
+ * queue. This guarantees that the device is not idled for a sync non-
+ * weight-raised queue when there are busy weight-raised queues. The former
+ * is then expired immediately if empty. Combined with the timestamping
+ * rules of BFQ (see [1] for details), this causes sync non-weight-raised
+ * queues to get a lower number of requests served, and hence to ask for a
+ * lower number of requests from the request pool, before the busy weight-
+ * raised queues get served again.
+ *
+ * This is beneficial for the processes associated with weight-raised
+ * queues, when the request pool is saturated (e.g., in the presence of
+ * write hogs). In fact, if the processes associated with the other queues
+ * ask for requests at a lower rate, then weight-raised processes have a
+ * higher probability to get a request from the pool immediately (or at
+ * least soon) when they need one. Hence they have a higher probability to
+ * actually get a fraction of the disk throughput proportional to their
+ * high weight. This is especially true with NCQ-capable drives, which
+ * enqueue several requests in advance and further reorder internally-
+ * queued requests.
+ *
+ * In the end, mistreating non-weight-raised queues when there are busy
+ * weight-raised queues seems to mitigate starvation problems in the
+ * presence of heavy write workloads and NCQ, and hence to guarantee a
+ * higher application and system responsiveness in these hostile scenarios.
+ *
+ * If the first component of the compound condition is instead true, i.e.,
+ * there is no weight-raised busy queue, then the second component of the
+ * compound condition takes into account service-guarantee and throughput
+ * issues related to NCQ (recall that the compound condition is evaluated
+ * only if the device is detected as supporting NCQ).
+ *
+ * As for service guarantees, allowing the drive to enqueue more than one
+ * request at a time, and hence delegating de facto final scheduling
+ * decisions to the drive's internal scheduler, causes loss of control on
+ * the actual request service order. In this respect, when the drive is
+ * allowed to enqueue more than one request at a time, the service
+ * distribution enforced by the drive's internal scheduler is likely to
+ * coincide with the desired device-throughput distribution only in the
+ * following, perfectly symmetric, scenario:
+ * 1) all active queues have the same weight,
+ * 2) all active groups at the same level in the groups tree have the same
+ *    weight,
+ * 3) all active groups at the same level in the groups tree have the same
+ *    number of children.
+ *
+ * Even in such a scenario, sequential I/O may still receive a preferential
+ * treatment, but this is not likely to be a big issue with flash-based
+ * devices, because of their non-dramatic loss of throughput with random
+ * I/O. Things do differ with HDDs, for which additional care is taken, as
+ * explained after completing the discussion for flash-based devices.
+ *
+ * Unfortunately, keeping the necessary state for evaluating exactly the
+ * above symmetry conditions would be quite complex and time-consuming.
+ * Therefore BFQ evaluates instead the following stronger sub-conditions,
+ * for which it is much easier to maintain the needed state:
+ * 1) all active queues have the same weight,
+ * 2) all active groups have the same weight,
+ * 3) all active groups have at most one active child each.
+ * In particular, the last two conditions are always true if hierarchical
+ * support and the cgroups interface are not enabled, hence no state needs
+ * to be maintained in this case.
+ *
+ * According to the above considerations, the second component of the
+ * compound condition evaluates to true if any of the above symmetry
+ * sub-condition does not hold, or the device is not flash-based. Therefore,
+ * if also the first component is true, then idling is allowed for a sync
+ * queue. These are the only sub-conditions considered if the device is
+ * flash-based, as, for such a device, it is sensible to force idling only
+ * for service-guarantee issues. In fact, as for throughput, idling
+ * NCQ-capable flash-based devices would not boost the throughput even
+ * with sequential I/O; rather it would lower the throughput in proportion
+ * to how fast the device is. In the end, (only) if all the three
+ * sub-conditions hold and the device is flash-based, the compound
+ * condition evaluates to false and therefore no idling is performed.
+ *
+ * As already said, things change with a rotational device, where idling
+ * boosts the throughput with sequential I/O (even with NCQ). Hence, for
+ * such a device the second component of the compound condition evaluates
+ * to true also if the following additional sub-condition does not hold:
+ * the queue is constantly seeky. Unfortunately, this different behavior
+ * with respect to flash-based devices causes an additional asymmetry: if
+ * some sync queues enjoy idling and some other sync queues do not, then
+ * the latter get a low share of the device throughput, simply because the
+ * former get many requests served after being set as in service, whereas
+ * the latter do not. As a consequence, to guarantee the desired throughput
+ * distribution, on HDDs the compound expression evaluates to true (and
+ * hence device idling is performed) also if the following last symmetry
+ * condition does not hold: no other queue is benefiting from idling. Also
+ * this last condition is actually replaced with a simpler-to-maintain and
+ * stronger condition: there is no busy queue which is not constantly seeky
+ * (and hence may also benefit from idling).
+ *
+ * To sum up, when all the required symmetry and throughput-boosting
+ * sub-conditions hold, the second component of the compound condition
+ * evaluates to false, and hence no idling is performed. This helps to
+ * keep the drives' internal queues full on NCQ-capable devices, and hence
+ * to boost the throughput, without causing 'almost' any loss of service
+ * guarantees. The 'almost' follows from the fact that, if the internal
+ * queue of one such device is filled while all the sub-conditions hold,
+ * but at some point in time some sub-condition stops to hold, then it may
+ * become impossible to let requests be served in the new desired order
+ * until all the requests already queued in the device have been served.
+ */
+static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \
+				   bfqd->busy_in_flight_queues == \
+				   bfqd->const_seeky_busy_in_flight_queues)
+
+#define cond_for_expiring_in_burst	(bfq_bfqq_in_large_burst(bfqq) && \
+					 bfqd->hw_tag && \
+					 (blk_queue_nonrot(bfqd->queue) || \
+					  bfq_bfqq_constantly_seeky(bfqq)))
+
+/*
+ * Condition for expiring a non-weight-raised queue (and hence not idling
+ * the device).
+ */
+#define cond_for_expiring_non_wr  (bfqd->hw_tag && \
+				   (bfqd->wr_busy_queues > 0 || \
+				    (blk_queue_nonrot(bfqd->queue) || \
+				      cond_for_seeky_on_ncq_hdd)))
+
+	return bfq_bfqq_sync(bfqq) &&
+		!cond_for_expiring_in_burst &&
+		(bfqq->wr_coeff > 1 || !symmetric_scenario ||
+		 (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) &&
+		  !cond_for_expiring_non_wr)
+	);
+}
+
+/*
+ * If the in-service queue is empty but sync, and the function
+ * bfq_bfqq_must_not_expire returns true, then:
+ * 1) the queue must remain in service and cannot be expired, and
+ * 2) the disk must be idled to wait for the possible arrival of a new
+ *    request for the queue.
+ * See the comments to the function bfq_bfqq_must_not_expire for the reasons
+ * why performing device idling is the best choice to boost the throughput
+ * and preserve service guarantees when bfq_bfqq_must_not_expire itself
+ * returns true.
+ */
+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+
+	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
+	       bfq_bfqq_must_not_expire(bfqq);
+}
+
+/*
+ * Select a queue for service.  If we have a current queue in service,
+ * check whether to continue servicing it, or retrieve and set a new one.
+ */
+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq;
+	struct request *next_rq;
+	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
+
+	bfqq = bfqd->in_service_queue;
+	if (bfqq == NULL)
+		goto new_queue;
+
+	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
+
+	if (bfq_may_expire_for_budg_timeout(bfqq) &&
+	    !timer_pending(&bfqd->idle_slice_timer) &&
+	    !bfq_bfqq_must_idle(bfqq))
+		goto expire;
+
+	next_rq = bfqq->next_rq;
+	/*
+	 * If bfqq has requests queued and it has enough budget left to
+	 * serve them, keep the queue, otherwise expire it.
+	 */
+	if (next_rq != NULL) {
+		if (bfq_serv_to_charge(next_rq, bfqq) >
+			bfq_bfqq_budget_left(bfqq)) {
+			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
+			goto expire;
+		} else {
+			/*
+			 * The idle timer may be pending because we may
+			 * not disable disk idling even when a new request
+			 * arrives.
+			 */
+			if (timer_pending(&bfqd->idle_slice_timer)) {
+				/*
+				 * If we get here: 1) at least a new request
+				 * has arrived but we have not disabled the
+				 * timer because the request was too small,
+				 * 2) then the block layer has unplugged
+				 * the device, causing the dispatch to be
+				 * invoked.
+				 *
+				 * Since the device is unplugged, now the
+				 * requests are probably large enough to
+				 * provide a reasonable throughput.
+				 * So we disable idling.
+				 */
+				bfq_clear_bfqq_wait_request(bfqq);
+				del_timer(&bfqd->idle_slice_timer);
+			}
+			goto keep_queue;
+		}
+	}
+
+	/*
+	 * No requests pending. However, if the in-service queue is idling
+	 * for a new request, or has requests waiting for a completion and
+	 * may idle after their completion, then keep it anyway.
+	 */
+	if (timer_pending(&bfqd->idle_slice_timer) ||
+	    (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) {
+		bfqq = NULL;
+		goto keep_queue;
+	}
+
+	reason = BFQ_BFQQ_NO_MORE_REQUESTS;
+expire:
+	bfq_bfqq_expire(bfqd, bfqq, 0, reason);
+new_queue:
+	bfqq = bfq_set_in_service_queue(bfqd);
+	bfq_log(bfqd, "select_queue: new queue %d returned",
+		bfqq != NULL ? bfqq->pid : 0);
+keep_queue:
+	return bfqq;
+}
+
+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
+		bfq_log_bfqq(bfqd, bfqq,
+			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",
+			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
+			jiffies_to_msecs(bfqq->wr_cur_max_time),
+			bfqq->wr_coeff,
+			bfqq->entity.weight, bfqq->entity.orig_weight);
+
+		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
+		       entity->orig_weight * bfqq->wr_coeff);
+		if (entity->ioprio_changed)
+			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
+
+		/*
+		 * If the queue was activated in a burst, or
+		 * too much time has elapsed from the beginning
+		 * of this weight-raising period, or the queue has
+		 * exceeded the acceptable number of cooperations,
+		 * then end weight raising.
+		 */
+		if (bfq_bfqq_in_large_burst(bfqq) ||
+		    bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh ||
+		    time_is_before_jiffies(bfqq->last_wr_start_finish +
+					   bfqq->wr_cur_max_time)) {
+			bfqq->last_wr_start_finish = jiffies;
+			bfq_log_bfqq(bfqd, bfqq,
+				     "wrais ending at %lu, rais_max_time %u",
+				     bfqq->last_wr_start_finish,
+				     jiffies_to_msecs(bfqq->wr_cur_max_time));
+			bfq_bfqq_end_wr(bfqq);
+		}
+	}
+	/* Update weight both if it must be raised and if it must be lowered */
+	if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
+		__bfq_entity_update_weight_prio(
+			bfq_entity_service_tree(entity),
+			entity);
+}
+
+/*
+ * Dispatch one request from bfqq, moving it to the request queue
+ * dispatch list.
+ */
+static int bfq_dispatch_request(struct bfq_data *bfqd,
+				struct bfq_queue *bfqq)
+{
+	int dispatched = 0;
+	struct request *rq;
+	unsigned long service_to_charge;
+
+	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
+
+	/* Follow expired path, else get first next available. */
+	rq = bfq_check_fifo(bfqq);
+	if (rq == NULL)
+		rq = bfqq->next_rq;
+	service_to_charge = bfq_serv_to_charge(rq, bfqq);
+
+	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
+		/*
+		 * This may happen if the next rq is chosen in fifo order
+		 * instead of sector order. The budget is properly
+		 * dimensioned to be always sufficient to serve the next
+		 * request only if it is chosen in sector order. The reason
+		 * is that it would be quite inefficient and little useful
+		 * to always make sure that the budget is large enough to
+		 * serve even the possible next rq in fifo order.
+		 * In fact, requests are seldom served in fifo order.
+		 *
+		 * Expire the queue for budget exhaustion, and make sure
+		 * that the next act_budget is enough to serve the next
+		 * request, even if it comes from the fifo expired path.
+		 */
+		bfqq->next_rq = rq;
+		/*
+		 * Since this dispatch is failed, make sure that
+		 * a new one will be performed
+		 */
+		if (!bfqd->rq_in_driver)
+			bfq_schedule_dispatch(bfqd);
+		goto expire;
+	}
+
+	/* Finally, insert request into driver dispatch list. */
+	bfq_bfqq_served(bfqq, service_to_charge);
+	bfq_dispatch_insert(bfqd->queue, rq);
+
+	bfq_update_wr_data(bfqd, bfqq);
+
+	bfq_log_bfqq(bfqd, bfqq,
+			"dispatched %u sec req (%llu), budg left %lu",
+			blk_rq_sectors(rq),
+			(long long unsigned)blk_rq_pos(rq),
+			bfq_bfqq_budget_left(bfqq));
+
+	dispatched++;
+
+	if (bfqd->in_service_bic == NULL) {
+		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
+		bfqd->in_service_bic = RQ_BIC(rq);
+	}
+
+	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
+	    dispatched >= bfqd->bfq_max_budget_async_rq) ||
+	    bfq_class_idle(bfqq)))
+		goto expire;
+
+	return dispatched;
+
+expire:
+	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
+	return dispatched;
+}
+
+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
+{
+	int dispatched = 0;
+
+	while (bfqq->next_rq != NULL) {
+		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
+		dispatched++;
+	}
+
+	BUG_ON(!list_empty(&bfqq->fifo));
+	return dispatched;
+}
+
+/*
+ * Drain our current requests.
+ * Used for barriers and when switching io schedulers on-the-fly.
+ */
+static int bfq_forced_dispatch(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq, *n;
+	struct bfq_service_tree *st;
+	int dispatched = 0;
+
+	bfqq = bfqd->in_service_queue;
+	if (bfqq != NULL)
+		__bfq_bfqq_expire(bfqd, bfqq);
+
+	/*
+	 * Loop through classes, and be careful to leave the scheduler
+	 * in a consistent state, as feedback mechanisms and vtime
+	 * updates cannot be disabled during the process.
+	 */
+	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
+		st = bfq_entity_service_tree(&bfqq->entity);
+
+		dispatched += __bfq_forced_dispatch_bfqq(bfqq);
+		bfqq->max_budget = bfq_max_budget(bfqd);
+
+		bfq_forget_idle(st);
+	}
+
+	BUG_ON(bfqd->busy_queues != 0);
+
+	return dispatched;
+}
+
+static int bfq_dispatch_requests(struct request_queue *q, int force)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_queue *bfqq;
+	int max_dispatch;
+
+	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
+	if (bfqd->busy_queues == 0)
+		return 0;
+
+	if (unlikely(force))
+		return bfq_forced_dispatch(bfqd);
+
+	bfqq = bfq_select_queue(bfqd);
+	if (bfqq == NULL)
+		return 0;
+
+	if (bfq_class_idle(bfqq))
+		max_dispatch = 1;
+
+	if (!bfq_bfqq_sync(bfqq))
+		max_dispatch = bfqd->bfq_max_budget_async_rq;
+
+	if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) {
+		if (bfqd->busy_queues > 1)
+			return 0;
+		if (bfqq->dispatched >= 4 * max_dispatch)
+			return 0;
+	}
+
+	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
+		return 0;
+
+	bfq_clear_bfqq_wait_request(bfqq);
+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
+
+	if (!bfq_dispatch_request(bfqd, bfqq))
+		return 0;
+
+	bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",
+			bfq_bfqq_sync(bfqq) ? "sync" : "async");
+
+	return 1;
+}
+
+/*
+ * Task holds one reference to the queue, dropped when task exits.  Each rq
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
+ *
+ * Queue lock must be held here.
+ */
+static void bfq_put_queue(struct bfq_queue *bfqq)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+
+	BUG_ON(atomic_read(&bfqq->ref) <= 0);
+
+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
+		     atomic_read(&bfqq->ref));
+	if (!atomic_dec_and_test(&bfqq->ref))
+		return;
+
+	BUG_ON(rb_first(&bfqq->sort_list) != NULL);
+	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
+	BUG_ON(bfqq->entity.tree != NULL);
+	BUG_ON(bfq_bfqq_busy(bfqq));
+	BUG_ON(bfqd->in_service_queue == bfqq);
+
+	if (bfq_bfqq_sync(bfqq))
+		/*
+		 * The fact that this queue is being destroyed does not
+		 * invalidate the fact that this queue may have been
+		 * activated during the current burst. As a consequence,
+		 * although the queue does not exist anymore, and hence
+		 * needs to be removed from the burst list if there,
+		 * the burst size has not to be decremented.
+		 */
+		hlist_del_init(&bfqq->burst_list_node);
+
+	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
+
+	kmem_cache_free(bfq_pool, bfqq);
+}
+
+static void bfq_put_cooperator(struct bfq_queue *bfqq)
+{
+	struct bfq_queue *__bfqq, *next;
+
+	/*
+	 * If this queue was scheduled to merge with another queue, be
+	 * sure to drop the reference taken on that queue (and others in
+	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
+	 */
+	__bfqq = bfqq->new_bfqq;
+	while (__bfqq) {
+		if (__bfqq == bfqq)
+			break;
+		next = __bfqq->new_bfqq;
+		bfq_put_queue(__bfqq);
+		__bfqq = next;
+	}
+}
+
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	if (bfqq == bfqd->in_service_queue) {
+		__bfq_bfqq_expire(bfqd, bfqq);
+		bfq_schedule_dispatch(bfqd);
+	}
+
+	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
+		     atomic_read(&bfqq->ref));
+
+	bfq_put_cooperator(bfqq);
+
+	bfq_put_queue(bfqq);
+}
+
+static inline void bfq_init_icq(struct io_cq *icq)
+{
+	struct bfq_io_cq *bic = icq_to_bic(icq);
+
+	bic->ttime.last_end_request = jiffies;
+	/*
+	 * A newly created bic indicates that the process has just
+	 * started doing I/O, and is probably mapping into memory its
+	 * executable and libraries: it definitely needs weight raising.
+	 * There is however the possibility that the process performs,
+	 * for a while, I/O close to some other process. EQM intercepts
+	 * this behavior and may merge the queue corresponding to the
+	 * process  with some other queue, BEFORE the weight of the queue
+	 * is raised. Merged queues are not weight-raised (they are assumed
+	 * to belong to processes that benefit only from high throughput).
+	 * If the merge is basically the consequence of an accident, then
+	 * the queue will be split soon and will get back its old weight.
+	 * It is then important to write down somewhere that this queue
+	 * does need weight raising, even if it did not make it to get its
+	 * weight raised before being merged. To this purpose, we overload
+	 * the field raising_time_left and assign 1 to it, to mark the queue
+	 * as needing weight raising.
+	 */
+	bic->wr_time_left = 1;
+}
+
+static void bfq_exit_icq(struct io_cq *icq)
+{
+	struct bfq_io_cq *bic = icq_to_bic(icq);
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+
+	if (bic->bfqq[BLK_RW_ASYNC]) {
+		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
+		bic->bfqq[BLK_RW_ASYNC] = NULL;
+	}
+
+	if (bic->bfqq[BLK_RW_SYNC]) {
+		/*
+		 * If the bic is using a shared queue, put the reference
+		 * taken on the io_context when the bic started using a
+		 * shared bfq_queue.
+		 */
+		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
+			put_io_context(icq->ioc);
+		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
+		bic->bfqq[BLK_RW_SYNC] = NULL;
+	}
+}
+
+/*
+ * Update the entity prio values; note that the new values will not
+ * be used until the next (re)activation.
+ */
+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+{
+	struct task_struct *tsk = current;
+	struct io_context *ioc = bic->icq.ioc;
+	int ioprio_class;
+
+	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
+	switch (ioprio_class) {
+	default:
+		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
+			"bfq: bad prio class %d\n", ioprio_class);
+	case IOPRIO_CLASS_NONE:
+		/*
+		 * No prio set, inherit CPU scheduling settings.
+		 */
+		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
+		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
+		break;
+	case IOPRIO_CLASS_RT:
+		bfqq->entity.new_ioprio = task_ioprio(ioc);
+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
+		break;
+	case IOPRIO_CLASS_BE:
+		bfqq->entity.new_ioprio = task_ioprio(ioc);
+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
+		break;
+	case IOPRIO_CLASS_IDLE:
+		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
+		bfqq->entity.new_ioprio = 7;
+		bfq_clear_bfqq_idle_window(bfqq);
+		break;
+	}
+
+	if (bfqq->entity.new_ioprio < 0 ||
+	    bfqq->entity.new_ioprio >= IOPRIO_BE_NR) {
+		printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n",
+				 bfqq->entity.new_ioprio);
+		BUG();
+	}
+
+	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio);
+	bfqq->entity.ioprio_changed = 1;
+}
+
+static void bfq_check_ioprio_change(struct io_context *ioc,
+				    struct bfq_io_cq *bic)
+{
+	struct bfq_data *bfqd;
+	struct bfq_queue *bfqq, *new_bfqq;
+	struct bfq_group *bfqg;
+	unsigned long uninitialized_var(flags);
+	int ioprio = bic->icq.ioc->ioprio;
+
+	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
+				   &flags);
+	if (unlikely(bfqd == NULL))
+		return;
+
+	bic->ioprio = ioprio;
+
+	bfqq = bic->bfqq[BLK_RW_ASYNC];
+	if (bfqq != NULL) {
+		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
+				    sched_data);
+		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic->icq.ioc,
+					 GFP_ATOMIC);
+		if (new_bfqq != NULL) {
+			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
+			bfq_log_bfqq(bfqd, bfqq,
+				     "check_ioprio_change: bfqq %p %d",
+				     bfqq, atomic_read(&bfqq->ref));
+			bfq_put_queue(bfqq);
+		}
+	}
+
+	bfqq = bic->bfqq[BLK_RW_SYNC];
+	if (bfqq != NULL)
+		bfq_set_next_ioprio_data(bfqq, bic);
+
+	bfq_put_bfqd_unlock(bfqd, &flags);
+}
+
+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct bfq_io_cq *bic, pid_t pid, int is_sync)
+{
+	RB_CLEAR_NODE(&bfqq->entity.rb_node);
+	INIT_LIST_HEAD(&bfqq->fifo);
+	INIT_HLIST_NODE(&bfqq->burst_list_node);
+
+	atomic_set(&bfqq->ref, 0);
+	bfqq->bfqd = bfqd;
+
+	if (bic)
+		bfq_set_next_ioprio_data(bfqq, bic);
+
+	if (is_sync) {
+		if (!bfq_class_idle(bfqq))
+			bfq_mark_bfqq_idle_window(bfqq);
+		bfq_mark_bfqq_sync(bfqq);
+	}
+	bfq_mark_bfqq_IO_bound(bfqq);
+
+	/* Tentative initial value to trade off between thr and lat */
+	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
+	bfqq->pid = pid;
+
+	bfqq->wr_coeff = 1;
+	bfqq->last_wr_start_finish = 0;
+	/*
+	 * Set to the value for which bfqq will not be deemed as
+	 * soft rt when it becomes backlogged.
+	 */
+	bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);
+}
+
+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
+					      struct bfq_group *bfqg,
+					      int is_sync,
+					      struct io_context *ioc,
+					      gfp_t gfp_mask)
+{
+	struct bfq_queue *bfqq, *new_bfqq = NULL;
+	struct bfq_io_cq *bic;
+
+retry:
+	bic = bfq_bic_lookup(bfqd, ioc);
+	/* bic always exists here */
+	bfqq = bic_to_bfqq(bic, is_sync);
+
+	/*
+	 * Always try a new alloc if we fall back to the OOM bfqq
+	 * originally, since it should just be a temporary situation.
+	 */
+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
+		bfqq = NULL;
+		if (new_bfqq != NULL) {
+			bfqq = new_bfqq;
+			new_bfqq = NULL;
+		} else if (gfp_mask & __GFP_WAIT) {
+			spin_unlock_irq(bfqd->queue->queue_lock);
+			new_bfqq = kmem_cache_alloc_node(bfq_pool,
+					gfp_mask | __GFP_ZERO,
+					bfqd->queue->node);
+			spin_lock_irq(bfqd->queue->queue_lock);
+			if (new_bfqq != NULL)
+				goto retry;
+		} else {
+			bfqq = kmem_cache_alloc_node(bfq_pool,
+					gfp_mask | __GFP_ZERO,
+					bfqd->queue->node);
+		}
+
+		if (bfqq != NULL) {
+			bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
+                                      is_sync);
+			bfq_init_entity(&bfqq->entity, bfqg);
+			bfq_log_bfqq(bfqd, bfqq, "allocated");
+		} else {
+			bfqq = &bfqd->oom_bfqq;
+			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
+		}
+	}
+
+	if (new_bfqq != NULL)
+		kmem_cache_free(bfq_pool, new_bfqq);
+
+	return bfqq;
+}
+
+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
+					       struct bfq_group *bfqg,
+					       int ioprio_class, int ioprio)
+{
+	switch (ioprio_class) {
+	case IOPRIO_CLASS_RT:
+		return &bfqg->async_bfqq[0][ioprio];
+	case IOPRIO_CLASS_BE:
+		return &bfqg->async_bfqq[1][ioprio];
+	case IOPRIO_CLASS_IDLE:
+		return &bfqg->async_idle_bfqq;
+	default:
+		BUG();
+	}
+}
+
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+				       struct bfq_group *bfqg, int is_sync,
+				       struct io_context *ioc, gfp_t gfp_mask)
+{
+	const int ioprio = task_ioprio(ioc);
+	const int ioprio_class = task_ioprio_class(ioc);
+	struct bfq_queue **async_bfqq = NULL;
+	struct bfq_queue *bfqq = NULL;
+
+	if (!is_sync) {
+		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
+						  ioprio);
+		bfqq = *async_bfqq;
+	}
+
+	if (bfqq == NULL)
+		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask);
+
+	/*
+	 * Pin the queue now that it's allocated, scheduler exit will
+	 * prune it.
+	 */
+	if (!is_sync && *async_bfqq == NULL) {
+		atomic_inc(&bfqq->ref);
+		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
+			     bfqq, atomic_read(&bfqq->ref));
+		*async_bfqq = bfqq;
+	}
+
+	atomic_inc(&bfqq->ref);
+	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
+		     atomic_read(&bfqq->ref));
+	return bfqq;
+}
+
+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
+				    struct bfq_io_cq *bic)
+{
+	unsigned long elapsed = jiffies - bic->ttime.last_end_request;
+	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
+
+	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
+	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
+	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
+				bic->ttime.ttime_samples;
+}
+
+static void bfq_update_io_seektime(struct bfq_data *bfqd,
+				   struct bfq_queue *bfqq,
+				   struct request *rq)
+{
+	sector_t sdist;
+	u64 total;
+
+	if (bfqq->last_request_pos < blk_rq_pos(rq))
+		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
+	else
+		sdist = bfqq->last_request_pos - blk_rq_pos(rq);
+
+	/*
+	 * Don't allow the seek distance to get too large from the
+	 * odd fragment, pagein, etc.
+	 */
+	if (bfqq->seek_samples == 0) /* first request, not really a seek */
+		sdist = 0;
+	else if (bfqq->seek_samples <= 60) /* second & third seek */
+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
+	else
+		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
+
+	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
+	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
+	total = bfqq->seek_total + (bfqq->seek_samples/2);
+	do_div(total, bfqq->seek_samples);
+	bfqq->seek_mean = (sector_t)total;
+
+	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
+			(u64)bfqq->seek_mean);
+}
+
+/*
+ * Disable idle window if the process thinks too long or seeks so much that
+ * it doesn't matter.
+ */
+static void bfq_update_idle_window(struct bfq_data *bfqd,
+				   struct bfq_queue *bfqq,
+				   struct bfq_io_cq *bic)
+{
+	int enable_idle;
+
+	/* Don't idle for async or idle io prio class. */
+	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
+		return;
+
+	/* Idle window just restored, statistics are meaningless. */
+	if (bfq_bfqq_just_split(bfqq))
+		return;
+
+	enable_idle = bfq_bfqq_idle_window(bfqq);
+
+	if (atomic_read(&bic->icq.ioc->nr_tasks) == 0 ||
+	    bfqd->bfq_slice_idle == 0 ||
+		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
+			bfqq->wr_coeff == 1))
+		enable_idle = 0;
+	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
+		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
+			bfqq->wr_coeff == 1)
+			enable_idle = 0;
+		else
+			enable_idle = 1;
+	}
+	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
+		enable_idle);
+
+	if (enable_idle)
+		bfq_mark_bfqq_idle_window(bfqq);
+	else
+		bfq_clear_bfqq_idle_window(bfqq);
+}
+
+/*
+ * Called when a new fs request (rq) is added to bfqq.  Check if there's
+ * something we should do about it.
+ */
+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			    struct request *rq)
+{
+	struct bfq_io_cq *bic = RQ_BIC(rq);
+
+	if (rq->cmd_flags & REQ_META)
+		bfqq->meta_pending++;
+
+	bfq_update_io_thinktime(bfqd, bic);
+	bfq_update_io_seektime(bfqd, bfqq, rq);
+	if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) {
+		bfq_clear_bfqq_constantly_seeky(bfqq);
+		if (!blk_queue_nonrot(bfqd->queue)) {
+			BUG_ON(!bfqd->const_seeky_busy_in_flight_queues);
+			bfqd->const_seeky_busy_in_flight_queues--;
+		}
+	}
+	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
+	    !BFQQ_SEEKY(bfqq))
+		bfq_update_idle_window(bfqd, bfqq, bic);
+	bfq_clear_bfqq_just_split(bfqq);
+
+	bfq_log_bfqq(bfqd, bfqq,
+		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
+		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
+		     (long long unsigned)bfqq->seek_mean);
+
+	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+
+	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
+		int small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
+				blk_rq_sectors(rq) < 32;
+		int budget_timeout = bfq_bfqq_budget_timeout(bfqq);
+
+		/*
+		 * There is just this request queued: if the request
+		 * is small and the queue is not to be expired, then
+		 * just exit.
+		 *
+		 * In this way, if the disk is being idled to wait for
+		 * a new request from the in-service queue, we avoid
+		 * unplugging the device and committing the disk to serve
+		 * just a small request. On the contrary, we wait for
+		 * the block layer to decide when to unplug the device:
+		 * hopefully, new requests will be merged to this one
+		 * quickly, then the device will be unplugged and
+		 * larger requests will be dispatched.
+		 */
+		if (small_req && !budget_timeout)
+			return;
+
+		/*
+		 * A large enough request arrived, or the queue is to
+		 * be expired: in both cases disk idling is to be
+		 * stopped, so clear wait_request flag and reset
+		 * timer.
+		 */
+		bfq_clear_bfqq_wait_request(bfqq);
+		del_timer(&bfqd->idle_slice_timer);
+
+		/*
+		 * The queue is not empty, because a new request just
+		 * arrived. Hence we can safely expire the queue, in
+		 * case of budget timeout, without risking that the
+		 * timestamps of the queue are not updated correctly.
+		 * See [1] for more details.
+		 */
+		if (budget_timeout)
+			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
+
+		/*
+		 * Let the request rip immediately, or let a new queue be
+		 * selected if bfqq has just been expired.
+		 */
+		__blk_run_queue(bfqd->queue);
+	}
+}
+
+static void bfq_insert_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
+
+	assert_spin_locked(bfqd->queue->queue_lock);
+
+	/*
+	 * An unplug may trigger a requeue of a request from the device
+	 * driver: make sure we are in process context while trying to
+	 * merge two bfq_queues.
+	 */
+	if (!in_interrupt()) {
+		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
+		if (new_bfqq != NULL) {
+			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
+				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
+			/*
+			 * Release the request's reference to the old bfqq
+			 * and make sure one is taken to the shared queue.
+			 */
+			new_bfqq->allocated[rq_data_dir(rq)]++;
+			bfqq->allocated[rq_data_dir(rq)]--;
+			atomic_inc(&new_bfqq->ref);
+			bfq_put_queue(bfqq);
+			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
+				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
+						bfqq, new_bfqq);
+			rq->elv.priv[1] = new_bfqq;
+			bfqq = new_bfqq;
+		} else
+			bfq_bfqq_increase_failed_cooperations(bfqq);
+	}
+
+	bfq_add_request(rq);
+
+	/*
+	 * Here a newly-created bfq_queue has already started a weight-raising
+	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()
+	 * from assigning it a full weight-raising period. See the detailed
+	 * comments about this field in bfq_init_icq().
+	 */
+	if (bfqq->bic != NULL)
+		bfqq->bic->wr_time_left = 0;
+	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
+	list_add_tail(&rq->queuelist, &bfqq->fifo);
+
+	bfq_rq_enqueued(bfqd, bfqq, rq);
+}
+
+static void bfq_update_hw_tag(struct bfq_data *bfqd)
+{
+	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
+				     bfqd->rq_in_driver);
+
+	if (bfqd->hw_tag == 1)
+		return;
+
+	/*
+	 * This sample is valid if the number of outstanding requests
+	 * is large enough to allow a queueing behavior.  Note that the
+	 * sum is not exact, as it's not taking into account deactivated
+	 * requests.
+	 */
+	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
+		return;
+
+	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
+		return;
+
+	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
+	bfqd->max_rq_in_driver = 0;
+	bfqd->hw_tag_samples = 0;
+}
+
+static void bfq_completed_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_data *bfqd = bfqq->bfqd;
+	bool sync = bfq_bfqq_sync(bfqq);
+
+	bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)",
+		     blk_rq_sectors(rq), sync);
+
+	bfq_update_hw_tag(bfqd);
+
+	BUG_ON(!bfqd->rq_in_driver);
+	BUG_ON(!bfqq->dispatched);
+	bfqd->rq_in_driver--;
+	bfqq->dispatched--;
+
+	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
+		bfq_weights_tree_remove(bfqd, &bfqq->entity,
+					&bfqd->queue_weights_tree);
+		if (!blk_queue_nonrot(bfqd->queue)) {
+			BUG_ON(!bfqd->busy_in_flight_queues);
+			bfqd->busy_in_flight_queues--;
+			if (bfq_bfqq_constantly_seeky(bfqq)) {
+				BUG_ON(!bfqd->
+					const_seeky_busy_in_flight_queues);
+				bfqd->const_seeky_busy_in_flight_queues--;
+			}
+		}
+	}
+
+	if (sync) {
+		bfqd->sync_flight--;
+		RQ_BIC(rq)->ttime.last_end_request = jiffies;
+	}
+
+	/*
+	 * If we are waiting to discover whether the request pattern of the
+	 * task associated with the queue is actually isochronous, and
+	 * both requisites for this condition to hold are satisfied, then
+	 * compute soft_rt_next_start (see the comments to the function
+	 * bfq_bfqq_softrt_next_start()).
+	 */
+	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
+	    RB_EMPTY_ROOT(&bfqq->sort_list))
+		bfqq->soft_rt_next_start =
+			bfq_bfqq_softrt_next_start(bfqd, bfqq);
+
+	/*
+	 * If this is the in-service queue, check if it needs to be expired,
+	 * or if we want to idle in case it has no pending requests.
+	 */
+	if (bfqd->in_service_queue == bfqq) {
+		if (bfq_bfqq_budget_new(bfqq))
+			bfq_set_budget_timeout(bfqd);
+
+		if (bfq_bfqq_must_idle(bfqq)) {
+			bfq_arm_slice_timer(bfqd);
+			goto out;
+		} else if (bfq_may_expire_for_budg_timeout(bfqq))
+			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
+		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
+			 (bfqq->dispatched == 0 ||
+			  !bfq_bfqq_must_not_expire(bfqq)))
+			bfq_bfqq_expire(bfqd, bfqq, 0,
+					BFQ_BFQQ_NO_MORE_REQUESTS);
+	}
+
+	if (!bfqd->rq_in_driver)
+		bfq_schedule_dispatch(bfqd);
+
+out:
+	return;
+}
+
+static inline int __bfq_may_queue(struct bfq_queue *bfqq)
+{
+	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
+		bfq_clear_bfqq_must_alloc(bfqq);
+		return ELV_MQUEUE_MUST;
+	}
+
+	return ELV_MQUEUE_MAY;
+}
+
+static int bfq_may_queue(struct request_queue *q, int rw)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct task_struct *tsk = current;
+	struct bfq_io_cq *bic;
+	struct bfq_queue *bfqq;
+
+	/*
+	 * Don't force setup of a queue from here, as a call to may_queue
+	 * does not necessarily imply that a request actually will be
+	 * queued. So just lookup a possibly existing queue, or return
+	 * 'may queue' if that fails.
+	 */
+	bic = bfq_bic_lookup(bfqd, tsk->io_context);
+	if (bic == NULL)
+		return ELV_MQUEUE_MAY;
+
+	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
+	if (bfqq != NULL)
+		return __bfq_may_queue(bfqq);
+
+	return ELV_MQUEUE_MAY;
+}
+
+/*
+ * Queue lock held here.
+ */
+static void bfq_put_request(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+	if (bfqq != NULL) {
+		const int rw = rq_data_dir(rq);
+
+		BUG_ON(!bfqq->allocated[rw]);
+		bfqq->allocated[rw]--;
+
+		rq->elv.priv[0] = NULL;
+		rq->elv.priv[1] = NULL;
+
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
+			     bfqq, atomic_read(&bfqq->ref));
+		bfq_put_queue(bfqq);
+	}
+}
+
+/*
+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
+ * was the last process referring to said bfqq.
+ */
+static struct bfq_queue *
+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
+
+	put_io_context(bic->icq.ioc);
+
+	if (bfqq_process_refs(bfqq) == 1) {
+		bfqq->pid = current->pid;
+		bfq_clear_bfqq_coop(bfqq);
+		bfq_clear_bfqq_split_coop(bfqq);
+		return bfqq;
+	}
+
+	bic_set_bfqq(bic, NULL, 1);
+
+	bfq_put_cooperator(bfqq);
+
+	bfq_put_queue(bfqq);
+	return NULL;
+}
+
+/*
+ * Allocate bfq data structures associated with this request.
+ */
+static int bfq_set_request(struct request_queue *q, struct request *rq,
+			   gfp_t gfp_mask)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
+	const int rw = rq_data_dir(rq);
+	const int is_sync = rq_is_sync(rq);
+	struct bfq_queue *bfqq;
+	struct bfq_group *bfqg;
+	unsigned long flags;
+	bool split = false;
+
+	/* handle changed prio notifications; cgroup change is handled separately */
+	if (unlikely(icq_get_changed(&bic->icq) & ICQ_IOPRIO_CHANGED))
+		bfq_check_ioprio_change(bic->icq.ioc, bic);
+
+	might_sleep_if(gfp_mask & __GFP_WAIT);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	if (bic == NULL)
+		goto queue_fail;
+
+	bfqg = bfq_bic_update_cgroup(bic);
+
+new_queue:
+	bfqq = bic_to_bfqq(bic, is_sync);
+	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
+		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic->icq.ioc, gfp_mask);
+		bic_set_bfqq(bic, bfqq, is_sync);
+		if (split && is_sync) {
+			if ((bic->was_in_burst_list && bfqd->large_burst) ||
+			    bic->saved_in_large_burst)
+				bfq_mark_bfqq_in_large_burst(bfqq);
+			else {
+			    bfq_clear_bfqq_in_large_burst(bfqq);
+			    if (bic->was_in_burst_list)
+			       hlist_add_head(&bfqq->burst_list_node,
+				              &bfqd->burst_list);
+			}
+		}
+	} else {
+		/* If the queue was seeky for too long, break it apart. */
+		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
+			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
+			bfqq = bfq_split_bfqq(bic, bfqq);
+			split = true;
+			if (!bfqq)
+				goto new_queue;
+		}
+	}
+
+	bfqq->allocated[rw]++;
+	atomic_inc(&bfqq->ref);
+	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
+		     atomic_read(&bfqq->ref));
+
+	rq->elv.priv[0] = bic;
+	rq->elv.priv[1] = bfqq;
+
+	/*
+	 * If a bfq_queue has only one process reference, it is owned
+	 * by only one bfq_io_cq: we can set the bic field of the
+	 * bfq_queue to the address of that structure. Also, if the
+	 * queue has just been split, mark a flag so that the
+	 * information is available to the other scheduler hooks.
+	 */
+	if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
+		bfqq->bic = bic;
+		if (split) {
+			bfq_mark_bfqq_just_split(bfqq);
+			/*
+			 * If the queue has just been split from a shared
+			 * queue, restore the idle window and the possible
+			 * weight raising period.
+			 */
+			bfq_bfqq_resume_state(bfqq, bic);
+		}
+	}
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return 0;
+
+queue_fail:
+	bfq_schedule_dispatch(bfqd);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return 1;
+}
+
+static void bfq_kick_queue(struct work_struct *work)
+{
+	struct bfq_data *bfqd =
+		container_of(work, struct bfq_data, unplug_work);
+	struct request_queue *q = bfqd->queue;
+
+	spin_lock_irq(q->queue_lock);
+	__blk_run_queue(q);
+	spin_unlock_irq(q->queue_lock);
+}
+
+/*
+ * Handler of the expiration of the timer running if the in-service queue
+ * is idling inside its time slice.
+ */
+static void bfq_idle_slice_timer(unsigned long data)
+{
+	struct bfq_data *bfqd = (struct bfq_data *)data;
+	struct bfq_queue *bfqq;
+	unsigned long flags;
+	enum bfqq_expiration reason;
+
+	spin_lock_irqsave(bfqd->queue->queue_lock, flags);
+
+	bfqq = bfqd->in_service_queue;
+	/*
+	 * Theoretical race here: the in-service queue can be NULL or
+	 * different from the queue that was idling if the timer handler
+	 * spins on the queue_lock and a new request arrives for the
+	 * current queue and there is a full dispatch cycle that changes
+	 * the in-service queue.  This can hardly happen, but in the worst
+	 * case we just expire a queue too early.
+	 */
+	if (bfqq != NULL) {
+		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
+		if (bfq_bfqq_budget_timeout(bfqq))
+			/*
+			 * Also here the queue can be safely expired
+			 * for budget timeout without wasting
+			 * guarantees
+			 */
+			reason = BFQ_BFQQ_BUDGET_TIMEOUT;
+		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
+			/*
+			 * The queue may not be empty upon timer expiration,
+			 * because we may not disable the timer when the
+			 * first request of the in-service queue arrives
+			 * during disk idling.
+			 */
+			reason = BFQ_BFQQ_TOO_IDLE;
+		else
+			goto schedule_dispatch;
+
+		bfq_bfqq_expire(bfqd, bfqq, 1, reason);
+	}
+
+schedule_dispatch:
+	bfq_schedule_dispatch(bfqd);
+
+	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
+}
+
+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
+{
+	del_timer_sync(&bfqd->idle_slice_timer);
+	cancel_work_sync(&bfqd->unplug_work);
+}
+
+static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
+					struct bfq_queue **bfqq_ptr)
+{
+	struct bfq_group *root_group = bfqd->root_group;
+	struct bfq_queue *bfqq = *bfqq_ptr;
+
+	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
+	if (bfqq != NULL) {
+		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
+		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
+			     bfqq, atomic_read(&bfqq->ref));
+		bfq_put_queue(bfqq);
+		*bfqq_ptr = NULL;
+	}
+}
+
+/*
+ * Release all the bfqg references to its async queues.  If we are
+ * deallocating the group these queues may still contain requests, so
+ * we reparent them to the root cgroup (i.e., the only one that will
+ * exist for sure until all the requests on a device are gone).
+ */
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
+{
+	int i, j;
+
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < IOPRIO_BE_NR; j++)
+			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
+
+	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
+}
+
+static void bfq_exit_queue(struct elevator_queue *e)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	struct request_queue *q = bfqd->queue;
+	struct bfq_queue *bfqq, *n;
+
+	bfq_shutdown_timer_wq(bfqd);
+
+	spin_lock_irq(q->queue_lock);
+
+	BUG_ON(bfqd->in_service_queue != NULL);
+	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
+		bfq_deactivate_bfqq(bfqd, bfqq, 0);
+
+	bfq_disconnect_groups(bfqd);
+	spin_unlock_irq(q->queue_lock);
+
+	bfq_shutdown_timer_wq(bfqd);
+
+	synchronize_rcu();
+
+	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
+
+	bfq_free_root_group(bfqd);
+	kfree(bfqd);
+}
+
+static void *bfq_init_queue(struct request_queue *q)
+{
+	struct bfq_group *bfqg;
+	struct bfq_data *bfqd;
+
+	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
+	if (bfqd == NULL)
+		return NULL;
+
+	/*
+	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+	 * Grab a permanent reference to it, so that the normal code flow
+	 * will not attempt to free it.
+	 */
+	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
+	atomic_inc(&bfqd->oom_bfqq.ref);
+	bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+	bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE;
+	bfqd->oom_bfqq.entity.new_weight =
+		bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio);
+	/*
+	 * Trigger weight initialization, according to ioprio, at the
+	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
+	 * class won't be changed any more.
+	 */
+	bfqd->oom_bfqq.entity.ioprio_changed = 1;
+
+	bfqd->queue = q;
+
+	bfqg = bfq_alloc_root_group(bfqd, q->node);
+	if (bfqg == NULL) {
+		kfree(bfqd);
+		return NULL;
+	}
+
+	bfqd->root_group = bfqg;
+	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+#ifdef CONFIG_CGROUP_BFQIO
+	bfqd->active_numerous_groups = 0;
+#endif
+
+	init_timer(&bfqd->idle_slice_timer);
+	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+	bfqd->idle_slice_timer.data = (unsigned long)bfqd;
+
+	bfqd->rq_pos_tree = RB_ROOT;
+	bfqd->queue_weights_tree = RB_ROOT;
+	bfqd->group_weights_tree = RB_ROOT;
+
+	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
+
+	INIT_LIST_HEAD(&bfqd->active_list);
+	INIT_LIST_HEAD(&bfqd->idle_list);
+	INIT_HLIST_HEAD(&bfqd->burst_list);
+
+	bfqd->hw_tag = -1;
+
+	bfqd->bfq_max_budget = bfq_default_max_budget;
+
+	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
+	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
+	bfqd->bfq_back_max = bfq_back_max;
+	bfqd->bfq_back_penalty = bfq_back_penalty;
+	bfqd->bfq_slice_idle = bfq_slice_idle;
+	bfqd->bfq_class_idle_last_service = 0;
+	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
+	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
+	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
+
+	bfqd->bfq_coop_thresh = 2;
+	bfqd->bfq_failed_cooperations = 7000;
+	bfqd->bfq_requests_within_timer = 120;
+
+	bfqd->bfq_large_burst_thresh = 11;
+	bfqd->bfq_burst_interval = msecs_to_jiffies(500);
+
+	bfqd->low_latency = true;
+
+	bfqd->bfq_wr_coeff = 20;
+	bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
+	bfqd->bfq_wr_max_time = 0;
+	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
+	bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
+	bfqd->bfq_wr_max_softrt_rate = 7000; /*
+					      * Approximate rate required
+					      * to playback or record a
+					      * high-definition compressed
+					      * video.
+					      */
+	bfqd->wr_busy_queues = 0;
+	bfqd->busy_in_flight_queues = 0;
+	bfqd->const_seeky_busy_in_flight_queues = 0;
+
+	/*
+	 * Begin by assuming, optimistically, that the device peak rate is
+	 * equal to the highest reference rate.
+	 */
+	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
+			T_fast[blk_queue_nonrot(bfqd->queue)];
+	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)];
+	bfqd->device_speed = BFQ_BFQD_FAST;
+
+	return bfqd;
+}
+
+static void bfq_slab_kill(void)
+{
+	if (bfq_pool != NULL)
+		kmem_cache_destroy(bfq_pool);
+}
+
+static int __init bfq_slab_setup(void)
+{
+	bfq_pool = KMEM_CACHE(bfq_queue, 0);
+	if (bfq_pool == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static ssize_t bfq_var_show(unsigned int var, char *page)
+{
+	return sprintf(page, "%d\n", var);
+}
+
+static ssize_t bfq_var_store(unsigned long *var, const char *page,
+			     size_t count)
+{
+	unsigned long new_val;
+	int ret = kstrtoul(page, 10, &new_val);
+
+	if (ret == 0)
+		*var = new_val;
+
+	return count;
+}
+
+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?
+		       jiffies_to_msecs(bfqd->bfq_wr_max_time) :
+		       jiffies_to_msecs(bfq_wr_duration(bfqd)));
+}
+
+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
+{
+	struct bfq_queue *bfqq;
+	struct bfq_data *bfqd = e->elevator_data;
+	ssize_t num_char = 0;
+
+	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
+			    bfqd->queued);
+
+	spin_lock_irq(bfqd->queue->queue_lock);
+
+	num_char += sprintf(page + num_char, "Active:\n");
+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
+	  num_char += sprintf(page + num_char,
+			      "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n",
+			      bfqq->pid,
+			      bfqq->entity.weight,
+			      bfqq->queued[0],
+			      bfqq->queued[1],
+			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
+			jiffies_to_msecs(bfqq->wr_cur_max_time));
+	}
+
+	num_char += sprintf(page + num_char, "Idle:\n");
+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
+			num_char += sprintf(page + num_char,
+				"pid%d: weight %hu, dur %d/%u\n",
+				bfqq->pid,
+				bfqq->entity.weight,
+				jiffies_to_msecs(jiffies -
+					bfqq->last_wr_start_finish),
+				jiffies_to_msecs(bfqq->wr_cur_max_time));
+	}
+
+	spin_unlock_irq(bfqd->queue->queue_lock);
+
+	return num_char;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	unsigned int __data = __VAR;					\
+	if (__CONV)							\
+		__data = jiffies_to_msecs(__data);			\
+	return bfq_var_show(__data, (page));				\
+}
+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
+SHOW_FUNCTION(bfq_max_budget_async_rq_show,
+	      bfqd->bfq_max_budget_async_rq, 0);
+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);
+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);
+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);
+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,
+	1);
+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
+static ssize_t								\
+__FUNC(struct elevator_queue *e, const char *page, size_t count)	\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	unsigned long uninitialized_var(__data);			\
+	int ret = bfq_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	if (__CONV)							\
+		*(__PTR) = msecs_to_jiffies(__data);			\
+	else								\
+		*(__PTR) = __data;					\
+	return ret;							\
+}
+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
+		INT_MAX, 1);
+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
+		INT_MAX, 1);
+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
+		INT_MAX, 0);
+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
+		1, INT_MAX, 0);
+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
+		INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);
+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,
+		1);
+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,
+		INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,
+		&bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,
+		INT_MAX, 0);
+#undef STORE_FUNCTION
+
+/* do nothing for the moment */
+static ssize_t bfq_weights_store(struct elevator_queue *e,
+				    const char *page, size_t count)
+{
+	return count;
+}
+
+static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
+{
+	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
+
+	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
+		return bfq_calc_max_budget(bfqd->peak_rate, timeout);
+	else
+		return bfq_default_max_budget;
+}
+
+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
+				    const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data == 0)
+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
+	else {
+		if (__data > INT_MAX)
+			__data = INT_MAX;
+		bfqd->bfq_max_budget = __data;
+	}
+
+	bfqd->bfq_user_max_budget = __data;
+
+	return ret;
+}
+
+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
+				      const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data < 1)
+		__data = 1;
+	else if (__data > INT_MAX)
+		__data = INT_MAX;
+
+	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
+	if (bfqd->bfq_user_max_budget == 0)
+		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
+
+	return ret;
+}
+
+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
+				     const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data > 1)
+		__data = 1;
+	if (__data == 0 && bfqd->low_latency != 0)
+		bfq_end_wr(bfqd);
+	bfqd->low_latency = __data;
+
+	return ret;
+}
+
+#define BFQ_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
+
+static struct elv_fs_entry bfq_attrs[] = {
+	BFQ_ATTR(fifo_expire_sync),
+	BFQ_ATTR(fifo_expire_async),
+	BFQ_ATTR(back_seek_max),
+	BFQ_ATTR(back_seek_penalty),
+	BFQ_ATTR(slice_idle),
+	BFQ_ATTR(max_budget),
+	BFQ_ATTR(max_budget_async_rq),
+	BFQ_ATTR(timeout_sync),
+	BFQ_ATTR(timeout_async),
+	BFQ_ATTR(low_latency),
+	BFQ_ATTR(wr_coeff),
+	BFQ_ATTR(wr_max_time),
+	BFQ_ATTR(wr_rt_max_time),
+	BFQ_ATTR(wr_min_idle_time),
+	BFQ_ATTR(wr_min_inter_arr_async),
+	BFQ_ATTR(wr_max_softrt_rate),
+	BFQ_ATTR(weights),
+	__ATTR_NULL
+};
+
+static struct elevator_type iosched_bfq = {
+	.ops = {
+		.elevator_merge_fn =		bfq_merge,
+		.elevator_merged_fn =		bfq_merged_request,
+		.elevator_merge_req_fn =	bfq_merged_requests,
+		.elevator_allow_merge_fn =	bfq_allow_merge,
+		.elevator_dispatch_fn =		bfq_dispatch_requests,
+		.elevator_add_req_fn =		bfq_insert_request,
+		.elevator_activate_req_fn =	bfq_activate_request,
+		.elevator_deactivate_req_fn =	bfq_deactivate_request,
+		.elevator_completed_req_fn =	bfq_completed_request,
+		.elevator_former_req_fn =	elv_rb_former_request,
+		.elevator_latter_req_fn =	elv_rb_latter_request,
+		.elevator_init_icq_fn =		bfq_init_icq,
+		.elevator_exit_icq_fn =		bfq_exit_icq,
+		.elevator_set_req_fn =		bfq_set_request,
+		.elevator_put_req_fn =		bfq_put_request,
+		.elevator_may_queue_fn =	bfq_may_queue,
+		.elevator_init_fn =		bfq_init_queue,
+		.elevator_exit_fn =		bfq_exit_queue,
+	},
+	.icq_size =		sizeof(struct bfq_io_cq),
+	.icq_align =		__alignof__(struct bfq_io_cq),
+	.elevator_attrs =	bfq_attrs,
+	.elevator_name =	"bfq",
+	.elevator_owner =	THIS_MODULE,
+};
+
+static int __init bfq_init(void)
+{
+	/*
+	 * Can be 0 on HZ < 1000 setups.
+	 */
+	if (bfq_slice_idle == 0)
+		bfq_slice_idle = 1;
+
+	if (bfq_timeout_async == 0)
+		bfq_timeout_async = 1;
+
+	if (bfq_slab_setup())
+		return -ENOMEM;
+
+	/*
+	 * Times to load large popular applications for the typical systems
+	 * installed on the reference devices (see the comments before the
+	 * definitions of the two arrays).
+	 */
+	T_slow[0] = msecs_to_jiffies(2600);
+	T_slow[1] = msecs_to_jiffies(1000);
+	T_fast[0] = msecs_to_jiffies(5500);
+	T_fast[1] = msecs_to_jiffies(2000);
+
+	/*
+	 * Thresholds that determine the switch between speed classes (see
+	 * the comments before the definition of the array).
+	 */
+	device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;
+	device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;
+
+	elv_register(&iosched_bfq);
+	pr_info("BFQ I/O-scheduler: v7r8");
+
+	return 0;
+}
+
+static void __exit bfq_exit(void)
+{
+	elv_unregister(&iosched_bfq);
+	bfq_slab_kill();
+}
+
+module_init(bfq_init);
+module_exit(bfq_exit);
+
+MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
+MODULE_LICENSE("GPL");
diff --git a/block/bfq-sched.c b/block/bfq-sched.c
new file mode 100644
index 000000000..d0890c6d4
--- /dev/null
+++ b/block/bfq-sched.c
@@ -0,0 +1,1180 @@
+/*
+ * BFQ: Hierarchical B-WF2Q+ scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ */
+
+#ifdef CONFIG_CGROUP_BFQIO
+#define for_each_entity(entity)	\
+	for (; entity != NULL; entity = entity->parent)
+
+#define for_each_entity_safe(entity, parent) \
+	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
+
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
+						 int extract,
+						 struct bfq_data *bfqd);
+
+static inline void bfq_update_budget(struct bfq_entity *next_in_service)
+{
+	struct bfq_entity *bfqg_entity;
+	struct bfq_group *bfqg;
+	struct bfq_sched_data *group_sd;
+
+	BUG_ON(next_in_service == NULL);
+
+	group_sd = next_in_service->sched_data;
+
+	bfqg = container_of(group_sd, struct bfq_group, sched_data);
+	/*
+	 * bfq_group's my_entity field is not NULL only if the group
+	 * is not the root group. We must not touch the root entity
+	 * as it must never become an in-service entity.
+	 */
+	bfqg_entity = bfqg->my_entity;
+	if (bfqg_entity != NULL)
+		bfqg_entity->budget = next_in_service->budget;
+}
+
+static int bfq_update_next_in_service(struct bfq_sched_data *sd)
+{
+	struct bfq_entity *next_in_service;
+
+	if (sd->in_service_entity != NULL)
+		/* will update/requeue at the end of service */
+		return 0;
+
+	/*
+	 * NOTE: this can be improved in many ways, such as returning
+	 * 1 (and thus propagating upwards the update) only when the
+	 * budget changes, or caching the bfqq that will be scheduled
+	 * next from this subtree.  By now we worry more about
+	 * correctness than about performance...
+	 */
+	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
+	sd->next_in_service = next_in_service;
+
+	if (next_in_service != NULL)
+		bfq_update_budget(next_in_service);
+
+	return 1;
+}
+
+static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
+					     struct bfq_entity *entity)
+{
+	BUG_ON(sd->next_in_service != entity);
+}
+#else
+#define for_each_entity(entity)	\
+	for (; entity != NULL; entity = NULL)
+
+#define for_each_entity_safe(entity, parent) \
+	for (parent = NULL; entity != NULL; entity = parent)
+
+static inline int bfq_update_next_in_service(struct bfq_sched_data *sd)
+{
+	return 0;
+}
+
+static inline void bfq_check_next_in_service(struct bfq_sched_data *sd,
+					     struct bfq_entity *entity)
+{
+}
+
+static inline void bfq_update_budget(struct bfq_entity *next_in_service)
+{
+}
+#endif
+
+/*
+ * Shift for timestamp calculations.  This actually limits the maximum
+ * service allowed in one timestamp delta (small shift values increase it),
+ * the maximum total weight that can be used for the queues in the system
+ * (big shift values increase it), and the period of virtual time
+ * wraparounds.
+ */
+#define WFQ_SERVICE_SHIFT	22
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static inline int bfq_gt(u64 a, u64 b)
+{
+	return (s64)(a - b) > 0;
+}
+
+static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = NULL;
+
+	BUG_ON(entity == NULL);
+
+	if (entity->my_sched_data == NULL)
+		bfqq = container_of(entity, struct bfq_queue, entity);
+
+	return bfqq;
+}
+
+
+/**
+ * bfq_delta - map service into the virtual time domain.
+ * @service: amount of service.
+ * @weight: scale factor (weight of an entity or weight sum).
+ */
+static inline u64 bfq_delta(unsigned long service,
+					unsigned long weight)
+{
+	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
+
+	do_div(d, weight);
+	return d;
+}
+
+/**
+ * bfq_calc_finish - assign the finish time to an entity.
+ * @entity: the entity to act upon.
+ * @service: the service to be charged to the entity.
+ */
+static inline void bfq_calc_finish(struct bfq_entity *entity,
+				   unsigned long service)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	BUG_ON(entity->weight == 0);
+
+	entity->finish = entity->start +
+		bfq_delta(service, entity->weight);
+
+	if (bfqq != NULL) {
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			"calc_finish: serv %lu, w %d",
+			service, entity->weight);
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			"calc_finish: start %llu, finish %llu, delta %llu",
+			entity->start, entity->finish,
+			bfq_delta(service, entity->weight));
+	}
+}
+
+/**
+ * bfq_entity_of - get an entity from a node.
+ * @node: the node field of the entity.
+ *
+ * Convert a node pointer to the relative entity.  This is used only
+ * to simplify the logic of some functions and not as the generic
+ * conversion mechanism because, e.g., in the tree walking functions,
+ * the check for a %NULL value would be redundant.
+ */
+static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
+{
+	struct bfq_entity *entity = NULL;
+
+	if (node != NULL)
+		entity = rb_entry(node, struct bfq_entity, rb_node);
+
+	return entity;
+}
+
+/**
+ * bfq_extract - remove an entity from a tree.
+ * @root: the tree root.
+ * @entity: the entity to remove.
+ */
+static inline void bfq_extract(struct rb_root *root,
+			       struct bfq_entity *entity)
+{
+	BUG_ON(entity->tree != root);
+
+	entity->tree = NULL;
+	rb_erase(&entity->rb_node, root);
+}
+
+/**
+ * bfq_idle_extract - extract an entity from the idle tree.
+ * @st: the service tree of the owning @entity.
+ * @entity: the entity being removed.
+ */
+static void bfq_idle_extract(struct bfq_service_tree *st,
+			     struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *next;
+
+	BUG_ON(entity->tree != &st->idle);
+
+	if (entity == st->first_idle) {
+		next = rb_next(&entity->rb_node);
+		st->first_idle = bfq_entity_of(next);
+	}
+
+	if (entity == st->last_idle) {
+		next = rb_prev(&entity->rb_node);
+		st->last_idle = bfq_entity_of(next);
+	}
+
+	bfq_extract(&st->idle, entity);
+
+	if (bfqq != NULL)
+		list_del(&bfqq->bfqq_list);
+}
+
+/**
+ * bfq_insert - generic tree insertion.
+ * @root: tree root.
+ * @entity: entity to insert.
+ *
+ * This is used for the idle and the active tree, since they are both
+ * ordered by finish time.
+ */
+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
+{
+	struct bfq_entity *entry;
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+
+	BUG_ON(entity->tree != NULL);
+
+	while (*node != NULL) {
+		parent = *node;
+		entry = rb_entry(parent, struct bfq_entity, rb_node);
+
+		if (bfq_gt(entry->finish, entity->finish))
+			node = &parent->rb_left;
+		else
+			node = &parent->rb_right;
+	}
+
+	rb_link_node(&entity->rb_node, parent, node);
+	rb_insert_color(&entity->rb_node, root);
+
+	entity->tree = root;
+}
+
+/**
+ * bfq_update_min - update the min_start field of a entity.
+ * @entity: the entity to update.
+ * @node: one of its children.
+ *
+ * This function is called when @entity may store an invalid value for
+ * min_start due to updates to the active tree.  The function  assumes
+ * that the subtree rooted at @node (which may be its left or its right
+ * child) has a valid min_start value.
+ */
+static inline void bfq_update_min(struct bfq_entity *entity,
+				  struct rb_node *node)
+{
+	struct bfq_entity *child;
+
+	if (node != NULL) {
+		child = rb_entry(node, struct bfq_entity, rb_node);
+		if (bfq_gt(entity->min_start, child->min_start))
+			entity->min_start = child->min_start;
+	}
+}
+
+/**
+ * bfq_update_active_node - recalculate min_start.
+ * @node: the node to update.
+ *
+ * @node may have changed position or one of its children may have moved,
+ * this function updates its min_start value.  The left and right subtrees
+ * are assumed to hold a correct min_start value.
+ */
+static inline void bfq_update_active_node(struct rb_node *node)
+{
+	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
+
+	entity->min_start = entity->start;
+	bfq_update_min(entity, node->rb_right);
+	bfq_update_min(entity, node->rb_left);
+}
+
+/**
+ * bfq_update_active_tree - update min_start for the whole active tree.
+ * @node: the starting node.
+ *
+ * @node must be the deepest modified node after an update.  This function
+ * updates its min_start using the values held by its children, assuming
+ * that they did not change, and then updates all the nodes that may have
+ * changed in the path to the root.  The only nodes that may have changed
+ * are the ones in the path or their siblings.
+ */
+static void bfq_update_active_tree(struct rb_node *node)
+{
+	struct rb_node *parent;
+
+up:
+	bfq_update_active_node(node);
+
+	parent = rb_parent(node);
+	if (parent == NULL)
+		return;
+
+	if (node == parent->rb_left && parent->rb_right != NULL)
+		bfq_update_active_node(parent->rb_right);
+	else if (parent->rb_left != NULL)
+		bfq_update_active_node(parent->rb_left);
+
+	node = parent;
+	goto up;
+}
+
+static void bfq_weights_tree_add(struct bfq_data *bfqd,
+				 struct bfq_entity *entity,
+				 struct rb_root *root);
+
+static void bfq_weights_tree_remove(struct bfq_data *bfqd,
+				    struct bfq_entity *entity,
+				    struct rb_root *root);
+
+
+/**
+ * bfq_active_insert - insert an entity in the active tree of its
+ *                     group/device.
+ * @st: the service tree of the entity.
+ * @entity: the entity being inserted.
+ *
+ * The active tree is ordered by finish time, but an extra key is kept
+ * per each node, containing the minimum value for the start times of
+ * its children (and the node itself), so it's possible to search for
+ * the eligible node with the lowest finish time in logarithmic time.
+ */
+static void bfq_active_insert(struct bfq_service_tree *st,
+			      struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *node = &entity->rb_node;
+#ifdef CONFIG_CGROUP_BFQIO
+	struct bfq_sched_data *sd = NULL;
+	struct bfq_group *bfqg = NULL;
+	struct bfq_data *bfqd = NULL;
+#endif
+
+	bfq_insert(&st->active, entity);
+
+	if (node->rb_left != NULL)
+		node = node->rb_left;
+	else if (node->rb_right != NULL)
+		node = node->rb_right;
+
+	bfq_update_active_tree(node);
+
+#ifdef CONFIG_CGROUP_BFQIO
+	sd = entity->sched_data;
+	bfqg = container_of(sd, struct bfq_group, sched_data);
+	BUG_ON(!bfqg);
+	bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+	if (bfqq != NULL)
+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
+#ifdef CONFIG_CGROUP_BFQIO
+	else { /* bfq_group */
+		BUG_ON(!bfqd);
+		bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
+	}
+	if (bfqg != bfqd->root_group) {
+		BUG_ON(!bfqg);
+		BUG_ON(!bfqd);
+		bfqg->active_entities++;
+		if (bfqg->active_entities == 2)
+			bfqd->active_numerous_groups++;
+	}
+#endif
+}
+
+/**
+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
+ * @ioprio: the ioprio value to convert.
+ */
+static inline unsigned short bfq_ioprio_to_weight(int ioprio)
+{
+	BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
+	return IOPRIO_BE_NR - ioprio;
+}
+
+/**
+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
+ * @weight: the weight value to convert.
+ *
+ * To preserve as mush as possible the old only-ioprio user interface,
+ * 0 is used as an escape ioprio value for weights (numerically) equal or
+ * larger than IOPRIO_BE_NR
+ */
+static inline unsigned short bfq_weight_to_ioprio(int weight)
+{
+	BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
+	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
+}
+
+static inline void bfq_get_entity(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	if (bfqq != NULL) {
+		atomic_inc(&bfqq->ref);
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
+			     bfqq, atomic_read(&bfqq->ref));
+	}
+}
+
+/**
+ * bfq_find_deepest - find the deepest node that an extraction can modify.
+ * @node: the node being removed.
+ *
+ * Do the first step of an extraction in an rb tree, looking for the
+ * node that will replace @node, and returning the deepest node that
+ * the following modifications to the tree can touch.  If @node is the
+ * last node in the tree return %NULL.
+ */
+static struct rb_node *bfq_find_deepest(struct rb_node *node)
+{
+	struct rb_node *deepest;
+
+	if (node->rb_right == NULL && node->rb_left == NULL)
+		deepest = rb_parent(node);
+	else if (node->rb_right == NULL)
+		deepest = node->rb_left;
+	else if (node->rb_left == NULL)
+		deepest = node->rb_right;
+	else {
+		deepest = rb_next(node);
+		if (deepest->rb_right != NULL)
+			deepest = deepest->rb_right;
+		else if (rb_parent(deepest) != node)
+			deepest = rb_parent(deepest);
+	}
+
+	return deepest;
+}
+
+/**
+ * bfq_active_extract - remove an entity from the active tree.
+ * @st: the service_tree containing the tree.
+ * @entity: the entity being removed.
+ */
+static void bfq_active_extract(struct bfq_service_tree *st,
+			       struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *node;
+#ifdef CONFIG_CGROUP_BFQIO
+	struct bfq_sched_data *sd = NULL;
+	struct bfq_group *bfqg = NULL;
+	struct bfq_data *bfqd = NULL;
+#endif
+
+	node = bfq_find_deepest(&entity->rb_node);
+	bfq_extract(&st->active, entity);
+
+	if (node != NULL)
+		bfq_update_active_tree(node);
+
+#ifdef CONFIG_CGROUP_BFQIO
+	sd = entity->sched_data;
+	bfqg = container_of(sd, struct bfq_group, sched_data);
+	BUG_ON(!bfqg);
+	bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+	if (bfqq != NULL)
+		list_del(&bfqq->bfqq_list);
+#ifdef CONFIG_CGROUP_BFQIO
+	else { /* bfq_group */
+		BUG_ON(!bfqd);
+		bfq_weights_tree_remove(bfqd, entity,
+					&bfqd->group_weights_tree);
+	}
+	if (bfqg != bfqd->root_group) {
+		BUG_ON(!bfqg);
+		BUG_ON(!bfqd);
+		BUG_ON(!bfqg->active_entities);
+		bfqg->active_entities--;
+		if (bfqg->active_entities == 1) {
+			BUG_ON(!bfqd->active_numerous_groups);
+			bfqd->active_numerous_groups--;
+		}
+	}
+#endif
+}
+
+/**
+ * bfq_idle_insert - insert an entity into the idle tree.
+ * @st: the service tree containing the tree.
+ * @entity: the entity to insert.
+ */
+static void bfq_idle_insert(struct bfq_service_tree *st,
+			    struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct bfq_entity *first_idle = st->first_idle;
+	struct bfq_entity *last_idle = st->last_idle;
+
+	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
+		st->first_idle = entity;
+	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
+		st->last_idle = entity;
+
+	bfq_insert(&st->idle, entity);
+
+	if (bfqq != NULL)
+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
+}
+
+/**
+ * bfq_forget_entity - remove an entity from the wfq trees.
+ * @st: the service tree.
+ * @entity: the entity being removed.
+ *
+ * Update the device status and forget everything about @entity, putting
+ * the device reference to it, if it is a queue.  Entities belonging to
+ * groups are not refcounted.
+ */
+static void bfq_forget_entity(struct bfq_service_tree *st,
+			      struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct bfq_sched_data *sd;
+
+	BUG_ON(!entity->on_st);
+
+	entity->on_st = 0;
+	st->wsum -= entity->weight;
+	if (bfqq != NULL) {
+		sd = entity->sched_data;
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
+			     bfqq, atomic_read(&bfqq->ref));
+		bfq_put_queue(bfqq);
+	}
+}
+
+/**
+ * bfq_put_idle_entity - release the idle tree ref of an entity.
+ * @st: service tree for the entity.
+ * @entity: the entity being released.
+ */
+static void bfq_put_idle_entity(struct bfq_service_tree *st,
+				struct bfq_entity *entity)
+{
+	bfq_idle_extract(st, entity);
+	bfq_forget_entity(st, entity);
+}
+
+/**
+ * bfq_forget_idle - update the idle tree if necessary.
+ * @st: the service tree to act upon.
+ *
+ * To preserve the global O(log N) complexity we only remove one entry here;
+ * as the idle tree will not grow indefinitely this can be done safely.
+ */
+static void bfq_forget_idle(struct bfq_service_tree *st)
+{
+	struct bfq_entity *first_idle = st->first_idle;
+	struct bfq_entity *last_idle = st->last_idle;
+
+	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
+	    !bfq_gt(last_idle->finish, st->vtime)) {
+		/*
+		 * Forget the whole idle tree, increasing the vtime past
+		 * the last finish time of idle entities.
+		 */
+		st->vtime = last_idle->finish;
+	}
+
+	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
+		bfq_put_idle_entity(st, first_idle);
+}
+
+static struct bfq_service_tree *
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
+			 struct bfq_entity *entity)
+{
+	struct bfq_service_tree *new_st = old_st;
+
+	if (entity->ioprio_changed) {
+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+		unsigned short prev_weight, new_weight;
+		struct bfq_data *bfqd = NULL;
+		struct rb_root *root;
+#ifdef CONFIG_CGROUP_BFQIO
+		struct bfq_sched_data *sd;
+		struct bfq_group *bfqg;
+#endif
+
+		if (bfqq != NULL)
+			bfqd = bfqq->bfqd;
+#ifdef CONFIG_CGROUP_BFQIO
+		else {
+			sd = entity->my_sched_data;
+			bfqg = container_of(sd, struct bfq_group, sched_data);
+			BUG_ON(!bfqg);
+			bfqd = (struct bfq_data *)bfqg->bfqd;
+			BUG_ON(!bfqd);
+		}
+#endif
+
+		BUG_ON(old_st->wsum < entity->weight);
+		old_st->wsum -= entity->weight;
+
+		if (entity->new_weight != entity->orig_weight) {
+			if (entity->new_weight < BFQ_MIN_WEIGHT ||
+			    entity->new_weight > BFQ_MAX_WEIGHT) {
+				printk(KERN_CRIT "update_weight_prio: "
+						 "new_weight %d\n",
+					entity->new_weight);
+				BUG();
+			}
+			entity->orig_weight = entity->new_weight;
+			entity->ioprio =
+				bfq_weight_to_ioprio(entity->orig_weight);
+		}
+
+		entity->ioprio_class = entity->new_ioprio_class;
+		entity->ioprio_changed = 0;
+
+		/*
+		 * NOTE: here we may be changing the weight too early,
+		 * this will cause unfairness.  The correct approach
+		 * would have required additional complexity to defer
+		 * weight changes to the proper time instants (i.e.,
+		 * when entity->finish <= old_st->vtime).
+		 */
+		new_st = bfq_entity_service_tree(entity);
+
+		prev_weight = entity->weight;
+		new_weight = entity->orig_weight *
+			     (bfqq != NULL ? bfqq->wr_coeff : 1);
+		/*
+		 * If the weight of the entity changes, remove the entity
+		 * from its old weight counter (if there is a counter
+		 * associated with the entity), and add it to the counter
+		 * associated with its new weight.
+		 */
+		if (prev_weight != new_weight) {
+			root = bfqq ? &bfqd->queue_weights_tree :
+				      &bfqd->group_weights_tree;
+			bfq_weights_tree_remove(bfqd, entity, root);
+		}
+		entity->weight = new_weight;
+		/*
+		 * Add the entity to its weights tree only if it is
+		 * not associated with a weight-raised queue.
+		 */
+		if (prev_weight != new_weight &&
+		    (bfqq ? bfqq->wr_coeff == 1 : 1))
+			/* If we get here, root has been initialized. */
+			bfq_weights_tree_add(bfqd, entity, root);
+
+		new_st->wsum += entity->weight;
+
+		if (new_st != old_st)
+			entity->start = new_st->vtime;
+	}
+
+	return new_st;
+}
+
+/**
+ * bfq_bfqq_served - update the scheduler status after selection for
+ *                   service.
+ * @bfqq: the queue being served.
+ * @served: bytes to transfer.
+ *
+ * NOTE: this can be optimized, as the timestamps of upper level entities
+ * are synchronized every time a new bfqq is selected for service.  By now,
+ * we keep it to better check consistency.
+ */
+static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_service_tree *st;
+
+	for_each_entity(entity) {
+		st = bfq_entity_service_tree(entity);
+
+		entity->service += served;
+		BUG_ON(entity->service > entity->budget);
+		BUG_ON(st->wsum == 0);
+
+		st->vtime += bfq_delta(served, st->wsum);
+		bfq_forget_idle(st);
+	}
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
+}
+
+/**
+ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
+ * @bfqq: the queue that needs a service update.
+ *
+ * When it's not possible to be fair in the service domain, because
+ * a queue is not consuming its budget fast enough (the meaning of
+ * fast depends on the timeout parameter), we charge it a full
+ * budget.  In this way we should obtain a sort of time-domain
+ * fairness among all the seeky/slow queues.
+ */
+static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
+
+	bfq_bfqq_served(bfqq, entity->budget - entity->service);
+}
+
+/**
+ * __bfq_activate_entity - activate an entity.
+ * @entity: the entity being activated.
+ *
+ * Called whenever an entity is activated, i.e., it is not active and one
+ * of its children receives a new request, or has to be reactivated due to
+ * budget exhaustion.  It uses the current budget of the entity (and the
+ * service received if @entity is active) of the queue to calculate its
+ * timestamps.
+ */
+static void __bfq_activate_entity(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+	if (entity == sd->in_service_entity) {
+		BUG_ON(entity->tree != NULL);
+		/*
+		 * If we are requeueing the current entity we have
+		 * to take care of not charging to it service it has
+		 * not received.
+		 */
+		bfq_calc_finish(entity, entity->service);
+		entity->start = entity->finish;
+		sd->in_service_entity = NULL;
+	} else if (entity->tree == &st->active) {
+		/*
+		 * Requeueing an entity due to a change of some
+		 * next_in_service entity below it.  We reuse the
+		 * old start time.
+		 */
+		bfq_active_extract(st, entity);
+	} else if (entity->tree == &st->idle) {
+		/*
+		 * Must be on the idle tree, bfq_idle_extract() will
+		 * check for that.
+		 */
+		bfq_idle_extract(st, entity);
+		entity->start = bfq_gt(st->vtime, entity->finish) ?
+				       st->vtime : entity->finish;
+	} else {
+		/*
+		 * The finish time of the entity may be invalid, and
+		 * it is in the past for sure, otherwise the queue
+		 * would have been on the idle tree.
+		 */
+		entity->start = st->vtime;
+		st->wsum += entity->weight;
+		bfq_get_entity(entity);
+
+		BUG_ON(entity->on_st);
+		entity->on_st = 1;
+	}
+
+	st = __bfq_entity_update_weight_prio(st, entity);
+	bfq_calc_finish(entity, entity->budget);
+	bfq_active_insert(st, entity);
+}
+
+/**
+ * bfq_activate_entity - activate an entity and its ancestors if necessary.
+ * @entity: the entity to activate.
+ *
+ * Activate @entity and all the entities on the path from it to the root.
+ */
+static void bfq_activate_entity(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sd;
+
+	for_each_entity(entity) {
+		__bfq_activate_entity(entity);
+
+		sd = entity->sched_data;
+		if (!bfq_update_next_in_service(sd))
+			/*
+			 * No need to propagate the activation to the
+			 * upper entities, as they will be updated when
+			 * the in-service entity is rescheduled.
+			 */
+			break;
+	}
+}
+
+/**
+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
+ * @entity: the entity to deactivate.
+ * @requeue: if false, the entity will not be put into the idle tree.
+ *
+ * Deactivate an entity, independently from its previous state.  If the
+ * entity was not on a service tree just return, otherwise if it is on
+ * any scheduler tree, extract it from that tree, and if necessary
+ * and if the caller did not specify @requeue, put it on the idle tree.
+ *
+ * Return %1 if the caller should update the entity hierarchy, i.e.,
+ * if the entity was in service or if it was the next_in_service for
+ * its sched_data; return %0 otherwise.
+ */
+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	int was_in_service = entity == sd->in_service_entity;
+	int ret = 0;
+
+	if (!entity->on_st)
+		return 0;
+
+	BUG_ON(was_in_service && entity->tree != NULL);
+
+	if (was_in_service) {
+		bfq_calc_finish(entity, entity->service);
+		sd->in_service_entity = NULL;
+	} else if (entity->tree == &st->active)
+		bfq_active_extract(st, entity);
+	else if (entity->tree == &st->idle)
+		bfq_idle_extract(st, entity);
+	else if (entity->tree != NULL)
+		BUG();
+
+	if (was_in_service || sd->next_in_service == entity)
+		ret = bfq_update_next_in_service(sd);
+
+	if (!requeue || !bfq_gt(entity->finish, st->vtime))
+		bfq_forget_entity(st, entity);
+	else
+		bfq_idle_insert(st, entity);
+
+	BUG_ON(sd->in_service_entity == entity);
+	BUG_ON(sd->next_in_service == entity);
+
+	return ret;
+}
+
+/**
+ * bfq_deactivate_entity - deactivate an entity.
+ * @entity: the entity to deactivate.
+ * @requeue: true if the entity can be put on the idle tree
+ */
+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+{
+	struct bfq_sched_data *sd;
+	struct bfq_entity *parent;
+
+	for_each_entity_safe(entity, parent) {
+		sd = entity->sched_data;
+
+		if (!__bfq_deactivate_entity(entity, requeue))
+			/*
+			 * The parent entity is still backlogged, and
+			 * we don't need to update it as it is still
+			 * in service.
+			 */
+			break;
+
+		if (sd->next_in_service != NULL)
+			/*
+			 * The parent entity is still backlogged and
+			 * the budgets on the path towards the root
+			 * need to be updated.
+			 */
+			goto update;
+
+		/*
+		 * If we reach there the parent is no more backlogged and
+		 * we want to propagate the dequeue upwards.
+		 */
+		requeue = 1;
+	}
+
+	return;
+
+update:
+	entity = parent;
+	for_each_entity(entity) {
+		__bfq_activate_entity(entity);
+
+		sd = entity->sched_data;
+		if (!bfq_update_next_in_service(sd))
+			break;
+	}
+}
+
+/**
+ * bfq_update_vtime - update vtime if necessary.
+ * @st: the service tree to act upon.
+ *
+ * If necessary update the service tree vtime to have at least one
+ * eligible entity, skipping to its start time.  Assumes that the
+ * active tree of the device is not empty.
+ *
+ * NOTE: this hierarchical implementation updates vtimes quite often,
+ * we may end up with reactivated processes getting timestamps after a
+ * vtime skip done because we needed a ->first_active entity on some
+ * intermediate node.
+ */
+static void bfq_update_vtime(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entry;
+	struct rb_node *node = st->active.rb_node;
+
+	entry = rb_entry(node, struct bfq_entity, rb_node);
+	if (bfq_gt(entry->min_start, st->vtime)) {
+		st->vtime = entry->min_start;
+		bfq_forget_idle(st);
+	}
+}
+
+/**
+ * bfq_first_active_entity - find the eligible entity with
+ *                           the smallest finish time
+ * @st: the service tree to select from.
+ *
+ * This function searches the first schedulable entity, starting from the
+ * root of the tree and going on the left every time on this side there is
+ * a subtree with at least one eligible (start >= vtime) entity. The path on
+ * the right is followed only if a) the left subtree contains no eligible
+ * entities and b) no eligible entity has been found yet.
+ */
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entry, *first = NULL;
+	struct rb_node *node = st->active.rb_node;
+
+	while (node != NULL) {
+		entry = rb_entry(node, struct bfq_entity, rb_node);
+left:
+		if (!bfq_gt(entry->start, st->vtime))
+			first = entry;
+
+		BUG_ON(bfq_gt(entry->min_start, st->vtime));
+
+		if (node->rb_left != NULL) {
+			entry = rb_entry(node->rb_left,
+					 struct bfq_entity, rb_node);
+			if (!bfq_gt(entry->min_start, st->vtime)) {
+				node = node->rb_left;
+				goto left;
+			}
+		}
+		if (first != NULL)
+			break;
+		node = node->rb_right;
+	}
+
+	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
+	return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * Update the virtual time in @st and return the first eligible entity
+ * it contains.
+ */
+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
+						   bool force)
+{
+	struct bfq_entity *entity, *new_next_in_service = NULL;
+
+	if (RB_EMPTY_ROOT(&st->active))
+		return NULL;
+
+	bfq_update_vtime(st);
+	entity = bfq_first_active_entity(st);
+	BUG_ON(bfq_gt(entity->start, st->vtime));
+
+	/*
+	 * If the chosen entity does not match with the sched_data's
+	 * next_in_service and we are forcedly serving the IDLE priority
+	 * class tree, bubble up budget update.
+	 */
+	if (unlikely(force && entity != entity->sched_data->next_in_service)) {
+		new_next_in_service = entity;
+		for_each_entity(new_next_in_service)
+			bfq_update_budget(new_next_in_service);
+	}
+
+	return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ * @extract: if true the returned entity will be also extracted from @sd.
+ *
+ * NOTE: since we cache the next_in_service entity at each level of the
+ * hierarchy, the complexity of the lookup can be decreased with
+ * absolutely no effort just returning the cached next_in_service value;
+ * we prefer to do full lookups to test the consistency of * the data
+ * structures.
+ */
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
+						 int extract,
+						 struct bfq_data *bfqd)
+{
+	struct bfq_service_tree *st = sd->service_tree;
+	struct bfq_entity *entity;
+	int i = 0;
+
+	BUG_ON(sd->in_service_entity != NULL);
+
+	if (bfqd != NULL &&
+	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
+		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
+						  true);
+		if (entity != NULL) {
+			i = BFQ_IOPRIO_CLASSES - 1;
+			bfqd->bfq_class_idle_last_service = jiffies;
+			sd->next_in_service = entity;
+		}
+	}
+	for (; i < BFQ_IOPRIO_CLASSES; i++) {
+		entity = __bfq_lookup_next_entity(st + i, false);
+		if (entity != NULL) {
+			if (extract) {
+				bfq_check_next_in_service(sd, entity);
+				bfq_active_extract(st + i, entity);
+				sd->in_service_entity = entity;
+				sd->next_in_service = NULL;
+			}
+			break;
+		}
+	}
+
+	return entity;
+}
+
+/*
+ * Get next queue for service.
+ */
+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+{
+	struct bfq_entity *entity = NULL;
+	struct bfq_sched_data *sd;
+	struct bfq_queue *bfqq;
+
+	BUG_ON(bfqd->in_service_queue != NULL);
+
+	if (bfqd->busy_queues == 0)
+		return NULL;
+
+	sd = &bfqd->root_group->sched_data;
+	for (; sd != NULL; sd = entity->my_sched_data) {
+		entity = bfq_lookup_next_entity(sd, 1, bfqd);
+		BUG_ON(entity == NULL);
+		entity->service = 0;
+	}
+
+	bfqq = bfq_entity_to_bfqq(entity);
+	BUG_ON(bfqq == NULL);
+
+	return bfqq;
+}
+
+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+{
+	if (bfqd->in_service_bic != NULL) {
+		put_io_context(bfqd->in_service_bic->icq.ioc);
+		bfqd->in_service_bic = NULL;
+	}
+
+	bfqd->in_service_queue = NULL;
+	del_timer(&bfqd->idle_slice_timer);
+}
+
+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				int requeue)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	if (bfqq == bfqd->in_service_queue)
+		__bfq_bfqd_reset_in_service(bfqd);
+
+	bfq_deactivate_entity(entity, requeue);
+}
+
+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_activate_entity(entity);
+}
+
+/*
+ * Called when the bfqq no longer has requests pending, remove it from
+ * the service tree.
+ */
+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			      int requeue)
+{
+	BUG_ON(!bfq_bfqq_busy(bfqq));
+	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
+
+	bfq_log_bfqq(bfqd, bfqq, "del from busy");
+
+	bfq_clear_bfqq_busy(bfqq);
+
+	BUG_ON(bfqd->busy_queues == 0);
+	bfqd->busy_queues--;
+
+	if (!bfqq->dispatched) {
+		bfq_weights_tree_remove(bfqd, &bfqq->entity,
+					&bfqd->queue_weights_tree);
+		if (!blk_queue_nonrot(bfqd->queue)) {
+			BUG_ON(!bfqd->busy_in_flight_queues);
+			bfqd->busy_in_flight_queues--;
+			if (bfq_bfqq_constantly_seeky(bfqq)) {
+				BUG_ON(!bfqd->
+					const_seeky_busy_in_flight_queues);
+				bfqd->const_seeky_busy_in_flight_queues--;
+			}
+		}
+	}
+	if (bfqq->wr_coeff > 1)
+		bfqd->wr_busy_queues--;
+
+	bfq_deactivate_bfqq(bfqd, bfqq, requeue);
+}
+
+/*
+ * Called when an inactive queue receives a new request.
+ */
+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	BUG_ON(bfq_bfqq_busy(bfqq));
+	BUG_ON(bfqq == bfqd->in_service_queue);
+
+	bfq_log_bfqq(bfqd, bfqq, "add to busy");
+
+	bfq_activate_bfqq(bfqd, bfqq);
+
+	bfq_mark_bfqq_busy(bfqq);
+	bfqd->busy_queues++;
+
+	if (!bfqq->dispatched) {
+		if (bfqq->wr_coeff == 1)
+			bfq_weights_tree_add(bfqd, &bfqq->entity,
+					     &bfqd->queue_weights_tree);
+		if (!blk_queue_nonrot(bfqd->queue)) {
+			bfqd->busy_in_flight_queues++;
+			if (bfq_bfqq_constantly_seeky(bfqq))
+				bfqd->const_seeky_busy_in_flight_queues++;
+		}
+	}
+	if (bfqq->wr_coeff > 1)
+		bfqd->wr_busy_queues++;
+}
diff --git a/block/bfq.h b/block/bfq.h
new file mode 100644
index 000000000..e6f1a88f4
--- /dev/null
+++ b/block/bfq.h
@@ -0,0 +1,806 @@
+/*
+ * BFQ-v7r8 for 3.4.0: data structures and common functions prototypes.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ */
+
+#ifndef _BFQ_H
+#define _BFQ_H
+
+#include <linux/blktrace_api.h>
+#include <linux/hrtimer.h>
+#include <linux/ioprio.h>
+#include <linux/rbtree.h>
+
+#define BFQ_IOPRIO_CLASSES	3
+#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)
+
+#define BFQ_MIN_WEIGHT	1
+#define BFQ_MAX_WEIGHT	1000
+
+#define BFQ_DEFAULT_QUEUE_IOPRIO	4
+
+#define BFQ_DEFAULT_GRP_WEIGHT	10
+#define BFQ_DEFAULT_GRP_IOPRIO	0
+#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
+
+struct bfq_entity;
+
+/**
+ * struct bfq_service_tree - per ioprio_class service tree.
+ * @active: tree for active entities (i.e., those backlogged).
+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
+ * @first_idle: idle entity with minimum F_i.
+ * @last_idle: idle entity with maximum F_i.
+ * @vtime: scheduler virtual time.
+ * @wsum: scheduler weight sum; active and idle entities contribute to it.
+ *
+ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
+ * ioprio_class has its own independent scheduler, and so its own
+ * bfq_service_tree.  All the fields are protected by the queue lock
+ * of the containing bfqd.
+ */
+struct bfq_service_tree {
+	struct rb_root active;
+	struct rb_root idle;
+
+	struct bfq_entity *first_idle;
+	struct bfq_entity *last_idle;
+
+	u64 vtime;
+	unsigned long wsum;
+};
+
+/**
+ * struct bfq_sched_data - multi-class scheduler.
+ * @in_service_entity: entity in service.
+ * @next_in_service: head-of-the-line entity in the scheduler.
+ * @service_tree: array of service trees, one per ioprio_class.
+ *
+ * bfq_sched_data is the basic scheduler queue.  It supports three
+ * ioprio_classes, and can be used either as a toplevel queue or as
+ * an intermediate queue on a hierarchical setup.
+ * @next_in_service points to the active entity of the sched_data
+ * service trees that will be scheduled next.
+ *
+ * The supported ioprio_classes are the same as in CFQ, in descending
+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
+ * Requests from higher priority queues are served before all the
+ * requests from lower priority queues; among requests of the same
+ * queue requests are served according to B-WF2Q+.
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_sched_data {
+	struct bfq_entity *in_service_entity;
+	struct bfq_entity *next_in_service;
+	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
+};
+
+/**
+ * struct bfq_weight_counter - counter of the number of all active entities
+ *                             with a given weight.
+ * @weight: weight of the entities that this counter refers to.
+ * @num_active: number of active entities with this weight.
+ * @weights_node: weights tree member (see bfq_data's @queue_weights_tree
+ *                and @group_weights_tree).
+ */
+struct bfq_weight_counter {
+	short int weight;
+	unsigned int num_active;
+	struct rb_node weights_node;
+};
+
+/**
+ * struct bfq_entity - schedulable entity.
+ * @rb_node: service_tree member.
+ * @weight_counter: pointer to the weight counter associated with this entity.
+ * @on_st: flag, true if the entity is on a tree (either the active or
+ *         the idle one of its service_tree).
+ * @finish: B-WF2Q+ finish timestamp (aka F_i).
+ * @start: B-WF2Q+ start timestamp (aka S_i).
+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
+ * @min_start: minimum start time of the (active) subtree rooted at
+ *             this entity; used for O(log N) lookups into active trees.
+ * @service: service received during the last round of service.
+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
+ * @weight: weight of the queue
+ * @parent: parent entity, for hierarchical scheduling.
+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
+ *                 associated scheduler queue, %NULL on leaf nodes.
+ * @sched_data: the scheduler queue this entity belongs to.
+ * @ioprio: the ioprio in use.
+ * @new_weight: when a weight change is requested, the new weight value.
+ * @orig_weight: original weight, used to implement weight boosting
+ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
+ * @ioprio_class: the ioprio_class in use.
+ * @new_ioprio_class: when an ioprio_class change is requested, the new
+ *                    ioprio_class value.
+ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
+ *                  ioprio_class change.
+ *
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
+ * entity belongs to the sched_data of the parent group in the cgroup
+ * hierarchy.  Non-leaf entities have also their own sched_data, stored
+ * in @my_sched_data.
+ *
+ * Each entity stores independently its priority values; this would
+ * allow different weights on different devices, but this
+ * functionality is not exported to userspace by now.  Priorities and
+ * weights are updated lazily, first storing the new values into the
+ * new_* fields, then setting the @ioprio_changed flag.  As soon as
+ * there is a transition in the entity state that allows the priority
+ * update to take place the effective and the requested priority
+ * values are synchronized.
+ *
+ * Unless cgroups are used, the weight value is calculated from the
+ * ioprio to export the same interface as CFQ.  When dealing with
+ * ``well-behaved'' queues (i.e., queues that do not spend too much
+ * time to consume their budget and have true sequential behavior, and
+ * when there are no external factors breaking anticipation) the
+ * relative weights at each level of the cgroups hierarchy should be
+ * guaranteed.  All the fields are protected by the queue lock of the
+ * containing bfqd.
+ */
+struct bfq_entity {
+	struct rb_node rb_node;
+	struct bfq_weight_counter *weight_counter;
+
+	int on_st;
+
+	u64 finish;
+	u64 start;
+
+	struct rb_root *tree;
+
+	u64 min_start;
+
+	unsigned long service, budget;
+	unsigned short weight, new_weight;
+	unsigned short orig_weight;
+
+	struct bfq_entity *parent;
+
+	struct bfq_sched_data *my_sched_data;
+	struct bfq_sched_data *sched_data;
+
+	unsigned short ioprio, new_ioprio;
+	unsigned short ioprio_class, new_ioprio_class;
+
+	int ioprio_changed;
+};
+
+struct bfq_group;
+
+/**
+ * struct bfq_queue - leaf schedulable entity.
+ * @ref: reference counter.
+ * @bfqd: parent bfq_data.
+ * @new_bfqq: shared bfq_queue if queue is cooperating with
+ *           one or more other queues.
+ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
+ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
+ * @sort_list: sorted list of pending requests.
+ * @next_rq: if fifo isn't expired, next request to serve.
+ * @queued: nr of requests queued in @sort_list.
+ * @allocated: currently allocated requests.
+ * @meta_pending: pending metadata requests.
+ * @fifo: fifo list of requests in sort_list.
+ * @entity: entity representing this queue in the scheduler.
+ * @max_budget: maximum budget allowed from the feedback mechanism.
+ * @budget_timeout: budget expiration (in jiffies).
+ * @dispatched: number of requests on the dispatch list or inside driver.
+ * @flags: status flags.
+ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
+ * @burst_list_node: node for the device's burst list.
+ * @seek_samples: number of seeks sampled
+ * @seek_total: sum of the distances of the seeks sampled
+ * @seek_mean: mean seek distance
+ * @last_request_pos: position of the last request enqueued
+ * @requests_within_timer: number of consecutive pairs of request completion
+ *                         and arrival, such that the queue becomes idle
+ *                         after the completion, but the next request arrives
+ *                         within an idle time slice; used only if the queue's
+ *                         IO_bound has been cleared.
+ * @pid: pid of the process owning the queue, used for logging purposes.
+ * @last_wr_start_finish: start time of the current weight-raising period if
+ *                        the @bfq-queue is being weight-raised, otherwise
+ *                        finish time of the last weight-raising period
+ * @wr_cur_max_time: current max raising time for this queue
+ * @soft_rt_next_start: minimum time instant such that, only if a new
+ *                      request is enqueued after this time instant in an
+ *                      idle @bfq_queue with no outstanding requests, then
+ *                      the task associated with the queue it is deemed as
+ *                      soft real-time (see the comments to the function
+ *                      bfq_bfqq_softrt_next_start())
+ * @last_idle_bklogged: time of the last transition of the @bfq_queue from
+ *                      idle to backlogged
+ * @service_from_backlogged: cumulative service received from the @bfq_queue
+ *                           since the last transition from idle to
+ *                           backlogged
+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
+ *	 queue is shared
+ *
+ * A bfq_queue is a leaf request queue; it can be associated with an
+ * io_context or more, if it  is  async or shared  between  cooperating
+ * processes. @cgroup holds a reference to the cgroup, to be sure that it
+ * does not disappear while a bfqq still references it (mostly to avoid
+ * races between request issuing and task migration followed by cgroup
+ * destruction).
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_queue {
+	atomic_t ref;
+	struct bfq_data *bfqd;
+
+	/* fields for cooperating queues handling */
+	struct bfq_queue *new_bfqq;
+	struct rb_node pos_node;
+	struct rb_root *pos_root;
+
+	struct rb_root sort_list;
+	struct request *next_rq;
+	int queued[2];
+	int allocated[2];
+	int meta_pending;
+	struct list_head fifo;
+
+	struct bfq_entity entity;
+
+	unsigned long max_budget;
+	unsigned long budget_timeout;
+
+	int dispatched;
+
+	unsigned int flags;
+
+	struct list_head bfqq_list;
+
+	struct hlist_node burst_list_node;
+
+	unsigned int seek_samples;
+	u64 seek_total;
+	sector_t seek_mean;
+	sector_t last_request_pos;
+
+	unsigned int requests_within_timer;
+
+	pid_t pid;
+	struct bfq_io_cq *bic;
+
+	/* weight-raising fields */
+	unsigned long wr_cur_max_time;
+	unsigned long soft_rt_next_start;
+	unsigned long last_wr_start_finish;
+	unsigned int wr_coeff;
+	unsigned long last_idle_bklogged;
+	unsigned long service_from_backlogged;
+};
+
+/**
+ * struct bfq_ttime - per process thinktime stats.
+ * @ttime_total: total process thinktime
+ * @ttime_samples: number of thinktime samples
+ * @ttime_mean: average process thinktime
+ */
+struct bfq_ttime {
+	unsigned long last_end_request;
+
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+};
+
+/**
+ * struct bfq_io_cq - per (request_queue, io_context) structure.
+ * @icq: associated io_cq structure
+ * @bfqq: array of two process queues, the sync and the async
+ * @ttime: associated @bfq_ttime struct
+ * @wr_time_left: snapshot of the time left before weight raising ends
+ *                for the sync queue associated to this process; this
+ *		  snapshot is taken to remember this value while the weight
+ *		  raising is suspended because the queue is merged with a
+ *		  shared queue, and is used to set @raising_cur_max_time
+ *		  when the queue is split from the shared queue and its
+ *		  weight is raised again
+ * @saved_idle_window: same purpose as the previous field for the idle
+ *                     window
+ * @saved_IO_bound: same purpose as the previous two fields for the I/O
+ *                  bound classification of a queue
+ * @saved_in_large_burst: same purpose as the previous fields for the
+ *                        value of the field keeping the queue's belonging
+ *                        to a large burst
+ * @was_in_burst_list: true if the queue belonged to a burst list
+ *                     before its merge with another cooperating queue
+ * @cooperations: counter of consecutive successful queue merges underwent
+ *                by any of the process' @bfq_queues
+ * @failed_cooperations: counter of consecutive failed queue merges of any
+ *                       of the process' @bfq_queues
+ */
+struct bfq_io_cq {
+	struct io_cq icq; /* must be the first member */
+	struct bfq_queue *bfqq[2];
+	struct bfq_ttime ttime;
+	int ioprio;
+
+	unsigned int wr_time_left;
+	bool saved_idle_window;
+	bool saved_IO_bound;
+
+	bool saved_in_large_burst;
+	bool was_in_burst_list;
+
+	unsigned int cooperations;
+	unsigned int failed_cooperations;
+};
+
+enum bfq_device_speed {
+	BFQ_BFQD_FAST,
+	BFQ_BFQD_SLOW,
+};
+
+/**
+ * struct bfq_data - per device data structure.
+ * @queue: request queue for the managed device.
+ * @root_group: root bfq_group for the device.
+ * @rq_pos_tree: rbtree sorted by next_request position, used when
+ *               determining if two or more queues have interleaving
+ *               requests (see bfq_close_cooperator()).
+ * @active_numerous_groups: number of bfq_groups containing more than one
+ *                          active @bfq_entity.
+ * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by
+ *                      weight. Used to keep track of whether all @bfq_queues
+ *                     have the same weight. The tree contains one counter
+ *                     for each distinct weight associated to some active
+ *                     and not weight-raised @bfq_queue (see the comments to
+ *                      the functions bfq_weights_tree_[add|remove] for
+ *                     further details).
+ * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted
+ *                      by weight. Used to keep track of whether all
+ *                     @bfq_groups have the same weight. The tree contains
+ *                     one counter for each distinct weight associated to
+ *                     some active @bfq_group (see the comments to the
+ *                     functions bfq_weights_tree_[add|remove] for further
+ *                     details).
+ * @busy_queues: number of bfq_queues containing requests (including the
+ *		 queue in service, even if it is idling).
+ * @busy_in_flight_queues: number of @bfq_queues containing pending or
+ *                         in-flight requests, plus the @bfq_queue in
+ *                         service, even if idle but waiting for the
+ *                         possible arrival of its next sync request. This
+ *                         field is updated only if the device is rotational,
+ *                         but used only if the device is also NCQ-capable.
+ *                         The reason why the field is updated also for non-
+ *                         NCQ-capable rotational devices is related to the
+ *                         fact that the value of @hw_tag may be set also
+ *                         later than when busy_in_flight_queues may need to
+ *                         be incremented for the first time(s). Taking also
+ *                         this possibility into account, to avoid unbalanced
+ *                         increments/decrements, would imply more overhead
+ *                         than just updating busy_in_flight_queues
+ *                         regardless of the value of @hw_tag.
+ * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues
+ *                                     (that is, seeky queues that expired
+ *                                     for budget timeout at least once)
+ *                                     containing pending or in-flight
+ *                                     requests, including the in-service
+ *                                     @bfq_queue if constantly seeky. This
+ *                                     field is updated only if the device
+ *                                     is rotational, but used only if the
+ *                                     device is also NCQ-capable (see the
+ *                                     comments to @busy_in_flight_queues).
+ * @wr_busy_queues: number of weight-raised busy @bfq_queues.
+ * @queued: number of queued requests.
+ * @rq_in_driver: number of requests dispatched and waiting for completion.
+ * @sync_flight: number of sync requests in the driver.
+ * @max_rq_in_driver: max number of reqs in driver in the last
+ *                    @hw_tag_samples completed requests.
+ * @hw_tag_samples: nr of samples used to calculate hw_tag.
+ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
+ * @budgets_assigned: number of budgets assigned.
+ * @idle_slice_timer: timer set when idling for the next sequential request
+ *                    from the queue in service.
+ * @unplug_work: delayed work to restart dispatching on the request queue.
+ * @in_service_queue: bfq_queue in service.
+ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.
+ * @last_position: on-disk position of the last served request.
+ * @last_budget_start: beginning of the last budget.
+ * @last_idling_start: beginning of the last idle slice.
+ * @peak_rate: peak transfer rate observed for a budget.
+ * @peak_rate_samples: number of samples used to calculate @peak_rate.
+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before
+ *                  rescheduling.
+ * @group_list: list of all the bfq_groups active on the device.
+ * @active_list: list of all the bfq_queues active on the device.
+ * @idle_list: list of all the bfq_queues idle on the device.
+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
+ *                   requests are served in fifo order.
+ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
+ * @bfq_back_max: maximum allowed backward seek.
+ * @bfq_slice_idle: maximum idling time.
+ * @bfq_user_max_budget: user-configured max budget value
+ *                       (0 for auto-tuning).
+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
+ *                           async queues.
+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
+ *               to prevent seeky queues to impose long latencies to well
+ *               behaved ones (this also implies that seeky queues cannot
+ *               receive guarantees in the service domain; after a timeout
+ *               they are charged for the whole allocated budget, to try
+ *               to preserve a behavior reasonably fair among them, but
+ *               without service-domain guarantees).
+ * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is
+ *                   no more granted any weight-raising.
+ * @bfq_failed_cooperations: number of consecutive failed cooperation
+ *                           chances after which weight-raising is restored
+ *                           to a queue subject to more than bfq_coop_thresh
+ *                           queue merges.
+ * @bfq_requests_within_timer: number of consecutive requests that must be
+ *                             issued within the idle time slice to set
+ *                             again idling to a queue which was marked as
+ *                             non-I/O-bound (see the definition of the
+ *                             IO_bound flag for further details).
+ * @last_ins_in_burst: last time at which a queue entered the current
+ *                     burst of queues being activated shortly after
+ *                     each other; for more details about this and the
+ *                     following parameters related to a burst of
+ *                     activations, see the comments to the function
+ *                     @bfq_handle_burst.
+ * @bfq_burst_interval: reference time interval used to decide whether a
+ *                      queue has been activated shortly after
+ *                      @last_ins_in_burst.
+ * @burst_size: number of queues in the current burst of queue activations.
+ * @bfq_large_burst_thresh: maximum burst size above which the current
+ * 			    queue-activation burst is deemed as 'large'.
+ * @large_burst: true if a large queue-activation burst is in progress.
+ * @burst_list: head of the burst list (as for the above fields, more details
+ * 		in the comments to the function bfq_handle_burst).
+ * @low_latency: if set to true, low-latency heuristics are enabled.
+ * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised
+ *                queue is multiplied.
+ * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies).
+ * @bfq_wr_rt_max_time: maximum duration for soft real-time processes.
+ * @bfq_wr_min_idle_time: minimum idle period after which weight-raising
+ *			  may be reactivated for a queue (in jiffies).
+ * @bfq_wr_min_inter_arr_async: minimum period between request arrivals
+ *				after which weight-raising may be
+ *				reactivated for an already busy queue
+ *				(in jiffies).
+ * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue,
+ *			    sectors per seconds.
+ * @RT_prod: cached value of the product R*T used for computing the maximum
+ *	     duration of the weight raising automatically.
+ * @device_speed: device-speed class for the low-latency heuristic.
+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions.
+ *
+ * All the fields are protected by the @queue lock.
+ */
+struct bfq_data {
+	struct request_queue *queue;
+
+	struct bfq_group *root_group;
+	struct rb_root rq_pos_tree;
+
+#ifdef CONFIG_CGROUP_BFQIO
+	int active_numerous_groups;
+#endif
+
+	struct rb_root queue_weights_tree;
+	struct rb_root group_weights_tree;
+
+	int busy_queues;
+	int busy_in_flight_queues;
+	int const_seeky_busy_in_flight_queues;
+	int wr_busy_queues;
+	int queued;
+	int rq_in_driver;
+	int sync_flight;
+
+	int max_rq_in_driver;
+	int hw_tag_samples;
+	int hw_tag;
+
+	int budgets_assigned;
+
+	struct timer_list idle_slice_timer;
+	struct work_struct unplug_work;
+
+	struct bfq_queue *in_service_queue;
+	struct bfq_io_cq *in_service_bic;
+
+	sector_t last_position;
+
+	ktime_t last_budget_start;
+	ktime_t last_idling_start;
+	int peak_rate_samples;
+	u64 peak_rate;
+	unsigned long bfq_max_budget;
+
+	struct hlist_head group_list;
+	struct list_head active_list;
+	struct list_head idle_list;
+
+	unsigned int bfq_fifo_expire[2];
+	unsigned int bfq_back_penalty;
+	unsigned int bfq_back_max;
+	unsigned int bfq_slice_idle;
+	u64 bfq_class_idle_last_service;
+
+	unsigned int bfq_user_max_budget;
+	unsigned int bfq_max_budget_async_rq;
+	unsigned int bfq_timeout[2];
+
+	unsigned int bfq_coop_thresh;
+	unsigned int bfq_failed_cooperations;
+	unsigned int bfq_requests_within_timer;
+
+	unsigned long last_ins_in_burst;
+	unsigned long bfq_burst_interval;
+	int burst_size;
+	unsigned long bfq_large_burst_thresh;
+	bool large_burst;
+	struct hlist_head burst_list;
+
+	bool low_latency;
+
+	/* parameters of the low_latency heuristics */
+	unsigned int bfq_wr_coeff;
+	unsigned int bfq_wr_max_time;
+	unsigned int bfq_wr_rt_max_time;
+	unsigned int bfq_wr_min_idle_time;
+	unsigned long bfq_wr_min_inter_arr_async;
+	unsigned int bfq_wr_max_softrt_rate;
+	u64 RT_prod;
+	enum bfq_device_speed device_speed;
+
+	struct bfq_queue oom_bfqq;
+};
+
+enum bfqq_state_flags {
+	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is in service */
+	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */
+	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */
+	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
+	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */
+	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */
+	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */
+	BFQ_BFQQ_FLAG_IO_bound,		/*
+					 * bfqq has timed-out at least once
+					 * having consumed at most 2/10 of
+					 * its budget
+					 */
+	BFQ_BFQQ_FLAG_in_large_burst,	/*
+					 * bfqq activated in a large burst,
+					 * see comments to bfq_handle_burst.
+					 */
+	BFQ_BFQQ_FLAG_constantly_seeky,	/*
+					 * bfqq has proved to be slow and
+					 * seeky until budget timeout
+					 */
+	BFQ_BFQQ_FLAG_softrt_update,	/*
+					 * may need softrt-next-start
+					 * update
+					 */
+	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */
+	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be split */
+	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */
+};
+
+#define BFQ_BFQQ_FNS(name)						\
+static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\
+{									\
+	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\
+}									\
+static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\
+{									\
+	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\
+}									\
+static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
+{									\
+	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\
+}
+
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(must_alloc);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(budget_new);
+BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
+BFQ_BFQQ_FNS(constantly_seeky);
+BFQ_BFQQ_FNS(coop);
+BFQ_BFQQ_FNS(split_coop);
+BFQ_BFQQ_FNS(just_split);
+BFQ_BFQQ_FNS(softrt_update);
+#undef BFQ_BFQQ_FNS
+
+/* Logging facilities. */
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
+	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
+
+#define bfq_log(bfqd, fmt, args...) \
+	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
+
+/* Expiration reasons. */
+enum bfqq_expiration {
+	BFQ_BFQQ_TOO_IDLE = 0,		/*
+					 * queue has been idling for
+					 * too long
+					 */
+	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */
+	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */
+	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */
+};
+
+#ifdef CONFIG_CGROUP_BFQIO
+/**
+ * struct bfq_group - per (device, cgroup) data structure.
+ * @entity: schedulable entity to insert into the parent group sched_data.
+ * @sched_data: own sched_data, to contain child entities (they may be
+ *              both bfq_queues and bfq_groups).
+ * @group_node: node to be inserted into the bfqio_cgroup->group_data
+ *              list of the containing cgroup's bfqio_cgroup.
+ * @bfqd_node: node to be inserted into the @bfqd->group_list list
+ *             of the groups active on the same device; used for cleanup.
+ * @bfqd: the bfq_data for the device this group acts upon.
+ * @async_bfqq: array of async queues for all the tasks belonging to
+ *              the group, one queue per ioprio value per ioprio_class,
+ *              except for the idle class that has only one queue.
+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
+ *             to avoid too many special cases during group creation/
+ *             migration.
+ * @active_entities: number of active entities belonging to the group;
+ *                   unused for the root group. Used to know whether there
+ *                   are groups with more than one active @bfq_entity
+ *                   (see the comments to the function
+ *                   bfq_bfqq_must_not_expire()).
+ *
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
+ * there is a set of bfq_groups, each one collecting the lower-level
+ * entities belonging to the group that are acting on the same device.
+ *
+ * Locking works as follows:
+ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed
+ *      via RCU from its readers.
+ *    o @bfqd is protected by the queue lock, RCU is used to access it
+ *      from the readers.
+ *    o All the other fields are protected by the @bfqd queue lock.
+ */
+struct bfq_group {
+	struct bfq_entity entity;
+	struct bfq_sched_data sched_data;
+
+	struct hlist_node group_node;
+	struct hlist_node bfqd_node;
+
+	void *bfqd;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+
+	struct bfq_entity *my_entity;
+
+	int active_entities;
+};
+
+/**
+ * struct bfqio_cgroup - bfq cgroup data structure.
+ * @css: subsystem state for bfq in the containing cgroup.
+ * @weight: cgroup weight.
+ * @ioprio: cgroup ioprio.
+ * @ioprio_class: cgroup ioprio_class.
+ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
+ * @group_data: list containing the bfq_group belonging to this cgroup.
+ *
+ * @group_data is accessed using RCU, with @lock protecting the updates,
+ * @ioprio and @ioprio_class are protected by @lock.
+ */
+struct bfqio_cgroup {
+	struct cgroup_subsys_state css;
+
+	unsigned short weight, ioprio, ioprio_class;
+
+	spinlock_t lock;
+	struct hlist_head group_data;
+};
+#else
+struct bfq_group {
+	struct bfq_sched_data sched_data;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+};
+#endif
+
+static inline struct bfq_service_tree *
+bfq_entity_service_tree(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sched_data = entity->sched_data;
+	unsigned int idx = entity->ioprio_class - 1;
+
+	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
+	BUG_ON(sched_data == NULL);
+
+	return sched_data->service_tree + idx;
+}
+
+static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
+					    bool is_sync)
+{
+	return bic->bfqq[is_sync];
+}
+
+static inline void bic_set_bfqq(struct bfq_io_cq *bic,
+				struct bfq_queue *bfqq, bool is_sync)
+{
+	bic->bfqq[is_sync] = bfqq;
+}
+
+static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
+{
+	return bic->icq.q->elevator->elevator_data;
+}
+
+/**
+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
+ * @ptr: a pointer to a bfqd.
+ * @flags: storage for the flags to be saved.
+ *
+ * This function allows bfqg->bfqd to be protected by the
+ * queue lock of the bfqd they reference; the pointer is dereferenced
+ * under RCU, so the storage for bfqd is assured to be safe as long
+ * as the RCU read side critical section does not end.  After the
+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
+ * sure that no other writer accessed it.  If we raced with a writer,
+ * the function returns NULL, with the queue unlocked, otherwise it
+ * returns the dereferenced pointer, with the queue locked.
+ */
+static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
+						   unsigned long *flags)
+{
+	struct bfq_data *bfqd;
+
+	rcu_read_lock();
+	bfqd = rcu_dereference(*(struct bfq_data **)ptr);
+
+	if (bfqd != NULL) {
+		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
+		if (*ptr == bfqd)
+			goto out;
+		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
+	}
+
+	bfqd = NULL;
+out:
+	rcu_read_unlock();
+	return bfqd;
+}
+
+static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
+				       unsigned long *flags)
+{
+	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
+}
+
+static void bfq_check_ioprio_change(struct io_context *ioc,
+				    struct bfq_io_cq *bic);
+static void bfq_put_queue(struct bfq_queue *bfqq);
+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+				       struct bfq_group *bfqg, int is_sync,
+				       struct io_context *ioc, gfp_t gfp_mask);
+static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+				    struct bfq_group *bfqg);
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+
+#endif /* _BFQ_H */
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 1b5cb66ef..55dd9861d 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/sched/sysctl.h>
 
 #include "blk.h"
 
diff --git a/build.sh b/build.sh
index d6c7de091..0a6060e70 100755
--- a/build.sh
+++ b/build.sh
@@ -34,7 +34,7 @@ RDIR=$(pwd)
 # japanese variants:
 #	dcm = N900D / SC-01F  (NTT Docomo)
 #	kdi = N900J / SCL22   (au by KDDI)
-VARIANT=can
+VARIANT=eur
 
 [ -z $VER ] && \
 # version number
@@ -61,11 +61,13 @@ MAKE_ZIP=1
 MAKE_TAR=1
 
 # directory containing cross-compile arm-cortex_a15 toolchain
-TOOLCHAIN=/home/jc/build/toolchain/arm-cortex_a15-linux-gnueabihf-linaro_4.9.4-2015.06
+TOOLCHAIN=/home/vagrant/g2/DORIMANX_LG_STOCK_LP_KERNEL/android-toolchain
 
 # amount of cpu threads to use in kernel make process
 THREADS=5
 
+G_SESNSOR_HUB=1
+
 ############## SCARY NO-TOUCHY STUFF ###############
 
 export ARCH=arm
@@ -82,6 +84,10 @@ if ! [ -d $RDIR"/ik.ramdisk/variant/$VARIANT/" ] ; then
 	exit -1
 fi
 
+if [ $G_SESNSOR_HUB -eq 1 ] ; then
+	sed -i "s/CONFIG_SENSORS_SSP_STM_LEGACY=y/# CONFIG_SENSORS_SSP_STM_LEGACY is not set/" arch/arm/configs/ik_defconfig
+fi
+
 [ $PERMISSIVE -eq 1 ] && SELINUX="never_enforce" || SELINUX="always_enforce"
 
 KDIR=$RDIR/build/arch/arm/boot
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 566ebe0d8..9a6bd9712 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1035,6 +1035,22 @@ config CRYPTO_LZO
 	help
 	  This is the LZO algorithm.
 
+config CRYPTO_LZ4
+	tristate "LZ4 compression algorithm"
+	select CRYPTO_ALGAPI
+	select LZ4_COMPRESS
+	select LZ4_DECOMPRESS
+	help
+	  This is the LZ4 algorithm.
+
+config CRYPTO_LZ4HC
+	tristate "LZ4HC compression algorithm"
+	select CRYPTO_ALGAPI
+	select LZ4HC_COMPRESS
+	select LZ4_DECOMPRESS
+	help
+	  This is the LZ4 high compression mode algorithm.
+
 comment "Random Number Generation"
 
 config CRYPTO_ANSI_CPRNG
diff --git a/crypto/Makefile b/crypto/Makefile
index cba53f235..df6e37262 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -86,6 +86,8 @@ obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o
 obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o
 obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o
 obj-$(CONFIG_CRYPTO_LZO) += lzo.o
+obj-$(CONFIG_CRYPTO_LZ4) += lz4.o
+obj-$(CONFIG_CRYPTO_LZ4HC) += lz4hc.o
 obj-$(CONFIG_CRYPTO_RNG2) += rng.o
 obj-$(CONFIG_CRYPTO_RNG2) += krng.o
 obj-$(CONFIG_CRYPTO_ANSI_CPRNG) += ansi_cprng.o
diff --git a/crypto/ablkcipher.c b/crypto/ablkcipher.c
index 313a05b8f..7da6f6a21 100644
--- a/crypto/ablkcipher.c
+++ b/crypto/ablkcipher.c
@@ -715,7 +715,7 @@ struct crypto_ablkcipher *crypto_alloc_ablkcipher(const char *alg_name,
 err:
 		if (err != -EAGAIN)
 			break;
-		if (signal_pending(current)) {
+		if (fatal_signal_pending(current)) {
 			err = -EINTR;
 			break;
 		}
diff --git a/crypto/ahash.c b/crypto/ahash.c
index d4c8aae5b..2a79f7ff1 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -487,7 +487,8 @@ static int ahash_prepare_alg(struct ahash_alg *alg)
 	struct crypto_alg *base = &alg->halg.base;
 
 	if (alg->halg.digestsize > PAGE_SIZE / 8 ||
-	    alg->halg.statesize > PAGE_SIZE / 8)
+	    alg->halg.statesize > PAGE_SIZE / 8 ||
+	    alg->halg.statesize == 0)
 		return -EINVAL;
 
 	base->cra_type = &crypto_ahash_type;
diff --git a/crypto/algapi.c b/crypto/algapi.c
index c18f8b104..8df2104b5 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -386,7 +386,7 @@ static void crypto_wait_for_test(struct crypto_larval *larval)
 		crypto_alg_tested(larval->alg.cra_driver_name, 0);
 	}
 
-	err = wait_for_completion_interruptible(&larval->completion);
+	err = wait_for_completion_killable(&larval->completion);
 	WARN_ON(err);
 
 out:
diff --git a/crypto/api.c b/crypto/api.c
index 5c13704f4..29fe0757f 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -181,7 +181,7 @@ static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg)
 	struct crypto_larval *larval = (void *)alg;
 	long timeout;
 
-	timeout = wait_for_completion_interruptible_timeout(
+	timeout = wait_for_completion_killable_timeout(
 		&larval->completion, 60 * HZ);
 
 	alg = larval->adult;
@@ -460,7 +460,7 @@ struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask)
 err:
 		if (err != -EAGAIN)
 			break;
-		if (signal_pending(current)) {
+		if (fatal_signal_pending(current)) {
 			err = -EINTR;
 			break;
 		}
@@ -589,7 +589,7 @@ void *crypto_alloc_tfm(const char *alg_name,
 err:
 		if (err != -EAGAIN)
 			break;
-		if (signal_pending(current)) {
+		if (fatal_signal_pending(current)) {
 			err = -EINTR;
 			break;
 		}
diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c
index 910497bd7..0c19d0357 100644
--- a/crypto/crypto_user.c
+++ b/crypto/crypto_user.c
@@ -350,7 +350,7 @@ static struct crypto_alg *crypto_user_aead_alg(const char *name, u32 type,
 		err = PTR_ERR(alg);
 		if (err != -EAGAIN)
 			break;
-		if (signal_pending(current)) {
+		if (fatal_signal_pending(current)) {
 			err = -EINTR;
 			break;
 		}
diff --git a/crypto/lz4.c b/crypto/lz4.c
new file mode 100644
index 000000000..34d072b72
--- /dev/null
+++ b/crypto/lz4.c
@@ -0,0 +1,106 @@
+/*
+ * Cryptographic API.
+ *
+ * Copyright (c) 2013 Chanho Min <chanho.min@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/vmalloc.h>
+#include <linux/lz4.h>
+
+struct lz4_ctx {
+	void *lz4_comp_mem;
+};
+
+static int lz4_init(struct crypto_tfm *tfm)
+{
+	struct lz4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->lz4_comp_mem = vmalloc(LZ4_MEM_COMPRESS);
+	if (!ctx->lz4_comp_mem)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void lz4_exit(struct crypto_tfm *tfm)
+{
+	struct lz4_ctx *ctx = crypto_tfm_ctx(tfm);
+	vfree(ctx->lz4_comp_mem);
+}
+
+static int lz4_compress_crypto(struct crypto_tfm *tfm, const u8 *src,
+			    unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+	struct lz4_ctx *ctx = crypto_tfm_ctx(tfm);
+	size_t tmp_len = *dlen;
+	int err;
+
+	err = lz4_compress(src, slen, dst, &tmp_len, ctx->lz4_comp_mem);
+
+	if (err < 0)
+		return -EINVAL;
+
+	*dlen = tmp_len;
+	return 0;
+}
+
+static int lz4_decompress_crypto(struct crypto_tfm *tfm, const u8 *src,
+			      unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+	int err;
+	size_t tmp_len = *dlen;
+	size_t __slen = slen;
+
+	err = lz4_decompress_unknownoutputsize(src, __slen, dst, &tmp_len);
+	if (err < 0)
+		return -EINVAL;
+
+	*dlen = tmp_len;
+	return err;
+}
+
+static struct crypto_alg alg_lz4 = {
+	.cra_name		= "lz4",
+	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
+	.cra_ctxsize		= sizeof(struct lz4_ctx),
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(alg_lz4.cra_list),
+	.cra_init		= lz4_init,
+	.cra_exit		= lz4_exit,
+	.cra_u			= { .compress = {
+	.coa_compress		= lz4_compress_crypto,
+	.coa_decompress		= lz4_decompress_crypto } }
+};
+
+static int __init lz4_mod_init(void)
+{
+	return crypto_register_alg(&alg_lz4);
+}
+
+static void __exit lz4_mod_fini(void)
+{
+	crypto_unregister_alg(&alg_lz4);
+}
+
+module_init(lz4_mod_init);
+module_exit(lz4_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZ4 Compression Algorithm");
diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c
new file mode 100644
index 000000000..9218b3fed
--- /dev/null
+++ b/crypto/lz4hc.c
@@ -0,0 +1,106 @@
+/*
+ * Cryptographic API.
+ *
+ * Copyright (c) 2013 Chanho Min <chanho.min@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/vmalloc.h>
+#include <linux/lz4.h>
+
+struct lz4hc_ctx {
+	void *lz4hc_comp_mem;
+};
+
+static int lz4hc_init(struct crypto_tfm *tfm)
+{
+	struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->lz4hc_comp_mem = vmalloc(LZ4HC_MEM_COMPRESS);
+	if (!ctx->lz4hc_comp_mem)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void lz4hc_exit(struct crypto_tfm *tfm)
+{
+	struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	vfree(ctx->lz4hc_comp_mem);
+}
+
+static int lz4hc_compress_crypto(struct crypto_tfm *tfm, const u8 *src,
+			    unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+	struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm);
+	size_t tmp_len = *dlen;
+	int err;
+
+	err = lz4hc_compress(src, slen, dst, &tmp_len, ctx->lz4hc_comp_mem);
+
+	if (err < 0)
+		return -EINVAL;
+
+	*dlen = tmp_len;
+	return 0;
+}
+
+static int lz4hc_decompress_crypto(struct crypto_tfm *tfm, const u8 *src,
+			      unsigned int slen, u8 *dst, unsigned int *dlen)
+{
+	int err;
+	size_t tmp_len = *dlen;
+	size_t __slen = slen;
+
+	err = lz4_decompress_unknownoutputsize(src, __slen, dst, &tmp_len);
+	if (err < 0)
+		return -EINVAL;
+
+	*dlen = tmp_len;
+	return err;
+}
+
+static struct crypto_alg alg_lz4hc = {
+	.cra_name		= "lz4hc",
+	.cra_flags		= CRYPTO_ALG_TYPE_COMPRESS,
+	.cra_ctxsize		= sizeof(struct lz4hc_ctx),
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(alg_lz4hc.cra_list),
+	.cra_init		= lz4hc_init,
+	.cra_exit		= lz4hc_exit,
+	.cra_u			= { .compress = {
+	.coa_compress		= lz4hc_compress_crypto,
+	.coa_decompress		= lz4hc_decompress_crypto } }
+};
+
+static int __init lz4hc_mod_init(void)
+{
+	return crypto_register_alg(&alg_lz4hc);
+}
+
+static void __exit lz4hc_mod_fini(void)
+{
+	crypto_unregister_alg(&alg_lz4hc);
+}
+
+module_init(lz4hc_mod_init);
+module_exit(lz4hc_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("LZ4HC Compression Algorithm");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index b0b161d03..360db8701 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -462,7 +462,7 @@ static int test_aead(struct crypto_aead *tfm, int enc,
 
 			ret = crypto_aead_setkey(tfm, key,
 						 template[i].klen);
-			if (!ret == template[i].fail) {
+			if ((!ret) == template[i].fail) {
 				printk(KERN_ERR "alg: aead: setkey failed on "
 				       "test %d for %s: flags=%x\n", j, algo,
 				       crypto_aead_get_flags(tfm));
@@ -552,7 +552,7 @@ static int test_aead(struct crypto_aead *tfm, int enc,
 			key = template[i].key;
 
 			ret = crypto_aead_setkey(tfm, key, template[i].klen);
-			if (!ret == template[i].fail) {
+			if ((!ret) == template[i].fail) {
 				printk(KERN_ERR "alg: aead: setkey failed on "
 				       "chunk test %d for %s: flags=%x\n", j,
 				       algo, crypto_aead_get_flags(tfm));
@@ -756,7 +756,7 @@ static int test_cipher(struct crypto_cipher *tfm, int enc,
 
 		ret = crypto_cipher_setkey(tfm, template[i].key,
 					   template[i].klen);
-		if (!ret == template[i].fail) {
+		if ((!ret) == template[i].fail) {
 			printk(KERN_ERR "alg: cipher: setkey failed "
 			       "on test %d for %s: flags=%x\n", j,
 			       algo, crypto_cipher_get_flags(tfm));
@@ -852,7 +852,7 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc,
 
 			ret = crypto_ablkcipher_setkey(tfm, template[i].key,
 						       template[i].klen);
-			if (!ret == template[i].fail) {
+			if ((!ret) == template[i].fail) {
 				printk(KERN_ERR "alg: skcipher: setkey failed "
 				       "on test %d for %s: flags=%x\n", j,
 				       algo, crypto_ablkcipher_get_flags(tfm));
@@ -916,7 +916,7 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc,
 
 			ret = crypto_ablkcipher_setkey(tfm, template[i].key,
 						       template[i].klen);
-			if (!ret == template[i].fail) {
+			if ((!ret) == template[i].fail) {
 				printk(KERN_ERR "alg: skcipher: setkey failed "
 				       "on chunk test %d for %s: flags=%x\n",
 				       j, algo,
diff --git a/drivers/auxdisplay/ks0108.c b/drivers/auxdisplay/ks0108.c
index 5b9385239..0d752851a 100644
--- a/drivers/auxdisplay/ks0108.c
+++ b/drivers/auxdisplay/ks0108.c
@@ -139,6 +139,7 @@ static int __init ks0108_init(void)
 
 	ks0108_pardevice = parport_register_device(ks0108_parport, KS0108_NAME,
 		NULL, NULL, NULL, PARPORT_DEV_EXCL, NULL);
+	parport_put_port(ks0108_parport);
 	if (ks0108_pardevice == NULL) {
 		printk(KERN_ERR KS0108_NAME ": ERROR: "
 			"parport didn't register new device\n");
diff --git a/drivers/base/devres.c b/drivers/base/devres.c
index 524bf96c2..06c541dc4 100644
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -254,10 +254,10 @@ void * devres_get(struct device *dev, void *new_res,
 	if (!dr) {
 		add_dr(dev, &new_dr->node);
 		dr = new_dr;
-		new_dr = NULL;
+		new_res = NULL;
 	}
 	spin_unlock_irqrestore(&dev->devres_lock, flags);
-	devres_free(new_dr);
+	devres_free(new_res);
 
 	return dr->data;
 }
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index a1a722502..5a1373330 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -311,9 +311,7 @@ int platform_device_add(struct platform_device *pdev)
  failed:
 	while (--i >= 0) {
 		struct resource *r = &pdev->resource[i];
-		unsigned long type = resource_type(r);
-
-		if (type == IORESOURCE_MEM || type == IORESOURCE_IO)
+		if (r->parent)
 			release_resource(r);
 	}
 
@@ -338,9 +336,7 @@ void platform_device_del(struct platform_device *pdev)
 
 		for (i = 0; i < pdev->num_resources; i++) {
 			struct resource *r = &pdev->resource[i];
-			unsigned long type = resource_type(r);
-
-			if (type == IORESOURCE_MEM || type == IORESOURCE_IO)
+			if (r->parent)
 				release_resource(r);
 		}
 	}
diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c
index 1db128951..023a9d79e 100644
--- a/drivers/base/regmap/regmap-debugfs.c
+++ b/drivers/base/regmap/regmap-debugfs.c
@@ -23,8 +23,7 @@ static struct dentry *regmap_debugfs_root;
 /* Calculate the length of a fixed format  */
 static size_t regmap_calc_reg_len(int max_val, char *buf, size_t buf_size)
 {
-	snprintf(buf, buf_size, "%x", max_val);
-	return strlen(buf);
+	return snprintf(NULL, 0, "%x", max_val);
 }
 
 static ssize_t regmap_name_read_file(struct file *file,
@@ -205,7 +204,7 @@ static ssize_t regmap_access_read_file(struct file *file,
 		/* If we're in the region the user is trying to read */
 		if (p >= *ppos) {
 			/* ...but not beyond it */
-			if (buf_pos >= count - 1 - tot_len)
+			if (buf_pos + tot_len + 1 >= count)
 				break;
 
 			/* Format the register */
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index a79640712..e93e2ded7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -118,6 +118,8 @@ source "drivers/block/paride/Kconfig"
 
 source "drivers/block/mtip32xx/Kconfig"
 
+source "drivers/block/zram/Kconfig"
+
 config BLK_CPQ_DA
 	tristate "Compaq SMART2 support"
 	depends on PCI && VIRT_TO_BUS
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index a81cdd7b9..16477b255 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1314,7 +1314,8 @@ static void blkback_changed(struct xenbus_device *dev,
 			break;
 		/* Missed the backend's Closing state -- fallthrough */
 	case XenbusStateClosing:
-		blkfront_closing(info);
+		if (info)
+			blkfront_closing(info);
 		break;
 	}
 }
diff --git a/drivers/staging/zram/Kconfig b/drivers/block/zram/Kconfig
similarity index 70%
rename from drivers/staging/zram/Kconfig
rename to drivers/block/zram/Kconfig
index 983314c41..386ba3d1a 100644
--- a/drivers/staging/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -14,12 +14,13 @@ config ZRAM
 	  disks and maybe many more.
 
 	  See zram.txt for more information.
-	  Project home: <https://compcache.googlecode.com/>
 
-config ZRAM_DEBUG
-	bool "Compressed RAM block device debug support"
+config ZRAM_LZ4_COMPRESS
+	bool "Enable LZ4 algorithm support"
 	depends on ZRAM
+	select LZ4_COMPRESS
+	select LZ4_DECOMPRESS
 	default n
 	help
-	  This option adds additional debugging code to the compressed
-	  RAM block device driver.
+	  This option enables LZ4 compression algorithm support. Compression
+	  algorithm can be changed using `comp_algorithm' device attribute.
\ No newline at end of file
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
new file mode 100644
index 000000000..be0763ff5
--- /dev/null
+++ b/drivers/block/zram/Makefile
@@ -0,0 +1,5 @@
+zram-y	:=	zcomp_lzo.o zcomp.o zram_drv.o
+
+zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o
+
+obj-$(CONFIG_ZRAM)	+=	zram.o
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
new file mode 100644
index 000000000..b51a816d7
--- /dev/null
+++ b/drivers/block/zram/zcomp.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/cpu.h>
+
+#include "zcomp.h"
+#include "zcomp_lzo.h"
+#ifdef CONFIG_ZRAM_LZ4_COMPRESS
+#include "zcomp_lz4.h"
+#endif
+
+static struct zcomp_backend *backends[] = {
+	&zcomp_lzo,
+#ifdef CONFIG_ZRAM_LZ4_COMPRESS
+	&zcomp_lz4,
+#endif
+	NULL
+};
+
+static struct zcomp_backend *find_backend(const char *compress)
+{
+	int i = 0;
+	while (backends[i]) {
+		if (sysfs_streq(compress, backends[i]->name))
+			break;
+		i++;
+	}
+	return backends[i];
+}
+
+static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
+{
+	if (zstrm->private)
+		comp->backend->destroy(zstrm->private);
+	free_pages((unsigned long)zstrm->buffer, 1);
+	kfree(zstrm);
+}
+
+/*
+ * allocate new zcomp_strm structure with ->private initialized by
+ * backend, return NULL on error
+ */
+static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags)
+{
+	struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags);
+	if (!zstrm)
+		return NULL;
+
+	zstrm->private = comp->backend->create(flags);
+	/*
+	 * allocate 2 pages. 1 for compressed data, plus 1 extra for the
+	 * case when compressed size is larger than the original one
+	 */
+	zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1);
+	if (!zstrm->private || !zstrm->buffer) {
+		zcomp_strm_free(comp, zstrm);
+		zstrm = NULL;
+	}
+	return zstrm;
+}
+
+/* show available compressors */
+ssize_t zcomp_available_show(const char *comp, char *buf)
+{
+	ssize_t sz = 0;
+	int i = 0;
+
+	while (backends[i]) {
+		if (!strcmp(comp, backends[i]->name))
+			sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+					"[%s] ", backends[i]->name);
+		else
+			sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+					"%s ", backends[i]->name);
+		i++;
+	}
+	sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
+	return sz;
+}
+
+bool zcomp_available_algorithm(const char *comp)
+{
+	return find_backend(comp) != NULL;
+}
+
+struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
+{
+	return *get_cpu_ptr(comp->stream);
+}
+
+void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm)
+{
+	put_cpu_ptr(comp->stream);
+}
+
+int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
+		const unsigned char *src, size_t *dst_len)
+{
+	return comp->backend->compress(src, zstrm->buffer, dst_len,
+			zstrm->private);
+}
+
+int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
+		size_t src_len, unsigned char *dst)
+{
+	return comp->backend->decompress(src, src_len, dst);
+}
+
+static int __zcomp_cpu_notifier(struct zcomp *comp,
+		unsigned long action, unsigned long cpu)
+{
+	struct zcomp_strm *zstrm;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		if (WARN_ON(*per_cpu_ptr(comp->stream, cpu)))
+			break;
+		zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
+		if (IS_ERR_OR_NULL(zstrm)) {
+			pr_err("Can't allocate a compression stream\n");
+			return NOTIFY_BAD;
+		}
+		*per_cpu_ptr(comp->stream, cpu) = zstrm;
+		break;
+	case CPU_DEAD:
+	case CPU_UP_CANCELED:
+		zstrm = *per_cpu_ptr(comp->stream, cpu);
+		if (!IS_ERR_OR_NULL(zstrm))
+			zcomp_strm_free(comp, zstrm);
+		*per_cpu_ptr(comp->stream, cpu) = NULL;
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static int zcomp_cpu_notifier(struct notifier_block *nb,
+		unsigned long action, void *pcpu)
+{
+	unsigned long cpu = (unsigned long)pcpu;
+	struct zcomp *comp = container_of(nb, typeof(*comp), notifier);
+
+	return __zcomp_cpu_notifier(comp, action, cpu);
+}
+
+static int zcomp_init(struct zcomp *comp)
+{
+	unsigned long cpu;
+	int ret;
+
+	comp->notifier.notifier_call = zcomp_cpu_notifier;
+
+	comp->stream = alloc_percpu(struct zcomp_strm *);
+	if (!comp->stream)
+		return -ENOMEM;
+
+	cpu_notifier_register_begin();
+	for_each_online_cpu(cpu) {
+		ret = __zcomp_cpu_notifier(comp, CPU_UP_PREPARE, cpu);
+		if (ret == NOTIFY_BAD)
+			goto cleanup;
+	}
+	__register_cpu_notifier(&comp->notifier);
+	cpu_notifier_register_done();
+	return 0;
+
+cleanup:
+	for_each_online_cpu(cpu)
+		__zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu);
+	cpu_notifier_register_done();
+	return -ENOMEM;
+}
+
+void zcomp_destroy(struct zcomp *comp)
+{
+	unsigned long cpu;
+
+	cpu_notifier_register_begin();
+	for_each_online_cpu(cpu)
+		__zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu);
+	__unregister_cpu_notifier(&comp->notifier);
+	cpu_notifier_register_done();
+
+	free_percpu(comp->stream);
+	kfree(comp);
+}
+
+/*
+ * search available compressors for requested algorithm.
+ * allocate new zcomp and initialize it. return compressing
+ * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL)
+ * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in
+ * case of allocation error, or any other error potentially
+ * returned by zcomp_init().
+ */
+struct zcomp *zcomp_create(const char *compress)
+{
+	struct zcomp *comp;
+	struct zcomp_backend *backend;
+	int error;
+
+	backend = find_backend(compress);
+	if (!backend)
+		return ERR_PTR(-EINVAL);
+
+	comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
+	if (!comp)
+		return ERR_PTR(-ENOMEM);
+
+	comp->backend = backend;
+	error = zcomp_init(comp);
+	if (error) {
+		kfree(comp);
+		return ERR_PTR(error);
+	}
+	return comp;
+}
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
new file mode 100644
index 000000000..ffd88cb74
--- /dev/null
+++ b/drivers/block/zram/zcomp.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ZCOMP_H_
+#define _ZCOMP_H_
+
+struct zcomp_strm {
+	/* compression/decompression buffer */
+	void *buffer;
+	/*
+	 * The private data of the compression stream, only compression
+	 * stream backend can touch this (e.g. compression algorithm
+	 * working memory)
+	 */
+	void *private;
+};
+
+/* static compression backend */
+struct zcomp_backend {
+	int (*compress)(const unsigned char *src, unsigned char *dst,
+			size_t *dst_len, void *private);
+
+	int (*decompress)(const unsigned char *src, size_t src_len,
+			unsigned char *dst);
+
+	void *(*create)(gfp_t flags);
+	void (*destroy)(void *private);
+
+	const char *name;
+};
+
+/* dynamic per-device compression frontend */
+struct zcomp {
+	struct zcomp_strm * __percpu *stream;
+	struct zcomp_backend *backend;
+	struct notifier_block notifier;
+};
+
+ssize_t zcomp_available_show(const char *comp, char *buf);
+bool zcomp_available_algorithm(const char *comp);
+
+struct zcomp *zcomp_create(const char *comp);
+void zcomp_destroy(struct zcomp *comp);
+
+struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
+void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm);
+
+int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
+		const unsigned char *src, size_t *dst_len);
+
+int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
+		size_t src_len, unsigned char *dst);
+
+bool zcomp_set_max_streams(struct zcomp *comp, int num_strm);
+#endif /* _ZCOMP_H_ */
diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
new file mode 100644
index 000000000..0110086ac
--- /dev/null
+++ b/drivers/block/zram/zcomp_lz4.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/lz4.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+
+#include "zcomp_lz4.h"
+
+static void *zcomp_lz4_create(gfp_t flags)
+{
+	void *ret;
+
+	ret = kmalloc(LZ4_MEM_COMPRESS, flags);
+	if (!ret)
+		ret = __vmalloc(LZ4_MEM_COMPRESS,
+				flags | __GFP_HIGHMEM,
+				PAGE_KERNEL);
+	return ret;
+}
+
+static void zcomp_lz4_destroy(void *private)
+{
+	kvfree(private);
+}
+
+static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst,
+		size_t *dst_len, void *private)
+{
+	/* return  : Success if return 0 */
+	return lz4_compress(src, PAGE_SIZE, dst, dst_len, private);
+}
+
+static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len,
+		unsigned char *dst)
+{
+	size_t dst_len = PAGE_SIZE;
+	/* return  : Success if return 0 */
+	return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len);
+}
+
+struct zcomp_backend zcomp_lz4 = {
+	.compress = zcomp_lz4_compress,
+	.decompress = zcomp_lz4_decompress,
+	.create = zcomp_lz4_create,
+	.destroy = zcomp_lz4_destroy,
+	.name = "lz4",
+};
diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h
new file mode 100644
index 000000000..60613fb29
--- /dev/null
+++ b/drivers/block/zram/zcomp_lz4.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ZCOMP_LZ4_H_
+#define _ZCOMP_LZ4_H_
+
+#include "zcomp.h"
+
+extern struct zcomp_backend zcomp_lz4;
+
+#endif /* _ZCOMP_LZ4_H_ */
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
new file mode 100644
index 000000000..ed7a1f054
--- /dev/null
+++ b/drivers/block/zram/zcomp_lzo.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/lzo.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+
+#include "zcomp_lzo.h"
+
+static void *lzo_create(gfp_t flags)
+{
+	void *ret;
+
+	ret = kmalloc(LZO1X_MEM_COMPRESS, flags);
+	if (!ret)
+		ret = __vmalloc(LZO1X_MEM_COMPRESS,
+				flags | __GFP_HIGHMEM,
+				PAGE_KERNEL);
+	return ret;
+}
+
+static void lzo_destroy(void *private)
+{
+	kvfree(private);
+}
+
+static int lzo_compress(const unsigned char *src, unsigned char *dst,
+		size_t *dst_len, void *private)
+{
+	int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private);
+	return ret == LZO_E_OK ? 0 : ret;
+}
+
+static int lzo_decompress(const unsigned char *src, size_t src_len,
+		unsigned char *dst)
+{
+	size_t dst_len = PAGE_SIZE;
+	int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len);
+	return ret == LZO_E_OK ? 0 : ret;
+}
+
+struct zcomp_backend zcomp_lzo = {
+	.compress = lzo_compress,
+	.decompress = lzo_decompress,
+	.create = lzo_create,
+	.destroy = lzo_destroy,
+	.name = "lzo",
+};
diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h
new file mode 100644
index 000000000..128c5807f
--- /dev/null
+++ b/drivers/block/zram/zcomp_lzo.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ZCOMP_LZO_H_
+#define _ZCOMP_LZO_H_
+
+#include "zcomp.h"
+
+extern struct zcomp_backend zcomp_lzo;
+
+#endif /* _ZCOMP_LZO_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
new file mode 100644
index 000000000..a0b07110d
--- /dev/null
+++ b/drivers/block/zram/zram_drv.c
@@ -0,0 +1,1555 @@
+/*
+ * Compressed RAM block device
+ *
+ * Copyright (C) 2008, 2009, 2010  Nitin Gupta
+ *               2012, 2013 Minchan Kim
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ */
+
+#define KMSG_COMPONENT "zram"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/device.h>
+#include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/sysfs.h>
+
+#include "zram_drv.h"
+
+#ifdef CONFIG_STATE_NOTIFIER
+#include <linux/state_notifier.h>
+static struct notifier_block notif;
+#endif
+
+static DEFINE_IDR(zram_index_idr);
+/* idr index must be protected */
+static DEFINE_MUTEX(zram_index_mutex);
+
+static int zram_major;
+#ifdef CONFIG_ZRAM_LZ4_COMPRESS
+static const char *default_compressor = "lz4";
+#else
+static const char *default_compressor = "lzo";
+#endif
+
+/* Module params (documentation at end) */
+static unsigned int num_devices = 1;
+
+static inline void deprecated_attr_warn(const char *name)
+{
+	pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n",
+			task_pid_nr(current),
+			current->comm,
+			name,
+			"See zram documentation.");
+}
+
+#define ZRAM_ATTR_RO(name)						\
+static ssize_t name##_show(struct device *d,				\
+				struct device_attribute *attr, char *b)	\
+{									\
+	struct zram *zram = dev_to_zram(d);				\
+									\
+	deprecated_attr_warn(__stringify(name));			\
+	return scnprintf(b, PAGE_SIZE, "%llu\n",			\
+		(u64)atomic64_read(&zram->stats.name));			\
+}									\
+static DEVICE_ATTR_RO(name);
+
+static inline bool init_done(struct zram *zram)
+{
+	return zram->disksize;
+}
+
+static inline struct zram *dev_to_zram(struct device *dev)
+{
+	return (struct zram *)dev_to_disk(dev)->private_data;
+}
+
+/* flag operations require table entry bit_spin_lock() being held */
+static int zram_test_flag(struct zram_meta *meta, u32 index,
+			enum zram_pageflags flag)
+{
+	return meta->table[index].value & BIT(flag);
+}
+
+static void zram_set_flag(struct zram_meta *meta, u32 index,
+			enum zram_pageflags flag)
+{
+	meta->table[index].value |= BIT(flag);
+}
+
+static void zram_clear_flag(struct zram_meta *meta, u32 index,
+			enum zram_pageflags flag)
+{
+	meta->table[index].value &= ~BIT(flag);
+}
+
+static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
+{
+	return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
+}
+
+static void zram_set_obj_size(struct zram_meta *meta,
+					u32 index, size_t size)
+{
+	unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;
+
+	meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
+}
+
+static inline bool is_partial_io(struct bio_vec *bvec)
+{
+	return bvec->bv_len != PAGE_SIZE;
+}
+
+/*
+ * Check if request is within bounds and aligned on zram logical blocks.
+ */
+static inline bool valid_io_request(struct zram *zram,
+		sector_t start, unsigned int size)
+{
+	u64 end, bound;
+
+	/* unaligned request */
+	if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
+		return false;
+	if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
+		return false;
+
+	end = start + (size >> SECTOR_SHIFT);
+	bound = zram->disksize >> SECTOR_SHIFT;
+	/* out of range range */
+	if (unlikely(start >= bound || end > bound || start > end))
+		return false;
+
+	/* I/O request is valid */
+	return true;
+}
+
+static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
+{
+	if (*offset + bvec->bv_len >= PAGE_SIZE)
+		(*index)++;
+	*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
+}
+
+static inline void update_used_max(struct zram *zram,
+					const unsigned long pages)
+{
+	unsigned long old_max, cur_max;
+
+	old_max = atomic_long_read(&zram->stats.max_used_pages);
+
+	do {
+		cur_max = old_max;
+		if (pages > cur_max)
+			old_max = atomic_long_cmpxchg(
+				&zram->stats.max_used_pages, cur_max, pages);
+	} while (old_max != cur_max);
+}
+
+static bool page_zero_filled(void *ptr)
+{
+	unsigned int pos;
+	unsigned long *page;
+
+	page = (unsigned long *)ptr;
+
+	for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
+		if (page[pos])
+			return false;
+	}
+
+	return true;
+}
+
+static void handle_zero_page(struct bio_vec *bvec)
+{
+	struct page *page = bvec->bv_page;
+	void *user_mem;
+
+	user_mem = kmap_atomic(page);
+	if (is_partial_io(bvec))
+		memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
+	else
+		clear_page(user_mem);
+	kunmap_atomic(user_mem);
+
+	flush_dcache_page(page);
+}
+
+#ifdef CONFIG_STATE_NOTIFIER
+static int zram_compact(struct zram *zram)
+{
+	struct zram_meta *meta;
+	u64 new_size, data_size;
+
+	down_read(&zram->init_lock);
+	if (!init_done(zram)) {
+		up_read(&zram->init_lock);
+		pr_info("*** zram state notifier: zram is off ***\n");
+		return -EINVAL;
+	}
+
+	meta = zram->meta;
+
+	data_size = atomic64_read(&zram->stats.compr_data_size);
+
+	pr_info("*** zram state notifier: starting compaction ***\n");
+	zs_compact(meta->mem_pool);
+
+	new_size = atomic64_read(&zram->stats.compr_data_size);
+	if (new_size < data_size)
+		pr_info("%s compacted. Saved %llu kb.\n",
+			zram->disk->disk_name,
+			(unsigned long long)(data_size - new_size));
+	pr_info("*** zram state notifier: finished compaction ***\n");
+
+	up_read(&zram->init_lock);
+
+	return 0;
+}
+
+static int zram_compact_cb(int id, void *ptr, void *data)
+{
+	zram_compact(ptr);
+	return 0;
+}
+
+static int state_notifier_callback(struct notifier_block *this,
+				unsigned long event, void *data)
+{
+	switch (event) {
+		case STATE_NOTIFIER_ACTIVE:
+			break;
+		case STATE_NOTIFIER_SUSPEND:
+			idr_for_each(&zram_index_idr, &zram_compact_cb, NULL);
+			break;
+		default:
+			break;
+	}
+
+	return NOTIFY_OK;
+}
+#endif
+
+static ssize_t initstate_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	u32 val;
+	struct zram *zram = dev_to_zram(dev);
+
+	down_read(&zram->init_lock);
+	val = init_done(zram);
+	up_read(&zram->init_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t disksize_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
+}
+
+static ssize_t orig_data_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	deprecated_attr_warn("orig_data_size");
+	return scnprintf(buf, PAGE_SIZE, "%llu\n",
+		(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
+}
+
+static ssize_t mem_used_total_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	u64 val = 0;
+	struct zram *zram = dev_to_zram(dev);
+
+	deprecated_attr_warn("mem_used_total");
+	down_read(&zram->init_lock);
+	if (init_done(zram)) {
+		struct zram_meta *meta = zram->meta;
+		val = zs_get_total_pages(meta->mem_pool);
+	}
+	up_read(&zram->init_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
+}
+
+static ssize_t mem_limit_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	u64 val;
+	struct zram *zram = dev_to_zram(dev);
+
+	deprecated_attr_warn("mem_limit");
+	down_read(&zram->init_lock);
+	val = zram->limit_pages;
+	up_read(&zram->init_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
+}
+
+static ssize_t mem_limit_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	u64 limit;
+	char *tmp;
+	struct zram *zram = dev_to_zram(dev);
+
+	limit = memparse(buf, &tmp);
+	if (buf == tmp) /* no chars parsed, invalid input */
+		return -EINVAL;
+
+	down_write(&zram->init_lock);
+	zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
+	up_write(&zram->init_lock);
+
+	return len;
+}
+
+static ssize_t mem_used_max_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	u64 val = 0;
+	struct zram *zram = dev_to_zram(dev);
+
+	deprecated_attr_warn("mem_used_max");
+	down_read(&zram->init_lock);
+	if (init_done(zram))
+		val = atomic_long_read(&zram->stats.max_used_pages);
+	up_read(&zram->init_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
+}
+
+static ssize_t mem_used_max_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	int err;
+	unsigned long val;
+	struct zram *zram = dev_to_zram(dev);
+
+	err = kstrtoul(buf, 10, &val);
+	if (err || val != 0)
+		return -EINVAL;
+
+	down_read(&zram->init_lock);
+	if (init_done(zram)) {
+		struct zram_meta *meta = zram->meta;
+		atomic_long_set(&zram->stats.max_used_pages,
+				zs_get_total_pages(meta->mem_pool));
+	}
+	up_read(&zram->init_lock);
+
+	return len;
+}
+
+/*
+ * We switched to per-cpu streams and this attr is not needed anymore.
+ * However, we will keep it around for some time, because:
+ * a) we may revert per-cpu streams in the future
+ * b) it's visible to user space and we need to follow our 2 years
+ *    retirement rule; but we already have a number of 'soon to be
+ *    altered' attrs, so max_comp_streams need to wait for the next
+ *    layoff cycle.
+ */
+static ssize_t max_comp_streams_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
+}
+
+static ssize_t max_comp_streams_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	return len;
+}
+
+static ssize_t comp_algorithm_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	size_t sz;
+	struct zram *zram = dev_to_zram(dev);
+
+	down_read(&zram->init_lock);
+	sz = zcomp_available_show(zram->compressor, buf);
+	up_read(&zram->init_lock);
+
+	return sz;
+}
+
+static ssize_t comp_algorithm_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct zram *zram = dev_to_zram(dev);
+	size_t sz;
+
+	if (!zcomp_available_algorithm(buf))
+		return -EINVAL;
+
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		up_write(&zram->init_lock);
+		pr_info("Can't change algorithm for initialized device\n");
+		return -EBUSY;
+	}
+	strlcpy(zram->compressor, buf, sizeof(zram->compressor));
+
+	/* ignore trailing newline */
+	sz = strlen(zram->compressor);
+	if (sz > 0 && zram->compressor[sz - 1] == '\n')
+		zram->compressor[sz - 1] = 0x00;
+
+	up_write(&zram->init_lock);
+	return len;
+}
+
+static ssize_t compact_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct zram *zram = dev_to_zram(dev);
+	struct zram_meta *meta;
+
+	down_read(&zram->init_lock);
+	if (!init_done(zram)) {
+		up_read(&zram->init_lock);
+		return -EINVAL;
+	}
+
+	meta = zram->meta;
+	zs_compact(meta->mem_pool);
+	up_read(&zram->init_lock);
+
+	return len;
+}
+
+static ssize_t io_stat_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+	ssize_t ret;
+
+	down_read(&zram->init_lock);
+	ret = scnprintf(buf, PAGE_SIZE,
+			"%8llu %8llu %8llu %8llu\n",
+			(u64)atomic64_read(&zram->stats.failed_reads),
+			(u64)atomic64_read(&zram->stats.failed_writes),
+			(u64)atomic64_read(&zram->stats.invalid_io),
+			(u64)atomic64_read(&zram->stats.notify_free));
+	up_read(&zram->init_lock);
+
+	return ret;
+}
+
+static ssize_t mm_stat_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+	struct zs_pool_stats pool_stats;
+	u64 orig_size, mem_used = 0;
+	long max_used;
+	ssize_t ret;
+
+	memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
+
+	down_read(&zram->init_lock);
+	if (init_done(zram)) {
+		mem_used = zs_get_total_pages(zram->meta->mem_pool);
+		zs_pool_stats(zram->meta->mem_pool, &pool_stats);
+	}
+
+	orig_size = atomic64_read(&zram->stats.pages_stored);
+	max_used = atomic_long_read(&zram->stats.max_used_pages);
+
+	ret = scnprintf(buf, PAGE_SIZE,
+			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
+			orig_size << PAGE_SHIFT,
+			(u64)atomic64_read(&zram->stats.compr_data_size),
+			mem_used << PAGE_SHIFT,
+			zram->limit_pages << PAGE_SHIFT,
+			max_used << PAGE_SHIFT,
+			(u64)atomic64_read(&zram->stats.zero_pages),
+			pool_stats.pages_compacted);
+	up_read(&zram->init_lock);
+
+	return ret;
+}
+
+static ssize_t debug_stat_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	int version = 1;
+	struct zram *zram = dev_to_zram(dev);
+	ssize_t ret;
+
+	down_read(&zram->init_lock);
+	ret = scnprintf(buf, PAGE_SIZE,
+			"version: %d\n%8llu\n",
+			version,
+			(u64)atomic64_read(&zram->stats.writestall));
+	up_read(&zram->init_lock);
+
+	return ret;
+}
+
+static DEVICE_ATTR_RO(io_stat);
+static DEVICE_ATTR_RO(mm_stat);
+static DEVICE_ATTR_RO(debug_stat);
+ZRAM_ATTR_RO(num_reads);
+ZRAM_ATTR_RO(num_writes);
+ZRAM_ATTR_RO(failed_reads);
+ZRAM_ATTR_RO(failed_writes);
+ZRAM_ATTR_RO(invalid_io);
+ZRAM_ATTR_RO(notify_free);
+ZRAM_ATTR_RO(zero_pages);
+ZRAM_ATTR_RO(compr_data_size);
+
+static inline bool zram_meta_get(struct zram *zram)
+{
+	if (atomic_inc_not_zero(&zram->refcount))
+		return true;
+	return false;
+}
+
+static inline void zram_meta_put(struct zram *zram)
+{
+	atomic_dec(&zram->refcount);
+}
+
+static void zram_meta_free(struct zram_meta *meta, u64 disksize)
+{
+	size_t num_pages = disksize >> PAGE_SHIFT;
+	size_t index;
+
+	/* Free all pages that are still in this zram device */
+	for (index = 0; index < num_pages; index++) {
+		unsigned long handle = meta->table[index].handle;
+
+		if (!handle)
+			continue;
+
+		zs_free(meta->mem_pool, handle);
+	}
+
+	zs_destroy_pool(meta->mem_pool);
+	vfree(meta->table);
+	kfree(meta);
+}
+
+static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
+{
+	size_t num_pages;
+	struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
+
+	if (!meta)
+		return NULL;
+
+	num_pages = disksize >> PAGE_SHIFT;
+	meta->table = vzalloc(num_pages * sizeof(*meta->table));
+	if (!meta->table) {
+		pr_err("Error allocating zram address table\n");
+		goto out_error;
+	}
+
+	meta->mem_pool = zs_create_pool(pool_name);
+	if (!meta->mem_pool) {
+		pr_err("Error creating memory pool\n");
+		goto out_error;
+	}
+
+	return meta;
+
+out_error:
+	vfree(meta->table);
+	kfree(meta);
+	return NULL;
+}
+
+/*
+ * To protect concurrent access to the same index entry,
+ * caller should hold this table index entry's bit_spinlock to
+ * indicate this index entry is accessing.
+ */
+static void zram_free_page(struct zram *zram, size_t index)
+{
+	struct zram_meta *meta = zram->meta;
+	unsigned long handle = meta->table[index].handle;
+
+	if (unlikely(!handle)) {
+		/*
+		 * No memory is allocated for zero filled pages.
+		 * Simply clear zero page flag.
+		 */
+		if (zram_test_flag(meta, index, ZRAM_ZERO)) {
+			zram_clear_flag(meta, index, ZRAM_ZERO);
+			atomic64_dec(&zram->stats.zero_pages);
+		}
+		return;
+	}
+
+	zs_free(meta->mem_pool, handle);
+
+	atomic64_sub(zram_get_obj_size(meta, index),
+			&zram->stats.compr_data_size);
+	atomic64_dec(&zram->stats.pages_stored);
+
+	meta->table[index].handle = 0;
+	zram_set_obj_size(meta, index, 0);
+}
+
+static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
+{
+	int ret = 0;
+	unsigned char *cmem;
+	struct zram_meta *meta = zram->meta;
+	unsigned long handle;
+	size_t size;
+
+	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	handle = meta->table[index].handle;
+	size = zram_get_obj_size(meta, index);
+
+	if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
+		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+		clear_page(mem);
+		return 0;
+	}
+
+	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
+	if (size == PAGE_SIZE)
+		copy_page(mem, cmem);
+	else
+		ret = zcomp_decompress(zram->comp, cmem, size, mem);
+	zs_unmap_object(meta->mem_pool, handle);
+	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+
+	/* Should NEVER happen. Return bio error if it does. */
+	if (unlikely(ret)) {
+		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
+			  u32 index, int offset)
+{
+	int ret;
+	struct page *page;
+	unsigned char *user_mem, *uncmem = NULL;
+	struct zram_meta *meta = zram->meta;
+	page = bvec->bv_page;
+
+	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	if (unlikely(!meta->table[index].handle) ||
+			zram_test_flag(meta, index, ZRAM_ZERO)) {
+		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+		handle_zero_page(bvec);
+		return 0;
+	}
+	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+
+	if (is_partial_io(bvec))
+		/* Use  a temporary buffer to decompress the page */
+		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
+
+	user_mem = kmap_atomic(page);
+	if (!is_partial_io(bvec))
+		uncmem = user_mem;
+
+	if (!uncmem) {
+		pr_err("Unable to allocate temp memory\n");
+		ret = -ENOMEM;
+		goto out_cleanup;
+	}
+
+	ret = zram_decompress_page(zram, uncmem, index);
+	/* Should NEVER happen. Return bio error if it does. */
+	if (unlikely(ret))
+		goto out_cleanup;
+
+	if (is_partial_io(bvec))
+		memcpy(user_mem + bvec->bv_offset, uncmem + offset,
+				bvec->bv_len);
+
+	flush_dcache_page(page);
+	ret = 0;
+out_cleanup:
+	kunmap_atomic(user_mem);
+	if (is_partial_io(bvec))
+		kfree(uncmem);
+	return ret;
+}
+
+static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
+			   int offset)
+{
+	int ret = 0;
+	size_t clen;
+	unsigned long handle = 0;
+	struct page *page;
+	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
+	struct zram_meta *meta = zram->meta;
+	struct zcomp_strm *zstrm = NULL;
+	unsigned long alloced_pages;
+
+	page = bvec->bv_page;
+	if (is_partial_io(bvec)) {
+		/*
+		 * This is a partial IO. We need to read the full page
+		 * before to write the changes.
+		 */
+		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
+		if (!uncmem) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = zram_decompress_page(zram, uncmem, index);
+		if (ret)
+			goto out;
+	}
+
+compress_again:
+	user_mem = kmap_atomic(page);
+	if (is_partial_io(bvec)) {
+		memcpy(uncmem + offset, user_mem + bvec->bv_offset,
+		       bvec->bv_len);
+		kunmap_atomic(user_mem);
+		user_mem = NULL;
+	} else {
+		uncmem = user_mem;
+	}
+
+	if (page_zero_filled(uncmem)) {
+		if (user_mem)
+			kunmap_atomic(user_mem);
+		/* Free memory associated with this sector now. */
+		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+		zram_free_page(zram, index);
+		zram_set_flag(meta, index, ZRAM_ZERO);
+		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+
+		atomic64_inc(&zram->stats.zero_pages);
+		ret = 0;
+		goto out;
+	}
+
+	zstrm = zcomp_strm_find(zram->comp);
+	ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
+	if (!is_partial_io(bvec)) {
+		kunmap_atomic(user_mem);
+		user_mem = NULL;
+		uncmem = NULL;
+	}
+
+	if (unlikely(ret)) {
+		pr_err("Compression failed! err=%d\n", ret);
+		goto out;
+	}
+
+	src = zstrm->buffer;
+	if (unlikely(clen > max_zpage_size)) {
+		clen = PAGE_SIZE;
+		if (is_partial_io(bvec))
+			src = uncmem;
+	}
+
+	/*
+	 * handle allocation has 2 paths:
+	 * a) fast path is executed with preemption disabled (for
+	 *  per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
+	 *  since we can't sleep;
+	 * b) slow path enables preemption and attempts to allocate
+	 *  the page with __GFP_DIRECT_RECLAIM bit set. we have to
+	 *  put per-cpu compression stream and, thus, to re-do
+	 *  the compression once handle is allocated.
+	 *
+	 * if we have a 'non-null' handle here then we are coming
+	 * from the slow path and handle has already been allocated.
+	 */
+	if (!handle)
+		handle = zs_malloc(meta->mem_pool, clen,
+#if 0 /* not implemented yet */
+				__GFP_KSWAPD_RECLAIM |
+#endif
+				__GFP_NOWARN |
+				__GFP_HIGHMEM);
+	if (!handle) {
+		zcomp_strm_release(zram->comp, zstrm);
+		zstrm = NULL;
+
+		atomic64_inc(&zram->stats.writestall);
+
+		handle = zs_malloc(meta->mem_pool, clen,
+				GFP_NOIO | __GFP_HIGHMEM);
+		if (handle)
+			goto compress_again;
+
+		pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
+			index, clen);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	alloced_pages = zs_get_total_pages(meta->mem_pool);
+	update_used_max(zram, alloced_pages);
+
+	if (zram->limit_pages && alloced_pages > zram->limit_pages) {
+		zs_free(meta->mem_pool, handle);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
+
+	if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
+		src = kmap_atomic(page);
+		copy_page(cmem, src);
+		kunmap_atomic(src);
+	} else {
+		memcpy(cmem, src, clen);
+	}
+
+	zcomp_strm_release(zram->comp, zstrm);
+	zstrm = NULL;
+	zs_unmap_object(meta->mem_pool, handle);
+
+	/*
+	 * Free memory associated with this sector
+	 * before overwriting unused sectors.
+	 */
+	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_free_page(zram, index);
+
+	meta->table[index].handle = handle;
+	zram_set_obj_size(meta, index, clen);
+	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+
+	/* Update stats */
+	atomic64_add(clen, &zram->stats.compr_data_size);
+	atomic64_inc(&zram->stats.pages_stored);
+out:
+	if (zstrm)
+		zcomp_strm_release(zram->comp, zstrm);
+	if (is_partial_io(bvec))
+		kfree(uncmem);
+	return ret;
+}
+
+/*
+ * zram_bio_discard - handler on discard request
+ * @index: physical block index in PAGE_SIZE units
+ * @offset: byte offset within physical block
+ */
+static void zram_bio_discard(struct zram *zram, u32 index,
+			     int offset, struct bio *bio)
+{
+	size_t n = bio->bi_size;
+	struct zram_meta *meta = zram->meta;
+
+	/*
+	 * zram manages data in physical block size units. Because logical block
+	 * size isn't identical with physical block size on some arch, we
+	 * could get a discard request pointing to a specific offset within a
+	 * certain physical block.  Although we can handle this request by
+	 * reading that physiclal block and decompressing and partially zeroing
+	 * and re-compressing and then re-storing it, this isn't reasonable
+	 * because our intent with a discard request is to save memory.  So
+	 * skipping this logical block is appropriate here.
+	 */
+	if (offset) {
+		if (n <= (PAGE_SIZE - offset))
+			return;
+
+		n -= (PAGE_SIZE - offset);
+		index++;
+	}
+
+	while (n >= PAGE_SIZE) {
+		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+		zram_free_page(zram, index);
+		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+		atomic64_inc(&zram->stats.notify_free);
+		index++;
+		n -= PAGE_SIZE;
+	}
+}
+
+static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
+			int offset, int rw)
+{
+	unsigned long start_time = jiffies;
+	int ret;
+
+	generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT,
+			&zram->disk->part0);
+
+	if (rw == READ) {
+		atomic64_inc(&zram->stats.num_reads);
+		ret = zram_bvec_read(zram, bvec, index, offset);
+	} else {
+		atomic64_inc(&zram->stats.num_writes);
+		ret = zram_bvec_write(zram, bvec, index, offset);
+	}
+
+	generic_end_io_acct(rw, &zram->disk->part0, start_time);
+
+	if (unlikely(ret)) {
+		if (rw == READ)
+			atomic64_inc(&zram->stats.failed_reads);
+		else
+			atomic64_inc(&zram->stats.failed_writes);
+	}
+
+	return ret;
+}
+
+static void __zram_make_request(struct zram *zram, struct bio *bio)
+{
+	int offset, rw, i;
+	u32 index;
+	struct bio_vec *bvec;
+
+	index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+	offset = (bio->bi_sector &
+		  (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
+
+	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
+		zram_bio_discard(zram, index, offset, bio);
+		bio_endio(bio, 0);
+		return;
+	}
+
+	rw = bio_data_dir(bio);
+	bio_for_each_segment(bvec, bio, i) {
+		int max_transfer_size = PAGE_SIZE - offset;
+
+		if (bvec->bv_len > max_transfer_size) {
+			/*
+			 * zram_bvec_rw() can only make operation on a single
+			 * zram page. Split the bio vector.
+			 */
+			struct bio_vec bv;
+
+			bv.bv_page = bvec->bv_page;
+			bv.bv_len = max_transfer_size;
+			bv.bv_offset = bvec->bv_offset;
+
+			if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0)
+				goto out;
+
+			bv.bv_len = bvec->bv_len - max_transfer_size;
+			bv.bv_offset += max_transfer_size;
+			if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0)
+				goto out;
+		} else
+			if (zram_bvec_rw(zram, bvec, index, offset, rw) < 0)
+				goto out;
+
+		update_position(&index, &offset, bvec);
+	}
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return;
+
+out:
+	bio_io_error(bio);
+}
+
+/*
+ * Handler function for all zram I/O requests.
+ */
+static void zram_make_request(struct request_queue *queue, struct bio *bio)
+{
+	struct zram *zram = queue->queuedata;
+
+	if (unlikely(!zram_meta_get(zram)))
+		goto error;
+
+	if (!valid_io_request(zram, bio->bi_sector,
+					bio->bi_size)) {
+		atomic64_inc(&zram->stats.invalid_io);
+		goto put_zram;
+	}
+
+	__zram_make_request(zram, bio);
+	zram_meta_put(zram);
+	return;
+put_zram:
+	zram_meta_put(zram);
+error:
+	bio_io_error(bio);
+}
+
+static void zram_slot_free_notify(struct block_device *bdev,
+				unsigned long index)
+{
+	struct zram *zram;
+	struct zram_meta *meta;
+
+	zram = bdev->bd_disk->private_data;
+	meta = zram->meta;
+
+	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+	zram_free_page(zram, index);
+	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+	atomic64_inc(&zram->stats.notify_free);
+}
+
+/* not ready for this change right now: Dorimanx */
+#if 0
+static int zram_rw_page(struct block_device *bdev, sector_t sector,
+		       struct page *page, int rw)
+{
+	int offset, err = -EIO;
+	u32 index;
+	struct zram *zram;
+	struct bio_vec bv;
+
+	zram = bdev->bd_disk->private_data;
+	if (unlikely(!zram_meta_get(zram)))
+		goto out;
+
+	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
+		atomic64_inc(&zram->stats.invalid_io);
+		err = -EINVAL;
+		goto put_zram;
+	}
+
+	index = sector >> SECTORS_PER_PAGE_SHIFT;
+	offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
+
+	bv.bv_page = page;
+	bv.bv_len = PAGE_SIZE;
+	bv.bv_offset = 0;
+
+	err = zram_bvec_rw(zram, &bv, index, offset, rw);
+put_zram:
+	zram_meta_put(zram);
+out:
+	/*
+	 * If I/O fails, just return error(ie, non-zero) without
+	 * calling page_endio.
+	 * It causes resubmit the I/O with bio request by upper functions
+	 * of rw_page(e.g., swap_readpage, __swap_writepage) and
+	 * bio->bi_end_io does things to handle the error
+	 * (e.g., SetPageError, set_page_dirty and extra works).
+	 */
+	if (err == 0)
+		page_endio(page, rw, 0);
+	return err;
+}
+#endif
+
+static void zram_reset_device(struct zram *zram)
+{
+	struct zram_meta *meta;
+	struct zcomp *comp;
+	u64 disksize;
+
+	down_write(&zram->init_lock);
+
+	zram->limit_pages = 0;
+
+	if (!init_done(zram)) {
+		up_write(&zram->init_lock);
+		return;
+	}
+
+	meta = zram->meta;
+	comp = zram->comp;
+	disksize = zram->disksize;
+	/*
+	 * Refcount will go down to 0 eventually and r/w handler
+	 * cannot handle further I/O so it will bail out by
+	 * check zram_meta_get.
+	 */
+	zram_meta_put(zram);
+	/*
+	 * We want to free zram_meta in process context to avoid
+	 * deadlock between reclaim path and any other locks.
+	 */
+	wait_event(zram->io_done, atomic_read(&zram->refcount) == 0);
+
+	/* Reset stats */
+	memset(&zram->stats, 0, sizeof(zram->stats));
+	zram->disksize = 0;
+
+	set_capacity(zram->disk, 0);
+	part_stat_set_all(&zram->disk->part0, 0);
+
+	up_write(&zram->init_lock);
+	/* I/O operation under all of CPU are done so let's free */
+	zram_meta_free(meta, disksize);
+	zcomp_destroy(comp);
+}
+
+static ssize_t disksize_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	u64 disksize;
+	struct zcomp *comp;
+	struct zram_meta *meta;
+	struct zram *zram = dev_to_zram(dev);
+	int err;
+
+	disksize = memparse(buf, NULL);
+	if (!disksize)
+		return -EINVAL;
+
+	disksize = PAGE_ALIGN(disksize);
+	meta = zram_meta_alloc(zram->disk->disk_name, disksize);
+	if (!meta)
+		return -ENOMEM;
+
+	comp = zcomp_create(zram->compressor);
+	if (IS_ERR(comp)) {
+		pr_err("Cannot initialise %s compressing backend\n",
+				zram->compressor);
+		err = PTR_ERR(comp);
+		goto out_free_meta;
+	}
+
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		pr_info("Cannot change disksize for initialized device\n");
+		err = -EBUSY;
+		goto out_destroy_comp;
+	}
+
+	init_waitqueue_head(&zram->io_done);
+	atomic_set(&zram->refcount, 1);
+	zram->meta = meta;
+	zram->comp = comp;
+	zram->disksize = disksize;
+	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
+	up_write(&zram->init_lock);
+
+	/*
+	 * Revalidate disk out of the init_lock to avoid lockdep splat.
+	 * It's okay because disk's capacity is protected by init_lock
+	 * so that revalidate_disk always sees up-to-date capacity.
+	 */
+	revalidate_disk(zram->disk);
+
+	return len;
+
+out_destroy_comp:
+	up_write(&zram->init_lock);
+	zcomp_destroy(comp);
+out_free_meta:
+	zram_meta_free(meta, disksize);
+	return err;
+}
+
+static ssize_t reset_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	int ret;
+	unsigned short do_reset;
+	struct zram *zram;
+	struct block_device *bdev;
+
+	ret = kstrtou16(buf, 10, &do_reset);
+	if (ret)
+		return ret;
+
+	if (!do_reset)
+		return -EINVAL;
+
+	zram = dev_to_zram(dev);
+	bdev = bdget_disk(zram->disk, 0);
+	if (!bdev)
+		return -ENOMEM;
+
+	mutex_lock(&bdev->bd_mutex);
+	/* Do not reset an active device or claimed device */
+	if (bdev->bd_openers || zram->claim) {
+		mutex_unlock(&bdev->bd_mutex);
+		bdput(bdev);
+		return -EBUSY;
+	}
+
+	/* From now on, anyone can't open /dev/zram[0-9] */
+	zram->claim = true;
+	mutex_unlock(&bdev->bd_mutex);
+
+	/* Make sure all the pending I/O are finished */
+	fsync_bdev(bdev);
+	zram_reset_device(zram);
+	revalidate_disk(zram->disk);
+	bdput(bdev);
+
+	mutex_lock(&bdev->bd_mutex);
+	zram->claim = false;
+	mutex_unlock(&bdev->bd_mutex);
+
+	return len;
+}
+
+static int zram_open(struct block_device *bdev, fmode_t mode)
+{
+	int ret = 0;
+	struct zram *zram;
+
+	WARN_ON(!mutex_is_locked(&bdev->bd_mutex));
+
+	zram = bdev->bd_disk->private_data;
+	/* zram was claimed to reset so open request fails */
+	if (zram->claim)
+		ret = -EBUSY;
+
+	return ret;
+}
+
+static const struct block_device_operations zram_devops = {
+	.open = zram_open,
+	.swap_slot_free_notify = zram_slot_free_notify,
+#if 0
+	.rw_page = zram_rw_page,
+#endif
+	.owner = THIS_MODULE
+};
+
+static DEVICE_ATTR_WO(compact);
+static DEVICE_ATTR_RW(disksize);
+static DEVICE_ATTR_RO(initstate);
+static DEVICE_ATTR_WO(reset);
+static DEVICE_ATTR_RO(orig_data_size);
+static DEVICE_ATTR_RO(mem_used_total);
+static DEVICE_ATTR_RW(mem_limit);
+static DEVICE_ATTR_RW(mem_used_max);
+static DEVICE_ATTR_RW(max_comp_streams);
+static DEVICE_ATTR_RW(comp_algorithm);
+
+static struct attribute *zram_disk_attrs[] = {
+	&dev_attr_disksize.attr,
+	&dev_attr_initstate.attr,
+	&dev_attr_reset.attr,
+	&dev_attr_num_reads.attr,
+	&dev_attr_num_writes.attr,
+	&dev_attr_failed_reads.attr,
+	&dev_attr_failed_writes.attr,
+	&dev_attr_compact.attr,
+	&dev_attr_invalid_io.attr,
+	&dev_attr_notify_free.attr,
+	&dev_attr_zero_pages.attr,
+	&dev_attr_orig_data_size.attr,
+	&dev_attr_compr_data_size.attr,
+	&dev_attr_mem_used_total.attr,
+	&dev_attr_mem_limit.attr,
+	&dev_attr_mem_used_max.attr,
+	&dev_attr_max_comp_streams.attr,
+	&dev_attr_comp_algorithm.attr,
+	&dev_attr_io_stat.attr,
+	&dev_attr_mm_stat.attr,
+	&dev_attr_debug_stat.attr,
+	NULL,
+};
+
+static struct attribute_group zram_disk_attr_group = {
+	.attrs = zram_disk_attrs,
+};
+
+/*
+ * Allocate and initialize new zram device. the function returns
+ * '>= 0' device_id upon success, and negative value otherwise.
+ */
+static int zram_add(void)
+{
+	struct zram *zram;
+	struct request_queue *queue;
+	int ret, device_id;
+
+	zram = kzalloc(sizeof(struct zram), GFP_KERNEL);
+	if (!zram)
+		return -ENOMEM;
+
+	ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto out_free_dev;
+	device_id = ret;
+
+	init_rwsem(&zram->init_lock);
+
+	queue = blk_alloc_queue(GFP_KERNEL);
+	if (!queue) {
+		pr_err("Error allocating disk queue for device %d\n",
+			device_id);
+		ret = -ENOMEM;
+		goto out_free_idr;
+	}
+
+	blk_queue_make_request(queue, zram_make_request);
+
+	/* gendisk structure */
+	zram->disk = alloc_disk(1);
+	if (!zram->disk) {
+		pr_err("Error allocating disk structure for device %d\n",
+			device_id);
+		ret = -ENOMEM;
+		goto out_free_queue;
+	}
+
+	zram->disk->major = zram_major;
+	zram->disk->first_minor = device_id;
+	zram->disk->fops = &zram_devops;
+	zram->disk->queue = queue;
+	zram->disk->queue->queuedata = zram;
+	zram->disk->private_data = zram;
+	snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
+
+	__set_bit(QUEUE_FLAG_FAST, &zram->disk->queue->queue_flags);
+	/* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
+	set_capacity(zram->disk, 0);
+	/* zram devices sort of resembles non-rotational disks */
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
+	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
+	/*
+	 * To ensure that we always get PAGE_SIZE aligned
+	 * and n*PAGE_SIZED sized I/O requests.
+	 */
+	blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
+	blk_queue_logical_block_size(zram->disk->queue,
+					ZRAM_LOGICAL_BLOCK_SIZE);
+	blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
+	blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
+	zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
+	blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
+	/*
+	 * zram_bio_discard() will clear all logical blocks if logical block
+	 * size is identical with physical block size(PAGE_SIZE). But if it is
+	 * different, we will skip discarding some parts of logical blocks in
+	 * the part of the request range which isn't aligned to physical block
+	 * size.  So we can't ensure that all discarded logical blocks are
+	 * zeroed.
+	 */
+	if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
+		zram->disk->queue->limits.discard_zeroes_data = 1;
+	else
+		zram->disk->queue->limits.discard_zeroes_data = 0;
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
+
+	add_disk(zram->disk);
+
+	ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
+				&zram_disk_attr_group);
+	if (ret < 0) {
+		pr_err("Error creating sysfs group for device %d\n",
+				device_id);
+		goto out_free_disk;
+	}
+	strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
+	zram->meta = NULL;
+
+	pr_info("Added device: %s\n", zram->disk->disk_name);
+	return device_id;
+
+out_free_disk:
+	del_gendisk(zram->disk);
+	put_disk(zram->disk);
+out_free_queue:
+	blk_cleanup_queue(queue);
+out_free_idr:
+	idr_remove(&zram_index_idr, device_id);
+out_free_dev:
+	kfree(zram);
+	return ret;
+}
+
+static int zram_remove(struct zram *zram)
+{
+	struct block_device *bdev;
+
+	bdev = bdget_disk(zram->disk, 0);
+	if (!bdev)
+		return -ENOMEM;
+
+	mutex_lock(&bdev->bd_mutex);
+	if (bdev->bd_openers || zram->claim) {
+		mutex_unlock(&bdev->bd_mutex);
+		bdput(bdev);
+		return -EBUSY;
+	}
+
+	zram->claim = true;
+	mutex_unlock(&bdev->bd_mutex);
+
+	/*
+	 * Remove sysfs first, so no one will perform a disksize
+	 * store while we destroy the devices. This also helps during
+	 * hot_remove -- zram_reset_device() is the last holder of
+	 * ->init_lock, no later/concurrent disksize_store() or any
+	 * other sysfs handlers are possible.
+	 */
+	sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
+			&zram_disk_attr_group);
+
+	/* Make sure all the pending I/O are finished */
+	fsync_bdev(bdev);
+	zram_reset_device(zram);
+	bdput(bdev);
+
+	pr_info("Removed device: %s\n", zram->disk->disk_name);
+
+	blk_cleanup_queue(zram->disk->queue);
+	del_gendisk(zram->disk);
+	put_disk(zram->disk);
+	kfree(zram);
+	return 0;
+}
+
+/* zram-control sysfs attributes */
+static ssize_t hot_add_show(struct class *class,
+			struct class_attribute *attr,
+			char *buf)
+{
+	int ret;
+
+	mutex_lock(&zram_index_mutex);
+	ret = zram_add();
+	mutex_unlock(&zram_index_mutex);
+
+	if (ret < 0)
+		return ret;
+	return scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+}
+
+static ssize_t hot_remove_store(struct class *class,
+			struct class_attribute *attr,
+			const char *buf,
+			size_t count)
+{
+	struct zram *zram;
+	int ret, dev_id;
+
+	/* dev_id is gendisk->first_minor, which is `int' */
+	ret = kstrtoint(buf, 10, &dev_id);
+	if (ret)
+		return ret;
+	if (dev_id < 0)
+		return -EINVAL;
+
+	mutex_lock(&zram_index_mutex);
+
+	zram = idr_find(&zram_index_idr, dev_id);
+	if (zram) {
+		ret = zram_remove(zram);
+		idr_remove(&zram_index_idr, dev_id);
+	} else {
+		ret = -ENODEV;
+	}
+
+	mutex_unlock(&zram_index_mutex);
+	return ret ? ret : count;
+}
+
+static struct class_attribute zram_control_class_attrs[] = {
+	__ATTR_RO(hot_add),
+	__ATTR_WO(hot_remove),
+	__ATTR_NULL,
+};
+
+static struct class zram_control_class = {
+	.name		= "zram-control",
+	.owner		= THIS_MODULE,
+	.class_attrs	= zram_control_class_attrs,
+};
+
+static int zram_remove_cb(int id, void *ptr, void *data)
+{
+	zram_remove(ptr);
+	return 0;
+}
+
+static void destroy_devices(void)
+{
+	class_unregister(&zram_control_class);
+	idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
+	idr_destroy(&zram_index_idr);
+	unregister_blkdev(zram_major, "zram");
+}
+
+static int __init zram_init(void)
+{
+	int ret;
+
+#ifdef CONFIG_STATE_NOTIFIER
+	notif.notifier_call = state_notifier_callback;
+	if (state_register_client(&notif))
+		pr_warn("Failed to register State notifier callback\n");
+#endif
+
+	ret = class_register(&zram_control_class);
+	if (ret) {
+		pr_err("Unable to register zram-control class\n");
+		return ret;
+	}
+
+	zram_major = register_blkdev(0, "zram");
+	if (zram_major <= 0) {
+		pr_err("Unable to get major number\n");
+		class_unregister(&zram_control_class);
+		return -EBUSY;
+	}
+
+	while (num_devices != 0) {
+		mutex_lock(&zram_index_mutex);
+		ret = zram_add();
+		mutex_unlock(&zram_index_mutex);
+		if (ret < 0)
+			goto out_error;
+		num_devices--;
+	}
+
+	return 0;
+
+out_error:
+	destroy_devices();
+	return ret;
+}
+
+static void __exit zram_exit(void)
+{
+#ifdef CONFIG_STATE_NOTIFIER
+	state_unregister_client(&notif);
+	notif.notifier_call = NULL;
+#endif
+	destroy_devices();
+}
+
+module_init(zram_init);
+module_exit(zram_exit);
+
+module_param(num_devices, uint, 0);
+MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
+MODULE_DESCRIPTION("Compressed RAM Block Device");
diff --git a/drivers/staging/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
similarity index 53%
rename from drivers/staging/zram/zram_drv.h
rename to drivers/block/zram/zram_drv.h
index 508a19f44..3f5bf66a2 100644
--- a/drivers/staging/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -2,6 +2,7 @@
  * Compressed RAM block device
  *
  * Copyright (C) 2008, 2009, 2010  Nitin Gupta
+ *               2012, 2013 Minchan Kim
  *
  * This code is released using a dual license strategy: BSD/GPL
  * You can choose the licence that better fits your requirements.
@@ -9,22 +10,15 @@
  * Released under the terms of 3-clause BSD License
  * Released under the terms of GNU General Public License Version 2.0
  *
- * Project home: http://compcache.googlecode.com
  */
 
 #ifndef _ZRAM_DRV_H_
 #define _ZRAM_DRV_H_
 
 #include <linux/spinlock.h>
-#include <linux/mutex.h>
+#include <linux/zsmalloc.h>
 
-#include "../zsmalloc/zsmalloc.h"
-
-/*
- * Some arbitrary value. This is just to catch
- * invalid value for num_devices module parameter.
- */
-static const unsigned max_num_devices = 32;
+#include "zcomp.h"
 
 /*-- Configurable parameters */
 
@@ -32,7 +26,7 @@ static const unsigned max_num_devices = 32;
  * Pages that compress to size greater than this are stored
  * uncompressed in memory.
  */
-static const size_t max_zpage_size = PAGE_SIZE / 10 * 9;
+static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
 
 /*
  * NOTE: max_zpage_size must be less than or equal to:
@@ -43,7 +37,6 @@ static const size_t max_zpage_size = PAGE_SIZE / 10 * 9;
 /*-- End of configurable params */
 
 #define SECTOR_SHIFT		9
-#define SECTOR_SIZE		(1 << SECTOR_SHIFT)
 #define SECTORS_PER_PAGE_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
 #define SECTORS_PER_PAGE	(1 << SECTORS_PER_PAGE_SHIFT)
 #define ZRAM_LOGICAL_BLOCK_SHIFT 12
@@ -51,10 +44,24 @@ static const size_t max_zpage_size = PAGE_SIZE / 10 * 9;
 #define ZRAM_SECTOR_PER_LOGICAL_BLOCK	\
 	(1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT))
 
-/* Flags for zram pages (table[page_no].flags) */
+
+/*
+ * The lower ZRAM_FLAG_SHIFT bits of table.value is for
+ * object size (excluding header), the higher bits is for
+ * zram_pageflags.
+ *
+ * zram is mainly used for memory efficiency so we want to keep memory
+ * footprint small so we can squeeze size and flags into a field.
+ * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header),
+ * the higher bits is for zram_pageflags.
+ */
+#define ZRAM_FLAG_SHIFT 24
+
+/* Flags for zram pages (table[page_no].value) */
 enum zram_pageflags {
 	/* Page consists entirely of zeros */
-	ZRAM_ZERO,
+	ZRAM_ZERO = ZRAM_FLAG_SHIFT,
+	ZRAM_ACCESS,	/* page is now accessed */
 
 	__NR_ZRAM_PAGEFLAGS,
 };
@@ -62,64 +69,54 @@ enum zram_pageflags {
 /*-- Data structures */
 
 /* Allocated for each disk page */
-struct table {
+struct zram_table_entry {
 	unsigned long handle;
-	u16 size;	/* object size (excluding header) */
-	u8 count;	/* object ref count (not yet used) */
-	u8 flags;
-} __aligned(4);
+	unsigned long value;
+};
 
-/*
- * All 64bit fields should only be manipulated by 64bit atomic accessors.
- * All modifications to 32bit counter should be protected by zram->lock.
- */
 struct zram_stats {
-	atomic64_t compr_size;	/* compressed size of pages stored */
+	atomic64_t compr_data_size;	/* compressed size of pages stored */
 	atomic64_t num_reads;	/* failed + successful */
 	atomic64_t num_writes;	/* --do-- */
-	atomic64_t failed_reads;	/* should NEVER! happen */
+	atomic64_t failed_reads;	/* can happen when memory is too low */
 	atomic64_t failed_writes;	/* can happen when memory is too low */
 	atomic64_t invalid_io;	/* non-page-aligned I/O requests */
 	atomic64_t notify_free;	/* no. of swap slot free notifications */
-	u32 pages_zero;		/* no. of zero filled pages */
-	u32 pages_stored;	/* no. of pages currently stored */
-	u32 good_compress;	/* % of pages with compression ratio<=50% */
-	u32 bad_compress;	/* % of pages with compression ratio>=75% */
+	atomic64_t zero_pages;		/* no. of zero filled pages */
+	atomic64_t pages_stored;	/* no. of pages currently stored */
+	atomic_long_t max_used_pages;	/* no. of maximum pages stored */
+	atomic64_t writestall;		/* no. of write slow paths */
 };
 
 struct zram_meta {
-	void *compress_workmem;
-	void *compress_buffer;
-	struct table *table;
+	struct zram_table_entry *table;
 	struct zs_pool *mem_pool;
 };
 
-struct zram_slot_free {
-	unsigned long index;
-	struct zram_slot_free *next;
-};
-
 struct zram {
 	struct zram_meta *meta;
-	struct rw_semaphore lock; /* protect compression buffers, table,
-				   * 32bit stat counters against concurrent
-				   * notifications, reads and writes */
-
-	struct work_struct free_work;  /* handle pending free request */
-	struct zram_slot_free *slot_free_rq; /* list head of free request */
-
-	struct request_queue *queue;
+	struct zcomp *comp;
 	struct gendisk *disk;
-	int init_done;
-	/* Prevent concurrent execution of device init, reset and R/W request */
+	/* Prevent concurrent execution of device init */
 	struct rw_semaphore init_lock;
+	/*
+	 * the number of pages zram can consume for storing compressed data
+	 */
+	unsigned long limit_pages;
+
+	struct zram_stats stats;
+	atomic_t refcount; /* refcount for zram_meta */
+	/* wait all IO under all of cpu are done */
+	wait_queue_head_t io_done;
 	/*
 	 * This is the limit on amount of *uncompressed* worth of data
 	 * we can store in a disk.
 	 */
 	u64 disksize;	/* bytes */
-	spinlock_t slot_free_lock;
-
-	struct zram_stats stats;
+	char compressor[10];
+	/*
+	 * zram is claimed so open request will be failed
+	 */
+	bool claim; /* Protected by bdev->bd_mutex */
 };
 #endif
diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c
index ed4b7481a..93c5b2fdf 100644
--- a/drivers/gpu/drm/drm_crtc.c
+++ b/drivers/gpu/drm/drm_crtc.c
@@ -2945,7 +2945,7 @@ static struct drm_property_blob *drm_property_create_blob(struct drm_device *dev
 	struct drm_property_blob *blob;
 	int ret;
 
-	if (!length || !data)
+	if (!length || length > ULONG_MAX - sizeof(struct drm_property_blob) || !data)
 		return NULL;
 
 	blob = kzalloc(sizeof(struct drm_property_blob)+length, GFP_KERNEL);
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 2f46bbfbb..b242534f5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -172,11 +172,12 @@ nouveau_gem_info(struct drm_file *file_priv, struct drm_gem_object *gem,
 	struct nouveau_bo *nvbo = nouveau_gem_object(gem);
 	struct nouveau_vma *vma;
 
-	if (nvbo->bo.mem.mem_type == TTM_PL_TT)
+	if (is_power_of_2(nvbo->valid_domains))
+		rep->domain = nvbo->valid_domains;
+	else if (nvbo->bo.mem.mem_type == TTM_PL_TT)
 		rep->domain = NOUVEAU_GEM_DOMAIN_GART;
 	else
 		rep->domain = NOUVEAU_GEM_DOMAIN_VRAM;
-
 	rep->offset = nvbo->bo.offset;
 	if (fpriv->vm) {
 		vma = nouveau_bo_vma_find(nvbo, fpriv->vm);
diff --git a/drivers/gpu/drm/radeon/radeon_combios.c b/drivers/gpu/drm/radeon/radeon_combios.c
index b72eb507d..69c2dd085 100644
--- a/drivers/gpu/drm/radeon/radeon_combios.c
+++ b/drivers/gpu/drm/radeon/radeon_combios.c
@@ -3399,6 +3399,14 @@ void radeon_combios_asic_init(struct drm_device *dev)
 	    rdev->pdev->subsystem_device == 0x30ae)
 		return;
 
+	/* quirk for rs4xx HP Compaq dc5750 Small Form Factor to make it resume
+	 * - it hangs on resume inside the dynclk 1 table.
+	 */
+	if (rdev->family == CHIP_RS480 &&
+	    rdev->pdev->subsystem_vendor == 0x103c &&
+	    rdev->pdev->subsystem_device == 0x280a)
+		return;
+
 	/* DYN CLK 1 */
 	table = combios_get_table_offset(dev, COMBIOS_DYN_CLK_1_TABLE);
 	if (table)
diff --git a/drivers/gpu/drm/radeon/radeon_connectors.c b/drivers/gpu/drm/radeon/radeon_connectors.c
index 9184bbe7c..9c5d96cb6 100644
--- a/drivers/gpu/drm/radeon/radeon_connectors.c
+++ b/drivers/gpu/drm/radeon/radeon_connectors.c
@@ -82,6 +82,11 @@ void radeon_connector_hotplug(struct drm_connector *connector)
 			if (!radeon_hpd_sense(rdev, radeon_connector->hpd.hpd)) {
 				drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF);
 			} else if (radeon_dp_needs_link_train(radeon_connector)) {
+				/* Don't try to start link training before we
+				 * have the dpcd */
+				if (!radeon_dp_getdpcd(radeon_connector))
+					return;
+
 				/* set it to OFF so that drm_helper_connector_dpms()
 				 * won't return immediately since the current state
 				 * is ON at this point.
diff --git a/drivers/gpu/ion/ion.c b/drivers/gpu/ion/ion.c
index 9c61bad7e..34d20c949 100644
--- a/drivers/gpu/ion/ion.c
+++ b/drivers/gpu/ion/ion.c
@@ -325,7 +325,7 @@ static struct ion_handle *ion_handle_create(struct ion_client *client,
 	if (!handle)
 		return ERR_PTR(-ENOMEM);
 	kref_init(&handle->ref);
-	rb_init_node(&handle->node);
+	RB_CLEAR_NODE(&handle->node);
 	handle->client = client;
 	ion_buffer_get(buffer);
 	ion_buffer_add_to_handle(buffer);
diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c
index 5cb6f4f64..791989de8 100644
--- a/drivers/gpu/msm/adreno.c
+++ b/drivers/gpu/msm/adreno.c
@@ -1735,7 +1735,7 @@ static int adreno_of_get_pdata(struct platform_device *pdev)
 
 	if (adreno_of_read_property(pdev->dev.of_node, "qcom,idle-timeout",
 		&pdata->idle_timeout))
-		pdata->idle_timeout = HZ/12;
+		pdata->idle_timeout = 80;
 
 	pdata->strtstp_sleepwake = of_property_read_bool(pdev->dev.of_node,
 						"qcom,strtstp-sleepwake");
diff --git a/drivers/gpu/msm/adreno_dispatch.c b/drivers/gpu/msm/adreno_dispatch.c
index 12c40c1ae..2a42e64db 100644
--- a/drivers/gpu/msm/adreno_dispatch.c
+++ b/drivers/gpu/msm/adreno_dispatch.c
@@ -267,7 +267,7 @@ static struct kgsl_cmdbatch *_get_cmdbatch(struct adreno_context *drawctxt)
 		 * it hasn't already been started
 		 */
 		if (!timer_pending(&cmdbatch->timer))
-			mod_timer(&cmdbatch->timer, jiffies + (5 * HZ));
+			mod_timer(&cmdbatch->timer, jiffies + msecs_to_jiffies(5000));
 
 		return ERR_PTR(-EAGAIN);
 	}
diff --git a/drivers/gpu/msm/adreno_profile.c b/drivers/gpu/msm/adreno_profile.c
index ebe9e587f..3929eee47 100644
--- a/drivers/gpu/msm/adreno_profile.c
+++ b/drivers/gpu/msm/adreno_profile.c
@@ -909,7 +909,7 @@ static int profile_pipe_print(struct file *filep, char __user *ubuf,
 
 		kgsl_mutex_unlock(&device->mutex, &device->mutex_owner);
 		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(HZ / 10);
+		schedule_timeout(msecs_to_jiffies(100));
 		kgsl_mutex_lock(&device->mutex, &device->mutex_owner);
 
 		if (signal_pending(current)) {
diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h
index c69bbee74..234666488 100644
--- a/drivers/gpu/msm/kgsl_device.h
+++ b/drivers/gpu/msm/kgsl_device.h
@@ -30,7 +30,6 @@
 #define KGSL_TIMEOUT_DEFAULT        0xFFFFFFFF
 #define KGSL_TIMEOUT_PART           50 /* 50 msec */
 
-#define FIRST_TIMEOUT (HZ / 2)
 
 
 /* KGSL device state is initialized to INIT when platform_probe		*
diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c
index 98140720d..8a2b31085 100644
--- a/drivers/gpu/msm/kgsl_pwrctrl.c
+++ b/drivers/gpu/msm/kgsl_pwrctrl.c
@@ -505,7 +505,6 @@ static int kgsl_pwrctrl_idle_timer_store(struct device *dev,
 	unsigned int val = 0;
 	struct kgsl_device *device = kgsl_device_from_dev(dev);
 	struct kgsl_pwrctrl *pwr;
-	const long div = 1000/HZ;
 	int ret;
 
 	if (device == NULL)
@@ -519,8 +518,7 @@ static int kgsl_pwrctrl_idle_timer_store(struct device *dev,
 	kgsl_mutex_lock(&device->mutex, &device->mutex_owner);
 
 	/* Let the timeout be requested in ms, but convert to jiffies. */
-	val /= div;
-	pwr->interval_timeout = val;
+	pwr->interval_timeout = msecs_to_jiffies(val);
 
 	kgsl_mutex_unlock(&device->mutex, &device->mutex_owner);
 
@@ -532,12 +530,11 @@ static int kgsl_pwrctrl_idle_timer_show(struct device *dev,
 					char *buf)
 {
 	struct kgsl_device *device = kgsl_device_from_dev(dev);
-	int mul = 1000/HZ;
 	if (device == NULL)
 		return 0;
 	/* Show the idle_timeout converted to msec */
 	return snprintf(buf, PAGE_SIZE, "%d\n",
-		device->pwrctrl.interval_timeout * mul);
+		jiffies_to_msecs(device->pwrctrl.interval_timeout));
 }
 
 static int kgsl_pwrctrl_pmqos_latency_store(struct device *dev,
@@ -1121,7 +1118,7 @@ int kgsl_pwrctrl_init(struct kgsl_device *device)
 	pwr->power_flags = 0;
 
 	pwr->idle_needed = pdata->idle_needed;
-	pwr->interval_timeout = pdata->idle_timeout;
+	pwr->interval_timeout = msecs_to_jiffies(pdata->idle_timeout);
 	pwr->strtstp_sleepwake = pdata->strtstp_sleepwake;
 	pwr->ebi1_clk = clk_get(&pdev->dev, "bus_clk");
 	if (IS_ERR(pwr->ebi1_clk))
@@ -1722,7 +1719,7 @@ static int _check_active_count(struct kgsl_device *device, int count)
 int kgsl_active_count_wait(struct kgsl_device *device, int count)
 {
 	int result = 0;
-	long wait_jiffies = HZ;
+	long wait_jiffies = msecs_to_jiffies(1000);
 
 	BUG_ON(!mutex_is_locked(&device->mutex));
 
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index c889aaef3..90104c6fc 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -856,6 +856,11 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
 	case IB_CM_SIDR_REQ_RCVD:
 		spin_unlock_irq(&cm_id_priv->lock);
 		cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
+		spin_lock_irq(&cm.lock);
+		if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
+			rb_erase(&cm_id_priv->sidr_id_node,
+				 &cm.remote_sidr_table);
+		spin_unlock_irq(&cm.lock);
 		break;
 	case IB_CM_REQ_SENT:
 		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
@@ -3092,7 +3097,10 @@ int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
 	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
 	spin_lock_irqsave(&cm.lock, flags);
-	rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+	if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) {
+		rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+		RB_CLEAR_NODE(&cm_id_priv->sidr_id_node);
+	}
 	spin_unlock_irqrestore(&cm.lock, flags);
 	return 0;
 
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 5bcb2afd3..228af1894 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -69,7 +69,7 @@
  */
 
 struct ib_uverbs_device {
-	struct kref				ref;
+	atomic_t				refcount;
 	int					num_comp_vectors;
 	struct completion			comp;
 	struct device			       *dev;
@@ -78,6 +78,7 @@ struct ib_uverbs_device {
 	struct cdev			        cdev;
 	struct rb_root				xrcd_tree;
 	struct mutex				xrcd_tree_mutex;
+	struct kobject				kobj;
 };
 
 struct ib_uverbs_event_file {
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 4d27e4c3f..95885b490 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1979,6 +1979,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
 		next->send_flags = user_wr->send_flags;
 
 		if (is_ud) {
+			if (next->opcode != IB_WR_SEND &&
+			    next->opcode != IB_WR_SEND_WITH_IMM) {
+				ret = -EINVAL;
+				goto out_put;
+			}
+
 			next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
 						     file->ucontext);
 			if (!next->wr.ud.ah) {
@@ -2015,9 +2021,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
 					user_wr->wr.atomic.compare_add;
 				next->wr.atomic.swap = user_wr->wr.atomic.swap;
 				next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
+			case IB_WR_SEND:
 				break;
 			default:
-				break;
+				ret = -EINVAL;
+				goto out_put;
 			}
 		}
 
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 5b51e4e6e..c8e766924 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -117,14 +117,18 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
 static void ib_uverbs_add_one(struct ib_device *device);
 static void ib_uverbs_remove_one(struct ib_device *device);
 
-static void ib_uverbs_release_dev(struct kref *ref)
+static void ib_uverbs_release_dev(struct kobject *kobj)
 {
 	struct ib_uverbs_device *dev =
-		container_of(ref, struct ib_uverbs_device, ref);
+		container_of(kobj, struct ib_uverbs_device, kobj);
 
-	complete(&dev->comp);
+	kfree(dev);
 }
 
+static struct kobj_type ib_uverbs_dev_ktype = {
+	.release = ib_uverbs_release_dev,
+};
+
 static void ib_uverbs_release_event_file(struct kref *ref)
 {
 	struct ib_uverbs_event_file *file =
@@ -273,13 +277,19 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 	return context->device->dealloc_ucontext(context);
 }
 
+static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
+{
+	complete(&dev->comp);
+}
+
 static void ib_uverbs_release_file(struct kref *ref)
 {
 	struct ib_uverbs_file *file =
 		container_of(ref, struct ib_uverbs_file, ref);
 
 	module_put(file->device->ib_dev->owner);
-	kref_put(&file->device->ref, ib_uverbs_release_dev);
+	if (atomic_dec_and_test(&file->device->refcount))
+		ib_uverbs_comp_dev(file->device);
 
 	kfree(file);
 }
@@ -621,9 +631,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 	int ret;
 
 	dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
-	if (dev)
-		kref_get(&dev->ref);
-	else
+	if (!atomic_inc_not_zero(&dev->refcount))
 		return -ENXIO;
 
 	if (!try_module_get(dev->ib_dev->owner)) {
@@ -644,6 +652,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 	mutex_init(&file->mutex);
 
 	filp->private_data = file;
+	kobject_get(&dev->kobj);
 
 	return nonseekable_open(inode, filp);
 
@@ -651,13 +660,16 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
 	module_put(dev->ib_dev->owner);
 
 err:
-	kref_put(&dev->ref, ib_uverbs_release_dev);
+	if (atomic_dec_and_test(&dev->refcount))
+		ib_uverbs_comp_dev(dev);
+
 	return ret;
 }
 
 static int ib_uverbs_close(struct inode *inode, struct file *filp)
 {
 	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_uverbs_device *dev = file->device;
 
 	ib_uverbs_cleanup_ucontext(file, file->ucontext);
 
@@ -665,6 +677,7 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp)
 		kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
 
 	kref_put(&file->ref, ib_uverbs_release_file);
+	kobject_put(&dev->kobj);
 
 	return 0;
 }
@@ -760,10 +773,11 @@ static void ib_uverbs_add_one(struct ib_device *device)
 	if (!uverbs_dev)
 		return;
 
-	kref_init(&uverbs_dev->ref);
+	atomic_set(&uverbs_dev->refcount, 1);
 	init_completion(&uverbs_dev->comp);
 	uverbs_dev->xrcd_tree = RB_ROOT;
 	mutex_init(&uverbs_dev->xrcd_tree_mutex);
+	kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
 
 	spin_lock(&map_lock);
 	devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -790,6 +804,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
 	cdev_init(&uverbs_dev->cdev, NULL);
 	uverbs_dev->cdev.owner = THIS_MODULE;
 	uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+	uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj;
 	kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
 	if (cdev_add(&uverbs_dev->cdev, base, 1))
 		goto err_cdev;
@@ -820,9 +835,10 @@ static void ib_uverbs_add_one(struct ib_device *device)
 		clear_bit(devnum, overflow_map);
 
 err:
-	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+	if (atomic_dec_and_test(&uverbs_dev->refcount))
+		ib_uverbs_comp_dev(uverbs_dev);
 	wait_for_completion(&uverbs_dev->comp);
-	kfree(uverbs_dev);
+	kobject_put(&uverbs_dev->kobj);
 	return;
 }
 
@@ -842,9 +858,10 @@ static void ib_uverbs_remove_one(struct ib_device *device)
 	else
 		clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
 
-	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+	if (atomic_dec_and_test(&uverbs_dev->refcount))
+		ib_uverbs_comp_dev(uverbs_dev);
 	wait_for_completion(&uverbs_dev->comp);
-	kfree(uverbs_dev);
+	kobject_put(&uverbs_dev->kobj);
 }
 
 static char *uverbs_devnode(struct device *dev, umode_t *mode)
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index a251becda..890c23b3d 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -169,9 +169,13 @@ int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
 	enum rdma_link_layer ll;
 
 	memset(ah_attr, 0, sizeof *ah_attr);
-	ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
 	ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24;
 	ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num);
+	if (ll == IB_LINK_LAYER_ETHERNET)
+		ah_attr->sl = be32_to_cpu(ah->av.eth.sl_tclass_flowlabel) >> 29;
+	else
+		ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+
 	ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0;
 	if (ah->av.ib.stat_rate)
 		ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET;
diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig
index f0f234dda..bc44d4c6c 100644
--- a/drivers/input/touchscreen/Kconfig
+++ b/drivers/input/touchscreen/Kconfig
@@ -1145,6 +1145,12 @@ config TOUCHSCREEN_ZINITIX_BT531
 	  To compile this driver as a module, choose M here: the
 	  module will be called BT531.
 
+config STATE_NOTIFIER
+    bool "State Notifier"
+ 	depends on CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI
+ 	default y
+ 
+
 source "drivers/input/touchscreen/wacom/Kconfig"
 source "drivers/input/touchscreen/gt9xx/Kconfig"
 source "drivers/input/touchscreen/synaptics/Kconfig"
diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile
index be6587f82..33101e7e9 100644
--- a/drivers/input/touchscreen/Makefile
+++ b/drivers/input/touchscreen/Makefile
@@ -100,3 +100,4 @@ obj-$(CONFIG_TOUCHSCREEN_FTS_S) += stm_s/
 obj-$(CONFIG_TOUCHSCREEN_FTS) += stm_fts/
 obj-$(CONFIG_TOUCHSCREEN_ZINITIX_BT532)	+= zinitix_bt532_ts.o
 obj-$(CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI_JSGLTE)	+= synaptics_jsglte/
+obj-$(CONFIG_STATE_NOTIFIER)		+= state_notifier.o
diff --git a/drivers/input/touchscreen/state_notifier.c b/drivers/input/touchscreen/state_notifier.c
new file mode 100644
index 000000000..7c67b9fc6
--- /dev/null
+++ b/drivers/input/touchscreen/state_notifier.c
@@ -0,0 +1,135 @@
+/*
+ * State Notifier Driver
+ *
+ * Copyright (c) 2013-2015, Pranav Vashi <neobuddy89@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/state_notifier.h>
+#include <linux/notifier.h>
+#include <linux/export.h>
+#include <linux/fb.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+
+#define DEFAULT_SUSPEND_DEFER_TIME	10
+#define STATE_NOTIFIER			"state_notifier"
+
+/*
+ * debug = 1 will print all
+ */
+static unsigned int debug = 1;
+module_param_named(debug_mask, debug, uint, 0644);
+
+static bool state_suspended;
+module_param_named(state_suspended, state_suspended, bool, 0444);
+
+#define dprintk(msg...)		\
+do {				\
+	if (debug)		\
+		pr_info(msg);	\
+} while (0)
+
+static unsigned int suspend_defer_time = DEFAULT_SUSPEND_DEFER_TIME;
+module_param_named(suspend_defer_time, suspend_defer_time, uint, 0664);
+static struct delayed_work suspend_work;
+static struct workqueue_struct *susp_wq;
+struct work_struct resume_work;
+static bool suspend_in_progress;
+
+static BLOCKING_NOTIFIER_HEAD(state_notifier_list);
+
+/**
+ *	state_register_client - register a client notifier
+ *	@nb: notifier block to callback on events
+ */
+int state_register_client(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&state_notifier_list, nb);
+}
+EXPORT_SYMBOL(state_register_client);
+
+/**
+ *	state_unregister_client - unregister a client notifier
+ *	@nb: notifier block to callback on events
+ */
+int state_unregister_client(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&state_notifier_list, nb);
+}
+EXPORT_SYMBOL(state_unregister_client);
+
+/**
+ *	state_notifier_call_chain - notify clients on state_events
+ *	@val: Value passed unmodified to notifier function
+ *	@v: pointer passed unmodified to notifier function
+ *
+ */
+int state_notifier_call_chain(unsigned long val, void *v)
+{
+	return blocking_notifier_call_chain(&state_notifier_list, val, v);
+}
+EXPORT_SYMBOL_GPL(state_notifier_call_chain);
+
+static void _suspend_work(struct work_struct *work)
+{
+	state_notifier_call_chain(STATE_NOTIFIER_SUSPEND, NULL);
+	state_suspended = true;
+	suspend_in_progress = false;
+	dprintk("%s: suspend completed.\n", STATE_NOTIFIER);
+}
+
+static void _resume_work(struct work_struct *work)
+{
+	state_notifier_call_chain(STATE_NOTIFIER_ACTIVE, NULL);
+	msleep_interruptible(50);
+	state_suspended = false;
+	dprintk("%s: resume completed.\n", STATE_NOTIFIER);
+}
+
+void state_suspend(void)
+{
+	if (state_suspended || suspend_in_progress)
+		return;
+
+	dprintk("%s: suspend called.\n", STATE_NOTIFIER);
+	suspend_in_progress = true;
+
+	queue_delayed_work_on(0, susp_wq, &suspend_work, 
+		msecs_to_jiffies(suspend_defer_time * 1000));
+}
+
+void state_resume(void)
+{
+	dprintk("%s: resume called.\n", STATE_NOTIFIER);
+	cancel_delayed_work_sync(&suspend_work);
+	suspend_in_progress = false;
+
+	if (state_suspended)
+		queue_work_on(0, susp_wq, &resume_work);
+}
+
+static int __init state_notifier_init(void)
+{
+	susp_wq = create_singlethread_workqueue("state_susp_wq");
+
+	if (!susp_wq) {
+		pr_err("State Notifier failed to allocate suspend workqueue\n");
+		return 0;
+	}
+
+	INIT_DELAYED_WORK(&suspend_work, _suspend_work);
+	INIT_WORK(&resume_work, _resume_work);
+
+	return 0;
+}
+
+subsys_initcall(state_notifier_init);
+
+MODULE_AUTHOR("Pranav Vashi <neobuddy89@gmail.com>");
+MODULE_DESCRIPTION("State Notifier Driver");
+MODULE_LICENSE("GPLv2");
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index a20d64d4e..4fd4f519c 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1931,8 +1931,8 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
 static void clear_dte_entry(u16 devid)
 {
 	/* remove entry from the device table seen by the hardware */
-	amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
-	amd_iommu_dev_table[devid].data[1] = 0;
+	amd_iommu_dev_table[devid].data[0]  = IOMMU_PTE_P | IOMMU_PTE_TV;
+	amd_iommu_dev_table[devid].data[1] &= DTE_FLAG_MASK;
 
 	amd_iommu_apply_erratum_63(devid);
 }
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index c4ffacb03..42f2090d3 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -277,6 +277,7 @@
 #define IOMMU_PTE_IR (1ULL << 61)
 #define IOMMU_PTE_IW (1ULL << 62)
 
+#define DTE_FLAG_MASK	(0x3ffULL << 32)
 #define DTE_FLAG_IOTLB	(0x01UL << 32)
 #define DTE_FLAG_GV	(0x01ULL << 55)
 #define DTE_GLX_SHIFT	(56)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 2e48b2273..f0a84bc3c 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1827,13 +1827,20 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
 				return -ENOMEM;
 			/* It is large page*/
 			if (largepage_lvl > 1) {
+				unsigned long nr_superpages, end_pfn, lvl_pages;
+
 				pteval |= DMA_PTE_LARGE_PAGE;
-				/* Ensure that old small page tables are removed to make room
-				   for superpage, if they exist. */
-				dma_pte_clear_range(domain, iov_pfn,
-						    iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
-				dma_pte_free_pagetable(domain, iov_pfn,
-						       iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
+				lvl_pages = lvl_to_nr_pages(largepage_lvl);
+
+				nr_superpages = sg_res / lvl_pages;
+				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
+
+				/*
+				 * Ensure that old small page tables are
+				 * removed to make room for superpage(s).
+				 */
+				dma_pte_clear_range(domain, iov_pfn, end_pfn);
+				dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
 			} else {
 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
 			}
diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c
index ce8897933..004fa1089 100644
--- a/drivers/macintosh/windfarm_core.c
+++ b/drivers/macintosh/windfarm_core.c
@@ -421,7 +421,7 @@ int wf_unregister_client(struct notifier_block *nb)
 {
 	mutex_lock(&wf_lock);
 	blocking_notifier_chain_unregister(&wf_client_list, nb);
-	wf_client_count++;
+	wf_client_count--;
 	if (wf_client_count == 0)
 		wf_stop_thread();
 	mutex_unlock(&wf_lock);
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2241534d3..f3b617eb3 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -347,7 +347,7 @@ config DM_MULTIPATH
 	# of SCSI_DH if the latter isn't defined but if
 	# it is, DM_MULTIPATH must depend on it.  We get a build
 	# error if SCSI_DH=m and DM_MULTIPATH=y
-	depends on SCSI_DH || !SCSI_DH
+	depends on !SCSI_DH || SCSI
 	---help---
 	  Allow volume managers to support multipath hardware.
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 14c3eea6e..29fb41513 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7929,6 +7929,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		/* Make sure they get written out promptly */
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
 		set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
+		set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
 		md_wakeup_thread(rdev->mddev->thread);
 	}
 	return rv;
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index dddd5a47f..be86d59ea 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -502,7 +502,7 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
 
 	r = new_block(s->info, &right);
 	if (r < 0) {
-		/* FIXME: put left */
+		unlock_block(s->info, left);
 		return r;
 	}
 
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index de63a1fc3..a0bb34a81 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -88,6 +88,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	char b[BDEVNAME_SIZE];
 	char b2[BDEVNAME_SIZE];
 	struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+	unsigned short blksize = 512;
 
 	if (!conf)
 		return -ENOMEM;
@@ -102,6 +103,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		sector_div(sectors, mddev->chunk_sectors);
 		rdev1->sectors = sectors * mddev->chunk_sectors;
 
+		blksize = max(blksize, queue_logical_block_size(
+				      rdev1->bdev->bd_disk->queue));
+
 		rdev_for_each(rdev2, mddev) {
 			pr_debug("md/raid0:%s:   comparing %s(%llu)"
 				 " with %s(%llu)\n",
@@ -138,6 +142,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	}
 	pr_debug("md/raid0:%s: FINAL %d zones\n",
 		 mdname(mddev), conf->nr_strip_zones);
+	/*
+	 * now since we have the hard sector sizes, we can make sure
+	 * chunk size is a multiple of that sector size
+	 */
+	if ((mddev->chunk_sectors << 9) % blksize) {
+		printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
+		       mdname(mddev),
+		       mddev->chunk_sectors << 9, blksize);
+		err = -EINVAL;
+		goto abort;
+	}
+
 	err = -ENOMEM;
 	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
 				conf->nr_strip_zones, GFP_KERNEL);
@@ -186,9 +202,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		}
 		dev[j] = rdev1;
 
-		disk_stack_limits(mddev->gendisk, rdev1->bdev,
-				  rdev1->data_offset << 9);
-
 		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
 			conf->has_merge_bvec = 1;
 
@@ -257,21 +270,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	mddev->queue->backing_dev_info.congested_fn = raid0_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
-	/*
-	 * now since we have the hard sector sizes, we can make sure
-	 * chunk size is a multiple of that sector size
-	 */
-	if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
-		printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n",
-		       mdname(mddev),
-		       mddev->chunk_sectors << 9);
-		goto abort;
-	}
-
-	blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
-	blk_queue_io_opt(mddev->queue,
-			 (mddev->chunk_sectors << 9) * mddev->raid_disks);
-
 	pr_debug("md/raid0:%s: done.\n", mdname(mddev));
 	*private_conf = conf;
 
@@ -431,6 +429,27 @@ static int raid0_run(struct mddev *mddev)
 		mddev->private = conf;
 	}
 	conf = mddev->private;
+	if (mddev->queue) {
+		struct md_rdev *rdev;
+		bool discard_supported = false;
+
+		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
+
+		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
+		blk_queue_io_opt(mddev->queue,
+				 (mddev->chunk_sectors << 9) * mddev->raid_disks);
+
+		rdev_for_each(rdev, mddev) {
+			disk_stack_limits(mddev->gendisk, rdev->bdev,
+					  rdev->data_offset << 9);
+			if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+				discard_supported = true;
+		}
+		if (!discard_supported)
+			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+		else
+			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+	}
 
 	/* calculate array device size */
 	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a5e678ee7..375cdc68d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1285,6 +1285,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 		set_bit(Faulty, &rdev->flags);
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	set_bit(MD_CHANGE_PENDING, &mddev->flags);
 	printk(KERN_ALERT
 	       "md/raid1:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid1:%s: Operation continuing on %d devices.\n",
@@ -2055,6 +2056,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 {
 	int m;
+	bool fail = false;
 	for (m = 0; m < conf->raid_disks * 2 ; m++)
 		if (r1_bio->bios[m] == IO_MADE_GOOD) {
 			struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -2067,6 +2069,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 			 * narrow down and record precise write
 			 * errors.
 			 */
+			fail = true;
 			if (!narrow_write_error(r1_bio, m)) {
 				md_error(conf->mddev,
 					 conf->mirrors[m].rdev);
@@ -2076,9 +2079,17 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 			rdev_dec_pending(conf->mirrors[m].rdev,
 					 conf->mddev);
 		}
-	if (test_bit(R1BIO_WriteError, &r1_bio->state))
-		close_write(r1_bio);
-	raid_end_bio_io(r1_bio);
+	if (fail) {
+		spin_lock_irq(&conf->device_lock);
+		list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
+		conf->nr_queued++;
+		spin_unlock_irq(&conf->device_lock);
+		md_wakeup_thread(conf->mddev->thread);
+	} else {
+		if (test_bit(R1BIO_WriteError, &r1_bio->state))
+			close_write(r1_bio);
+		raid_end_bio_io(r1_bio);
+	}
 }
 
 static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
@@ -2181,6 +2192,29 @@ static void raid1d(struct mddev *mddev)
 
 	md_check_recovery(mddev);
 
+	if (!list_empty_careful(&conf->bio_end_io_list) &&
+	    !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+		LIST_HEAD(tmp);
+		spin_lock_irqsave(&conf->device_lock, flags);
+		if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+			while (!list_empty(&conf->bio_end_io_list)) {
+				list_move(conf->bio_end_io_list.prev, &tmp);
+				conf->nr_queued--;
+			}
+		}
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+		while (!list_empty(&tmp)) {
+			r1_bio = list_first_entry(&conf->bio_end_io_list,
+						  struct r1bio, retry_list);
+			list_del(&r1_bio->retry_list);
+			if (mddev->degraded)
+				set_bit(R1BIO_Degraded, &r1_bio->state);
+			if (test_bit(R1BIO_WriteError, &r1_bio->state))
+				close_write(r1_bio);
+			raid_end_bio_io(r1_bio);
+		}
+	}
+
 	blk_start_plug(&plug);
 	for (;;) {
 
@@ -2582,6 +2616,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	conf->raid_disks = mddev->raid_disks;
 	conf->mddev = mddev;
 	INIT_LIST_HEAD(&conf->retry_list);
+	INIT_LIST_HEAD(&conf->bio_end_io_list);
 
 	spin_lock_init(&conf->resync_lock);
 	init_waitqueue_head(&conf->wait_barrier);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 80ded1393..50086cf0e 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -48,6 +48,11 @@ struct r1conf {
 	 * block, or anything else.
 	 */
 	struct list_head	retry_list;
+	/* A separate list of r1bio which just need raid_end_bio_io called.
+	 * This mustn't happen for writes which had any errors if the superblock
+	 * needs to be written.
+	 */
+	struct list_head	bio_end_io_list;
 
 	/* queue pending writes to be submitted on unplug */
 	struct bio_list		pending_bio_list;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f9a981b0e..6ce5c8d3b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1453,6 +1453,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	set_bit(MD_CHANGE_PENDING, &mddev->flags);
 	printk(KERN_ALERT
 	       "md/raid10:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid10:%s: Operation continuing on %d devices.\n",
@@ -2526,6 +2527,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 		}
 		put_buf(r10_bio);
 	} else {
+		bool fail = false;
 		for (m = 0; m < conf->copies; m++) {
 			int dev = r10_bio->devs[m].devnum;
 			struct bio *bio = r10_bio->devs[m].bio;
@@ -2538,6 +2540,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 				rdev_dec_pending(rdev, conf->mddev);
 			} else if (bio != NULL &&
 				   !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+				fail = true;
 				if (!narrow_write_error(r10_bio, m)) {
 					md_error(conf->mddev, rdev);
 					set_bit(R10BIO_Degraded,
@@ -2555,10 +2558,17 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 				rdev_dec_pending(rdev, conf->mddev);
 			}
 		}
-		if (test_bit(R10BIO_WriteError,
-			     &r10_bio->state))
-			close_write(r10_bio);
-		raid_end_bio_io(r10_bio);
+		if (fail) {
+			spin_lock_irq(&conf->device_lock);
+			list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
+			spin_unlock_irq(&conf->device_lock);
+			md_wakeup_thread(conf->mddev->thread);
+		} else {
+			if (test_bit(R10BIO_WriteError,
+				     &r10_bio->state))
+				close_write(r10_bio);
+			raid_end_bio_io(r10_bio);
+		}
 	}
 }
 
@@ -2572,6 +2582,29 @@ static void raid10d(struct mddev *mddev)
 
 	md_check_recovery(mddev);
 
+	if (!list_empty_careful(&conf->bio_end_io_list) &&
+	    !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+		LIST_HEAD(tmp);
+		spin_lock_irqsave(&conf->device_lock, flags);
+		if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+			list_add(&tmp, &conf->bio_end_io_list);
+			list_del_init(&conf->bio_end_io_list);
+		}
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+		while (!list_empty(&tmp)) {
+			r10_bio = list_first_entry(&conf->bio_end_io_list,
+						  struct r10bio, retry_list);
+			list_del(&r10_bio->retry_list);
+			if (mddev->degraded)
+				set_bit(R10BIO_Degraded, &r10_bio->state);
+
+			if (test_bit(R10BIO_WriteError,
+				     &r10_bio->state))
+				close_write(r10_bio);
+			raid_end_bio_io(r10_bio);
+		}
+	}
+
 	blk_start_plug(&plug);
 	for (;;) {
 
@@ -3270,6 +3303,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 
 	spin_lock_init(&conf->device_lock);
 	INIT_LIST_HEAD(&conf->retry_list);
+	INIT_LIST_HEAD(&conf->bio_end_io_list);
 
 	spin_lock_init(&conf->resync_lock);
 	init_waitqueue_head(&conf->wait_barrier);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 7c615613c..c09b38414 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -42,6 +42,12 @@ struct r10conf {
 	sector_t		chunk_mask;
 
 	struct list_head	retry_list;
+	/* A separate list of r1bio which just need raid_end_bio_io called.
+	 * This mustn't happen for writes which had any errors if the superblock
+	 * needs to be written.
+	 */
+	struct list_head	bio_end_io_list;
+
 	/* queue pending writes and submit them on unplug */
 	struct bio_list		pending_bio_list;
 	int			pending_count;
diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c
index cec1f8c05..a7ff6b547 100644
--- a/drivers/media/rc/rc-main.c
+++ b/drivers/media/rc/rc-main.c
@@ -946,9 +946,6 @@ static int rc_dev_uevent(struct device *device, struct kobj_uevent_env *env)
 {
 	struct rc_dev *dev = to_rc_dev(device);
 
-	if (!dev || !dev->input_dev)
-		return -ENODEV;
-
 	if (dev->rc_map.name)
 		ADD_HOTPLUG_VAR("NAME=%s", dev->rc_map.name);
 	if (dev->driver_name)
diff --git a/drivers/misc/modem_if/sipc4_modem.c b/drivers/misc/modem_if/sipc4_modem.c
index 66ae4c1af..2ba3bac89 100755
--- a/drivers/misc/modem_if/sipc4_modem.c
+++ b/drivers/misc/modem_if/sipc4_modem.c
@@ -105,8 +105,8 @@ static struct io_device *create_io_device(struct modem_io_t *io_t,
 		return NULL;
 	}
 
-	rb_init_node(&iod->node_chan);
-	rb_init_node(&iod->node_fmt);
+	RB_CLEAR_NODE(&iod->node_chan);
+	RB_CLEAR_NODE(&iod->node_fmt);
 
 	iod->name = io_t->name;
 	iod->id = io_t->id;
diff --git a/drivers/misc/modem_if/sipc5_modem.c b/drivers/misc/modem_if/sipc5_modem.c
index 8943686fa..84043a2a7 100755
--- a/drivers/misc/modem_if/sipc5_modem.c
+++ b/drivers/misc/modem_if/sipc5_modem.c
@@ -108,8 +108,8 @@ static struct io_device *create_io_device(struct modem_io_t *io_t,
 		return NULL;
 	}
 
-	rb_init_node(&iod->node_chan);
-	rb_init_node(&iod->node_fmt);
+	RB_CLEAR_NODE(&iod->node_chan);
+	RB_CLEAR_NODE(&iod->node_fmt);
 
 	iod->name = io_t->name;
 	iod->id = io_t->id;
diff --git a/drivers/misc/sec_misc.c b/drivers/misc/sec_misc.c
index dbd80b854..5a6a17eb7 100755
--- a/drivers/misc/sec_misc.c
+++ b/drivers/misc/sec_misc.c
@@ -283,7 +283,7 @@ static ssize_t drop_caches_store
 		si_meminfo(&i);
 		printk("[Before]\nMemFree : %8lu kB\n", K(i.freeram));
 		printk("Cached : %8lu kB\n\n", K(global_page_state(NR_FILE_PAGES) - \
-						total_swapcache_pages - i.bufferram));
+						total_swapcache_pages() - i.bufferram));
 
 		iterate_supers(drop_pagecache_sb, NULL);
 		drop_slab();
@@ -291,7 +291,7 @@ static ssize_t drop_caches_store
 		si_meminfo(&i);
 		printk("[After]\nMemFree : %8lu kB\n", K(i.freeram));
 		printk("Cached : %8lu kB\n\n", K(global_page_state(NR_FILE_PAGES) - \
-						total_swapcache_pages - i.bufferram));
+						total_swapcache_pages() - i.bufferram));
 		printk("Cached Drop done!\n");
 	}
 out:
diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c
index 43f1a0011..8f793ea1d 100644
--- a/drivers/mtd/ubi/io.c
+++ b/drivers/mtd/ubi/io.c
@@ -942,6 +942,11 @@ static int validate_vid_hdr(const struct ubi_device *ubi,
 		goto bad;
 	}
 
+	if (data_size > ubi->leb_size) {
+		dbg_err("bad data_size");
+		goto bad;
+	}
+
 	if (vol_type == UBI_VID_STATIC) {
 		/*
 		 * Although from high-level point of view static volumes may
diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c
index c015fc0a7..4105a508f 100644
--- a/drivers/mtd/ubi/vtbl.c
+++ b/drivers/mtd/ubi/vtbl.c
@@ -656,6 +656,7 @@ static int init_volumes(struct ubi_device *ubi, const struct ubi_scan_info *si,
 		if (ubi->corr_peb_count)
 			ubi_err("%d PEBs are corrupted and not used",
 				ubi->corr_peb_count);
+		return -ENOSPC;
 	}
 	ubi->rsvd_pebs += reserved_pebs;
 	ubi->avail_pebs -= reserved_pebs;
diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
index 424260a3c..43422b94d 100644
--- a/drivers/mtd/ubi/wl.c
+++ b/drivers/mtd/ubi/wl.c
@@ -1516,6 +1516,7 @@ int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si)
 		if (ubi->corr_peb_count)
 			ubi_err("%d PEBs are corrupted and not used",
 				ubi->corr_peb_count);
+		err = -ENOSPC;
 		goto out_free;
 	}
 	ubi->avail_pebs -= WL_RESERVED_PEBS;
diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c
index cac5b256a..37534a06b 100644
--- a/drivers/net/wireless/ath/ath9k/init.c
+++ b/drivers/net/wireless/ath/ath9k/init.c
@@ -683,6 +683,7 @@ void ath9k_set_hw_capab(struct ath_softc *sc, struct ieee80211_hw *hw)
 	hw->max_rate_tries = 10;
 	hw->sta_data_size = sizeof(struct ath_node);
 	hw->vif_data_size = sizeof(struct ath_vif);
+	hw->extra_tx_headroom = 4;
 
 	hw->wiphy->available_antennas_rx = BIT(ah->caps.max_rxchains) - 1;
 	hw->wiphy->available_antennas_tx = BIT(ah->caps.max_txchains) - 1;
diff --git a/drivers/net/wireless/bcmdhd/wl_cfg80211.c b/drivers/net/wireless/bcmdhd/wl_cfg80211.c
index 671c85a42..24081ea33 100644
--- a/drivers/net/wireless/bcmdhd/wl_cfg80211.c
+++ b/drivers/net/wireless/bcmdhd/wl_cfg80211.c
@@ -10084,7 +10084,7 @@ wl_cfg80211_netdev_notifier_call(struct notifier_block * nb,
 					break;
 				}
 				set_current_state(TASK_INTERRUPTIBLE);
-				schedule_timeout(100);
+				schedule_timeout(HZ);
 				set_current_state(TASK_RUNNING);
 				refcnt++;
 			}
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-lib.c b/drivers/net/wireless/iwlwifi/iwl-agn-lib.c
index 56f41c940..6314e24c2 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn-lib.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn-lib.c
@@ -1063,7 +1063,7 @@ static void iwlagn_wowlan_program_keys(struct ieee80211_hw *hw,
 			u8 *pn = seq.ccmp.pn;
 
 			ieee80211_get_key_rx_seq(key, i, &seq);
-			aes_sc->pn = cpu_to_le64(
+			aes_sc[i].pn = cpu_to_le64(
 					(u64)pn[5] |
 					((u64)pn[4] << 8) |
 					((u64)pn[3] << 16) |
diff --git a/drivers/of/address.c b/drivers/of/address.c
index 693cdaaf2..05c653958 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -623,10 +623,10 @@ struct device_node *of_find_matching_node_by_address(struct device_node *from,
 	struct resource res;
 
 	while (dn) {
-		if (of_address_to_resource(dn, 0, &res))
-			continue;
-		if (res.start == base_address)
+		if (!of_address_to_resource(dn, 0, &res) &&
+		    res.start == base_address)
 			return dn;
+
 		dn = of_find_matching_node(dn, matches);
 	}
 
diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 2a581642c..f49d961cf 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -357,6 +357,56 @@ static const struct pci_vpd_ops pci_vpd_pci22_ops = {
 	.release = pci_vpd_pci22_release,
 };
 
+static ssize_t pci_vpd_f0_read(struct pci_dev *dev, loff_t pos, size_t count,
+			       void *arg)
+{
+	struct pci_dev *tdev = pci_get_slot(dev->bus, PCI_SLOT(dev->devfn));
+	ssize_t ret;
+
+	if (!tdev)
+		return -ENODEV;
+
+	ret = pci_read_vpd(tdev, pos, count, arg);
+	pci_dev_put(tdev);
+	return ret;
+}
+
+static ssize_t pci_vpd_f0_write(struct pci_dev *dev, loff_t pos, size_t count,
+				const void *arg)
+{
+	struct pci_dev *tdev = pci_get_slot(dev->bus, PCI_SLOT(dev->devfn));
+	ssize_t ret;
+
+	if (!tdev)
+		return -ENODEV;
+
+	ret = pci_write_vpd(tdev, pos, count, arg);
+	pci_dev_put(tdev);
+	return ret;
+}
+
+static const struct pci_vpd_ops pci_vpd_f0_ops = {
+	.read = pci_vpd_f0_read,
+	.write = pci_vpd_f0_write,
+	.release = pci_vpd_pci22_release,
+};
+
+static int pci_vpd_f0_dev_check(struct pci_dev *dev)
+{
+	struct pci_dev *tdev = pci_get_slot(dev->bus, PCI_SLOT(dev->devfn));
+	int ret = 0;
+
+	if (!tdev)
+		return -ENODEV;
+	if (!tdev->vpd || !tdev->multifunction ||
+	    dev->class != tdev->class || dev->vendor != tdev->vendor ||
+	    dev->device != tdev->device)
+		ret = -ENODEV;
+
+	pci_dev_put(tdev);
+	return ret;
+}
+
 int pci_vpd_pci22_init(struct pci_dev *dev)
 {
 	struct pci_vpd_pci22 *vpd;
@@ -365,12 +415,21 @@ int pci_vpd_pci22_init(struct pci_dev *dev)
 	cap = pci_find_capability(dev, PCI_CAP_ID_VPD);
 	if (!cap)
 		return -ENODEV;
+	if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0) {
+		int ret = pci_vpd_f0_dev_check(dev);
+
+		if (ret)
+			return ret;
+	}
 	vpd = kzalloc(sizeof(*vpd), GFP_ATOMIC);
 	if (!vpd)
 		return -ENOMEM;
 
 	vpd->base.len = PCI_VPD_PCI22_SIZE;
-	vpd->base.ops = &pci_vpd_pci22_ops;
+	if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0)
+		vpd->base.ops = &pci_vpd_f0_ops;
+	else
+		vpd->base.ops = &pci_vpd_pci22_ops;
 	mutex_init(&vpd->lock);
 	vpd->cap = cap;
 	vpd->busy = false;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index c0300242d..3ce87c82f 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1883,6 +1883,15 @@ static void __devinit quirk_netmos(struct pci_dev *dev)
 DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_NETMOS, PCI_ANY_ID,
 			 PCI_CLASS_COMMUNICATION_SERIAL, 8, quirk_netmos);
 
+static void quirk_f0_vpd_link(struct pci_dev *dev)
+{
+	if ((dev->class >> 8) != PCI_CLASS_NETWORK_ETHERNET ||
+	    !dev->multifunction || !PCI_FUNC(dev->devfn))
+		return;
+	dev->dev_flags |= PCI_DEV_FLAGS_VPD_REF_F0;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_f0_vpd_link);
+
 static void __devinit quirk_e100_interrupt(struct pci_dev *dev)
 {
 	u16 command, pmcsr;
@@ -2834,12 +2843,15 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x3c28, vtd_mask_spec_errors);
 
 static void __devinit fixup_ti816x_class(struct pci_dev* dev)
 {
+	u32 class = dev->class;
+
 	/* TI 816x devices do not have class code set when in PCIe boot mode */
-	dev_info(&dev->dev, "Setting PCI class for 816x PCIe device\n");
-	dev->class = PCI_CLASS_MULTIMEDIA_VIDEO;
+	dev->class = PCI_CLASS_MULTIMEDIA_VIDEO << 8;
+	dev_info(&dev->dev, "PCI class overridden (%#08x -> %#08x)\n",
+		 class, dev->class);
 }
 DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_TI, 0xb800,
-				 PCI_CLASS_NOT_DEFINED, 0, fixup_ti816x_class);
+			      PCI_CLASS_NOT_DEFINED, 0, fixup_ti816x_class);
 
 /* Some PCIe devices do not work reliably with the claimed maximum
  * payload size supported.
diff --git a/drivers/platform/msm/usb_bam.c b/drivers/platform/msm/usb_bam.c
index bdb7e4122..913dc0bae 100644
--- a/drivers/platform/msm/usb_bam.c
+++ b/drivers/platform/msm/usb_bam.c
@@ -31,7 +31,7 @@
 
 #define USB_THRESHOLD 512
 #define USB_BAM_MAX_STR_LEN 50
-#define USB_BAM_TIMEOUT (10*HZ)
+#define USB_BAM_TIMEOUT 10000
 
 enum usb_bam_sm {
 	USB_BAM_SM_INIT = 0,
@@ -1073,7 +1073,7 @@ static void wait_for_prod_granted(enum usb_bam cur_bam)
 	} else if (ret == -EINPROGRESS) {
 		pr_debug("%s: Waiting for PROD_GRANTED\n", __func__);
 		if (!wait_for_completion_timeout(&info.prod_avail[cur_bam],
-			USB_BAM_TIMEOUT))
+			msecs_to_jiffies(USB_BAM_TIMEOUT)))
 			pr_err("%s: Timeout wainting for PROD_GRANTED\n",
 				__func__);
 	} else
@@ -1117,7 +1117,7 @@ static void wait_for_prod_release(enum usb_bam cur_bam)
 	} else if (ret == -EINPROGRESS) {
 		pr_debug("%s: Waiting for PROD_RELEASED\n", __func__);
 		if (!wait_for_completion_timeout(&info.prod_released[cur_bam],
-						USB_BAM_TIMEOUT))
+						msecs_to_jiffies(USB_BAM_TIMEOUT)))
 			pr_err("%s: Timeout waiting for PROD_RELEASED\n",
 			__func__);
 	} else
@@ -2073,7 +2073,7 @@ void usb_bam_reset_complete(void)
 {
 	pr_debug("Waiting for reset compelte");
 	if (wait_for_completion_interruptible_timeout(&ctx.reset_done,
-			10*HZ) <= 0)
+			msecs_to_jiffies(10000)) <= 0)
 		pr_warn("Timeout while waiting for reset");
 
 	pr_debug("Finished Waiting for reset complete");
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index a4f1a503b..0303e0256 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -2592,7 +2592,7 @@ static void regulator_bulk_enable_async(void *data, async_cookie_t cookie)
 int regulator_bulk_enable(int num_consumers,
 			  struct regulator_bulk_data *consumers)
 {
-	LIST_HEAD(async_domain);
+	ASYNC_DOMAIN_EXCLUSIVE(async_domain);
 	int i;
 	int ret = 0;
 
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 51ee663c1..ec96c5fe3 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -720,7 +720,7 @@ static void async_sas_ata_eh(void *data, async_cookie_t cookie)
 void sas_ata_strategy_handler(struct Scsi_Host *shost)
 {
 	struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(shost);
-	LIST_HEAD(async);
+	ASYNC_DOMAIN_EXCLUSIVE(async);
 	int i;
 
 	/* it's ok to defer revalidation events during ata eh, these
diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.c
index dbb8edfc8..06da698da 100644
--- a/drivers/scsi/mvsas/mv_sas.c
+++ b/drivers/scsi/mvsas/mv_sas.c
@@ -984,6 +984,8 @@ static void mvs_slot_free(struct mvs_info *mvi, u32 rx_desc)
 static void mvs_slot_task_free(struct mvs_info *mvi, struct sas_task *task,
 			  struct mvs_slot_info *slot, u32 slot_idx)
 {
+	if (!slot)
+		return;
 	if (!slot->task)
 		return;
 	if (!sas_protocol_ata(task->task_proto))
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 29fabffd6..b1f76da1e 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -54,6 +54,7 @@
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
+#include <linux/async.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
diff --git a/drivers/sensorhub/stm32f/ssp.h b/drivers/sensorhub/stm32f/ssp.h
index 414182817..81c24adf0 100755
--- a/drivers/sensorhub/stm32f/ssp.h
+++ b/drivers/sensorhub/stm32f/ssp.h
@@ -48,7 +48,7 @@
 #include "ssp_sensorhub.h"
 #endif
 
-#define SSP_DBG		1
+#define SSP_DBG		0
 
 #define SUCCESS		1
 #define FAIL		0
@@ -56,18 +56,19 @@
 
 #define FACTORY_DATA_MAX	100
 
-#if SSP_DBG
-#define SSP_FUNC_DBG 1
-#define SSP_DATA_DBG 0
-
 /* ssp mcu device ID */
 #define DEVICE_ID			0x55
 
+#if SSP_DBG
+#define SSP_FUNC_DBG 1
+#define SSP_DATA_DBG 0
 
 #define ssp_dbg(dev, format, ...) do { \
 	printk(KERN_INFO dev, format, ##__VA_ARGS__); \
 	} while (0)
 #else
+#define SSP_FUNC_DBG 0
+#define SSP_DATA_DBG 0
 #define ssp_dbg(dev, format, ...)
 #endif
 
diff --git a/drivers/sensorhub/stm32f/ssp_debug.c b/drivers/sensorhub/stm32f/ssp_debug.c
index af356f4ca..3d25335b4 100755
--- a/drivers/sensorhub/stm32f/ssp_debug.c
+++ b/drivers/sensorhub/stm32f/ssp_debug.c
@@ -196,7 +196,9 @@ int print_mcu_debug(char *pchRcvDataFrame, int *pDataIdx,
 		int iRcvDataFrameLength)
 {
 	int iLength = pchRcvDataFrame[(*pDataIdx)++];
+#if SSP_DBG
 	int cur = *pDataIdx;
+#endif
 
 	if (iLength > iRcvDataFrameLength - *pDataIdx || iLength <= 0) {
 		ssp_dbg("[SSP]: MSG From MCU - invalid debug length(%d/%d/%d)\n",
diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index dc25bee8d..2ecc2d646 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -799,6 +799,10 @@ static irqreturn_t ssp_int(int irq, void *dev_id)
 	if (!(sccr1_reg & SSCR1_TIE))
 		mask &= ~SSSR_TFS;
 
+	/* Ignore RX timeout interrupt if it is disabled */
+	if (!(sccr1_reg & SSCR1_TINTE))
+		mask &= ~SSSR_TINT;
+
 	if (!(status & mask))
 		return IRQ_NONE;
 
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 3d8f662e4..a3f31e9ab 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -831,8 +831,7 @@ static struct class spi_master_class = {
  *
  * The caller is responsible for assigning the bus number and initializing
  * the master's methods before calling spi_register_master(); and (after errors
- * adding the device) calling spi_master_put() and kfree() to prevent a memory
- * leak.
+ * adding the device) calling spi_master_put() to prevent a memory leak.
  */
 struct spi_master *spi_alloc_master(struct device *dev, unsigned size)
 {
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index d6931e9cd..c15649439 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -80,12 +80,6 @@ source "drivers/staging/sep/Kconfig"
 
 source "drivers/staging/iio/Kconfig"
 
-source "drivers/staging/zram/Kconfig"
-
-source "drivers/staging/zcache/Kconfig"
-
-source "drivers/staging/zsmalloc/Kconfig"
-
 source "drivers/staging/wlags49_h2/Kconfig"
 
 source "drivers/staging/wlags49_h25/Kconfig"
@@ -126,8 +120,6 @@ source "drivers/staging/android/Kconfig"
 
 source "drivers/staging/telephony/Kconfig"
 
-source "drivers/staging/ramster/Kconfig"
-
 source "drivers/staging/ozwpan/Kconfig"
 
 source "drivers/staging/vnswap/Kconfig"
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index 8229e82e9..605826f30 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -32,9 +32,6 @@ obj-$(CONFIG_VT6656)		+= vt6656/
 obj-$(CONFIG_VME_BUS)		+= vme/
 obj-$(CONFIG_DX_SEP)            += sep/
 obj-$(CONFIG_IIO)		+= iio/
-obj-$(CONFIG_ZRAM)		+= zram/
-obj-$(CONFIG_ZCACHE)		+= zcache/
-obj-$(CONFIG_ZSMALLOC)		+= zsmalloc/
 obj-$(CONFIG_WLAGS49_H2)	+= wlags49_h2/
 obj-$(CONFIG_WLAGS49_H25)	+= wlags49_h25/
 obj-$(CONFIG_FB_SM7XX)		+= sm7xx/
@@ -54,6 +51,5 @@ obj-$(CONFIG_MFD_NVEC)		+= nvec/
 obj-$(CONFIG_DRM_OMAP)		+= omapdrm/
 obj-$(CONFIG_ANDROID)		+= android/
 obj-$(CONFIG_PHONE)		+= telephony/
-obj-$(CONFIG_RAMSTER)		+= ramster/
 obj-$(CONFIG_USB_WPAN_HCD)	+= ozwpan/
 obj-$(CONFIG_VNSWAP)		+= vnswap/
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index 159cbf8a7..bf2158b6c 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -41,6 +41,7 @@
 #include <linux/delay.h>
 #include <linux/swap.h>
 #include <linux/fs.h>
+#include <linux/zcache.h>
 
 #include <linux/ratelimit.h>
 
@@ -144,7 +145,7 @@ static DEFINE_MUTEX(scan_mutex);
 #endif
 
 #if defined(CONFIG_ZSWAP)
-extern atomic_t zswap_pool_pages;
+extern u64 zswap_pool_total_size;
 extern atomic_t zswap_stored_pages;
 #endif
 
@@ -174,6 +175,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
 	bool is_active_high;
 	bool flag = 0;
 #endif
+	u32 zswap_pool_total_pages;
 
 	if (nr_to_scan > 0) {
 		if (mutex_lock_interruptible(&scan_mutex) < 0)
@@ -196,7 +198,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
 	is_active_high = (global_page_state(NR_ACTIVE_FILE) >
 				global_page_state(NR_INACTIVE_FILE)) ? 1 : 0;
 #endif
-	other_file = global_page_state(NR_FILE_PAGES);
+	other_file = global_page_state(NR_FILE_PAGES) + zcache_pages();
 
 #if defined(CONFIG_CMA_PAGE_COUNTING) && defined(CONFIG_EXCLUDE_LRU_LIVING_IN_CMA)
 	if (get_nr_swap_pages() < SSWAP_LMK_THRESHOLD && cma_page_ratio >= CMA_PAGE_RATIO
@@ -205,8 +207,8 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
 		flag = 1;
 	}
 #endif
-	if (global_page_state(NR_SHMEM) + total_swapcache_pages < other_file)
-		other_file -= global_page_state(NR_SHMEM) + total_swapcache_pages;
+	if (global_page_state(NR_SHMEM) + total_swapcache_pages() < other_file)
+		other_file -= global_page_state(NR_SHMEM) + total_swapcache_pages();
 	else
 		other_file = 0;
 
@@ -275,7 +277,8 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
 #if defined(CONFIG_ZSWAP)
 		if (atomic_read(&zswap_stored_pages)) {
 			lowmem_print(3, "shown tasksize : %d\n", tasksize);
-			tasksize += atomic_read(&zswap_pool_pages) * get_mm_counter(p->mm, MM_SWAPENTS)
+			zswap_pool_total_pages = DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+			tasksize +=  zswap_pool_total_pages * get_mm_counter(p->mm, MM_SWAPENTS)
 				/ atomic_read(&zswap_stored_pages);
 			lowmem_print(3, "real tasksize : %d\n", tasksize);
 		}
@@ -301,22 +304,22 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
 #if defined(CONFIG_CMA_PAGE_COUNTING)
 		lowmem_print(1, "send sigkill to %d (%s), adj %d, size %d, "
 			"ofree %d, ofile %d(%c), is_kswapd %d - "
-			"cma_free %lu priority %d cma_i_file %lu cma_a_file %lu\n",
+			"cma_free %lu priority %d cma_i_file %lu cma_a_file %lu\n,Total zcache is %ldkB\n",
 			selected->pid, selected->comm,
 			selected_oom_score_adj, selected_tasksize,
 			other_free, other_file, flag ? '-' : '+',
 			!!current_is_kswapd(),
 			nr_cma_free, sc->priority,
-			nr_cma_inactive_file, nr_cma_active_file);
+			nr_cma_inactive_file, nr_cma_active_file, (long)zcache_pages() * (long)(PAGE_SIZE / 1024));
 #else
 		lowmem_print(1, "send sigkill to %d (%s), adj %d, size %d, "
 				"free memory = %d, reclaimable memory = %d "
-				"is_kswapd %d cma_free %lu priority %d\n",
+				"is_kswapd %d cma_free %lu priority %d\nTotal zcache is %ldkB\n",
 				selected->pid, selected->comm,
 				selected_oom_score_adj, selected_tasksize,
 				other_free, other_file,
 				!!current_is_kswapd(),
-				nr_cma_free, sc->priority);
+				nr_cma_free, sc->priority,(long)zcache_pages() * (long)(PAGE_SIZE / 1024));
 #endif
 		lowmem_deathpending_timeout = jiffies + HZ;
 		send_sig(SIGKILL, selected, 0);
@@ -399,9 +402,9 @@ static int android_oom_handler(struct notifier_block *nb,
 
 	nr_cma_inactive_file = global_page_state(NR_CMA_INACTIVE_FILE);
 	nr_cma_active_file = global_page_state(NR_CMA_ACTIVE_FILE);
-	other_file = global_page_state(NR_FILE_PAGES) -
+	other_file = global_page_state(NR_FILE_PAGES) + zcache_pages() -
 					global_page_state(NR_SHMEM) -
-					total_swapcache_pages -
+					total_swapcache_pages() -
 					nr_cma_inactive_file -
 					nr_cma_active_file;
 #endif
diff --git a/drivers/staging/ramster/Kconfig b/drivers/staging/ramster/Kconfig
deleted file mode 100644
index 4af1f8d4b..000000000
--- a/drivers/staging/ramster/Kconfig
+++ /dev/null
@@ -1,13 +0,0 @@
-config RAMSTER
-	bool "Cross-machine RAM capacity sharing, aka peer-to-peer tmem"
-	depends on (CLEANCACHE || FRONTSWAP) && CONFIGFS_FS=y && !ZCACHE && !XVMALLOC && !HIGHMEM
-	select LZO_COMPRESS
-	select LZO_DECOMPRESS
-	default n
-	help
-	  RAMster allows RAM on other machines in a cluster to be utilized
-	  dynamically and symmetrically instead of swapping to a local swap
-	  disk, thus improving performance on memory-constrained workloads
-	  while minimizing total RAM across the cluster.  RAMster, like
-	  zcache, compresses swap pages into local RAM, but then remotifies
-	  the compressed pages to another node in the RAMster cluster.
diff --git a/drivers/staging/ramster/Makefile b/drivers/staging/ramster/Makefile
deleted file mode 100644
index bcc13c87f..000000000
--- a/drivers/staging/ramster/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-$(CONFIG_RAMSTER)	+=	zcache-main.o tmem.o r2net.o xvmalloc.o cluster/
diff --git a/drivers/staging/ramster/TODO b/drivers/staging/ramster/TODO
deleted file mode 100644
index 46fcf0c58..000000000
--- a/drivers/staging/ramster/TODO
+++ /dev/null
@@ -1,13 +0,0 @@
-For this staging driver, RAMster duplicates code from drivers/staging/zcache
-then incorporates changes to the local copy of the code.  For V5, it also
-directly incorporates the soon-to-be-removed drivers/staging/zram/xvmalloc.[ch]
-as all testing has been done with xvmalloc rather than the new zsmalloc.
-Before RAMster can be promoted from staging, the zcache and RAMster drivers
-should be either merged or reorganized to separate out common code.
-
-Until V4, RAMster duplicated code from fs/ocfs2/cluster, but this made
-RAMster incompatible with ocfs2 running in the same kernel and included
-lots of code that could be removed.  As of V5, the ocfs2 code has been
-mined and made RAMster-specific, made to communicate with a userland
-ramster-tools package rather than ocfs2-tools, and can co-exist with ocfs2
-both in the same kernel and in userland on the same machine.
diff --git a/drivers/staging/ramster/cluster/Makefile b/drivers/staging/ramster/cluster/Makefile
deleted file mode 100755
index 9c6943652..000000000
--- a/drivers/staging/ramster/cluster/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_RAMSTER) += ramster_nodemanager.o
-
-ramster_nodemanager-objs := heartbeat.o masklog.o nodemanager.o tcp.o
diff --git a/drivers/staging/ramster/cluster/heartbeat.c b/drivers/staging/ramster/cluster/heartbeat.c
deleted file mode 100755
index 002094907..000000000
--- a/drivers/staging/ramster/cluster/heartbeat.c
+++ /dev/null
@@ -1,464 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * Copyright (C) 2004, 2005, 2012 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/configfs.h>
-
-#include "heartbeat.h"
-#include "tcp.h"
-#include "nodemanager.h"
-
-#include "masklog.h"
-
-/*
- * The first heartbeat pass had one global thread that would serialize all hb
- * callback calls.  This global serializing sem should only be removed once
- * we've made sure that all callees can deal with being called concurrently
- * from multiple hb region threads.
- */
-static DECLARE_RWSEM(r2hb_callback_sem);
-
-/*
- * multiple hb threads are watching multiple regions.  A node is live
- * whenever any of the threads sees activity from the node in its region.
- */
-static DEFINE_SPINLOCK(r2hb_live_lock);
-static unsigned long r2hb_live_node_bitmap[BITS_TO_LONGS(R2NM_MAX_NODES)];
-
-static struct r2hb_callback {
-	struct list_head list;
-} r2hb_callbacks[R2HB_NUM_CB];
-
-enum r2hb_heartbeat_modes {
-	R2HB_HEARTBEAT_LOCAL		= 0,
-	R2HB_HEARTBEAT_GLOBAL,
-	R2HB_HEARTBEAT_NUM_MODES,
-};
-
-char *r2hb_heartbeat_mode_desc[R2HB_HEARTBEAT_NUM_MODES] = {
-		"local",	/* R2HB_HEARTBEAT_LOCAL */
-		"global",	/* R2HB_HEARTBEAT_GLOBAL */
-};
-
-unsigned int r2hb_dead_threshold = R2HB_DEFAULT_DEAD_THRESHOLD;
-unsigned int r2hb_heartbeat_mode = R2HB_HEARTBEAT_LOCAL;
-
-/* Only sets a new threshold if there are no active regions.
- *
- * No locking or otherwise interesting code is required for reading
- * r2hb_dead_threshold as it can't change once regions are active and
- * it's not interesting to anyone until then anyway. */
-static void r2hb_dead_threshold_set(unsigned int threshold)
-{
-	if (threshold > R2HB_MIN_DEAD_THRESHOLD) {
-		spin_lock(&r2hb_live_lock);
-		r2hb_dead_threshold = threshold;
-		spin_unlock(&r2hb_live_lock);
-	}
-}
-
-static int r2hb_global_hearbeat_mode_set(unsigned int hb_mode)
-{
-	int ret = -1;
-
-	if (hb_mode < R2HB_HEARTBEAT_NUM_MODES) {
-		spin_lock(&r2hb_live_lock);
-		r2hb_heartbeat_mode = hb_mode;
-		ret = 0;
-		spin_unlock(&r2hb_live_lock);
-	}
-
-	return ret;
-}
-
-void r2hb_exit(void)
-{
-}
-
-int r2hb_init(void)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(r2hb_callbacks); i++)
-		INIT_LIST_HEAD(&r2hb_callbacks[i].list);
-
-	memset(r2hb_live_node_bitmap, 0, sizeof(r2hb_live_node_bitmap));
-
-	return 0;
-}
-
-/* if we're already in a callback then we're already serialized by the sem */
-static void r2hb_fill_node_map_from_callback(unsigned long *map,
-					     unsigned bytes)
-{
-	BUG_ON(bytes < (BITS_TO_LONGS(R2NM_MAX_NODES) * sizeof(unsigned long)));
-
-	memcpy(map, &r2hb_live_node_bitmap, bytes);
-}
-
-/*
- * get a map of all nodes that are heartbeating in any regions
- */
-void r2hb_fill_node_map(unsigned long *map, unsigned bytes)
-{
-	/* callers want to serialize this map and callbacks so that they
-	 * can trust that they don't miss nodes coming to the party */
-	down_read(&r2hb_callback_sem);
-	spin_lock(&r2hb_live_lock);
-	r2hb_fill_node_map_from_callback(map, bytes);
-	spin_unlock(&r2hb_live_lock);
-	up_read(&r2hb_callback_sem);
-}
-EXPORT_SYMBOL_GPL(r2hb_fill_node_map);
-
-/*
- * heartbeat configfs bits.  The heartbeat set is a default set under
- * the cluster set in nodemanager.c.
- */
-
-/* heartbeat set */
-
-struct r2hb_hb_group {
-	struct config_group hs_group;
-	/* some stuff? */
-};
-
-static struct r2hb_hb_group *to_r2hb_hb_group(struct config_group *group)
-{
-	return group ?
-		container_of(group, struct r2hb_hb_group, hs_group)
-		: NULL;
-}
-
-static struct config_item r2hb_config_item;
-
-static struct config_item *r2hb_hb_group_make_item(struct config_group *group,
-							  const char *name)
-{
-	int ret;
-
-	if (strlen(name) > R2HB_MAX_REGION_NAME_LEN) {
-		ret = -ENAMETOOLONG;
-		goto free;
-	}
-
-	config_item_put(&r2hb_config_item);
-
-	return &r2hb_config_item;
-free:
-	return ERR_PTR(ret);
-}
-
-static void r2hb_hb_group_drop_item(struct config_group *group,
-					   struct config_item *item)
-{
-	if (r2hb_global_heartbeat_active()) {
-		printk(KERN_NOTICE "ramster: Heartbeat %s "
-			"on region %s (%s)\n",
-			"stopped/aborted", config_item_name(item),
-			"no region");
-	}
-
-	config_item_put(item);
-}
-
-struct r2hb_hb_group_attribute {
-	struct configfs_attribute attr;
-	ssize_t (*show)(struct r2hb_hb_group *, char *);
-	ssize_t (*store)(struct r2hb_hb_group *, const char *, size_t);
-};
-
-static ssize_t r2hb_hb_group_show(struct config_item *item,
-					 struct configfs_attribute *attr,
-					 char *page)
-{
-	struct r2hb_hb_group *reg = to_r2hb_hb_group(to_config_group(item));
-	struct r2hb_hb_group_attribute *r2hb_hb_group_attr =
-		container_of(attr, struct r2hb_hb_group_attribute, attr);
-	ssize_t ret = 0;
-
-	if (r2hb_hb_group_attr->show)
-		ret = r2hb_hb_group_attr->show(reg, page);
-	return ret;
-}
-
-static ssize_t r2hb_hb_group_store(struct config_item *item,
-					  struct configfs_attribute *attr,
-					  const char *page, size_t count)
-{
-	struct r2hb_hb_group *reg = to_r2hb_hb_group(to_config_group(item));
-	struct r2hb_hb_group_attribute *r2hb_hb_group_attr =
-		container_of(attr, struct r2hb_hb_group_attribute, attr);
-	ssize_t ret = -EINVAL;
-
-	if (r2hb_hb_group_attr->store)
-		ret = r2hb_hb_group_attr->store(reg, page, count);
-	return ret;
-}
-
-static ssize_t r2hb_hb_group_threshold_show(struct r2hb_hb_group *group,
-						     char *page)
-{
-	return sprintf(page, "%u\n", r2hb_dead_threshold);
-}
-
-static ssize_t r2hb_hb_group_threshold_store(struct r2hb_hb_group *group,
-						    const char *page,
-						    size_t count)
-{
-	unsigned long tmp;
-	char *p = (char *)page;
-	int err;
-
-	err = kstrtoul(p, 10, &tmp);
-	if (err)
-		return err;
-
-	/* this will validate ranges for us. */
-	r2hb_dead_threshold_set((unsigned int) tmp);
-
-	return count;
-}
-
-static
-ssize_t r2hb_hb_group_mode_show(struct r2hb_hb_group *group,
-				       char *page)
-{
-	return sprintf(page, "%s\n",
-		       r2hb_heartbeat_mode_desc[r2hb_heartbeat_mode]);
-}
-
-static
-ssize_t r2hb_hb_group_mode_store(struct r2hb_hb_group *group,
-					const char *page, size_t count)
-{
-	unsigned int i;
-	int ret;
-	size_t len;
-
-	len = (page[count - 1] == '\n') ? count - 1 : count;
-	if (!len)
-		return -EINVAL;
-
-	for (i = 0; i < R2HB_HEARTBEAT_NUM_MODES; ++i) {
-		if (strnicmp(page, r2hb_heartbeat_mode_desc[i], len))
-			continue;
-
-		ret = r2hb_global_hearbeat_mode_set(i);
-		if (!ret)
-			printk(KERN_NOTICE "ramster: Heartbeat mode "
-						"set to %s\n",
-			       r2hb_heartbeat_mode_desc[i]);
-		return count;
-	}
-
-	return -EINVAL;
-
-}
-
-static struct r2hb_hb_group_attribute r2hb_hb_group_attr_threshold = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "dead_threshold",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= r2hb_hb_group_threshold_show,
-	.store	= r2hb_hb_group_threshold_store,
-};
-
-static struct r2hb_hb_group_attribute r2hb_hb_group_attr_mode = {
-	.attr   = { .ca_owner = THIS_MODULE,
-		.ca_name = "mode",
-		.ca_mode = S_IRUGO | S_IWUSR },
-	.show   = r2hb_hb_group_mode_show,
-	.store  = r2hb_hb_group_mode_store,
-};
-
-static struct configfs_attribute *r2hb_hb_group_attrs[] = {
-	&r2hb_hb_group_attr_threshold.attr,
-	&r2hb_hb_group_attr_mode.attr,
-	NULL,
-};
-
-static struct configfs_item_operations r2hb_hearbeat_group_item_ops = {
-	.show_attribute		= r2hb_hb_group_show,
-	.store_attribute	= r2hb_hb_group_store,
-};
-
-static struct configfs_group_operations r2hb_hb_group_group_ops = {
-	.make_item	= r2hb_hb_group_make_item,
-	.drop_item	= r2hb_hb_group_drop_item,
-};
-
-static struct config_item_type r2hb_hb_group_type = {
-	.ct_group_ops	= &r2hb_hb_group_group_ops,
-	.ct_item_ops	= &r2hb_hearbeat_group_item_ops,
-	.ct_attrs	= r2hb_hb_group_attrs,
-	.ct_owner	= THIS_MODULE,
-};
-
-/* this is just here to avoid touching group in heartbeat.h which the
- * entire damn world #includes */
-struct config_group *r2hb_alloc_hb_set(void)
-{
-	struct r2hb_hb_group *hs = NULL;
-	struct config_group *ret = NULL;
-
-	hs = kzalloc(sizeof(struct r2hb_hb_group), GFP_KERNEL);
-	if (hs == NULL)
-		goto out;
-
-	config_group_init_type_name(&hs->hs_group, "heartbeat",
-				    &r2hb_hb_group_type);
-
-	ret = &hs->hs_group;
-out:
-	if (ret == NULL)
-		kfree(hs);
-	return ret;
-}
-
-void r2hb_free_hb_set(struct config_group *group)
-{
-	struct r2hb_hb_group *hs = to_r2hb_hb_group(group);
-	kfree(hs);
-}
-
-/* hb callback registration and issuing */
-
-static struct r2hb_callback *hbcall_from_type(enum r2hb_callback_type type)
-{
-	if (type == R2HB_NUM_CB)
-		return ERR_PTR(-EINVAL);
-
-	return &r2hb_callbacks[type];
-}
-
-void r2hb_setup_callback(struct r2hb_callback_func *hc,
-			 enum r2hb_callback_type type,
-			 r2hb_cb_func *func,
-			 void *data,
-			 int priority)
-{
-	INIT_LIST_HEAD(&hc->hc_item);
-	hc->hc_func = func;
-	hc->hc_data = data;
-	hc->hc_priority = priority;
-	hc->hc_type = type;
-	hc->hc_magic = R2HB_CB_MAGIC;
-}
-EXPORT_SYMBOL_GPL(r2hb_setup_callback);
-
-int r2hb_register_callback(const char *region_uuid,
-			   struct r2hb_callback_func *hc)
-{
-	struct r2hb_callback_func *tmp;
-	struct list_head *iter;
-	struct r2hb_callback *hbcall;
-	int ret;
-
-	BUG_ON(hc->hc_magic != R2HB_CB_MAGIC);
-	BUG_ON(!list_empty(&hc->hc_item));
-
-	hbcall = hbcall_from_type(hc->hc_type);
-	if (IS_ERR(hbcall)) {
-		ret = PTR_ERR(hbcall);
-		goto out;
-	}
-
-	down_write(&r2hb_callback_sem);
-
-	list_for_each(iter, &hbcall->list) {
-		tmp = list_entry(iter, struct r2hb_callback_func, hc_item);
-		if (hc->hc_priority < tmp->hc_priority) {
-			list_add_tail(&hc->hc_item, iter);
-			break;
-		}
-	}
-	if (list_empty(&hc->hc_item))
-		list_add_tail(&hc->hc_item, &hbcall->list);
-
-	up_write(&r2hb_callback_sem);
-	ret = 0;
-out:
-	mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n",
-	     ret, __builtin_return_address(0), hc);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(r2hb_register_callback);
-
-void r2hb_unregister_callback(const char *region_uuid,
-			      struct r2hb_callback_func *hc)
-{
-	BUG_ON(hc->hc_magic != R2HB_CB_MAGIC);
-
-	mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n",
-	     __builtin_return_address(0), hc);
-
-	/* XXX Can this happen _with_ a region reference? */
-	if (list_empty(&hc->hc_item))
-		return;
-
-	down_write(&r2hb_callback_sem);
-
-	list_del_init(&hc->hc_item);
-
-	up_write(&r2hb_callback_sem);
-}
-EXPORT_SYMBOL_GPL(r2hb_unregister_callback);
-
-int r2hb_check_node_heartbeating_from_callback(u8 node_num)
-{
-	unsigned long testing_map[BITS_TO_LONGS(R2NM_MAX_NODES)];
-
-	r2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
-	if (!test_bit(node_num, testing_map)) {
-		mlog(ML_HEARTBEAT,
-		     "node (%u) does not have heartbeating enabled.\n",
-		     node_num);
-		return 0;
-	}
-
-	return 1;
-}
-EXPORT_SYMBOL_GPL(r2hb_check_node_heartbeating_from_callback);
-
-void r2hb_stop_all_regions(void)
-{
-}
-EXPORT_SYMBOL_GPL(r2hb_stop_all_regions);
-
-/*
- * this is just a hack until we get the plumbing which flips file systems
- * read only and drops the hb ref instead of killing the node dead.
- */
-int r2hb_global_heartbeat_active(void)
-{
-	return (r2hb_heartbeat_mode == R2HB_HEARTBEAT_GLOBAL);
-}
-EXPORT_SYMBOL(r2hb_global_heartbeat_active);
-
-/* added for RAMster */
-void r2hb_manual_set_node_heartbeating(int node_num)
-{
-	if (node_num < R2NM_MAX_NODES)
-		set_bit(node_num, r2hb_live_node_bitmap);
-}
-EXPORT_SYMBOL(r2hb_manual_set_node_heartbeating);
diff --git a/drivers/staging/ramster/cluster/heartbeat.h b/drivers/staging/ramster/cluster/heartbeat.h
deleted file mode 100755
index 6cbc775bd..000000000
--- a/drivers/staging/ramster/cluster/heartbeat.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * heartbeat.h
- *
- * Function prototypes
- *
- * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
- */
-
-#ifndef R2CLUSTER_HEARTBEAT_H
-#define R2CLUSTER_HEARTBEAT_H
-
-#define R2HB_REGION_TIMEOUT_MS		2000
-
-#define R2HB_MAX_REGION_NAME_LEN	32
-
-/* number of changes to be seen as live */
-#define R2HB_LIVE_THRESHOLD	   2
-/* number of equal samples to be seen as dead */
-extern unsigned int r2hb_dead_threshold;
-#define R2HB_DEFAULT_DEAD_THRESHOLD	   31
-/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
-#define R2HB_MIN_DEAD_THRESHOLD	  2
-#define R2HB_MAX_WRITE_TIMEOUT_MS \
-	(R2HB_REGION_TIMEOUT_MS * (r2hb_dead_threshold - 1))
-
-#define R2HB_CB_MAGIC		0x51d1e4ec
-
-/* callback stuff */
-enum r2hb_callback_type {
-	R2HB_NODE_DOWN_CB = 0,
-	R2HB_NODE_UP_CB,
-	R2HB_NUM_CB
-};
-
-struct r2nm_node;
-typedef void (r2hb_cb_func)(struct r2nm_node *, int, void *);
-
-struct r2hb_callback_func {
-	u32			hc_magic;
-	struct list_head	hc_item;
-	r2hb_cb_func		*hc_func;
-	void			*hc_data;
-	int			hc_priority;
-	enum r2hb_callback_type hc_type;
-};
-
-struct config_group *r2hb_alloc_hb_set(void);
-void r2hb_free_hb_set(struct config_group *group);
-
-void r2hb_setup_callback(struct r2hb_callback_func *hc,
-			 enum r2hb_callback_type type,
-			 r2hb_cb_func *func,
-			 void *data,
-			 int priority);
-int r2hb_register_callback(const char *region_uuid,
-			   struct r2hb_callback_func *hc);
-void r2hb_unregister_callback(const char *region_uuid,
-			      struct r2hb_callback_func *hc);
-void r2hb_fill_node_map(unsigned long *map,
-			unsigned bytes);
-void r2hb_exit(void);
-int r2hb_init(void);
-int r2hb_check_node_heartbeating_from_callback(u8 node_num);
-void r2hb_stop_all_regions(void);
-int r2hb_get_all_regions(char *region_uuids, u8 numregions);
-int r2hb_global_heartbeat_active(void);
-void r2hb_manual_set_node_heartbeating(int);
-
-#endif /* R2CLUSTER_HEARTBEAT_H */
diff --git a/drivers/staging/ramster/cluster/masklog.c b/drivers/staging/ramster/cluster/masklog.c
deleted file mode 100755
index 1261d8579..000000000
--- a/drivers/staging/ramster/cluster/masklog.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * Copyright (C) 2004, 2005, 2012 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/string.h>
-#include <linux/uaccess.h>
-
-#include "masklog.h"
-
-struct mlog_bits r2_mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
-EXPORT_SYMBOL_GPL(r2_mlog_and_bits);
-struct mlog_bits r2_mlog_not_bits = MLOG_BITS_RHS(0);
-EXPORT_SYMBOL_GPL(r2_mlog_not_bits);
-
-static ssize_t mlog_mask_show(u64 mask, char *buf)
-{
-	char *state;
-
-	if (__mlog_test_u64(mask, r2_mlog_and_bits))
-		state = "allow";
-	else if (__mlog_test_u64(mask, r2_mlog_not_bits))
-		state = "deny";
-	else
-		state = "off";
-
-	return snprintf(buf, PAGE_SIZE, "%s\n", state);
-}
-
-static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
-{
-	if (!strnicmp(buf, "allow", 5)) {
-		__mlog_set_u64(mask, r2_mlog_and_bits);
-		__mlog_clear_u64(mask, r2_mlog_not_bits);
-	} else if (!strnicmp(buf, "deny", 4)) {
-		__mlog_set_u64(mask, r2_mlog_not_bits);
-		__mlog_clear_u64(mask, r2_mlog_and_bits);
-	} else if (!strnicmp(buf, "off", 3)) {
-		__mlog_clear_u64(mask, r2_mlog_not_bits);
-		__mlog_clear_u64(mask, r2_mlog_and_bits);
-	} else
-		return -EINVAL;
-
-	return count;
-}
-
-struct mlog_attribute {
-	struct attribute attr;
-	u64 mask;
-};
-
-#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr)
-
-#define define_mask(_name) {			\
-	.attr = {				\
-		.name = #_name,			\
-		.mode = S_IRUGO | S_IWUSR,	\
-	},					\
-	.mask = ML_##_name,			\
-}
-
-static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
-	define_mask(TCP),
-	define_mask(MSG),
-	define_mask(SOCKET),
-	define_mask(HEARTBEAT),
-	define_mask(HB_BIO),
-	define_mask(DLMFS),
-	define_mask(DLM),
-	define_mask(DLM_DOMAIN),
-	define_mask(DLM_THREAD),
-	define_mask(DLM_MASTER),
-	define_mask(DLM_RECOVERY),
-	define_mask(DLM_GLUE),
-	define_mask(VOTE),
-	define_mask(CONN),
-	define_mask(QUORUM),
-	define_mask(BASTS),
-	define_mask(CLUSTER),
-	define_mask(ERROR),
-	define_mask(NOTICE),
-	define_mask(KTHREAD),
-};
-
-static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
-
-static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
-			 char *buf)
-{
-	struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
-
-	return mlog_mask_show(mlog_attr->mask, buf);
-}
-
-static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
-			  const char *buf, size_t count)
-{
-	struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
-
-	return mlog_mask_store(mlog_attr->mask, buf, count);
-}
-
-static const struct sysfs_ops mlog_attr_ops = {
-	.show  = mlog_show,
-	.store = mlog_store,
-};
-
-static struct kobj_type mlog_ktype = {
-	.default_attrs = mlog_attr_ptrs,
-	.sysfs_ops     = &mlog_attr_ops,
-};
-
-static struct kset mlog_kset = {
-	.kobj   = {.ktype = &mlog_ktype},
-};
-
-int r2_mlog_sys_init(struct kset *r2cb_kset)
-{
-	int i = 0;
-
-	while (mlog_attrs[i].attr.mode) {
-		mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
-		i++;
-	}
-	mlog_attr_ptrs[i] = NULL;
-
-	kobject_set_name(&mlog_kset.kobj, "logmask");
-	mlog_kset.kobj.kset = r2cb_kset;
-	return kset_register(&mlog_kset);
-}
-
-void r2_mlog_sys_shutdown(void)
-{
-	kset_unregister(&mlog_kset);
-}
diff --git a/drivers/staging/ramster/cluster/masklog.h b/drivers/staging/ramster/cluster/masklog.h
deleted file mode 100755
index 918ae110b..000000000
--- a/drivers/staging/ramster/cluster/masklog.h
+++ /dev/null
@@ -1,220 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * Copyright (C) 2005, 2012 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef R2CLUSTER_MASKLOG_H
-#define R2CLUSTER_MASKLOG_H
-
-/*
- * For now this is a trivial wrapper around printk() that gives the critical
- * ability to enable sets of debugging output at run-time.  In the future this
- * will almost certainly be redirected to relayfs so that it can pay a
- * substantially lower heisenberg tax.
- *
- * Callers associate the message with a bitmask and a global bitmask is
- * maintained with help from /proc.  If any of the bits match the message is
- * output.
- *
- * We must have efficient bit tests on i386 and it seems gcc still emits crazy
- * code for the 64bit compare.  It emits very good code for the dual unsigned
- * long tests, though, completely avoiding tests that can never pass if the
- * caller gives a constant bitmask that fills one of the longs with all 0s.  So
- * the desire is to have almost all of the calls decided on by comparing just
- * one of the longs.  This leads to having infrequently given bits that are
- * frequently matched in the high bits.
- *
- * _ERROR and _NOTICE are used for messages that always go to the console and
- * have appropriate KERN_ prefixes.  We wrap these in our function instead of
- * just calling printk() so that this can eventually make its way through
- * relayfs along with the debugging messages.  Everything else gets KERN_DEBUG.
- * The inline tests and macro dance give GCC the opportunity to quite cleverly
- * only emit the appropriage printk() when the caller passes in a constant
- * mask, as is almost always the case.
- *
- * All this bitmask nonsense is managed from the files under
- * /sys/fs/r2cb/logmask/.  Reading the files gives a straightforward
- * indication of which bits are allowed (allow) or denied (off/deny).
- *	ENTRY deny
- *	EXIT deny
- *	TCP off
- *	MSG off
- *	SOCKET off
- *	ERROR allow
- *	NOTICE allow
- *
- * Writing changes the state of a given bit and requires a strictly formatted
- * single write() call:
- *
- *	write(fd, "allow", 5);
- *
- * Echoing allow/deny/off string into the logmask files can flip the bits
- * on or off as expected; here is the bash script for example:
- *
- * log_mask="/sys/fs/r2cb/log_mask"
- * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do
- *	echo allow >"$log_mask"/"$node"
- * done
- *
- * The debugfs.ramster tool can also flip the bits with the -l option:
- *
- * debugfs.ramster -l TCP allow
- */
-
-/* for task_struct */
-#include <linux/sched.h>
-
-/* bits that are frequently given and infrequently matched in the low word */
-/* NOTE: If you add a flag, you need to also update masklog.c! */
-#define ML_TCP		0x0000000000000001ULL /* net cluster/tcp.c */
-#define ML_MSG		0x0000000000000002ULL /* net network messages */
-#define ML_SOCKET	0x0000000000000004ULL /* net socket lifetime */
-#define ML_HEARTBEAT	0x0000000000000008ULL /* hb all heartbeat tracking */
-#define ML_HB_BIO	0x0000000000000010ULL /* hb io tracing */
-#define ML_DLMFS	0x0000000000000020ULL /* dlm user dlmfs */
-#define ML_DLM		0x0000000000000040ULL /* dlm general debugging */
-#define ML_DLM_DOMAIN	0x0000000000000080ULL /* dlm domain debugging */
-#define ML_DLM_THREAD	0x0000000000000100ULL /* dlm domain thread */
-#define ML_DLM_MASTER	0x0000000000000200ULL /* dlm master functions */
-#define ML_DLM_RECOVERY	0x0000000000000400ULL /* dlm master functions */
-#define ML_DLM_GLUE	0x0000000000000800ULL /* ramster dlm glue layer */
-#define ML_VOTE		0x0000000000001000ULL /* ramster node messaging  */
-#define ML_CONN		0x0000000000002000ULL /* net connection management */
-#define ML_QUORUM	0x0000000000004000ULL /* net connection quorum */
-#define ML_BASTS	0x0000000000008000ULL /* dlmglue asts and basts */
-#define ML_CLUSTER	0x0000000000010000ULL /* cluster stack */
-
-/* bits that are infrequently given and frequently matched in the high word */
-#define ML_ERROR	0x1000000000000000ULL /* sent to KERN_ERR */
-#define ML_NOTICE	0x2000000000000000ULL /* setn to KERN_NOTICE */
-#define ML_KTHREAD	0x4000000000000000ULL /* kernel thread activity */
-
-#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
-#ifndef MLOG_MASK_PREFIX
-#define MLOG_MASK_PREFIX 0
-#endif
-
-/*
- * When logging is disabled, force the bit test to 0 for anything other
- * than errors and notices, allowing gcc to remove the code completely.
- * When enabled, allow all masks.
- */
-#if defined(CONFIG_RAMSTER_DEBUG_MASKLOG)
-#define ML_ALLOWED_BITS (~0)
-#else
-#define ML_ALLOWED_BITS (ML_ERROR|ML_NOTICE)
-#endif
-
-#define MLOG_MAX_BITS 64
-
-struct mlog_bits {
-	unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG];
-};
-
-extern struct mlog_bits r2_mlog_and_bits, r2_mlog_not_bits;
-
-#if BITS_PER_LONG == 32
-
-#define __mlog_test_u64(mask, bits)			\
-	((u32)(mask & 0xffffffff) & bits.words[0] ||	\
-	  ((u64)(mask) >> 32) & bits.words[1])
-#define __mlog_set_u64(mask, bits) do {			\
-	bits.words[0] |= (u32)(mask & 0xffffffff);	\
-	bits.words[1] |= (u64)(mask) >> 32;		\
-} while (0)
-#define __mlog_clear_u64(mask, bits) do {		\
-	bits.words[0] &= ~((u32)(mask & 0xffffffff));	\
-	bits.words[1] &= ~((u64)(mask) >> 32);		\
-} while (0)
-#define MLOG_BITS_RHS(mask) {				\
-	{						\
-		[0] = (u32)(mask & 0xffffffff),		\
-		[1] = (u64)(mask) >> 32,		\
-	}						\
-}
-
-#else /* 32bit long above, 64bit long below */
-
-#define __mlog_test_u64(mask, bits)	((mask) & bits.words[0])
-#define __mlog_set_u64(mask, bits) do {		\
-	bits.words[0] |= (mask);		\
-} while (0)
-#define __mlog_clear_u64(mask, bits) do {	\
-	bits.words[0] &= ~(mask);		\
-} while (0)
-#define MLOG_BITS_RHS(mask) { { (mask) } }
-
-#endif
-
-/*
- * smp_processor_id() "helpfully" screams when called outside preemptible
- * regions in current kernels.  sles doesn't have the variants that don't
- * scream.  just do this instead of trying to guess which we're building
- * against.. *sigh*.
- */
-#define __mlog_cpu_guess ({		\
-	unsigned long _cpu = get_cpu();	\
-	put_cpu();			\
-	_cpu;				\
-})
-
-/* In the following two macros, the whitespace after the ',' just
- * before ##args is intentional. Otherwise, gcc 2.95 will eat the
- * previous token if args expands to nothing.
- */
-#define __mlog_printk(level, fmt, args...)				\
-	printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm,		\
-	       task_pid_nr(current), __mlog_cpu_guess,			\
-	       __PRETTY_FUNCTION__, __LINE__ , ##args)
-
-#define mlog(mask, fmt, args...) do {					\
-	u64 __m = MLOG_MASK_PREFIX | (mask);				\
-	if ((__m & ML_ALLOWED_BITS) &&					\
-	    __mlog_test_u64(__m, r2_mlog_and_bits) &&			\
-	    !__mlog_test_u64(__m, r2_mlog_not_bits)) {			\
-		if (__m & ML_ERROR)					\
-			__mlog_printk(KERN_ERR, "ERROR: "fmt , ##args);	\
-		else if (__m & ML_NOTICE)				\
-			__mlog_printk(KERN_NOTICE, fmt , ##args);	\
-		else							\
-			__mlog_printk(KERN_INFO, fmt , ##args);		\
-	}								\
-} while (0)
-
-#define mlog_errno(st) do {						\
-	int _st = (st);							\
-	if (_st != -ERESTARTSYS && _st != -EINTR &&			\
-	    _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC)		\
-		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
-} while (0)
-
-#define mlog_bug_on_msg(cond, fmt, args...) do {			\
-	if (cond) {							\
-		mlog(ML_ERROR, "bug expression: " #cond "\n");		\
-		mlog(ML_ERROR, fmt, ##args);				\
-		BUG();							\
-	}								\
-} while (0)
-
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-int r2_mlog_sys_init(struct kset *r2cb_subsys);
-void r2_mlog_sys_shutdown(void);
-
-#endif /* R2CLUSTER_MASKLOG_H */
diff --git a/drivers/staging/ramster/cluster/nodemanager.c b/drivers/staging/ramster/cluster/nodemanager.c
deleted file mode 100755
index de0e5c8da..000000000
--- a/drivers/staging/ramster/cluster/nodemanager.c
+++ /dev/null
@@ -1,992 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * Copyright (C) 2004, 2005, 2012 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/slab.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/configfs.h>
-
-#include "tcp.h"
-#include "nodemanager.h"
-#include "heartbeat.h"
-#include "masklog.h"
-
-/* for now we operate under the assertion that there can be only one
- * cluster active at a time.  Changing this will require trickling
- * cluster references throughout where nodes are looked up */
-struct r2nm_cluster *r2nm_single_cluster;
-
-char *r2nm_fence_method_desc[R2NM_FENCE_METHODS] = {
-		"reset",	/* R2NM_FENCE_RESET */
-		"panic",	/* R2NM_FENCE_PANIC */
-};
-
-struct r2nm_node *r2nm_get_node_by_num(u8 node_num)
-{
-	struct r2nm_node *node = NULL;
-
-	if (node_num >= R2NM_MAX_NODES || r2nm_single_cluster == NULL)
-		goto out;
-
-	read_lock(&r2nm_single_cluster->cl_nodes_lock);
-	node = r2nm_single_cluster->cl_nodes[node_num];
-	if (node)
-		config_item_get(&node->nd_item);
-	read_unlock(&r2nm_single_cluster->cl_nodes_lock);
-out:
-	return node;
-}
-EXPORT_SYMBOL_GPL(r2nm_get_node_by_num);
-
-int r2nm_configured_node_map(unsigned long *map, unsigned bytes)
-{
-	struct r2nm_cluster *cluster = r2nm_single_cluster;
-
-	BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap)));
-
-	if (cluster == NULL)
-		return -EINVAL;
-
-	read_lock(&cluster->cl_nodes_lock);
-	memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
-	read_unlock(&cluster->cl_nodes_lock);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(r2nm_configured_node_map);
-
-static struct r2nm_node *r2nm_node_ip_tree_lookup(struct r2nm_cluster *cluster,
-						  __be32 ip_needle,
-						  struct rb_node ***ret_p,
-						  struct rb_node **ret_parent)
-{
-	struct rb_node **p = &cluster->cl_node_ip_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct r2nm_node *node, *ret = NULL;
-
-	while (*p) {
-		int cmp;
-
-		parent = *p;
-		node = rb_entry(parent, struct r2nm_node, nd_ip_node);
-
-		cmp = memcmp(&ip_needle, &node->nd_ipv4_address,
-				sizeof(ip_needle));
-		if (cmp < 0)
-			p = &(*p)->rb_left;
-		else if (cmp > 0)
-			p = &(*p)->rb_right;
-		else {
-			ret = node;
-			break;
-		}
-	}
-
-	if (ret_p != NULL)
-		*ret_p = p;
-	if (ret_parent != NULL)
-		*ret_parent = parent;
-
-	return ret;
-}
-
-struct r2nm_node *r2nm_get_node_by_ip(__be32 addr)
-{
-	struct r2nm_node *node = NULL;
-	struct r2nm_cluster *cluster = r2nm_single_cluster;
-
-	if (cluster == NULL)
-		goto out;
-
-	read_lock(&cluster->cl_nodes_lock);
-	node = r2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL);
-	if (node)
-		config_item_get(&node->nd_item);
-	read_unlock(&cluster->cl_nodes_lock);
-
-out:
-	return node;
-}
-EXPORT_SYMBOL_GPL(r2nm_get_node_by_ip);
-
-void r2nm_node_put(struct r2nm_node *node)
-{
-	config_item_put(&node->nd_item);
-}
-EXPORT_SYMBOL_GPL(r2nm_node_put);
-
-void r2nm_node_get(struct r2nm_node *node)
-{
-	config_item_get(&node->nd_item);
-}
-EXPORT_SYMBOL_GPL(r2nm_node_get);
-
-u8 r2nm_this_node(void)
-{
-	u8 node_num = R2NM_MAX_NODES;
-
-	if (r2nm_single_cluster && r2nm_single_cluster->cl_has_local)
-		node_num = r2nm_single_cluster->cl_local_node;
-
-	return node_num;
-}
-EXPORT_SYMBOL_GPL(r2nm_this_node);
-
-/* node configfs bits */
-
-static struct r2nm_cluster *to_r2nm_cluster(struct config_item *item)
-{
-	return item ?
-		container_of(to_config_group(item), struct r2nm_cluster,
-			     cl_group)
-		: NULL;
-}
-
-static struct r2nm_node *to_r2nm_node(struct config_item *item)
-{
-	return item ? container_of(item, struct r2nm_node, nd_item) : NULL;
-}
-
-static void r2nm_node_release(struct config_item *item)
-{
-	struct r2nm_node *node = to_r2nm_node(item);
-	kfree(node);
-}
-
-static ssize_t r2nm_node_num_read(struct r2nm_node *node, char *page)
-{
-	return sprintf(page, "%d\n", node->nd_num);
-}
-
-static struct r2nm_cluster *to_r2nm_cluster_from_node(struct r2nm_node *node)
-{
-	/* through the first node_set .parent
-	 * mycluster/nodes/mynode == r2nm_cluster->r2nm_node_group->r2nm_node */
-	return to_r2nm_cluster(node->nd_item.ci_parent->ci_parent);
-}
-
-enum {
-	R2NM_NODE_ATTR_NUM = 0,
-	R2NM_NODE_ATTR_PORT,
-	R2NM_NODE_ATTR_ADDRESS,
-	R2NM_NODE_ATTR_LOCAL,
-};
-
-static ssize_t r2nm_node_num_write(struct r2nm_node *node, const char *page,
-				   size_t count)
-{
-	struct r2nm_cluster *cluster = to_r2nm_cluster_from_node(node);
-	unsigned long tmp;
-	char *p = (char *)page;
-	int err;
-
-	err = kstrtoul(p, 10, &tmp);
-	if (err)
-		return err;
-
-	if (tmp >= R2NM_MAX_NODES)
-		return -ERANGE;
-
-	/* once we're in the cl_nodes tree networking can look us up by
-	 * node number and try to use our address and port attributes
-	 * to connect to this node.. make sure that they've been set
-	 * before writing the node attribute? */
-	if (!test_bit(R2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
-	    !test_bit(R2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
-		return -EINVAL; /* XXX */
-
-	write_lock(&cluster->cl_nodes_lock);
-	if (cluster->cl_nodes[tmp])
-		p = NULL;
-	else  {
-		cluster->cl_nodes[tmp] = node;
-		node->nd_num = tmp;
-		set_bit(tmp, cluster->cl_nodes_bitmap);
-	}
-	write_unlock(&cluster->cl_nodes_lock);
-	if (p == NULL)
-		return -EEXIST;
-
-	return count;
-}
-static ssize_t r2nm_node_ipv4_port_read(struct r2nm_node *node, char *page)
-{
-	return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
-}
-
-static ssize_t r2nm_node_ipv4_port_write(struct r2nm_node *node,
-					 const char *page, size_t count)
-{
-	unsigned long tmp;
-	char *p = (char *)page;
-	int err;
-
-	err = kstrtoul(p, 10, &tmp);
-	if (err)
-		return err;
-
-	if (tmp == 0)
-		return -EINVAL;
-	if (tmp >= (u16)-1)
-		return -ERANGE;
-
-	node->nd_ipv4_port = htons(tmp);
-
-	return count;
-}
-
-static ssize_t r2nm_node_ipv4_address_read(struct r2nm_node *node, char *page)
-{
-	return sprintf(page, "%pI4\n", &node->nd_ipv4_address);
-}
-
-static ssize_t r2nm_node_ipv4_address_write(struct r2nm_node *node,
-					    const char *page,
-					    size_t count)
-{
-	struct r2nm_cluster *cluster = to_r2nm_cluster_from_node(node);
-	int ret, i;
-	struct rb_node **p, *parent;
-	unsigned int octets[4];
-	__be32 ipv4_addr = 0;
-
-	ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2],
-		     &octets[1], &octets[0]);
-	if (ret != 4)
-		return -EINVAL;
-
-	for (i = 0; i < ARRAY_SIZE(octets); i++) {
-		if (octets[i] > 255)
-			return -ERANGE;
-		be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
-	}
-
-	ret = 0;
-	write_lock(&cluster->cl_nodes_lock);
-	if (r2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
-		ret = -EEXIST;
-	else {
-		rb_link_node(&node->nd_ip_node, parent, p);
-		rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
-	}
-	write_unlock(&cluster->cl_nodes_lock);
-	if (ret)
-		return ret;
-
-	memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr));
-
-	return count;
-}
-
-static ssize_t r2nm_node_local_read(struct r2nm_node *node, char *page)
-{
-	return sprintf(page, "%d\n", node->nd_local);
-}
-
-static ssize_t r2nm_node_local_write(struct r2nm_node *node, const char *page,
-				     size_t count)
-{
-	struct r2nm_cluster *cluster = to_r2nm_cluster_from_node(node);
-	unsigned long tmp;
-	char *p = (char *)page;
-	ssize_t ret;
-	int err;
-
-	err = kstrtoul(p, 10, &tmp);
-	if (err)
-		return err;
-
-	tmp = !!tmp; /* boolean of whether this node wants to be local */
-
-	/* setting local turns on networking rx for now so we require having
-	 * set everything else first */
-	if (!test_bit(R2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
-	    !test_bit(R2NM_NODE_ATTR_NUM, &node->nd_set_attributes) ||
-	    !test_bit(R2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
-		return -EINVAL; /* XXX */
-
-	/* the only failure case is trying to set a new local node
-	 * when a different one is already set */
-	if (tmp && tmp == cluster->cl_has_local &&
-	    cluster->cl_local_node != node->nd_num)
-		return -EBUSY;
-
-	/* bring up the rx thread if we're setting the new local node. */
-	if (tmp && !cluster->cl_has_local) {
-		ret = r2net_start_listening(node);
-		if (ret)
-			return ret;
-	}
-
-	if (!tmp && cluster->cl_has_local &&
-	    cluster->cl_local_node == node->nd_num) {
-		r2net_stop_listening(node);
-		cluster->cl_local_node = R2NM_INVALID_NODE_NUM;
-	}
-
-	node->nd_local = tmp;
-	if (node->nd_local) {
-		cluster->cl_has_local = tmp;
-		cluster->cl_local_node = node->nd_num;
-	}
-
-	return count;
-}
-
-struct r2nm_node_attribute {
-	struct configfs_attribute attr;
-	ssize_t (*show)(struct r2nm_node *, char *);
-	ssize_t (*store)(struct r2nm_node *, const char *, size_t);
-};
-
-static struct r2nm_node_attribute r2nm_node_attr_num = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "num",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= r2nm_node_num_read,
-	.store	= r2nm_node_num_write,
-};
-
-static struct r2nm_node_attribute r2nm_node_attr_ipv4_port = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "ipv4_port",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= r2nm_node_ipv4_port_read,
-	.store	= r2nm_node_ipv4_port_write,
-};
-
-static struct r2nm_node_attribute r2nm_node_attr_ipv4_address = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "ipv4_address",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= r2nm_node_ipv4_address_read,
-	.store	= r2nm_node_ipv4_address_write,
-};
-
-static struct r2nm_node_attribute r2nm_node_attr_local = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "local",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= r2nm_node_local_read,
-	.store	= r2nm_node_local_write,
-};
-
-static struct configfs_attribute *r2nm_node_attrs[] = {
-	[R2NM_NODE_ATTR_NUM] = &r2nm_node_attr_num.attr,
-	[R2NM_NODE_ATTR_PORT] = &r2nm_node_attr_ipv4_port.attr,
-	[R2NM_NODE_ATTR_ADDRESS] = &r2nm_node_attr_ipv4_address.attr,
-	[R2NM_NODE_ATTR_LOCAL] = &r2nm_node_attr_local.attr,
-	NULL,
-};
-
-static int r2nm_attr_index(struct configfs_attribute *attr)
-{
-	int i;
-	for (i = 0; i < ARRAY_SIZE(r2nm_node_attrs); i++) {
-		if (attr == r2nm_node_attrs[i])
-			return i;
-	}
-	BUG();
-	return 0;
-}
-
-static ssize_t r2nm_node_show(struct config_item *item,
-			      struct configfs_attribute *attr,
-			      char *page)
-{
-	struct r2nm_node *node = to_r2nm_node(item);
-	struct r2nm_node_attribute *r2nm_node_attr =
-		container_of(attr, struct r2nm_node_attribute, attr);
-	ssize_t ret = 0;
-
-	if (r2nm_node_attr->show)
-		ret = r2nm_node_attr->show(node, page);
-	return ret;
-}
-
-static ssize_t r2nm_node_store(struct config_item *item,
-			       struct configfs_attribute *attr,
-			       const char *page, size_t count)
-{
-	struct r2nm_node *node = to_r2nm_node(item);
-	struct r2nm_node_attribute *r2nm_node_attr =
-		container_of(attr, struct r2nm_node_attribute, attr);
-	ssize_t ret;
-	int attr_index = r2nm_attr_index(attr);
-
-	if (r2nm_node_attr->store == NULL) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (test_bit(attr_index, &node->nd_set_attributes))
-		return -EBUSY;
-
-	ret = r2nm_node_attr->store(node, page, count);
-	if (ret < count)
-		goto out;
-
-	set_bit(attr_index, &node->nd_set_attributes);
-out:
-	return ret;
-}
-
-static struct configfs_item_operations r2nm_node_item_ops = {
-	.release		= r2nm_node_release,
-	.show_attribute		= r2nm_node_show,
-	.store_attribute	= r2nm_node_store,
-};
-
-static struct config_item_type r2nm_node_type = {
-	.ct_item_ops	= &r2nm_node_item_ops,
-	.ct_attrs	= r2nm_node_attrs,
-	.ct_owner	= THIS_MODULE,
-};
-
-/* node set */
-
-struct r2nm_node_group {
-	struct config_group ns_group;
-	/* some stuff? */
-};
-
-#if 0
-static struct r2nm_node_group *to_r2nm_node_group(struct config_group *group)
-{
-	return group ?
-		container_of(group, struct r2nm_node_group, ns_group)
-		: NULL;
-}
-#endif
-
-struct r2nm_cluster_attribute {
-	struct configfs_attribute attr;
-	ssize_t (*show)(struct r2nm_cluster *, char *);
-	ssize_t (*store)(struct r2nm_cluster *, const char *, size_t);
-};
-
-static ssize_t r2nm_cluster_attr_write(const char *page, ssize_t count,
-					unsigned int *val)
-{
-	unsigned long tmp;
-	char *p = (char *)page;
-	int err;
-
-	err = kstrtoul(p, 10, &tmp);
-	if (err)
-		return err;
-
-	if (tmp == 0)
-		return -EINVAL;
-	if (tmp >= (u32)-1)
-		return -ERANGE;
-
-	*val = tmp;
-
-	return count;
-}
-
-static ssize_t r2nm_cluster_attr_idle_timeout_ms_read(
-	struct r2nm_cluster *cluster, char *page)
-{
-	return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms);
-}
-
-static ssize_t r2nm_cluster_attr_idle_timeout_ms_write(
-	struct r2nm_cluster *cluster, const char *page, size_t count)
-{
-	ssize_t ret;
-	unsigned int val = 0;
-
-	ret =  r2nm_cluster_attr_write(page, count, &val);
-
-	if (ret > 0) {
-		if (cluster->cl_idle_timeout_ms != val
-			&& r2net_num_connected_peers()) {
-			mlog(ML_NOTICE,
-			     "r2net: cannot change idle timeout after "
-			     "the first peer has agreed to it."
-			     "  %d connected peers\n",
-			     r2net_num_connected_peers());
-			ret = -EINVAL;
-		} else if (val <= cluster->cl_keepalive_delay_ms) {
-			mlog(ML_NOTICE, "r2net: idle timeout must be larger "
-			     "than keepalive delay\n");
-			ret = -EINVAL;
-		} else {
-			cluster->cl_idle_timeout_ms = val;
-		}
-	}
-
-	return ret;
-}
-
-static ssize_t r2nm_cluster_attr_keepalive_delay_ms_read(
-	struct r2nm_cluster *cluster, char *page)
-{
-	return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms);
-}
-
-static ssize_t r2nm_cluster_attr_keepalive_delay_ms_write(
-	struct r2nm_cluster *cluster, const char *page, size_t count)
-{
-	ssize_t ret;
-	unsigned int val = 0;
-
-	ret =  r2nm_cluster_attr_write(page, count, &val);
-
-	if (ret > 0) {
-		if (cluster->cl_keepalive_delay_ms != val
-		    && r2net_num_connected_peers()) {
-			mlog(ML_NOTICE,
-			     "r2net: cannot change keepalive delay after"
-			     " the first peer has agreed to it."
-			     "  %d connected peers\n",
-			     r2net_num_connected_peers());
-			ret = -EINVAL;
-		} else if (val >= cluster->cl_idle_timeout_ms) {
-			mlog(ML_NOTICE, "r2net: keepalive delay must be "
-			     "smaller than idle timeout\n");
-			ret = -EINVAL;
-		} else {
-			cluster->cl_keepalive_delay_ms = val;
-		}
-	}
-
-	return ret;
-}
-
-static ssize_t r2nm_cluster_attr_reconnect_delay_ms_read(
-	struct r2nm_cluster *cluster, char *page)
-{
-	return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms);
-}
-
-static ssize_t r2nm_cluster_attr_reconnect_delay_ms_write(
-	struct r2nm_cluster *cluster, const char *page, size_t count)
-{
-	return r2nm_cluster_attr_write(page, count,
-					&cluster->cl_reconnect_delay_ms);
-}
-
-static ssize_t r2nm_cluster_attr_fence_method_read(
-	struct r2nm_cluster *cluster, char *page)
-{
-	ssize_t ret = 0;
-
-	if (cluster)
-		ret = sprintf(page, "%s\n",
-			      r2nm_fence_method_desc[cluster->cl_fence_method]);
-	return ret;
-}
-
-static ssize_t r2nm_cluster_attr_fence_method_write(
-	struct r2nm_cluster *cluster, const char *page, size_t count)
-{
-	unsigned int i;
-
-	if (page[count - 1] != '\n')
-		goto bail;
-
-	for (i = 0; i < R2NM_FENCE_METHODS; ++i) {
-		if (count != strlen(r2nm_fence_method_desc[i]) + 1)
-			continue;
-		if (strncasecmp(page, r2nm_fence_method_desc[i], count - 1))
-			continue;
-		if (cluster->cl_fence_method != i) {
-			printk(KERN_INFO "ramster: Changing fence method to %s\n",
-			       r2nm_fence_method_desc[i]);
-			cluster->cl_fence_method = i;
-		}
-		return count;
-	}
-
-bail:
-	return -EINVAL;
-}
-
-static struct r2nm_cluster_attribute r2nm_cluster_attr_idle_timeout_ms = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "idle_timeout_ms",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= r2nm_cluster_attr_idle_timeout_ms_read,
-	.store	= r2nm_cluster_attr_idle_timeout_ms_write,
-};
-
-static struct r2nm_cluster_attribute r2nm_cluster_attr_keepalive_delay_ms = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "keepalive_delay_ms",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= r2nm_cluster_attr_keepalive_delay_ms_read,
-	.store	= r2nm_cluster_attr_keepalive_delay_ms_write,
-};
-
-static struct r2nm_cluster_attribute r2nm_cluster_attr_reconnect_delay_ms = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "reconnect_delay_ms",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= r2nm_cluster_attr_reconnect_delay_ms_read,
-	.store	= r2nm_cluster_attr_reconnect_delay_ms_write,
-};
-
-static struct r2nm_cluster_attribute r2nm_cluster_attr_fence_method = {
-	.attr	= { .ca_owner = THIS_MODULE,
-		    .ca_name = "fence_method",
-		    .ca_mode = S_IRUGO | S_IWUSR },
-	.show	= r2nm_cluster_attr_fence_method_read,
-	.store	= r2nm_cluster_attr_fence_method_write,
-};
-
-static struct configfs_attribute *r2nm_cluster_attrs[] = {
-	&r2nm_cluster_attr_idle_timeout_ms.attr,
-	&r2nm_cluster_attr_keepalive_delay_ms.attr,
-	&r2nm_cluster_attr_reconnect_delay_ms.attr,
-	&r2nm_cluster_attr_fence_method.attr,
-	NULL,
-};
-static ssize_t r2nm_cluster_show(struct config_item *item,
-					struct configfs_attribute *attr,
-					char *page)
-{
-	struct r2nm_cluster *cluster = to_r2nm_cluster(item);
-	struct r2nm_cluster_attribute *r2nm_cluster_attr =
-		container_of(attr, struct r2nm_cluster_attribute, attr);
-	ssize_t ret = 0;
-
-	if (r2nm_cluster_attr->show)
-		ret = r2nm_cluster_attr->show(cluster, page);
-	return ret;
-}
-
-static ssize_t r2nm_cluster_store(struct config_item *item,
-					struct configfs_attribute *attr,
-					const char *page, size_t count)
-{
-	struct r2nm_cluster *cluster = to_r2nm_cluster(item);
-	struct r2nm_cluster_attribute *r2nm_cluster_attr =
-		container_of(attr, struct r2nm_cluster_attribute, attr);
-	ssize_t ret;
-
-	if (r2nm_cluster_attr->store == NULL) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	ret = r2nm_cluster_attr->store(cluster, page, count);
-	if (ret < count)
-		goto out;
-out:
-	return ret;
-}
-
-static struct config_item *r2nm_node_group_make_item(struct config_group *group,
-						     const char *name)
-{
-	struct r2nm_node *node = NULL;
-
-	if (strlen(name) > R2NM_MAX_NAME_LEN)
-		return ERR_PTR(-ENAMETOOLONG);
-
-	node = kzalloc(sizeof(struct r2nm_node), GFP_KERNEL);
-	if (node == NULL)
-		return ERR_PTR(-ENOMEM);
-
-	strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
-	config_item_init_type_name(&node->nd_item, name, &r2nm_node_type);
-	spin_lock_init(&node->nd_lock);
-
-	mlog(ML_CLUSTER, "r2nm: Registering node %s\n", name);
-
-	return &node->nd_item;
-}
-
-static void r2nm_node_group_drop_item(struct config_group *group,
-				      struct config_item *item)
-{
-	struct r2nm_node *node = to_r2nm_node(item);
-	struct r2nm_cluster *cluster =
-				to_r2nm_cluster(group->cg_item.ci_parent);
-
-	r2net_disconnect_node(node);
-
-	if (cluster->cl_has_local &&
-	    (cluster->cl_local_node == node->nd_num)) {
-		cluster->cl_has_local = 0;
-		cluster->cl_local_node = R2NM_INVALID_NODE_NUM;
-		r2net_stop_listening(node);
-	}
-
-	/* XXX call into net to stop this node from trading messages */
-
-	write_lock(&cluster->cl_nodes_lock);
-
-	/* XXX sloppy */
-	if (node->nd_ipv4_address)
-		rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree);
-
-	/* nd_num might be 0 if the node number hasn't been set.. */
-	if (cluster->cl_nodes[node->nd_num] == node) {
-		cluster->cl_nodes[node->nd_num] = NULL;
-		clear_bit(node->nd_num, cluster->cl_nodes_bitmap);
-	}
-	write_unlock(&cluster->cl_nodes_lock);
-
-	mlog(ML_CLUSTER, "r2nm: Unregistered node %s\n",
-	     config_item_name(&node->nd_item));
-
-	config_item_put(item);
-}
-
-static struct configfs_group_operations r2nm_node_group_group_ops = {
-	.make_item	= r2nm_node_group_make_item,
-	.drop_item	= r2nm_node_group_drop_item,
-};
-
-static struct config_item_type r2nm_node_group_type = {
-	.ct_group_ops	= &r2nm_node_group_group_ops,
-	.ct_owner	= THIS_MODULE,
-};
-
-/* cluster */
-
-static void r2nm_cluster_release(struct config_item *item)
-{
-	struct r2nm_cluster *cluster = to_r2nm_cluster(item);
-
-	kfree(cluster->cl_group.default_groups);
-	kfree(cluster);
-}
-
-static struct configfs_item_operations r2nm_cluster_item_ops = {
-	.release	= r2nm_cluster_release,
-	.show_attribute		= r2nm_cluster_show,
-	.store_attribute	= r2nm_cluster_store,
-};
-
-static struct config_item_type r2nm_cluster_type = {
-	.ct_item_ops	= &r2nm_cluster_item_ops,
-	.ct_attrs	= r2nm_cluster_attrs,
-	.ct_owner	= THIS_MODULE,
-};
-
-/* cluster set */
-
-struct r2nm_cluster_group {
-	struct configfs_subsystem cs_subsys;
-	/* some stuff? */
-};
-
-#if 0
-static struct r2nm_cluster_group *
-to_r2nm_cluster_group(struct config_group *group)
-{
-	return group ?
-		container_of(to_configfs_subsystem(group),
-				struct r2nm_cluster_group, cs_subsys)
-	       : NULL;
-}
-#endif
-
-static struct config_group *
-r2nm_cluster_group_make_group(struct config_group *group,
-							  const char *name)
-{
-	struct r2nm_cluster *cluster = NULL;
-	struct r2nm_node_group *ns = NULL;
-	struct config_group *r2hb_group = NULL, *ret = NULL;
-	void *defs = NULL;
-
-	/* this runs under the parent dir's i_mutex; there can be only
-	 * one caller in here at a time */
-	if (r2nm_single_cluster)
-		return ERR_PTR(-ENOSPC);
-
-	cluster = kzalloc(sizeof(struct r2nm_cluster), GFP_KERNEL);
-	ns = kzalloc(sizeof(struct r2nm_node_group), GFP_KERNEL);
-	defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
-	r2hb_group = r2hb_alloc_hb_set();
-	if (cluster == NULL || ns == NULL || r2hb_group == NULL || defs == NULL)
-		goto out;
-
-	config_group_init_type_name(&cluster->cl_group, name,
-				    &r2nm_cluster_type);
-	config_group_init_type_name(&ns->ns_group, "node",
-				    &r2nm_node_group_type);
-
-	cluster->cl_group.default_groups = defs;
-	cluster->cl_group.default_groups[0] = &ns->ns_group;
-	cluster->cl_group.default_groups[1] = r2hb_group;
-	cluster->cl_group.default_groups[2] = NULL;
-	rwlock_init(&cluster->cl_nodes_lock);
-	cluster->cl_node_ip_tree = RB_ROOT;
-	cluster->cl_reconnect_delay_ms = R2NET_RECONNECT_DELAY_MS_DEFAULT;
-	cluster->cl_idle_timeout_ms    = R2NET_IDLE_TIMEOUT_MS_DEFAULT;
-	cluster->cl_keepalive_delay_ms = R2NET_KEEPALIVE_DELAY_MS_DEFAULT;
-	cluster->cl_fence_method       = R2NM_FENCE_RESET;
-
-	ret = &cluster->cl_group;
-	r2nm_single_cluster = cluster;
-
-out:
-	if (ret == NULL) {
-		kfree(cluster);
-		kfree(ns);
-		r2hb_free_hb_set(r2hb_group);
-		kfree(defs);
-		ret = ERR_PTR(-ENOMEM);
-	}
-
-	return ret;
-}
-
-static void r2nm_cluster_group_drop_item(struct config_group *group,
-						struct config_item *item)
-{
-	struct r2nm_cluster *cluster = to_r2nm_cluster(item);
-	int i;
-	struct config_item *killme;
-
-	BUG_ON(r2nm_single_cluster != cluster);
-	r2nm_single_cluster = NULL;
-
-	for (i = 0; cluster->cl_group.default_groups[i]; i++) {
-		killme = &cluster->cl_group.default_groups[i]->cg_item;
-		cluster->cl_group.default_groups[i] = NULL;
-		config_item_put(killme);
-	}
-
-	config_item_put(item);
-}
-
-static struct configfs_group_operations r2nm_cluster_group_group_ops = {
-	.make_group	= r2nm_cluster_group_make_group,
-	.drop_item	= r2nm_cluster_group_drop_item,
-};
-
-static struct config_item_type r2nm_cluster_group_type = {
-	.ct_group_ops	= &r2nm_cluster_group_group_ops,
-	.ct_owner	= THIS_MODULE,
-};
-
-static struct r2nm_cluster_group r2nm_cluster_group = {
-	.cs_subsys = {
-		.su_group = {
-			.cg_item = {
-				.ci_namebuf = "cluster",
-				.ci_type = &r2nm_cluster_group_type,
-			},
-		},
-	},
-};
-
-int r2nm_depend_item(struct config_item *item)
-{
-	return configfs_depend_item(&r2nm_cluster_group.cs_subsys, item);
-}
-
-void r2nm_undepend_item(struct config_item *item)
-{
-	configfs_undepend_item(&r2nm_cluster_group.cs_subsys, item);
-}
-
-int r2nm_depend_this_node(void)
-{
-	int ret = 0;
-	struct r2nm_node *local_node;
-
-	local_node = r2nm_get_node_by_num(r2nm_this_node());
-	if (!local_node) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	ret = r2nm_depend_item(&local_node->nd_item);
-	r2nm_node_put(local_node);
-
-out:
-	return ret;
-}
-
-void r2nm_undepend_this_node(void)
-{
-	struct r2nm_node *local_node;
-
-	local_node = r2nm_get_node_by_num(r2nm_this_node());
-	BUG_ON(!local_node);
-
-	r2nm_undepend_item(&local_node->nd_item);
-	r2nm_node_put(local_node);
-}
-
-
-static void __exit exit_r2nm(void)
-{
-	/* XXX sync with hb callbacks and shut down hb? */
-	r2net_unregister_hb_callbacks();
-	configfs_unregister_subsystem(&r2nm_cluster_group.cs_subsys);
-
-	r2net_exit();
-	r2hb_exit();
-}
-
-static int __init init_r2nm(void)
-{
-	int ret = -1;
-
-	ret = r2hb_init();
-	if (ret)
-		goto out;
-
-	ret = r2net_init();
-	if (ret)
-		goto out_r2hb;
-
-	ret = r2net_register_hb_callbacks();
-	if (ret)
-		goto out_r2net;
-
-	config_group_init(&r2nm_cluster_group.cs_subsys.su_group);
-	mutex_init(&r2nm_cluster_group.cs_subsys.su_mutex);
-	ret = configfs_register_subsystem(&r2nm_cluster_group.cs_subsys);
-	if (ret) {
-		printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
-		goto out_callbacks;
-	}
-
-	if (!ret)
-		goto out;
-
-	configfs_unregister_subsystem(&r2nm_cluster_group.cs_subsys);
-out_callbacks:
-	r2net_unregister_hb_callbacks();
-out_r2net:
-	r2net_exit();
-out_r2hb:
-	r2hb_exit();
-out:
-	return ret;
-}
-
-MODULE_AUTHOR("Oracle");
-MODULE_LICENSE("GPL");
-
-module_init(init_r2nm)
-module_exit(exit_r2nm)
diff --git a/drivers/staging/ramster/cluster/nodemanager.h b/drivers/staging/ramster/cluster/nodemanager.h
deleted file mode 100755
index 41a04df58..000000000
--- a/drivers/staging/ramster/cluster/nodemanager.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * nodemanager.h
- *
- * Function prototypes
- *
- * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
- */
-
-#ifndef R2CLUSTER_NODEMANAGER_H
-#define R2CLUSTER_NODEMANAGER_H
-
-#include "ramster_nodemanager.h"
-
-/* This totally doesn't belong here. */
-#include <linux/configfs.h>
-#include <linux/rbtree.h>
-
-enum r2nm_fence_method {
-	R2NM_FENCE_RESET	= 0,
-	R2NM_FENCE_PANIC,
-	R2NM_FENCE_METHODS,	/* Number of fence methods */
-};
-
-struct r2nm_node {
-	spinlock_t		nd_lock;
-	struct config_item	nd_item;
-	char			nd_name[R2NM_MAX_NAME_LEN+1]; /* replace? */
-	__u8			nd_num;
-	/* only one address per node, as attributes, for now. */
-	__be32			nd_ipv4_address;
-	__be16			nd_ipv4_port;
-	struct rb_node		nd_ip_node;
-	/* there can be only one local node for now */
-	int			nd_local;
-
-	unsigned long		nd_set_attributes;
-};
-
-struct r2nm_cluster {
-	struct config_group	cl_group;
-	unsigned		cl_has_local:1;
-	u8			cl_local_node;
-	rwlock_t		cl_nodes_lock;
-	struct r2nm_node	*cl_nodes[R2NM_MAX_NODES];
-	struct rb_root		cl_node_ip_tree;
-	unsigned int		cl_idle_timeout_ms;
-	unsigned int		cl_keepalive_delay_ms;
-	unsigned int		cl_reconnect_delay_ms;
-	enum r2nm_fence_method	cl_fence_method;
-
-	/* part of a hack for disk bitmap.. will go eventually. - zab */
-	unsigned long	cl_nodes_bitmap[BITS_TO_LONGS(R2NM_MAX_NODES)];
-};
-
-extern struct r2nm_cluster *r2nm_single_cluster;
-
-u8 r2nm_this_node(void);
-
-int r2nm_configured_node_map(unsigned long *map, unsigned bytes);
-struct r2nm_node *r2nm_get_node_by_num(u8 node_num);
-struct r2nm_node *r2nm_get_node_by_ip(__be32 addr);
-void r2nm_node_get(struct r2nm_node *node);
-void r2nm_node_put(struct r2nm_node *node);
-
-int r2nm_depend_item(struct config_item *item);
-void r2nm_undepend_item(struct config_item *item);
-int r2nm_depend_this_node(void);
-void r2nm_undepend_this_node(void);
-
-#endif /* R2CLUSTER_NODEMANAGER_H */
diff --git a/drivers/staging/ramster/cluster/ramster_nodemanager.h b/drivers/staging/ramster/cluster/ramster_nodemanager.h
deleted file mode 100755
index 49f879d94..000000000
--- a/drivers/staging/ramster/cluster/ramster_nodemanager.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ramster_nodemanager.h
- *
- * Header describing the interface between userspace and the kernel
- * for the ramster_nodemanager module.
- *
- * Copyright (C) 2002, 2004, 2012 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
- */
-
-#ifndef _RAMSTER_NODEMANAGER_H
-#define _RAMSTER_NODEMANAGER_H
-
-#define R2NM_API_VERSION	5
-
-#define R2NM_MAX_NODES		255
-#define R2NM_INVALID_NODE_NUM	255
-
-/* host name, group name, cluster name all 64 bytes */
-#define R2NM_MAX_NAME_LEN        64    /* __NEW_UTS_LEN */
-
-#endif /* _RAMSTER_NODEMANAGER_H */
diff --git a/drivers/staging/ramster/cluster/tcp.c b/drivers/staging/ramster/cluster/tcp.c
deleted file mode 100755
index 3af1b2c51..000000000
--- a/drivers/staging/ramster/cluster/tcp.c
+++ /dev/null
@@ -1,2256 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- *
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
- * ----
- *
- * Callers for this were originally written against a very simple synchronus
- * API.  This implementation reflects those simple callers.  Some day I'm sure
- * we'll need to move to a more robust posting/callback mechanism.
- *
- * Transmit calls pass in kernel virtual addresses and block copying this into
- * the socket's tx buffers via a usual blocking sendmsg.  They'll block waiting
- * for a failed socket to timeout.  TX callers can also pass in a poniter to an
- * 'int' which gets filled with an errno off the wire in response to the
- * message they send.
- *
- * Handlers for unsolicited messages are registered.  Each socket has a page
- * that incoming data is copied into.  First the header, then the data.
- * Handlers are called from only one thread with a reference to this per-socket
- * page.  This page is destroyed after the handler call, so it can't be
- * referenced beyond the call.  Handlers may block but are discouraged from
- * doing so.
- *
- * Any framing errors (bad magic, large payload lengths) close a connection.
- *
- * Our sock_container holds the state we associate with a socket.  It's current
- * framing state is held there as well as the refcounting we do around when it
- * is safe to tear down the socket.  The socket is only finally torn down from
- * the container when the container loses all of its references -- so as long
- * as you hold a ref on the container you can trust that the socket is valid
- * for use with kernel socket APIs.
- *
- * Connections are initiated between a pair of nodes when the node with the
- * higher node number gets a heartbeat callback which indicates that the lower
- * numbered node has started heartbeating.  The lower numbered node is passive
- * and only accepts the connection if the higher numbered node is heartbeating.
- */
-
-#include <linux/kernel.h>
-#include <linux/jiffies.h>
-#include <linux/slab.h>
-#include <linux/idr.h>
-#include <linux/kref.h>
-#include <linux/net.h>
-#include <linux/export.h>
-#include <linux/uaccess.h>
-#include <net/tcp.h>
-
-
-#include "heartbeat.h"
-#include "tcp.h"
-#include "nodemanager.h"
-#define MLOG_MASK_PREFIX ML_TCP
-#include "masklog.h"
-
-#include "tcp_internal.h"
-
-#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u"
-
-/*
- * In the following two log macros, the whitespace after the ',' just
- * before ##args is intentional. Otherwise, gcc 2.95 will eat the
- * previous token if args expands to nothing.
- */
-#define msglog(hdr, fmt, args...) do {					\
-	typeof(hdr) __hdr = (hdr);					\
-	mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d "	\
-	     "key %08x num %u] " fmt,					\
-	be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len),	\
-	     be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status),	\
-	     be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key),	\
-	     be32_to_cpu(__hdr->msg_num) ,  ##args);			\
-} while (0)
-
-#define sclog(sc, fmt, args...) do {					\
-	typeof(sc) __sc = (sc);						\
-	mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p "	\
-	     "pg_off %zu] " fmt, __sc,					\
-	     atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock,	\
-	    __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off ,	\
-	    ##args);							\
-} while (0)
-
-static DEFINE_RWLOCK(r2net_handler_lock);
-static struct rb_root r2net_handler_tree = RB_ROOT;
-
-static struct r2net_node r2net_nodes[R2NM_MAX_NODES];
-
-/* XXX someday we'll need better accounting */
-static struct socket *r2net_listen_sock;
-
-/*
- * listen work is only queued by the listening socket callbacks on the
- * r2net_wq.  teardown detaches the callbacks before destroying the workqueue.
- * quorum work is queued as sock containers are shutdown.. stop_listening
- * tears down all the node's sock containers, preventing future shutdowns
- * and queued quroum work, before canceling delayed quorum work and
- * destroying the work queue.
- */
-static struct workqueue_struct *r2net_wq;
-static struct work_struct r2net_listen_work;
-
-static struct r2hb_callback_func r2net_hb_up, r2net_hb_down;
-#define R2NET_HB_PRI 0x1
-
-static struct r2net_handshake *r2net_hand;
-static struct r2net_msg *r2net_keep_req, *r2net_keep_resp;
-
-static int r2net_sys_err_translations[R2NET_ERR_MAX] = {
-		[R2NET_ERR_NONE]	= 0,
-		[R2NET_ERR_NO_HNDLR]	= -ENOPROTOOPT,
-		[R2NET_ERR_OVERFLOW]	= -EOVERFLOW,
-		[R2NET_ERR_DIED]	= -EHOSTDOWN,};
-
-/* can't quite avoid *all* internal declarations :/ */
-static void r2net_sc_connect_completed(struct work_struct *work);
-static void r2net_rx_until_empty(struct work_struct *work);
-static void r2net_shutdown_sc(struct work_struct *work);
-static void r2net_listen_data_ready(struct sock *sk, int bytes);
-static void r2net_sc_send_keep_req(struct work_struct *work);
-static void r2net_idle_timer(unsigned long data);
-static void r2net_sc_postpone_idle(struct r2net_sock_container *sc);
-static void r2net_sc_reset_idle_timer(struct r2net_sock_container *sc);
-
-#ifdef CONFIG_DEBUG_FS
-static void r2net_init_nst(struct r2net_send_tracking *nst, u32 msgtype,
-			   u32 msgkey, struct task_struct *task, u8 node)
-{
-	INIT_LIST_HEAD(&nst->st_net_debug_item);
-	nst->st_task = task;
-	nst->st_msg_type = msgtype;
-	nst->st_msg_key = msgkey;
-	nst->st_node = node;
-}
-
-static inline void r2net_set_nst_sock_time(struct r2net_send_tracking *nst)
-{
-	nst->st_sock_time = ktime_get();
-}
-
-static inline void r2net_set_nst_send_time(struct r2net_send_tracking *nst)
-{
-	nst->st_send_time = ktime_get();
-}
-
-static inline void r2net_set_nst_status_time(struct r2net_send_tracking *nst)
-{
-	nst->st_status_time = ktime_get();
-}
-
-static inline void r2net_set_nst_sock_container(struct r2net_send_tracking *nst,
-						struct r2net_sock_container *sc)
-{
-	nst->st_sc = sc;
-}
-
-static inline void r2net_set_nst_msg_id(struct r2net_send_tracking *nst,
-					u32 msg_id)
-{
-	nst->st_id = msg_id;
-}
-
-static inline void r2net_set_sock_timer(struct r2net_sock_container *sc)
-{
-	sc->sc_tv_timer = ktime_get();
-}
-
-static inline void r2net_set_data_ready_time(struct r2net_sock_container *sc)
-{
-	sc->sc_tv_data_ready = ktime_get();
-}
-
-static inline void r2net_set_advance_start_time(struct r2net_sock_container *sc)
-{
-	sc->sc_tv_advance_start = ktime_get();
-}
-
-static inline void r2net_set_advance_stop_time(struct r2net_sock_container *sc)
-{
-	sc->sc_tv_advance_stop = ktime_get();
-}
-
-static inline void r2net_set_func_start_time(struct r2net_sock_container *sc)
-{
-	sc->sc_tv_func_start = ktime_get();
-}
-
-static inline void r2net_set_func_stop_time(struct r2net_sock_container *sc)
-{
-	sc->sc_tv_func_stop = ktime_get();
-}
-
-#else  /* CONFIG_DEBUG_FS */
-# define r2net_init_nst(a, b, c, d, e)
-# define r2net_set_nst_sock_time(a)
-# define r2net_set_nst_send_time(a)
-# define r2net_set_nst_status_time(a)
-# define r2net_set_nst_sock_container(a, b)
-# define r2net_set_nst_msg_id(a, b)
-# define r2net_set_sock_timer(a)
-# define r2net_set_data_ready_time(a)
-# define r2net_set_advance_start_time(a)
-# define r2net_set_advance_stop_time(a)
-# define r2net_set_func_start_time(a)
-# define r2net_set_func_stop_time(a)
-#endif /* CONFIG_DEBUG_FS */
-
-#ifdef CONFIG_RAMSTER_FS_STATS
-static ktime_t r2net_get_func_run_time(struct r2net_sock_container *sc)
-{
-	return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
-}
-
-static void r2net_update_send_stats(struct r2net_send_tracking *nst,
-				    struct r2net_sock_container *sc)
-{
-	sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total,
-					   ktime_sub(ktime_get(),
-						     nst->st_status_time));
-	sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total,
-					 ktime_sub(nst->st_status_time,
-						   nst->st_send_time));
-	sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total,
-					    ktime_sub(nst->st_send_time,
-						      nst->st_sock_time));
-	sc->sc_send_count++;
-}
-
-static void r2net_update_recv_stats(struct r2net_sock_container *sc)
-{
-	sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total,
-					    r2net_get_func_run_time(sc));
-	sc->sc_recv_count++;
-}
-
-#else
-
-# define r2net_update_send_stats(a, b)
-
-# define r2net_update_recv_stats(sc)
-
-#endif /* CONFIG_RAMSTER_FS_STATS */
-
-static inline int r2net_reconnect_delay(void)
-{
-	return r2nm_single_cluster->cl_reconnect_delay_ms;
-}
-
-static inline int r2net_keepalive_delay(void)
-{
-	return r2nm_single_cluster->cl_keepalive_delay_ms;
-}
-
-static inline int r2net_idle_timeout(void)
-{
-	return r2nm_single_cluster->cl_idle_timeout_ms;
-}
-
-static inline int r2net_sys_err_to_errno(enum r2net_system_error err)
-{
-	int trans;
-	BUG_ON(err >= R2NET_ERR_MAX);
-	trans = r2net_sys_err_translations[err];
-
-	/* Just in case we mess up the translation table above */
-	BUG_ON(err != R2NET_ERR_NONE && trans == 0);
-	return trans;
-}
-
-struct r2net_node *r2net_nn_from_num(u8 node_num)
-{
-	BUG_ON(node_num >= ARRAY_SIZE(r2net_nodes));
-	return &r2net_nodes[node_num];
-}
-
-static u8 r2net_num_from_nn(struct r2net_node *nn)
-{
-	BUG_ON(nn == NULL);
-	return nn - r2net_nodes;
-}
-
-/* ------------------------------------------------------------ */
-
-static int r2net_prep_nsw(struct r2net_node *nn, struct r2net_status_wait *nsw)
-{
-	int ret = 0;
-
-	do {
-		if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
-			ret = -EAGAIN;
-			break;
-		}
-		spin_lock(&nn->nn_lock);
-		ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
-		if (ret == 0)
-			list_add_tail(&nsw->ns_node_item,
-				      &nn->nn_status_list);
-		spin_unlock(&nn->nn_lock);
-	} while (ret == -EAGAIN);
-
-	if (ret == 0)  {
-		init_waitqueue_head(&nsw->ns_wq);
-		nsw->ns_sys_status = R2NET_ERR_NONE;
-		nsw->ns_status = 0;
-	}
-
-	return ret;
-}
-
-static void r2net_complete_nsw_locked(struct r2net_node *nn,
-				      struct r2net_status_wait *nsw,
-				      enum r2net_system_error sys_status,
-				      s32 status)
-{
-	assert_spin_locked(&nn->nn_lock);
-
-	if (!list_empty(&nsw->ns_node_item)) {
-		list_del_init(&nsw->ns_node_item);
-		nsw->ns_sys_status = sys_status;
-		nsw->ns_status = status;
-		idr_remove(&nn->nn_status_idr, nsw->ns_id);
-		wake_up(&nsw->ns_wq);
-	}
-}
-
-static void r2net_complete_nsw(struct r2net_node *nn,
-			       struct r2net_status_wait *nsw,
-			       u64 id, enum r2net_system_error sys_status,
-			       s32 status)
-{
-	spin_lock(&nn->nn_lock);
-	if (nsw == NULL) {
-		if (id > INT_MAX)
-			goto out;
-
-		nsw = idr_find(&nn->nn_status_idr, id);
-		if (nsw == NULL)
-			goto out;
-	}
-
-	r2net_complete_nsw_locked(nn, nsw, sys_status, status);
-
-out:
-	spin_unlock(&nn->nn_lock);
-	return;
-}
-
-static void r2net_complete_nodes_nsw(struct r2net_node *nn)
-{
-	struct r2net_status_wait *nsw, *tmp;
-	unsigned int num_kills = 0;
-
-	assert_spin_locked(&nn->nn_lock);
-
-	list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) {
-		r2net_complete_nsw_locked(nn, nsw, R2NET_ERR_DIED, 0);
-		num_kills++;
-	}
-
-	mlog(0, "completed %d messages for node %u\n", num_kills,
-	     r2net_num_from_nn(nn));
-}
-
-static int r2net_nsw_completed(struct r2net_node *nn,
-			       struct r2net_status_wait *nsw)
-{
-	int completed;
-	spin_lock(&nn->nn_lock);
-	completed = list_empty(&nsw->ns_node_item);
-	spin_unlock(&nn->nn_lock);
-	return completed;
-}
-
-/* ------------------------------------------------------------ */
-
-static void sc_kref_release(struct kref *kref)
-{
-	struct r2net_sock_container *sc = container_of(kref,
-					struct r2net_sock_container, sc_kref);
-	BUG_ON(timer_pending(&sc->sc_idle_timeout));
-
-	sclog(sc, "releasing\n");
-
-	if (sc->sc_sock) {
-		sock_release(sc->sc_sock);
-		sc->sc_sock = NULL;
-	}
-
-	r2nm_undepend_item(&sc->sc_node->nd_item);
-	r2nm_node_put(sc->sc_node);
-	sc->sc_node = NULL;
-
-	r2net_debug_del_sc(sc);
-	kfree(sc);
-}
-
-static void sc_put(struct r2net_sock_container *sc)
-{
-	sclog(sc, "put\n");
-	kref_put(&sc->sc_kref, sc_kref_release);
-}
-static void sc_get(struct r2net_sock_container *sc)
-{
-	sclog(sc, "get\n");
-	kref_get(&sc->sc_kref);
-}
-static struct r2net_sock_container *sc_alloc(struct r2nm_node *node)
-{
-	struct r2net_sock_container *sc, *ret = NULL;
-	struct page *page = NULL;
-	int status = 0;
-
-	page = alloc_page(GFP_NOFS);
-	sc = kzalloc(sizeof(*sc), GFP_NOFS);
-	if (sc == NULL || page == NULL)
-		goto out;
-
-	kref_init(&sc->sc_kref);
-	r2nm_node_get(node);
-	sc->sc_node = node;
-
-	/* pin the node item of the remote node */
-	status = r2nm_depend_item(&node->nd_item);
-	if (status) {
-		mlog_errno(status);
-		r2nm_node_put(node);
-		goto out;
-	}
-	INIT_WORK(&sc->sc_connect_work, r2net_sc_connect_completed);
-	INIT_WORK(&sc->sc_rx_work, r2net_rx_until_empty);
-	INIT_WORK(&sc->sc_shutdown_work, r2net_shutdown_sc);
-	INIT_DELAYED_WORK(&sc->sc_keepalive_work, r2net_sc_send_keep_req);
-
-	init_timer(&sc->sc_idle_timeout);
-	sc->sc_idle_timeout.function = r2net_idle_timer;
-	sc->sc_idle_timeout.data = (unsigned long)sc;
-
-	sclog(sc, "alloced\n");
-
-	ret = sc;
-	sc->sc_page = page;
-	r2net_debug_add_sc(sc);
-	sc = NULL;
-	page = NULL;
-
-out:
-	if (page)
-		__free_page(page);
-	kfree(sc);
-
-	return ret;
-}
-
-/* ------------------------------------------------------------ */
-
-static void r2net_sc_queue_work(struct r2net_sock_container *sc,
-				struct work_struct *work)
-{
-	sc_get(sc);
-	if (!queue_work(r2net_wq, work))
-		sc_put(sc);
-}
-static void r2net_sc_queue_delayed_work(struct r2net_sock_container *sc,
-					struct delayed_work *work,
-					int delay)
-{
-	sc_get(sc);
-	if (!queue_delayed_work(r2net_wq, work, delay))
-		sc_put(sc);
-}
-static void r2net_sc_cancel_delayed_work(struct r2net_sock_container *sc,
-					 struct delayed_work *work)
-{
-	if (cancel_delayed_work(work))
-		sc_put(sc);
-}
-
-static atomic_t r2net_connected_peers = ATOMIC_INIT(0);
-
-int r2net_num_connected_peers(void)
-{
-	return atomic_read(&r2net_connected_peers);
-}
-
-static void r2net_set_nn_state(struct r2net_node *nn,
-			       struct r2net_sock_container *sc,
-			       unsigned valid, int err)
-{
-	int was_valid = nn->nn_sc_valid;
-	int was_err = nn->nn_persistent_error;
-	struct r2net_sock_container *old_sc = nn->nn_sc;
-
-	assert_spin_locked(&nn->nn_lock);
-
-	if (old_sc && !sc)
-		atomic_dec(&r2net_connected_peers);
-	else if (!old_sc && sc)
-		atomic_inc(&r2net_connected_peers);
-
-	/* the node num comparison and single connect/accept path should stop
-	 * an non-null sc from being overwritten with another */
-	BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc);
-	mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
-	mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
-
-	if (was_valid && !valid && err == 0)
-		err = -ENOTCONN;
-
-	mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n",
-	     r2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid,
-	     nn->nn_persistent_error, err);
-
-	nn->nn_sc = sc;
-	nn->nn_sc_valid = valid ? 1 : 0;
-	nn->nn_persistent_error = err;
-
-	/* mirrors r2net_tx_can_proceed() */
-	if (nn->nn_persistent_error || nn->nn_sc_valid)
-		wake_up(&nn->nn_sc_wq);
-
-	if (!was_err && nn->nn_persistent_error) {
-		queue_delayed_work(r2net_wq, &nn->nn_still_up,
-				   msecs_to_jiffies(R2NET_QUORUM_DELAY_MS));
-	}
-
-	if (was_valid && !valid) {
-		printk(KERN_NOTICE "ramster: No longer connected to "
-		       SC_NODEF_FMT "\n",
-			old_sc->sc_node->nd_name, old_sc->sc_node->nd_num,
-			&old_sc->sc_node->nd_ipv4_address,
-			ntohs(old_sc->sc_node->nd_ipv4_port));
-		r2net_complete_nodes_nsw(nn);
-	}
-
-	if (!was_valid && valid) {
-		cancel_delayed_work(&nn->nn_connect_expired);
-		printk(KERN_NOTICE "ramster: %s " SC_NODEF_FMT "\n",
-		       r2nm_this_node() > sc->sc_node->nd_num ?
-		       "Connected to" : "Accepted connection from",
-		       sc->sc_node->nd_name, sc->sc_node->nd_num,
-			&sc->sc_node->nd_ipv4_address,
-			ntohs(sc->sc_node->nd_ipv4_port));
-	}
-
-	/* trigger the connecting worker func as long as we're not valid,
-	 * it will back off if it shouldn't connect.  This can be called
-	 * from node config teardown and so needs to be careful about
-	 * the work queue actually being up. */
-	if (!valid && r2net_wq) {
-		unsigned long delay;
-		/* delay if we're within a RECONNECT_DELAY of the
-		 * last attempt */
-		delay = (nn->nn_last_connect_attempt +
-			 msecs_to_jiffies(r2net_reconnect_delay()))
-			- jiffies;
-		if (delay > msecs_to_jiffies(r2net_reconnect_delay()))
-			delay = 0;
-		mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
-		queue_delayed_work(r2net_wq, &nn->nn_connect_work, delay);
-
-		/*
-		 * Delay the expired work after idle timeout.
-		 *
-		 * We might have lots of failed connection attempts that run
-		 * through here but we only cancel the connect_expired work when
-		 * a connection attempt succeeds.  So only the first enqueue of
-		 * the connect_expired work will do anything.  The rest will see
-		 * that it's already queued and do nothing.
-		 */
-		delay += msecs_to_jiffies(r2net_idle_timeout());
-		queue_delayed_work(r2net_wq, &nn->nn_connect_expired, delay);
-	}
-
-	/* keep track of the nn's sc ref for the caller */
-	if ((old_sc == NULL) && sc)
-		sc_get(sc);
-	if (old_sc && (old_sc != sc)) {
-		r2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work);
-		sc_put(old_sc);
-	}
-}
-
-/* see r2net_register_callbacks() */
-static void r2net_data_ready(struct sock *sk, int bytes)
-{
-	void (*ready)(struct sock *sk, int bytes);
-
-	read_lock(&sk->sk_callback_lock);
-	if (sk->sk_user_data) {
-		struct r2net_sock_container *sc = sk->sk_user_data;
-		sclog(sc, "data_ready hit\n");
-		r2net_set_data_ready_time(sc);
-		r2net_sc_queue_work(sc, &sc->sc_rx_work);
-		ready = sc->sc_data_ready;
-	} else {
-		ready = sk->sk_data_ready;
-	}
-	read_unlock(&sk->sk_callback_lock);
-
-	ready(sk, bytes);
-}
-
-/* see r2net_register_callbacks() */
-static void r2net_state_change(struct sock *sk)
-{
-	void (*state_change)(struct sock *sk);
-	struct r2net_sock_container *sc;
-
-	read_lock(&sk->sk_callback_lock);
-	sc = sk->sk_user_data;
-	if (sc == NULL) {
-		state_change = sk->sk_state_change;
-		goto out;
-	}
-
-	sclog(sc, "state_change to %d\n", sk->sk_state);
-
-	state_change = sc->sc_state_change;
-
-	switch (sk->sk_state) {
-
-	/* ignore connecting sockets as they make progress */
-	case TCP_SYN_SENT:
-	case TCP_SYN_RECV:
-		break;
-	case TCP_ESTABLISHED:
-		r2net_sc_queue_work(sc, &sc->sc_connect_work);
-		break;
-	default:
-		printk(KERN_INFO "ramster: Connection to "
-			SC_NODEF_FMT " shutdown, state %d\n",
-			sc->sc_node->nd_name, sc->sc_node->nd_num,
-			&sc->sc_node->nd_ipv4_address,
-			ntohs(sc->sc_node->nd_ipv4_port), sk->sk_state);
-		r2net_sc_queue_work(sc, &sc->sc_shutdown_work);
-		break;
-
-	}
-out:
-	read_unlock(&sk->sk_callback_lock);
-	state_change(sk);
-}
-
-/*
- * we register callbacks so we can queue work on events before calling
- * the original callbacks.  our callbacks our careful to test user_data
- * to discover when they've reaced with r2net_unregister_callbacks().
- */
-static void r2net_register_callbacks(struct sock *sk,
-				     struct r2net_sock_container *sc)
-{
-	write_lock_bh(&sk->sk_callback_lock);
-
-	/* accepted sockets inherit the old listen socket data ready */
-	if (sk->sk_data_ready == r2net_listen_data_ready) {
-		sk->sk_data_ready = sk->sk_user_data;
-		sk->sk_user_data = NULL;
-	}
-
-	BUG_ON(sk->sk_user_data != NULL);
-	sk->sk_user_data = sc;
-	sc_get(sc);
-
-	sc->sc_data_ready = sk->sk_data_ready;
-	sc->sc_state_change = sk->sk_state_change;
-	sk->sk_data_ready = r2net_data_ready;
-	sk->sk_state_change = r2net_state_change;
-
-	mutex_init(&sc->sc_send_lock);
-
-	write_unlock_bh(&sk->sk_callback_lock);
-}
-
-static int r2net_unregister_callbacks(struct sock *sk,
-					struct r2net_sock_container *sc)
-{
-	int ret = 0;
-
-	write_lock_bh(&sk->sk_callback_lock);
-	if (sk->sk_user_data == sc) {
-		ret = 1;
-		sk->sk_user_data = NULL;
-		sk->sk_data_ready = sc->sc_data_ready;
-		sk->sk_state_change = sc->sc_state_change;
-	}
-	write_unlock_bh(&sk->sk_callback_lock);
-
-	return ret;
-}
-
-/*
- * this is a little helper that is called by callers who have seen a problem
- * with an sc and want to detach it from the nn if someone already hasn't beat
- * them to it.  if an error is given then the shutdown will be persistent
- * and pending transmits will be canceled.
- */
-static void r2net_ensure_shutdown(struct r2net_node *nn,
-					struct r2net_sock_container *sc,
-				   int err)
-{
-	spin_lock(&nn->nn_lock);
-	if (nn->nn_sc == sc)
-		r2net_set_nn_state(nn, NULL, 0, err);
-	spin_unlock(&nn->nn_lock);
-}
-
-/*
- * This work queue function performs the blocking parts of socket shutdown.  A
- * few paths lead here.  set_nn_state will trigger this callback if it sees an
- * sc detached from the nn.  state_change will also trigger this callback
- * directly when it sees errors.  In that case we need to call set_nn_state
- * ourselves as state_change couldn't get the nn_lock and call set_nn_state
- * itself.
- */
-static void r2net_shutdown_sc(struct work_struct *work)
-{
-	struct r2net_sock_container *sc =
-		container_of(work, struct r2net_sock_container,
-			     sc_shutdown_work);
-	struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num);
-
-	sclog(sc, "shutting down\n");
-
-	/* drop the callbacks ref and call shutdown only once */
-	if (r2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
-		/* we shouldn't flush as we're in the thread, the
-		 * races with pending sc work structs are harmless */
-		del_timer_sync(&sc->sc_idle_timeout);
-		r2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
-		sc_put(sc);
-		kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR);
-	}
-
-	/* not fatal so failed connects before the other guy has our
-	 * heartbeat can be retried */
-	r2net_ensure_shutdown(nn, sc, 0);
-	sc_put(sc);
-}
-
-/* ------------------------------------------------------------ */
-
-static int r2net_handler_cmp(struct r2net_msg_handler *nmh, u32 msg_type,
-			     u32 key)
-{
-	int ret = memcmp(&nmh->nh_key, &key, sizeof(key));
-
-	if (ret == 0)
-		ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type));
-
-	return ret;
-}
-
-static struct r2net_msg_handler *
-r2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
-				struct rb_node **ret_parent)
-{
-	struct rb_node **p = &r2net_handler_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct r2net_msg_handler *nmh, *ret = NULL;
-	int cmp;
-
-	while (*p) {
-		parent = *p;
-		nmh = rb_entry(parent, struct r2net_msg_handler, nh_node);
-		cmp = r2net_handler_cmp(nmh, msg_type, key);
-
-		if (cmp < 0)
-			p = &(*p)->rb_left;
-		else if (cmp > 0)
-			p = &(*p)->rb_right;
-		else {
-			ret = nmh;
-			break;
-		}
-	}
-
-	if (ret_p != NULL)
-		*ret_p = p;
-	if (ret_parent != NULL)
-		*ret_parent = parent;
-
-	return ret;
-}
-
-static void r2net_handler_kref_release(struct kref *kref)
-{
-	struct r2net_msg_handler *nmh;
-	nmh = container_of(kref, struct r2net_msg_handler, nh_kref);
-
-	kfree(nmh);
-}
-
-static void r2net_handler_put(struct r2net_msg_handler *nmh)
-{
-	kref_put(&nmh->nh_kref, r2net_handler_kref_release);
-}
-
-/* max_len is protection for the handler func.  incoming messages won't
- * be given to the handler if their payload is longer than the max. */
-int r2net_register_handler(u32 msg_type, u32 key, u32 max_len,
-			   r2net_msg_handler_func *func, void *data,
-			   r2net_post_msg_handler_func *post_func,
-			   struct list_head *unreg_list)
-{
-	struct r2net_msg_handler *nmh = NULL;
-	struct rb_node **p, *parent;
-	int ret = 0;
-
-	if (max_len > R2NET_MAX_PAYLOAD_BYTES) {
-		mlog(0, "max_len for message handler out of range: %u\n",
-			max_len);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (!msg_type) {
-		mlog(0, "no message type provided: %u, %p\n", msg_type, func);
-		ret = -EINVAL;
-		goto out;
-
-	}
-	if (!func) {
-		mlog(0, "no message handler provided: %u, %p\n",
-		       msg_type, func);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	nmh = kzalloc(sizeof(struct r2net_msg_handler), GFP_NOFS);
-	if (nmh == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	nmh->nh_func = func;
-	nmh->nh_func_data = data;
-	nmh->nh_post_func = post_func;
-	nmh->nh_msg_type = msg_type;
-	nmh->nh_max_len = max_len;
-	nmh->nh_key = key;
-	/* the tree and list get this ref.. they're both removed in
-	 * unregister when this ref is dropped */
-	kref_init(&nmh->nh_kref);
-	INIT_LIST_HEAD(&nmh->nh_unregister_item);
-
-	write_lock(&r2net_handler_lock);
-	if (r2net_handler_tree_lookup(msg_type, key, &p, &parent))
-		ret = -EEXIST;
-	else {
-		rb_link_node(&nmh->nh_node, parent, p);
-		rb_insert_color(&nmh->nh_node, &r2net_handler_tree);
-		list_add_tail(&nmh->nh_unregister_item, unreg_list);
-
-		mlog(ML_TCP, "registered handler func %p type %u key %08x\n",
-		     func, msg_type, key);
-		/* we've had some trouble with handlers seemingly vanishing. */
-		mlog_bug_on_msg(r2net_handler_tree_lookup(msg_type, key, &p,
-							  &parent) == NULL,
-				"couldn't find handler we *just* registered "
-				"for type %u key %08x\n", msg_type, key);
-	}
-	write_unlock(&r2net_handler_lock);
-	if (ret)
-		goto out;
-
-out:
-	if (ret)
-		kfree(nmh);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(r2net_register_handler);
-
-void r2net_unregister_handler_list(struct list_head *list)
-{
-	struct r2net_msg_handler *nmh, *n;
-
-	write_lock(&r2net_handler_lock);
-	list_for_each_entry_safe(nmh, n, list, nh_unregister_item) {
-		mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
-		     nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
-		rb_erase(&nmh->nh_node, &r2net_handler_tree);
-		list_del_init(&nmh->nh_unregister_item);
-		kref_put(&nmh->nh_kref, r2net_handler_kref_release);
-	}
-	write_unlock(&r2net_handler_lock);
-}
-EXPORT_SYMBOL_GPL(r2net_unregister_handler_list);
-
-static struct r2net_msg_handler *r2net_handler_get(u32 msg_type, u32 key)
-{
-	struct r2net_msg_handler *nmh;
-
-	read_lock(&r2net_handler_lock);
-	nmh = r2net_handler_tree_lookup(msg_type, key, NULL, NULL);
-	if (nmh)
-		kref_get(&nmh->nh_kref);
-	read_unlock(&r2net_handler_lock);
-
-	return nmh;
-}
-
-/* ------------------------------------------------------------ */
-
-static int r2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
-{
-	int ret;
-	mm_segment_t oldfs;
-	struct kvec vec = {
-		.iov_len = len,
-		.iov_base = data,
-	};
-	struct msghdr msg = {
-		.msg_iovlen = 1,
-		.msg_iov = (struct iovec *)&vec,
-		.msg_flags = MSG_DONTWAIT,
-	};
-
-	oldfs = get_fs();
-	set_fs(get_ds());
-	ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
-	set_fs(oldfs);
-
-	return ret;
-}
-
-static int r2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
-			      size_t veclen, size_t total)
-{
-	int ret;
-	mm_segment_t oldfs;
-	struct msghdr msg = {
-		.msg_iov = (struct iovec *)vec,
-		.msg_iovlen = veclen,
-	};
-
-	if (sock == NULL) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	oldfs = get_fs();
-	set_fs(get_ds());
-	ret = sock_sendmsg(sock, &msg, total);
-	set_fs(oldfs);
-	if (ret != total) {
-		mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
-		     total);
-		if (ret >= 0)
-			ret = -EPIPE; /* should be smarter, I bet */
-		goto out;
-	}
-
-	ret = 0;
-out:
-	if (ret < 0)
-		mlog(0, "returning error: %d\n", ret);
-	return ret;
-}
-
-static void r2net_sendpage(struct r2net_sock_container *sc,
-			   void *kmalloced_virt,
-			   size_t size)
-{
-	struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num);
-	ssize_t ret;
-
-	while (1) {
-		mutex_lock(&sc->sc_send_lock);
-		ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
-					virt_to_page(kmalloced_virt),
-					(long)kmalloced_virt & ~PAGE_MASK,
-					size, MSG_DONTWAIT);
-		mutex_unlock(&sc->sc_send_lock);
-		if (ret == size)
-			break;
-		if (ret == (ssize_t)-EAGAIN) {
-			mlog(0, "sendpage of size %zu to " SC_NODEF_FMT
-			     " returned EAGAIN\n", size, sc->sc_node->nd_name,
-				sc->sc_node->nd_num,
-				&sc->sc_node->nd_ipv4_address,
-				ntohs(sc->sc_node->nd_ipv4_port));
-			cond_resched();
-			continue;
-		}
-		mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT
-		     " failed with %zd\n", size, sc->sc_node->nd_name,
-			sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address,
-			ntohs(sc->sc_node->nd_ipv4_port), ret);
-		r2net_ensure_shutdown(nn, sc, 0);
-		break;
-	}
-}
-
-static void r2net_init_msg(struct r2net_msg *msg, u16 data_len,
-				u16 msg_type, u32 key)
-{
-	memset(msg, 0, sizeof(struct r2net_msg));
-	msg->magic = cpu_to_be16(R2NET_MSG_MAGIC);
-	msg->data_len = cpu_to_be16(data_len);
-	msg->msg_type = cpu_to_be16(msg_type);
-	msg->sys_status = cpu_to_be32(R2NET_ERR_NONE);
-	msg->status = 0;
-	msg->key = cpu_to_be32(key);
-}
-
-static int r2net_tx_can_proceed(struct r2net_node *nn,
-				struct r2net_sock_container **sc_ret,
-				int *error)
-{
-	int ret = 0;
-
-	spin_lock(&nn->nn_lock);
-	if (nn->nn_persistent_error) {
-		ret = 1;
-		*sc_ret = NULL;
-		*error = nn->nn_persistent_error;
-	} else if (nn->nn_sc_valid) {
-		kref_get(&nn->nn_sc->sc_kref);
-
-		ret = 1;
-		*sc_ret = nn->nn_sc;
-		*error = 0;
-	}
-	spin_unlock(&nn->nn_lock);
-
-	return ret;
-}
-
-/* Get a map of all nodes to which this node is currently connected to */
-void r2net_fill_node_map(unsigned long *map, unsigned bytes)
-{
-	struct r2net_sock_container *sc;
-	int node, ret;
-
-	BUG_ON(bytes < (BITS_TO_LONGS(R2NM_MAX_NODES) * sizeof(unsigned long)));
-
-	memset(map, 0, bytes);
-	for (node = 0; node < R2NM_MAX_NODES; ++node) {
-		r2net_tx_can_proceed(r2net_nn_from_num(node), &sc, &ret);
-		if (!ret) {
-			set_bit(node, map);
-			sc_put(sc);
-		}
-	}
-}
-EXPORT_SYMBOL_GPL(r2net_fill_node_map);
-
-int r2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
-			   size_t caller_veclen, u8 target_node, int *status)
-{
-	int ret = 0;
-	struct r2net_msg *msg = NULL;
-	size_t veclen, caller_bytes = 0;
-	struct kvec *vec = NULL;
-	struct r2net_sock_container *sc = NULL;
-	struct r2net_node *nn = r2net_nn_from_num(target_node);
-	struct r2net_status_wait nsw = {
-		.ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
-	};
-	struct r2net_send_tracking nst;
-
-	/* this may be a general bug fix */
-	init_waitqueue_head(&nsw.ns_wq);
-
-	r2net_init_nst(&nst, msg_type, key, current, target_node);
-
-	if (r2net_wq == NULL) {
-		mlog(0, "attempt to tx without r2netd running\n");
-		ret = -ESRCH;
-		goto out;
-	}
-
-	if (caller_veclen == 0) {
-		mlog(0, "bad kvec array length\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen);
-	if (caller_bytes > R2NET_MAX_PAYLOAD_BYTES) {
-		mlog(0, "total payload len %zu too large\n", caller_bytes);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (target_node == r2nm_this_node()) {
-		ret = -ELOOP;
-		goto out;
-	}
-
-	r2net_debug_add_nst(&nst);
-
-	r2net_set_nst_sock_time(&nst);
-
-	wait_event(nn->nn_sc_wq, r2net_tx_can_proceed(nn, &sc, &ret));
-	if (ret)
-		goto out;
-
-	r2net_set_nst_sock_container(&nst, sc);
-
-	veclen = caller_veclen + 1;
-	vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
-	if (vec == NULL) {
-		mlog(0, "failed to %zu element kvec!\n", veclen);
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	msg = kmalloc(sizeof(struct r2net_msg), GFP_ATOMIC);
-	if (!msg) {
-		mlog(0, "failed to allocate a r2net_msg!\n");
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	r2net_init_msg(msg, caller_bytes, msg_type, key);
-
-	vec[0].iov_len = sizeof(struct r2net_msg);
-	vec[0].iov_base = msg;
-	memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec));
-
-	ret = r2net_prep_nsw(nn, &nsw);
-	if (ret)
-		goto out;
-
-	msg->msg_num = cpu_to_be32(nsw.ns_id);
-	r2net_set_nst_msg_id(&nst, nsw.ns_id);
-
-	r2net_set_nst_send_time(&nst);
-
-	/* finally, convert the message header to network byte-order
-	 * and send */
-	mutex_lock(&sc->sc_send_lock);
-	ret = r2net_send_tcp_msg(sc->sc_sock, vec, veclen,
-				 sizeof(struct r2net_msg) + caller_bytes);
-	mutex_unlock(&sc->sc_send_lock);
-	msglog(msg, "sending returned %d\n", ret);
-	if (ret < 0) {
-		mlog(0, "error returned from r2net_send_tcp_msg=%d\n", ret);
-		goto out;
-	}
-
-	/* wait on other node's handler */
-	r2net_set_nst_status_time(&nst);
-	wait_event(nsw.ns_wq, r2net_nsw_completed(nn, &nsw));
-
-	r2net_update_send_stats(&nst, sc);
-
-	/* Note that we avoid overwriting the callers status return
-	 * variable if a system error was reported on the other
-	 * side. Callers beware. */
-	ret = r2net_sys_err_to_errno(nsw.ns_sys_status);
-	if (status && !ret)
-		*status = nsw.ns_status;
-
-	mlog(0, "woken, returning system status %d, user status %d\n",
-	     ret, nsw.ns_status);
-out:
-	r2net_debug_del_nst(&nst); /* must be before dropping sc and node */
-	if (sc)
-		sc_put(sc);
-	kfree(vec);
-	kfree(msg);
-	r2net_complete_nsw(nn, &nsw, 0, 0, 0);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(r2net_send_message_vec);
-
-int r2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
-		       u8 target_node, int *status)
-{
-	struct kvec vec = {
-		.iov_base = data,
-		.iov_len = len,
-	};
-	return r2net_send_message_vec(msg_type, key, &vec, 1,
-				      target_node, status);
-}
-EXPORT_SYMBOL_GPL(r2net_send_message);
-
-static int r2net_send_status_magic(struct socket *sock, struct r2net_msg *hdr,
-				   enum r2net_system_error syserr, int err)
-{
-	struct kvec vec = {
-		.iov_base = hdr,
-		.iov_len = sizeof(struct r2net_msg),
-	};
-
-	BUG_ON(syserr >= R2NET_ERR_MAX);
-
-	/* leave other fields intact from the incoming message, msg_num
-	 * in particular */
-	hdr->sys_status = cpu_to_be32(syserr);
-	hdr->status = cpu_to_be32(err);
-	/* twiddle the magic */
-	hdr->magic = cpu_to_be16(R2NET_MSG_STATUS_MAGIC);
-	hdr->data_len = 0;
-
-	msglog(hdr, "about to send status magic %d\n", err);
-	/* hdr has been in host byteorder this whole time */
-	return r2net_send_tcp_msg(sock, &vec, 1, sizeof(struct r2net_msg));
-}
-
-/*
- * "data magic" is a long version of "status magic" where the message
- * payload actually contains data to be passed in reply to certain messages
- */
-static int r2net_send_data_magic(struct r2net_sock_container *sc,
-			  struct r2net_msg *hdr,
-			  void *data, size_t data_len,
-			  enum r2net_system_error syserr, int err)
-{
-	struct kvec vec[2];
-	int ret;
-
-	vec[0].iov_base = hdr;
-	vec[0].iov_len = sizeof(struct r2net_msg);
-	vec[1].iov_base = data;
-	vec[1].iov_len = data_len;
-
-	BUG_ON(syserr >= R2NET_ERR_MAX);
-
-	/* leave other fields intact from the incoming message, msg_num
-	 * in particular */
-	hdr->sys_status = cpu_to_be32(syserr);
-	hdr->status = cpu_to_be32(err);
-	hdr->magic = cpu_to_be16(R2NET_MSG_DATA_MAGIC);  /* twiddle magic */
-	hdr->data_len = cpu_to_be16(data_len);
-
-	msglog(hdr, "about to send data magic %d\n", err);
-	/* hdr has been in host byteorder this whole time */
-	ret = r2net_send_tcp_msg(sc->sc_sock, vec, 2,
-			sizeof(struct r2net_msg) + data_len);
-	return ret;
-}
-
-/*
- * called by a message handler to convert an otherwise normal reply
- * message into a "data magic" message
- */
-void r2net_force_data_magic(struct r2net_msg *hdr, u16 msgtype, u32 msgkey)
-{
-	hdr->magic = cpu_to_be16(R2NET_MSG_DATA_MAGIC);
-	hdr->msg_type = cpu_to_be16(msgtype);
-	hdr->key = cpu_to_be32(msgkey);
-}
-
-/* this returns -errno if the header was unknown or too large, etc.
- * after this is called the buffer us reused for the next message */
-static int r2net_process_message(struct r2net_sock_container *sc,
-				 struct r2net_msg *hdr)
-{
-	struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num);
-	int ret = 0, handler_status;
-	enum  r2net_system_error syserr;
-	struct r2net_msg_handler *nmh = NULL;
-	void *ret_data = NULL;
-	int data_magic = 0;
-
-	msglog(hdr, "processing message\n");
-
-	r2net_sc_postpone_idle(sc);
-
-	switch (be16_to_cpu(hdr->magic)) {
-
-	case R2NET_MSG_STATUS_MAGIC:
-		/* special type for returning message status */
-		r2net_complete_nsw(nn, NULL, be32_to_cpu(hdr->msg_num),
-						be32_to_cpu(hdr->sys_status),
-						be32_to_cpu(hdr->status));
-		goto out;
-	case R2NET_MSG_KEEP_REQ_MAGIC:
-		r2net_sendpage(sc, r2net_keep_resp, sizeof(*r2net_keep_resp));
-		goto out;
-	case R2NET_MSG_KEEP_RESP_MAGIC:
-		goto out;
-	case R2NET_MSG_MAGIC:
-		break;
-	case R2NET_MSG_DATA_MAGIC:
-		/*
-		 * unlike a normal status magic, a data magic DOES
-		 * (MUST) have a handler, so the control flow is
-		 * a little funky here as a result
-		 */
-		data_magic = 1;
-		break;
-	default:
-		msglog(hdr, "bad magic\n");
-		ret = -EINVAL;
-		goto out;
-		break;
-	}
-
-	/* find a handler for it */
-	handler_status = 0;
-	nmh = r2net_handler_get(be16_to_cpu(hdr->msg_type),
-				be32_to_cpu(hdr->key));
-	if (!nmh) {
-		mlog(ML_TCP, "couldn't find handler for type %u key %08x\n",
-		     be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key));
-		syserr = R2NET_ERR_NO_HNDLR;
-		goto out_respond;
-	}
-
-	syserr = R2NET_ERR_NONE;
-
-	if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len)
-		syserr = R2NET_ERR_OVERFLOW;
-
-	if (syserr != R2NET_ERR_NONE)
-		goto out_respond;
-
-	r2net_set_func_start_time(sc);
-	sc->sc_msg_key = be32_to_cpu(hdr->key);
-	sc->sc_msg_type = be16_to_cpu(hdr->msg_type);
-	handler_status = (nmh->nh_func)(hdr, sizeof(struct r2net_msg) +
-					     be16_to_cpu(hdr->data_len),
-					nmh->nh_func_data, &ret_data);
-	if (data_magic) {
-		/*
-		 * handler handled data sent in reply to request
-		 * so complete the transaction
-		 */
-		r2net_complete_nsw(nn, NULL, be32_to_cpu(hdr->msg_num),
-			be32_to_cpu(hdr->sys_status), handler_status);
-		goto out;
-	}
-	/*
-	 * handler changed magic to DATA_MAGIC to reply to request for data,
-	 * implies ret_data points to data to return and handler_status
-	 * is the number of bytes of data
-	 */
-	if (be16_to_cpu(hdr->magic) == R2NET_MSG_DATA_MAGIC) {
-		ret = r2net_send_data_magic(sc, hdr,
-						ret_data, handler_status,
-						syserr, 0);
-		hdr = NULL;
-		mlog(0, "sending data reply %d, syserr %d returned %d\n",
-			handler_status, syserr, ret);
-		r2net_set_func_stop_time(sc);
-
-		r2net_update_recv_stats(sc);
-		goto out;
-	}
-	r2net_set_func_stop_time(sc);
-
-	r2net_update_recv_stats(sc);
-
-out_respond:
-	/* this destroys the hdr, so don't use it after this */
-	mutex_lock(&sc->sc_send_lock);
-	ret = r2net_send_status_magic(sc->sc_sock, hdr, syserr,
-				      handler_status);
-	mutex_unlock(&sc->sc_send_lock);
-	hdr = NULL;
-	mlog(0, "sending handler status %d, syserr %d returned %d\n",
-	     handler_status, syserr, ret);
-
-	if (nmh) {
-		BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL);
-		if (nmh->nh_post_func)
-			(nmh->nh_post_func)(handler_status, nmh->nh_func_data,
-					    ret_data);
-	}
-
-out:
-	if (nmh)
-		r2net_handler_put(nmh);
-	return ret;
-}
-
-static int r2net_check_handshake(struct r2net_sock_container *sc)
-{
-	struct r2net_handshake *hand = page_address(sc->sc_page);
-	struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num);
-
-	if (hand->protocol_version != cpu_to_be64(R2NET_PROTOCOL_VERSION)) {
-		printk(KERN_NOTICE "ramster: " SC_NODEF_FMT " Advertised net "
-		       "protocol version %llu but %llu is required. "
-		       "Disconnecting.\n", sc->sc_node->nd_name,
-			sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address,
-			ntohs(sc->sc_node->nd_ipv4_port),
-		       (unsigned long long)be64_to_cpu(hand->protocol_version),
-		       R2NET_PROTOCOL_VERSION);
-
-		/* don't bother reconnecting if its the wrong version. */
-		r2net_ensure_shutdown(nn, sc, -ENOTCONN);
-		return -1;
-	}
-
-	/*
-	 * Ensure timeouts are consistent with other nodes, otherwise
-	 * we can end up with one node thinking that the other must be down,
-	 * but isn't. This can ultimately cause corruption.
-	 */
-	if (be32_to_cpu(hand->r2net_idle_timeout_ms) !=
-				r2net_idle_timeout()) {
-		printk(KERN_NOTICE "ramster: " SC_NODEF_FMT " uses a network "
-		       "idle timeout of %u ms, but we use %u ms locally. "
-		       "Disconnecting.\n", sc->sc_node->nd_name,
-			sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address,
-			ntohs(sc->sc_node->nd_ipv4_port),
-		       be32_to_cpu(hand->r2net_idle_timeout_ms),
-		       r2net_idle_timeout());
-		r2net_ensure_shutdown(nn, sc, -ENOTCONN);
-		return -1;
-	}
-
-	if (be32_to_cpu(hand->r2net_keepalive_delay_ms) !=
-			r2net_keepalive_delay()) {
-		printk(KERN_NOTICE "ramster: " SC_NODEF_FMT " uses a keepalive "
-		       "delay of %u ms, but we use %u ms locally. "
-		       "Disconnecting.\n", sc->sc_node->nd_name,
-			sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address,
-			ntohs(sc->sc_node->nd_ipv4_port),
-		       be32_to_cpu(hand->r2net_keepalive_delay_ms),
-		       r2net_keepalive_delay());
-		r2net_ensure_shutdown(nn, sc, -ENOTCONN);
-		return -1;
-	}
-
-	if (be32_to_cpu(hand->r2hb_heartbeat_timeout_ms) !=
-			R2HB_MAX_WRITE_TIMEOUT_MS) {
-		printk(KERN_NOTICE "ramster: " SC_NODEF_FMT " uses a heartbeat "
-		       "timeout of %u ms, but we use %u ms locally. "
-		       "Disconnecting.\n", sc->sc_node->nd_name,
-			sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address,
-			ntohs(sc->sc_node->nd_ipv4_port),
-		       be32_to_cpu(hand->r2hb_heartbeat_timeout_ms),
-		       R2HB_MAX_WRITE_TIMEOUT_MS);
-		r2net_ensure_shutdown(nn, sc, -ENOTCONN);
-		return -1;
-	}
-
-	sc->sc_handshake_ok = 1;
-
-	spin_lock(&nn->nn_lock);
-	/* set valid and queue the idle timers only if it hasn't been
-	 * shut down already */
-	if (nn->nn_sc == sc) {
-		r2net_sc_reset_idle_timer(sc);
-		atomic_set(&nn->nn_timeout, 0);
-		r2net_set_nn_state(nn, sc, 1, 0);
-	}
-	spin_unlock(&nn->nn_lock);
-
-	/* shift everything up as though it wasn't there */
-	sc->sc_page_off -= sizeof(struct r2net_handshake);
-	if (sc->sc_page_off)
-		memmove(hand, hand + 1, sc->sc_page_off);
-
-	return 0;
-}
-
-/* this demuxes the queued rx bytes into header or payload bits and calls
- * handlers as each full message is read off the socket.  it returns -error,
- * == 0 eof, or > 0 for progress made.*/
-static int r2net_advance_rx(struct r2net_sock_container *sc)
-{
-	struct r2net_msg *hdr;
-	int ret = 0;
-	void *data;
-	size_t datalen;
-
-	sclog(sc, "receiving\n");
-	r2net_set_advance_start_time(sc);
-
-	if (unlikely(sc->sc_handshake_ok == 0)) {
-		if (sc->sc_page_off < sizeof(struct r2net_handshake)) {
-			data = page_address(sc->sc_page) + sc->sc_page_off;
-			datalen = sizeof(struct r2net_handshake) -
-							sc->sc_page_off;
-			ret = r2net_recv_tcp_msg(sc->sc_sock, data, datalen);
-			if (ret > 0)
-				sc->sc_page_off += ret;
-		}
-
-		if (sc->sc_page_off == sizeof(struct r2net_handshake)) {
-			r2net_check_handshake(sc);
-			if (unlikely(sc->sc_handshake_ok == 0))
-				ret = -EPROTO;
-		}
-		goto out;
-	}
-
-	/* do we need more header? */
-	if (sc->sc_page_off < sizeof(struct r2net_msg)) {
-		data = page_address(sc->sc_page) + sc->sc_page_off;
-		datalen = sizeof(struct r2net_msg) - sc->sc_page_off;
-		ret = r2net_recv_tcp_msg(sc->sc_sock, data, datalen);
-		if (ret > 0) {
-			sc->sc_page_off += ret;
-			/* only swab incoming here.. we can
-			 * only get here once as we cross from
-			 * being under to over */
-			if (sc->sc_page_off == sizeof(struct r2net_msg)) {
-				hdr = page_address(sc->sc_page);
-				if (be16_to_cpu(hdr->data_len) >
-				    R2NET_MAX_PAYLOAD_BYTES)
-					ret = -EOVERFLOW;
-			}
-		}
-		if (ret <= 0)
-			goto out;
-	}
-
-	if (sc->sc_page_off < sizeof(struct r2net_msg)) {
-		/* oof, still don't have a header */
-		goto out;
-	}
-
-	/* this was swabbed above when we first read it */
-	hdr = page_address(sc->sc_page);
-
-	msglog(hdr, "at page_off %zu\n", sc->sc_page_off);
-
-	/* do we need more payload? */
-	if (sc->sc_page_off - sizeof(struct r2net_msg) <
-					be16_to_cpu(hdr->data_len)) {
-		/* need more payload */
-		data = page_address(sc->sc_page) + sc->sc_page_off;
-		datalen = (sizeof(struct r2net_msg) +
-				be16_to_cpu(hdr->data_len)) -
-				sc->sc_page_off;
-		ret = r2net_recv_tcp_msg(sc->sc_sock, data, datalen);
-		if (ret > 0)
-			sc->sc_page_off += ret;
-		if (ret <= 0)
-			goto out;
-	}
-
-	if (sc->sc_page_off - sizeof(struct r2net_msg) ==
-						be16_to_cpu(hdr->data_len)) {
-		/* we can only get here once, the first time we read
-		 * the payload.. so set ret to progress if the handler
-		 * works out. after calling this the message is toast */
-		ret = r2net_process_message(sc, hdr);
-		if (ret == 0)
-			ret = 1;
-		sc->sc_page_off = 0;
-	}
-
-out:
-	sclog(sc, "ret = %d\n", ret);
-	r2net_set_advance_stop_time(sc);
-	return ret;
-}
-
-/* this work func is triggerd by data ready.  it reads until it can read no
- * more.  it interprets 0, eof, as fatal.  if data_ready hits while we're doing
- * our work the work struct will be marked and we'll be called again. */
-static void r2net_rx_until_empty(struct work_struct *work)
-{
-	struct r2net_sock_container *sc =
-		container_of(work, struct r2net_sock_container, sc_rx_work);
-	int ret;
-
-	do {
-		ret = r2net_advance_rx(sc);
-	} while (ret > 0);
-
-	if (ret <= 0 && ret != -EAGAIN) {
-		struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num);
-		sclog(sc, "saw error %d, closing\n", ret);
-		/* not permanent so read failed handshake can retry */
-		r2net_ensure_shutdown(nn, sc, 0);
-	}
-
-	sc_put(sc);
-}
-
-static int r2net_set_nodelay(struct socket *sock)
-{
-	int ret, val = 1;
-	mm_segment_t oldfs;
-
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-
-	/*
-	 * Dear unsuspecting programmer,
-	 *
-	 * Don't use sock_setsockopt() for SOL_TCP.  It doesn't check its level
-	 * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
-	 * silently turn into SO_DEBUG.
-	 *
-	 * Yours,
-	 * Keeper of hilariously fragile interfaces.
-	 */
-	ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
-				    (char __user *)&val, sizeof(val));
-
-	set_fs(oldfs);
-	return ret;
-}
-
-static void r2net_initialize_handshake(void)
-{
-	r2net_hand->r2hb_heartbeat_timeout_ms = cpu_to_be32(
-		R2HB_MAX_WRITE_TIMEOUT_MS);
-	r2net_hand->r2net_idle_timeout_ms = cpu_to_be32(r2net_idle_timeout());
-	r2net_hand->r2net_keepalive_delay_ms = cpu_to_be32(
-		r2net_keepalive_delay());
-	r2net_hand->r2net_reconnect_delay_ms = cpu_to_be32(
-		r2net_reconnect_delay());
-}
-
-/* ------------------------------------------------------------ */
-
-/* called when a connect completes and after a sock is accepted.  the
- * rx path will see the response and mark the sc valid */
-static void r2net_sc_connect_completed(struct work_struct *work)
-{
-	struct r2net_sock_container *sc =
-			container_of(work, struct r2net_sock_container,
-			     sc_connect_work);
-
-	mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n",
-		(unsigned long long)R2NET_PROTOCOL_VERSION,
-		(unsigned long long)be64_to_cpu(r2net_hand->connector_id));
-
-	r2net_initialize_handshake();
-	r2net_sendpage(sc, r2net_hand, sizeof(*r2net_hand));
-	sc_put(sc);
-}
-
-/* this is called as a work_struct func. */
-static void r2net_sc_send_keep_req(struct work_struct *work)
-{
-	struct r2net_sock_container *sc =
-		container_of(work, struct r2net_sock_container,
-			     sc_keepalive_work.work);
-
-	r2net_sendpage(sc, r2net_keep_req, sizeof(*r2net_keep_req));
-	sc_put(sc);
-}
-
-/* socket shutdown does a del_timer_sync against this as it tears down.
- * we can't start this timer until we've got to the point in sc buildup
- * where shutdown is going to be involved */
-static void r2net_idle_timer(unsigned long data)
-{
-	struct r2net_sock_container *sc = (struct r2net_sock_container *)data;
-#ifdef CONFIG_DEBUG_FS
-	unsigned long msecs = ktime_to_ms(ktime_get()) -
-		ktime_to_ms(sc->sc_tv_timer);
-#else
-	unsigned long msecs = r2net_idle_timeout();
-#endif
-
-	printk(KERN_NOTICE "ramster: Connection to " SC_NODEF_FMT " has been "
-	       "idle for %lu.%lu secs, shutting it down.\n",
-		sc->sc_node->nd_name, sc->sc_node->nd_num,
-		&sc->sc_node->nd_ipv4_address, ntohs(sc->sc_node->nd_ipv4_port),
-	       msecs / 1000, msecs % 1000);
-
-	/*
-	 * Initialize the nn_timeout so that the next connection attempt
-	 * will continue in r2net_start_connect.
-	 */
-	/* Avoid spurious shutdowns... not sure if this is still necessary */
-	pr_err("ramster_idle_timer, skipping shutdown work\n");
-#if 0
-	/* old code used to do these two lines */
-	atomic_set(&nn->nn_timeout, 1);
-	r2net_sc_queue_work(sc, &sc->sc_shutdown_work);
-#endif
-}
-
-static void r2net_sc_reset_idle_timer(struct r2net_sock_container *sc)
-{
-	r2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
-	r2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
-		      msecs_to_jiffies(r2net_keepalive_delay()));
-	r2net_set_sock_timer(sc);
-	mod_timer(&sc->sc_idle_timeout,
-	       jiffies + msecs_to_jiffies(r2net_idle_timeout()));
-}
-
-static void r2net_sc_postpone_idle(struct r2net_sock_container *sc)
-{
-	/* Only push out an existing timer */
-	if (timer_pending(&sc->sc_idle_timeout))
-		r2net_sc_reset_idle_timer(sc);
-}
-
-/* this work func is kicked whenever a path sets the nn state which doesn't
- * have valid set.  This includes seeing hb come up, losing a connection,
- * having a connect attempt fail, etc. This centralizes the logic which decides
- * if a connect attempt should be made or if we should give up and all future
- * transmit attempts should fail */
-static void r2net_start_connect(struct work_struct *work)
-{
-	struct r2net_node *nn =
-		container_of(work, struct r2net_node, nn_connect_work.work);
-	struct r2net_sock_container *sc = NULL;
-	struct r2nm_node *node = NULL, *mynode = NULL;
-	struct socket *sock = NULL;
-	struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
-	int ret = 0, stop;
-	unsigned int timeout;
-
-	/* if we're greater we initiate tx, otherwise we accept */
-	if (r2nm_this_node() <= r2net_num_from_nn(nn))
-		goto out;
-
-	/* watch for racing with tearing a node down */
-	node = r2nm_get_node_by_num(r2net_num_from_nn(nn));
-	if (node == NULL) {
-		ret = 0;
-		goto out;
-	}
-
-	mynode = r2nm_get_node_by_num(r2nm_this_node());
-	if (mynode == NULL) {
-		ret = 0;
-		goto out;
-	}
-
-	spin_lock(&nn->nn_lock);
-	/*
-	 * see if we already have one pending or have given up.
-	 * For nn_timeout, it is set when we close the connection
-	 * because of the idle time out. So it means that we have
-	 * at least connected to that node successfully once,
-	 * now try to connect to it again.
-	 */
-	timeout = atomic_read(&nn->nn_timeout);
-	stop = (nn->nn_sc ||
-		(nn->nn_persistent_error &&
-		(nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
-	spin_unlock(&nn->nn_lock);
-	if (stop)
-		goto out;
-
-	nn->nn_last_connect_attempt = jiffies;
-
-	sc = sc_alloc(node);
-	if (sc == NULL) {
-		mlog(0, "couldn't allocate sc\n");
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
-	if (ret < 0) {
-		mlog(0, "can't create socket: %d\n", ret);
-		goto out;
-	}
-	sc->sc_sock = sock; /* freed by sc_kref_release */
-
-	sock->sk->sk_allocation = GFP_ATOMIC;
-
-	myaddr.sin_family = AF_INET;
-	myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
-	myaddr.sin_port = htons(0); /* any port */
-
-	ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr,
-			      sizeof(myaddr));
-	if (ret) {
-		mlog(ML_ERROR, "bind failed with %d at address %pI4\n",
-		     ret, &mynode->nd_ipv4_address);
-		goto out;
-	}
-
-	ret = r2net_set_nodelay(sc->sc_sock);
-	if (ret) {
-		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
-		goto out;
-	}
-
-	r2net_register_callbacks(sc->sc_sock->sk, sc);
-
-	spin_lock(&nn->nn_lock);
-	/* handshake completion will set nn->nn_sc_valid */
-	r2net_set_nn_state(nn, sc, 0, 0);
-	spin_unlock(&nn->nn_lock);
-
-	remoteaddr.sin_family = AF_INET;
-	remoteaddr.sin_addr.s_addr = node->nd_ipv4_address;
-	remoteaddr.sin_port = node->nd_ipv4_port;
-
-	ret = sc->sc_sock->ops->connect(sc->sc_sock,
-					(struct sockaddr *)&remoteaddr,
-					sizeof(remoteaddr),
-					O_NONBLOCK);
-	if (ret == -EINPROGRESS)
-		ret = 0;
-
-out:
-	if (ret) {
-		printk(KERN_NOTICE "ramster: Connect attempt to " SC_NODEF_FMT
-		       " failed with errno %d\n", sc->sc_node->nd_name,
-			sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address,
-			ntohs(sc->sc_node->nd_ipv4_port), ret);
-		/* 0 err so that another will be queued and attempted
-		 * from set_nn_state */
-		if (sc)
-			r2net_ensure_shutdown(nn, sc, 0);
-	}
-	if (sc)
-		sc_put(sc);
-	if (node)
-		r2nm_node_put(node);
-	if (mynode)
-		r2nm_node_put(mynode);
-
-	return;
-}
-
-static void r2net_connect_expired(struct work_struct *work)
-{
-	struct r2net_node *nn =
-		container_of(work, struct r2net_node, nn_connect_expired.work);
-
-	spin_lock(&nn->nn_lock);
-	if (!nn->nn_sc_valid) {
-		printk(KERN_NOTICE "ramster: No connection established with "
-		       "node %u after %u.%u seconds, giving up.\n",
-		     r2net_num_from_nn(nn),
-		     r2net_idle_timeout() / 1000,
-		     r2net_idle_timeout() % 1000);
-
-		r2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
-	}
-	spin_unlock(&nn->nn_lock);
-}
-
-static void r2net_still_up(struct work_struct *work)
-{
-}
-
-/* ------------------------------------------------------------ */
-
-void r2net_disconnect_node(struct r2nm_node *node)
-{
-	struct r2net_node *nn = r2net_nn_from_num(node->nd_num);
-
-	/* don't reconnect until it's heartbeating again */
-	spin_lock(&nn->nn_lock);
-	atomic_set(&nn->nn_timeout, 0);
-	r2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
-	spin_unlock(&nn->nn_lock);
-
-	if (r2net_wq) {
-		cancel_delayed_work(&nn->nn_connect_expired);
-		cancel_delayed_work(&nn->nn_connect_work);
-		cancel_delayed_work(&nn->nn_still_up);
-		flush_workqueue(r2net_wq);
-	}
-}
-
-static void r2net_hb_node_down_cb(struct r2nm_node *node, int node_num,
-				  void *data)
-{
-	if (!node)
-		return;
-
-	if (node_num != r2nm_this_node())
-		r2net_disconnect_node(node);
-
-	BUG_ON(atomic_read(&r2net_connected_peers) < 0);
-}
-
-static void r2net_hb_node_up_cb(struct r2nm_node *node, int node_num,
-				void *data)
-{
-	struct r2net_node *nn = r2net_nn_from_num(node_num);
-
-	BUG_ON(!node);
-
-	/* ensure an immediate connect attempt */
-	nn->nn_last_connect_attempt = jiffies -
-		(msecs_to_jiffies(r2net_reconnect_delay()) + 1);
-
-	if (node_num != r2nm_this_node()) {
-		/* believe it or not, accept and node hearbeating testing
-		 * can succeed for this node before we got here.. so
-		 * only use set_nn_state to clear the persistent error
-		 * if that hasn't already happened */
-		spin_lock(&nn->nn_lock);
-		atomic_set(&nn->nn_timeout, 0);
-		if (nn->nn_persistent_error)
-			r2net_set_nn_state(nn, NULL, 0, 0);
-		spin_unlock(&nn->nn_lock);
-	}
-}
-
-void r2net_unregister_hb_callbacks(void)
-{
-	r2hb_unregister_callback(NULL, &r2net_hb_up);
-	r2hb_unregister_callback(NULL, &r2net_hb_down);
-}
-
-int r2net_register_hb_callbacks(void)
-{
-	int ret;
-
-	r2hb_setup_callback(&r2net_hb_down, R2HB_NODE_DOWN_CB,
-			    r2net_hb_node_down_cb, NULL, R2NET_HB_PRI);
-	r2hb_setup_callback(&r2net_hb_up, R2HB_NODE_UP_CB,
-			    r2net_hb_node_up_cb, NULL, R2NET_HB_PRI);
-
-	ret = r2hb_register_callback(NULL, &r2net_hb_up);
-	if (ret == 0)
-		ret = r2hb_register_callback(NULL, &r2net_hb_down);
-
-	if (ret)
-		r2net_unregister_hb_callbacks();
-
-	return ret;
-}
-
-/* ------------------------------------------------------------ */
-
-static int r2net_accept_one(struct socket *sock)
-{
-	int ret, slen;
-	struct sockaddr_in sin;
-	struct socket *new_sock = NULL;
-	struct r2nm_node *node = NULL;
-	struct r2nm_node *local_node = NULL;
-	struct r2net_sock_container *sc = NULL;
-	struct r2net_node *nn;
-
-	BUG_ON(sock == NULL);
-	ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
-			       sock->sk->sk_protocol, &new_sock);
-	if (ret)
-		goto out;
-
-	new_sock->type = sock->type;
-	new_sock->ops = sock->ops;
-	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
-	if (ret < 0)
-		goto out;
-
-	new_sock->sk->sk_allocation = GFP_ATOMIC;
-
-	ret = r2net_set_nodelay(new_sock);
-	if (ret) {
-		mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret);
-		goto out;
-	}
-
-	slen = sizeof(sin);
-	ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
-				       &slen, 1);
-	if (ret < 0)
-		goto out;
-
-	node = r2nm_get_node_by_ip(sin.sin_addr.s_addr);
-	if (node == NULL) {
-		printk(KERN_NOTICE "ramster: Attempt to connect from unknown "
-		       "node at %pI4:%d\n", &sin.sin_addr.s_addr,
-		       ntohs(sin.sin_port));
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (r2nm_this_node() >= node->nd_num) {
-		local_node = r2nm_get_node_by_num(r2nm_this_node());
-		printk(KERN_NOTICE "ramster: Unexpected connect attempt seen "
-		       "at node '%s' (%u, %pI4:%d) from node '%s' (%u, "
-		       "%pI4:%d)\n", local_node->nd_name, local_node->nd_num,
-		       &(local_node->nd_ipv4_address),
-		       ntohs(local_node->nd_ipv4_port), node->nd_name,
-		       node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port));
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* this happens all the time when the other node sees our heartbeat
-	 * and tries to connect before we see their heartbeat */
-	if (!r2hb_check_node_heartbeating_from_callback(node->nd_num)) {
-		mlog(ML_CONN, "attempt to connect from node '%s' at "
-		     "%pI4:%d but it isn't heartbeating\n",
-		     node->nd_name, &sin.sin_addr.s_addr,
-		     ntohs(sin.sin_port));
-		ret = -EINVAL;
-		goto out;
-	}
-
-	nn = r2net_nn_from_num(node->nd_num);
-
-	spin_lock(&nn->nn_lock);
-	if (nn->nn_sc)
-		ret = -EBUSY;
-	else
-		ret = 0;
-	spin_unlock(&nn->nn_lock);
-	if (ret) {
-		printk(KERN_NOTICE "ramster: Attempt to connect from node '%s' "
-		       "at %pI4:%d but it already has an open connection\n",
-		       node->nd_name, &sin.sin_addr.s_addr,
-		       ntohs(sin.sin_port));
-		goto out;
-	}
-
-	sc = sc_alloc(node);
-	if (sc == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	sc->sc_sock = new_sock;
-	new_sock = NULL;
-
-	spin_lock(&nn->nn_lock);
-	atomic_set(&nn->nn_timeout, 0);
-	r2net_set_nn_state(nn, sc, 0, 0);
-	spin_unlock(&nn->nn_lock);
-
-	r2net_register_callbacks(sc->sc_sock->sk, sc);
-	r2net_sc_queue_work(sc, &sc->sc_rx_work);
-
-	r2net_initialize_handshake();
-	r2net_sendpage(sc, r2net_hand, sizeof(*r2net_hand));
-
-out:
-	if (new_sock)
-		sock_release(new_sock);
-	if (node)
-		r2nm_node_put(node);
-	if (local_node)
-		r2nm_node_put(local_node);
-	if (sc)
-		sc_put(sc);
-	return ret;
-}
-
-static void r2net_accept_many(struct work_struct *work)
-{
-	struct socket *sock = r2net_listen_sock;
-	while (r2net_accept_one(sock) == 0)
-		cond_resched();
-}
-
-static void r2net_listen_data_ready(struct sock *sk, int bytes)
-{
-	void (*ready)(struct sock *sk, int bytes);
-
-	read_lock(&sk->sk_callback_lock);
-	ready = sk->sk_user_data;
-	if (ready == NULL) { /* check for teardown race */
-		ready = sk->sk_data_ready;
-		goto out;
-	}
-
-	/* ->sk_data_ready is also called for a newly established child socket
-	 * before it has been accepted and the acceptor has set up their
-	 * data_ready.. we only want to queue listen work for our listening
-	 * socket */
-	if (sk->sk_state == TCP_LISTEN) {
-		mlog(ML_TCP, "bytes: %d\n", bytes);
-		queue_work(r2net_wq, &r2net_listen_work);
-	}
-
-out:
-	read_unlock(&sk->sk_callback_lock);
-	ready(sk, bytes);
-}
-
-static int r2net_open_listening_sock(__be32 addr, __be16 port)
-{
-	struct socket *sock = NULL;
-	int ret;
-	struct sockaddr_in sin = {
-		.sin_family = PF_INET,
-		.sin_addr = { .s_addr = addr },
-		.sin_port = port,
-	};
-
-	ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
-	if (ret < 0) {
-		printk(KERN_ERR "ramster: Error %d while creating socket\n",
-			ret);
-		goto out;
-	}
-
-	sock->sk->sk_allocation = GFP_ATOMIC;
-
-	write_lock_bh(&sock->sk->sk_callback_lock);
-	sock->sk->sk_user_data = sock->sk->sk_data_ready;
-	sock->sk->sk_data_ready = r2net_listen_data_ready;
-	write_unlock_bh(&sock->sk->sk_callback_lock);
-
-	r2net_listen_sock = sock;
-	INIT_WORK(&r2net_listen_work, r2net_accept_many);
-
-	sock->sk->sk_reuse = 1;
-	ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
-	if (ret < 0) {
-		printk(KERN_ERR "ramster: Error %d while binding socket at "
-			"%pI4:%u\n", ret, &addr, ntohs(port));
-		goto out;
-	}
-
-	ret = sock->ops->listen(sock, 64);
-	if (ret < 0)
-		printk(KERN_ERR "ramster: Error %d while listening on %pI4:%u\n",
-		       ret, &addr, ntohs(port));
-
-out:
-	if (ret) {
-		r2net_listen_sock = NULL;
-		if (sock)
-			sock_release(sock);
-	}
-	return ret;
-}
-
-/*
- * called from node manager when we should bring up our network listening
- * socket.  node manager handles all the serialization to only call this
- * once and to match it with r2net_stop_listening().  note,
- * r2nm_this_node() doesn't work yet as we're being called while it
- * is being set up.
- */
-int r2net_start_listening(struct r2nm_node *node)
-{
-	int ret = 0;
-
-	BUG_ON(r2net_wq != NULL);
-	BUG_ON(r2net_listen_sock != NULL);
-
-	mlog(ML_KTHREAD, "starting r2net thread...\n");
-	r2net_wq = create_singlethread_workqueue("r2net");
-	if (r2net_wq == NULL) {
-		mlog(ML_ERROR, "unable to launch r2net thread\n");
-		return -ENOMEM; /* ? */
-	}
-
-	ret = r2net_open_listening_sock(node->nd_ipv4_address,
-					node->nd_ipv4_port);
-	if (ret) {
-		destroy_workqueue(r2net_wq);
-		r2net_wq = NULL;
-	}
-
-	return ret;
-}
-
-/* again, r2nm_this_node() doesn't work here as we're involved in
- * tearing it down */
-void r2net_stop_listening(struct r2nm_node *node)
-{
-	struct socket *sock = r2net_listen_sock;
-	size_t i;
-
-	BUG_ON(r2net_wq == NULL);
-	BUG_ON(r2net_listen_sock == NULL);
-
-	/* stop the listening socket from generating work */
-	write_lock_bh(&sock->sk->sk_callback_lock);
-	sock->sk->sk_data_ready = sock->sk->sk_user_data;
-	sock->sk->sk_user_data = NULL;
-	write_unlock_bh(&sock->sk->sk_callback_lock);
-
-	for (i = 0; i < ARRAY_SIZE(r2net_nodes); i++) {
-		struct r2nm_node *node = r2nm_get_node_by_num(i);
-		if (node) {
-			r2net_disconnect_node(node);
-			r2nm_node_put(node);
-		}
-	}
-
-	/* finish all work and tear down the work queue */
-	mlog(ML_KTHREAD, "waiting for r2net thread to exit....\n");
-	destroy_workqueue(r2net_wq);
-	r2net_wq = NULL;
-
-	sock_release(r2net_listen_sock);
-	r2net_listen_sock = NULL;
-}
-
-void r2net_hb_node_up_manual(int node_num)
-{
-	struct r2nm_node dummy;
-	if (r2nm_single_cluster == NULL)
-		pr_err("ramster: cluster not alive, node_up_manual ignored\n");
-	else {
-		r2hb_manual_set_node_heartbeating(node_num);
-		r2net_hb_node_up_cb(&dummy, node_num, NULL);
-	}
-}
-
-/* ------------------------------------------------------------ */
-
-int r2net_init(void)
-{
-	unsigned long i;
-
-	if (r2net_debugfs_init())
-		return -ENOMEM;
-
-	r2net_hand = kzalloc(sizeof(struct r2net_handshake), GFP_KERNEL);
-	r2net_keep_req = kzalloc(sizeof(struct r2net_msg), GFP_KERNEL);
-	r2net_keep_resp = kzalloc(sizeof(struct r2net_msg), GFP_KERNEL);
-	if (!r2net_hand || !r2net_keep_req || !r2net_keep_resp) {
-		kfree(r2net_hand);
-		kfree(r2net_keep_req);
-		kfree(r2net_keep_resp);
-		return -ENOMEM;
-	}
-
-	r2net_hand->protocol_version = cpu_to_be64(R2NET_PROTOCOL_VERSION);
-	r2net_hand->connector_id = cpu_to_be64(1);
-
-	r2net_keep_req->magic = cpu_to_be16(R2NET_MSG_KEEP_REQ_MAGIC);
-	r2net_keep_resp->magic = cpu_to_be16(R2NET_MSG_KEEP_RESP_MAGIC);
-
-	for (i = 0; i < ARRAY_SIZE(r2net_nodes); i++) {
-		struct r2net_node *nn = r2net_nn_from_num(i);
-
-		atomic_set(&nn->nn_timeout, 0);
-		spin_lock_init(&nn->nn_lock);
-		INIT_DELAYED_WORK(&nn->nn_connect_work, r2net_start_connect);
-		INIT_DELAYED_WORK(&nn->nn_connect_expired,
-				  r2net_connect_expired);
-		INIT_DELAYED_WORK(&nn->nn_still_up, r2net_still_up);
-		/* until we see hb from a node we'll return einval */
-		nn->nn_persistent_error = -ENOTCONN;
-		init_waitqueue_head(&nn->nn_sc_wq);
-		idr_init(&nn->nn_status_idr);
-		INIT_LIST_HEAD(&nn->nn_status_list);
-	}
-
-	return 0;
-}
-
-void r2net_exit(void)
-{
-	kfree(r2net_hand);
-	kfree(r2net_keep_req);
-	kfree(r2net_keep_resp);
-	r2net_debugfs_exit();
-}
diff --git a/drivers/staging/ramster/cluster/tcp.h b/drivers/staging/ramster/cluster/tcp.h
deleted file mode 100755
index 9d0583345..000000000
--- a/drivers/staging/ramster/cluster/tcp.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * tcp.h
- *
- * Function prototypes
- *
- * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
- */
-
-#ifndef R2CLUSTER_TCP_H
-#define R2CLUSTER_TCP_H
-
-#include <linux/socket.h>
-#ifdef __KERNEL__
-#include <net/sock.h>
-#include <linux/tcp.h>
-#else
-#include <sys/socket.h>
-#endif
-#include <linux/inet.h>
-#include <linux/in.h>
-
-struct r2net_msg {
-	__be16 magic;
-	__be16 data_len;
-	__be16 msg_type;
-	__be16 pad1;
-	__be32 sys_status;
-	__be32 status;
-	__be32 key;
-	__be32 msg_num;
-	__u8  buf[0];
-};
-
-typedef int (r2net_msg_handler_func)(struct r2net_msg *msg, u32 len, void *data,
-				     void **ret_data);
-typedef void (r2net_post_msg_handler_func)(int status, void *data,
-					   void *ret_data);
-
-#define R2NET_MAX_PAYLOAD_BYTES  (4096 - sizeof(struct r2net_msg))
-
-/* same as hb delay, we're waiting for another node to recognize our hb */
-#define R2NET_RECONNECT_DELAY_MS_DEFAULT	2000
-
-#define R2NET_KEEPALIVE_DELAY_MS_DEFAULT	2000
-#define R2NET_IDLE_TIMEOUT_MS_DEFAULT		30000
-
-
-/* TODO: figure this out.... */
-static inline int r2net_link_down(int err, struct socket *sock)
-{
-	if (sock) {
-		if (sock->sk->sk_state != TCP_ESTABLISHED &&
-			sock->sk->sk_state != TCP_CLOSE_WAIT)
-			return 1;
-	}
-
-	if (err >= 0)
-		return 0;
-	switch (err) {
-
-	/* ????????????????????????? */
-	case -ERESTARTSYS:
-	case -EBADF:
-	/* When the server has died, an ICMP port unreachable
-	 * message prompts ECONNREFUSED. */
-	case -ECONNREFUSED:
-	case -ENOTCONN:
-	case -ECONNRESET:
-	case -EPIPE:
-		return 1;
-
-	}
-	return 0;
-}
-
-enum {
-	R2NET_DRIVER_UNINITED,
-	R2NET_DRIVER_READY,
-};
-
-int r2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
-		       u8 target_node, int *status);
-int r2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
-			   size_t veclen, u8 target_node, int *status);
-
-int r2net_register_handler(u32 msg_type, u32 key, u32 max_len,
-			   r2net_msg_handler_func *func, void *data,
-			   r2net_post_msg_handler_func *post_func,
-			   struct list_head *unreg_list);
-void r2net_unregister_handler_list(struct list_head *list);
-
-void r2net_fill_node_map(unsigned long *map, unsigned bytes);
-
-void r2net_force_data_magic(struct r2net_msg *, u16, u32);
-void r2net_hb_node_up_manual(int);
-struct r2net_node *r2net_nn_from_num(u8);
-
-struct r2nm_node;
-int r2net_register_hb_callbacks(void);
-void r2net_unregister_hb_callbacks(void);
-int r2net_start_listening(struct r2nm_node *node);
-void r2net_stop_listening(struct r2nm_node *node);
-void r2net_disconnect_node(struct r2nm_node *node);
-int r2net_num_connected_peers(void);
-
-int r2net_init(void);
-void r2net_exit(void);
-
-struct r2net_send_tracking;
-struct r2net_sock_container;
-
-#if 0
-int r2net_debugfs_init(void);
-void r2net_debugfs_exit(void);
-void r2net_debug_add_nst(struct r2net_send_tracking *nst);
-void r2net_debug_del_nst(struct r2net_send_tracking *nst);
-void r2net_debug_add_sc(struct r2net_sock_container *sc);
-void r2net_debug_del_sc(struct r2net_sock_container *sc);
-#else
-static inline int r2net_debugfs_init(void)
-{
-	return 0;
-}
-static inline void r2net_debugfs_exit(void)
-{
-}
-static inline void r2net_debug_add_nst(struct r2net_send_tracking *nst)
-{
-}
-static inline void r2net_debug_del_nst(struct r2net_send_tracking *nst)
-{
-}
-static inline void r2net_debug_add_sc(struct r2net_sock_container *sc)
-{
-}
-static inline void r2net_debug_del_sc(struct r2net_sock_container *sc)
-{
-}
-#endif	/* CONFIG_DEBUG_FS */
-
-#endif /* R2CLUSTER_TCP_H */
diff --git a/drivers/staging/ramster/cluster/tcp_internal.h b/drivers/staging/ramster/cluster/tcp_internal.h
deleted file mode 100755
index 4d8cc9f96..000000000
--- a/drivers/staging/ramster/cluster/tcp_internal.h
+++ /dev/null
@@ -1,248 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * Copyright (C) 2005 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef R2CLUSTER_TCP_INTERNAL_H
-#define R2CLUSTER_TCP_INTERNAL_H
-
-#define R2NET_MSG_MAGIC           ((u16)0xfa55)
-#define R2NET_MSG_STATUS_MAGIC    ((u16)0xfa56)
-#define R2NET_MSG_KEEP_REQ_MAGIC  ((u16)0xfa57)
-#define R2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
-/*
- * "data magic" is a long version of "status magic" where the message
- * payload actually contains data to be passed in reply to certain messages
- */
-#define R2NET_MSG_DATA_MAGIC      ((u16)0xfa59)
-
-/* we're delaying our quorum decision so that heartbeat will have timed
- * out truly dead nodes by the time we come around to making decisions
- * on their number */
-#define R2NET_QUORUM_DELAY_MS	\
-		((r2hb_dead_threshold + 2) * R2HB_REGION_TIMEOUT_MS)
-
-/*
- * This version number represents quite a lot, unfortunately.  It not
- * only represents the raw network message protocol on the wire but also
- * locking semantics of the file system using the protocol.  It should
- * be somewhere else, I'm sure, but right now it isn't.
- *
- * With version 11, we separate out the filesystem locking portion.  The
- * filesystem now has a major.minor version it negotiates.  Version 11
- * introduces this negotiation to the r2dlm protocol, and as such the
- * version here in tcp_internal.h should not need to be bumped for
- * filesystem locking changes.
- *
- * New in version 11
- *	- Negotiation of filesystem locking in the dlm join.
- *
- * New in version 10:
- *	- Meta/data locks combined
- *
- * New in version 9:
- *	- All votes removed
- *
- * New in version 8:
- *	- Replace delete inode votes with a cluster lock
- *
- * New in version 7:
- *	- DLM join domain includes the live nodemap
- *
- * New in version 6:
- *	- DLM lockres remote refcount fixes.
- *
- * New in version 5:
- *	- Network timeout checking protocol
- *
- * New in version 4:
- *	- Remove i_generation from lock names for better stat performance.
- *
- * New in version 3:
- *	- Replace dentry votes with a cluster lock
- *
- * New in version 2:
- *	- full 64 bit i_size in the metadata lock lvbs
- *	- introduction of "rw" lock and pushing meta/data locking down
- */
-#define R2NET_PROTOCOL_VERSION 11ULL
-struct r2net_handshake {
-	__be64	protocol_version;
-	__be64	connector_id;
-	__be32  r2hb_heartbeat_timeout_ms;
-	__be32  r2net_idle_timeout_ms;
-	__be32  r2net_keepalive_delay_ms;
-	__be32  r2net_reconnect_delay_ms;
-};
-
-struct r2net_node {
-	/* this is never called from int/bh */
-	spinlock_t			nn_lock;
-
-	/* set the moment an sc is allocated and a connect is started */
-	struct r2net_sock_container	*nn_sc;
-	/* _valid is only set after the handshake passes and tx can happen */
-	unsigned			nn_sc_valid:1;
-	/* if this is set tx just returns it */
-	int				nn_persistent_error;
-	/* It is only set to 1 after the idle time out. */
-	atomic_t			nn_timeout;
-
-	/* threads waiting for an sc to arrive wait on the wq for generation
-	 * to increase.  it is increased when a connecting socket succeeds
-	 * or fails or when an accepted socket is attached. */
-	wait_queue_head_t		nn_sc_wq;
-
-	struct idr			nn_status_idr;
-	struct list_head		nn_status_list;
-
-	/* connects are attempted from when heartbeat comes up until either hb
-	 * goes down, the node is unconfigured, no connect attempts succeed
-	 * before R2NET_CONN_IDLE_DELAY, or a connect succeeds.  connect_work
-	 * is queued from set_nn_state both from hb up and from itself if a
-	 * connect attempt fails and so can be self-arming.  shutdown is
-	 * careful to first mark the nn such that no connects will be attempted
-	 * before canceling delayed connect work and flushing the queue. */
-	struct delayed_work		nn_connect_work;
-	unsigned long			nn_last_connect_attempt;
-
-	/* this is queued as nodes come up and is canceled when a connection is
-	 * established.  this expiring gives up on the node and errors out
-	 * transmits */
-	struct delayed_work		nn_connect_expired;
-
-	/* after we give up on a socket we wait a while before deciding
-	 * that it is still heartbeating and that we should do some
-	 * quorum work */
-	struct delayed_work		nn_still_up;
-};
-
-struct r2net_sock_container {
-	struct kref		sc_kref;
-	/* the next two are valid for the life time of the sc */
-	struct socket		*sc_sock;
-	struct r2nm_node	*sc_node;
-
-	/* all of these sc work structs hold refs on the sc while they are
-	 * queued.  they should not be able to ref a freed sc.  the teardown
-	 * race is with r2net_wq destruction in r2net_stop_listening() */
-
-	/* rx and connect work are generated from socket callbacks.  sc
-	 * shutdown removes the callbacks and then flushes the work queue */
-	struct work_struct	sc_rx_work;
-	struct work_struct	sc_connect_work;
-	/* shutdown work is triggered in two ways.  the simple way is
-	 * for a code path calls ensure_shutdown which gets a lock, removes
-	 * the sc from the nn, and queues the work.  in this case the
-	 * work is single-shot.  the work is also queued from a sock
-	 * callback, though, and in this case the work will find the sc
-	 * still on the nn and will call ensure_shutdown itself.. this
-	 * ends up triggering the shutdown work again, though nothing
-	 * will be done in that second iteration.  so work queue teardown
-	 * has to be careful to remove the sc from the nn before waiting
-	 * on the work queue so that the shutdown work doesn't remove the
-	 * sc and rearm itself.
-	 */
-	struct work_struct	sc_shutdown_work;
-
-	struct timer_list	sc_idle_timeout;
-	struct delayed_work	sc_keepalive_work;
-
-	unsigned		sc_handshake_ok:1;
-
-	struct page		*sc_page;
-	size_t			sc_page_off;
-
-	/* original handlers for the sockets */
-	void			(*sc_state_change)(struct sock *sk);
-	void			(*sc_data_ready)(struct sock *sk, int bytes);
-
-	u32			sc_msg_key;
-	u16			sc_msg_type;
-
-#ifdef CONFIG_DEBUG_FS
-	struct list_head        sc_net_debug_item;
-	ktime_t			sc_tv_timer;
-	ktime_t			sc_tv_data_ready;
-	ktime_t			sc_tv_advance_start;
-	ktime_t			sc_tv_advance_stop;
-	ktime_t			sc_tv_func_start;
-	ktime_t			sc_tv_func_stop;
-#endif
-#ifdef CONFIG_RAMSTER_FS_STATS
-	ktime_t			sc_tv_acquiry_total;
-	ktime_t			sc_tv_send_total;
-	ktime_t			sc_tv_status_total;
-	u32			sc_send_count;
-	u32			sc_recv_count;
-	ktime_t			sc_tv_process_total;
-#endif
-	struct mutex		sc_send_lock;
-};
-
-struct r2net_msg_handler {
-	struct rb_node		nh_node;
-	u32			nh_max_len;
-	u32			nh_msg_type;
-	u32			nh_key;
-	r2net_msg_handler_func	*nh_func;
-	r2net_msg_handler_func	*nh_func_data;
-	r2net_post_msg_handler_func
-				*nh_post_func;
-	struct kref		nh_kref;
-	struct list_head	nh_unregister_item;
-};
-
-enum r2net_system_error {
-	R2NET_ERR_NONE = 0,
-	R2NET_ERR_NO_HNDLR,
-	R2NET_ERR_OVERFLOW,
-	R2NET_ERR_DIED,
-	R2NET_ERR_MAX
-};
-
-struct r2net_status_wait {
-	enum r2net_system_error	ns_sys_status;
-	s32			ns_status;
-	int			ns_id;
-	wait_queue_head_t	ns_wq;
-	struct list_head	ns_node_item;
-};
-
-#ifdef CONFIG_DEBUG_FS
-/* just for state dumps */
-struct r2net_send_tracking {
-	struct list_head		st_net_debug_item;
-	struct task_struct		*st_task;
-	struct r2net_sock_container	*st_sc;
-	u32				st_id;
-	u32				st_msg_type;
-	u32				st_msg_key;
-	u8				st_node;
-	ktime_t				st_sock_time;
-	ktime_t				st_send_time;
-	ktime_t				st_status_time;
-};
-#else
-struct r2net_send_tracking {
-	u32	dummy;
-};
-#endif	/* CONFIG_DEBUG_FS */
-
-#endif /* R2CLUSTER_TCP_INTERNAL_H */
diff --git a/drivers/staging/ramster/r2net.c b/drivers/staging/ramster/r2net.c
deleted file mode 100644
index 2ee02204c..000000000
--- a/drivers/staging/ramster/r2net.c
+++ /dev/null
@@ -1,401 +0,0 @@
-/*
- * r2net.c
- *
- * Copyright (c) 2011, Dan Magenheimer, Oracle Corp.
- *
- * Ramster_r2net provides an interface between zcache and r2net.
- *
- * FIXME: support more than two nodes
- */
-
-#include <linux/list.h>
-#include "cluster/tcp.h"
-#include "cluster/nodemanager.h"
-#include "tmem.h"
-#include "zcache.h"
-#include "ramster.h"
-
-#define RAMSTER_TESTING
-
-#define RMSTR_KEY	0x77347734
-
-enum {
-	RMSTR_TMEM_PUT_EPH = 100,
-	RMSTR_TMEM_PUT_PERS,
-	RMSTR_TMEM_ASYNC_GET_REQUEST,
-	RMSTR_TMEM_ASYNC_GET_AND_FREE_REQUEST,
-	RMSTR_TMEM_ASYNC_GET_REPLY,
-	RMSTR_TMEM_FLUSH,
-	RMSTR_TMEM_FLOBJ,
-	RMSTR_TMEM_DESTROY_POOL,
-};
-
-#define RMSTR_R2NET_MAX_LEN \
-		(R2NET_MAX_PAYLOAD_BYTES - sizeof(struct tmem_xhandle))
-
-#include "cluster/tcp_internal.h"
-
-static struct r2nm_node *r2net_target_node;
-static int r2net_target_nodenum;
-
-int r2net_remote_target_node_set(int node_num)
-{
-	int ret = -1;
-
-	r2net_target_node = r2nm_get_node_by_num(node_num);
-	if (r2net_target_node != NULL) {
-		r2net_target_nodenum = node_num;
-		r2nm_node_put(r2net_target_node);
-		ret = 0;
-	}
-	return ret;
-}
-
-/* FIXME following buffer should be per-cpu, protected by preempt_disable */
-static char ramster_async_get_buf[R2NET_MAX_PAYLOAD_BYTES];
-
-static int ramster_remote_async_get_request_handler(struct r2net_msg *msg,
-				u32 len, void *data, void **ret_data)
-{
-	char *pdata;
-	struct tmem_xhandle xh;
-	int found;
-	size_t size = RMSTR_R2NET_MAX_LEN;
-	u16 msgtype = be16_to_cpu(msg->msg_type);
-	bool get_and_free = (msgtype == RMSTR_TMEM_ASYNC_GET_AND_FREE_REQUEST);
-	unsigned long flags;
-
-	xh = *(struct tmem_xhandle *)msg->buf;
-	if (xh.xh_data_size > RMSTR_R2NET_MAX_LEN)
-		BUG();
-	pdata = ramster_async_get_buf;
-	*(struct tmem_xhandle *)pdata = xh;
-	pdata += sizeof(struct tmem_xhandle);
-	local_irq_save(flags);
-	found = zcache_get(xh.client_id, xh.pool_id, &xh.oid, xh.index,
-				pdata, &size, 1, get_and_free ? 1 : -1);
-	local_irq_restore(flags);
-	if (found < 0) {
-		/* a zero size indicates the get failed */
-		size = 0;
-	}
-	if (size > RMSTR_R2NET_MAX_LEN)
-		BUG();
-	*ret_data = pdata - sizeof(struct tmem_xhandle);
-	/* now make caller (r2net_process_message) handle specially */
-	r2net_force_data_magic(msg, RMSTR_TMEM_ASYNC_GET_REPLY, RMSTR_KEY);
-	return size + sizeof(struct tmem_xhandle);
-}
-
-static int ramster_remote_async_get_reply_handler(struct r2net_msg *msg,
-				u32 len, void *data, void **ret_data)
-{
-	char *in = (char *)msg->buf;
-	int datalen = len - sizeof(struct r2net_msg);
-	int ret = -1;
-	struct tmem_xhandle *xh = (struct tmem_xhandle *)in;
-
-	in += sizeof(struct tmem_xhandle);
-	datalen -= sizeof(struct tmem_xhandle);
-	BUG_ON(datalen < 0 || datalen > PAGE_SIZE);
-	ret = zcache_localify(xh->pool_id, &xh->oid, xh->index,
-				in, datalen, xh->extra);
-#ifdef RAMSTER_TESTING
-	if (ret == -EEXIST)
-		pr_err("TESTING ArrgREP, aborted overwrite on racy put\n");
-#endif
-	return ret;
-}
-
-int ramster_remote_put_handler(struct r2net_msg *msg,
-				u32 len, void *data, void **ret_data)
-{
-	struct tmem_xhandle *xh;
-	char *p = (char *)msg->buf;
-	int datalen = len - sizeof(struct r2net_msg) -
-				sizeof(struct tmem_xhandle);
-	u16 msgtype = be16_to_cpu(msg->msg_type);
-	bool ephemeral = (msgtype == RMSTR_TMEM_PUT_EPH);
-	unsigned long flags;
-	int ret;
-
-	xh = (struct tmem_xhandle *)p;
-	p += sizeof(struct tmem_xhandle);
-	zcache_autocreate_pool(xh->client_id, xh->pool_id, ephemeral);
-	local_irq_save(flags);
-	ret = zcache_put(xh->client_id, xh->pool_id, &xh->oid, xh->index,
-				p, datalen, 1, ephemeral ? 1 : -1);
-	local_irq_restore(flags);
-	return ret;
-}
-
-int ramster_remote_flush_handler(struct r2net_msg *msg,
-				u32 len, void *data, void **ret_data)
-{
-	struct tmem_xhandle *xh;
-	char *p = (char *)msg->buf;
-
-	xh = (struct tmem_xhandle *)p;
-	p += sizeof(struct tmem_xhandle);
-	(void)zcache_flush(xh->client_id, xh->pool_id, &xh->oid, xh->index);
-	return 0;
-}
-
-int ramster_remote_flobj_handler(struct r2net_msg *msg,
-				u32 len, void *data, void **ret_data)
-{
-	struct tmem_xhandle *xh;
-	char *p = (char *)msg->buf;
-
-	xh = (struct tmem_xhandle *)p;
-	p += sizeof(struct tmem_xhandle);
-	(void)zcache_flush_object(xh->client_id, xh->pool_id, &xh->oid);
-	return 0;
-}
-
-int ramster_remote_async_get(struct tmem_xhandle *xh, bool free, int remotenode,
-				size_t expect_size, uint8_t expect_cksum,
-				void *extra)
-{
-	int ret = -1, status;
-	struct r2nm_node *node = NULL;
-	struct kvec vec[1];
-	size_t veclen = 1;
-	u32 msg_type;
-
-	node = r2nm_get_node_by_num(remotenode);
-	if (node == NULL)
-		goto out;
-	xh->client_id = r2nm_this_node(); /* which node is getting */
-	xh->xh_data_cksum = expect_cksum;
-	xh->xh_data_size = expect_size;
-	xh->extra = extra;
-	vec[0].iov_len = sizeof(*xh);
-	vec[0].iov_base = xh;
-	if (free)
-		msg_type = RMSTR_TMEM_ASYNC_GET_AND_FREE_REQUEST;
-	else
-		msg_type = RMSTR_TMEM_ASYNC_GET_REQUEST;
-	ret = r2net_send_message_vec(msg_type, RMSTR_KEY,
-					vec, veclen, remotenode, &status);
-	r2nm_node_put(node);
-	if (ret < 0) {
-		/* FIXME handle bad message possibilities here? */
-		pr_err("UNTESTED ret<0 in ramster_remote_async_get\n");
-	}
-	ret = status;
-out:
-	return ret;
-}
-
-#ifdef RAMSTER_TESTING
-/* leave me here to see if it catches a weird crash */
-static void ramster_check_irq_counts(void)
-{
-	static int last_hardirq_cnt, last_softirq_cnt, last_preempt_cnt;
-	int cur_hardirq_cnt, cur_softirq_cnt, cur_preempt_cnt;
-
-	cur_hardirq_cnt = hardirq_count() >> HARDIRQ_SHIFT;
-	if (cur_hardirq_cnt > last_hardirq_cnt) {
-		last_hardirq_cnt = cur_hardirq_cnt;
-		if (!(last_hardirq_cnt&(last_hardirq_cnt-1)))
-			pr_err("RAMSTER TESTING RRP hardirq_count=%d\n",
-				last_hardirq_cnt);
-	}
-	cur_softirq_cnt = softirq_count() >> SOFTIRQ_SHIFT;
-	if (cur_softirq_cnt > last_softirq_cnt) {
-		last_softirq_cnt = cur_softirq_cnt;
-		if (!(last_softirq_cnt&(last_softirq_cnt-1)))
-			pr_err("RAMSTER TESTING RRP softirq_count=%d\n",
-				last_softirq_cnt);
-	}
-	cur_preempt_cnt = preempt_count() & PREEMPT_MASK;
-	if (cur_preempt_cnt > last_preempt_cnt) {
-		last_preempt_cnt = cur_preempt_cnt;
-		if (!(last_preempt_cnt&(last_preempt_cnt-1)))
-			pr_err("RAMSTER TESTING RRP preempt_count=%d\n",
-				last_preempt_cnt);
-	}
-}
-#endif
-
-int ramster_remote_put(struct tmem_xhandle *xh, char *data, size_t size,
-				bool ephemeral, int *remotenode)
-{
-	int nodenum, ret = -1, status;
-	struct r2nm_node *node = NULL;
-	struct kvec vec[2];
-	size_t veclen = 2;
-	u32 msg_type;
-#ifdef RAMSTER_TESTING
-	struct r2net_node *nn;
-#endif
-
-	BUG_ON(size > RMSTR_R2NET_MAX_LEN);
-	xh->client_id = r2nm_this_node(); /* which node is putting */
-	vec[0].iov_len = sizeof(*xh);
-	vec[0].iov_base = xh;
-	vec[1].iov_len = size;
-	vec[1].iov_base = data;
-	node = r2net_target_node;
-	if (!node)
-		goto out;
-
-	nodenum = r2net_target_nodenum;
-
-	r2nm_node_get(node);
-
-#ifdef RAMSTER_TESTING
-	nn = r2net_nn_from_num(nodenum);
-	WARN_ON_ONCE(nn->nn_persistent_error || !nn->nn_sc_valid);
-#endif
-
-	if (ephemeral)
-		msg_type = RMSTR_TMEM_PUT_EPH;
-	else
-		msg_type = RMSTR_TMEM_PUT_PERS;
-#ifdef RAMSTER_TESTING
-	/* leave me here to see if it catches a weird crash */
-	ramster_check_irq_counts();
-#endif
-
-	ret = r2net_send_message_vec(msg_type, RMSTR_KEY, vec, veclen,
-						nodenum, &status);
-#ifdef RAMSTER_TESTING
-	if (ret != 0) {
-		static unsigned long cnt;
-		cnt++;
-		if (!(cnt&(cnt-1)))
-			pr_err("ramster_remote_put: message failed, "
-				"ret=%d, cnt=%lu\n", ret, cnt);
-		ret = -1;
-	}
-#endif
-	if (ret < 0)
-		ret = -1;
-	else {
-		ret = status;
-		*remotenode = nodenum;
-	}
-
-	r2nm_node_put(node);
-out:
-	return ret;
-}
-
-int ramster_remote_flush(struct tmem_xhandle *xh, int remotenode)
-{
-	int ret = -1, status;
-	struct r2nm_node *node = NULL;
-	struct kvec vec[1];
-	size_t veclen = 1;
-
-	node = r2nm_get_node_by_num(remotenode);
-	BUG_ON(node == NULL);
-	xh->client_id = r2nm_this_node(); /* which node is flushing */
-	vec[0].iov_len = sizeof(*xh);
-	vec[0].iov_base = xh;
-	BUG_ON(irqs_disabled());
-	BUG_ON(in_softirq());
-	ret = r2net_send_message_vec(RMSTR_TMEM_FLUSH, RMSTR_KEY,
-					vec, veclen, remotenode, &status);
-	r2nm_node_put(node);
-	return ret;
-}
-
-int ramster_remote_flush_object(struct tmem_xhandle *xh, int remotenode)
-{
-	int ret = -1, status;
-	struct r2nm_node *node = NULL;
-	struct kvec vec[1];
-	size_t veclen = 1;
-
-	node = r2nm_get_node_by_num(remotenode);
-	BUG_ON(node == NULL);
-	xh->client_id = r2nm_this_node(); /* which node is flobjing */
-	vec[0].iov_len = sizeof(*xh);
-	vec[0].iov_base = xh;
-	ret = r2net_send_message_vec(RMSTR_TMEM_FLOBJ, RMSTR_KEY,
-					vec, veclen, remotenode, &status);
-	r2nm_node_put(node);
-	return ret;
-}
-
-/*
- * Handler registration
- */
-
-static LIST_HEAD(r2net_unreg_list);
-
-static void r2net_unregister_handlers(void)
-{
-	r2net_unregister_handler_list(&r2net_unreg_list);
-}
-
-int r2net_register_handlers(void)
-{
-	int status;
-
-	status = r2net_register_handler(RMSTR_TMEM_PUT_EPH, RMSTR_KEY,
-				RMSTR_R2NET_MAX_LEN,
-				ramster_remote_put_handler,
-				NULL, NULL, &r2net_unreg_list);
-	if (status)
-		goto bail;
-
-	status = r2net_register_handler(RMSTR_TMEM_PUT_PERS, RMSTR_KEY,
-				RMSTR_R2NET_MAX_LEN,
-				ramster_remote_put_handler,
-				NULL, NULL, &r2net_unreg_list);
-	if (status)
-		goto bail;
-
-	status = r2net_register_handler(RMSTR_TMEM_ASYNC_GET_REQUEST, RMSTR_KEY,
-				RMSTR_R2NET_MAX_LEN,
-				ramster_remote_async_get_request_handler,
-				NULL, NULL,
-				&r2net_unreg_list);
-	if (status)
-		goto bail;
-
-	status = r2net_register_handler(RMSTR_TMEM_ASYNC_GET_AND_FREE_REQUEST,
-				RMSTR_KEY, RMSTR_R2NET_MAX_LEN,
-				ramster_remote_async_get_request_handler,
-				NULL, NULL,
-				&r2net_unreg_list);
-	if (status)
-		goto bail;
-
-	status = r2net_register_handler(RMSTR_TMEM_ASYNC_GET_REPLY, RMSTR_KEY,
-				RMSTR_R2NET_MAX_LEN,
-				ramster_remote_async_get_reply_handler,
-				NULL, NULL,
-				&r2net_unreg_list);
-	if (status)
-		goto bail;
-
-	status = r2net_register_handler(RMSTR_TMEM_FLUSH, RMSTR_KEY,
-				RMSTR_R2NET_MAX_LEN,
-				ramster_remote_flush_handler,
-				NULL, NULL,
-				&r2net_unreg_list);
-	if (status)
-		goto bail;
-
-	status = r2net_register_handler(RMSTR_TMEM_FLOBJ, RMSTR_KEY,
-				RMSTR_R2NET_MAX_LEN,
-				ramster_remote_flobj_handler,
-				NULL, NULL,
-				&r2net_unreg_list);
-	if (status)
-		goto bail;
-
-	pr_info("ramster: r2net handlers registered\n");
-
-bail:
-	if (status) {
-		r2net_unregister_handlers();
-		pr_err("ramster: couldn't register r2net handlers\n");
-	}
-	return status;
-}
diff --git a/drivers/staging/ramster/ramster.h b/drivers/staging/ramster/ramster.h
deleted file mode 100644
index 0c9455e8d..000000000
--- a/drivers/staging/ramster/ramster.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * ramster.h
- *
- * Peer-to-peer transcendent memory
- *
- * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp.
- */
-
-#ifndef _RAMSTER_H_
-#define _RAMSTER_H_
-
-/*
- * format of remote pampd:
- *   bit 0 == intransit
- *   bit 1 == is_remote... if this bit is set, then
- *   bit 2-9 == remotenode
- *   bit 10-22 == size
- *   bit 23-30 == cksum
- */
-#define FAKE_PAMPD_INTRANSIT_BITS	1
-#define FAKE_PAMPD_ISREMOTE_BITS	1
-#define FAKE_PAMPD_REMOTENODE_BITS	8
-#define FAKE_PAMPD_REMOTESIZE_BITS	13
-#define FAKE_PAMPD_CHECKSUM_BITS	8
-
-#define FAKE_PAMPD_INTRANSIT_SHIFT	0
-#define FAKE_PAMPD_ISREMOTE_SHIFT	(FAKE_PAMPD_INTRANSIT_SHIFT + \
-					 FAKE_PAMPD_INTRANSIT_BITS)
-#define FAKE_PAMPD_REMOTENODE_SHIFT	(FAKE_PAMPD_ISREMOTE_SHIFT + \
-					 FAKE_PAMPD_ISREMOTE_BITS)
-#define FAKE_PAMPD_REMOTESIZE_SHIFT	(FAKE_PAMPD_REMOTENODE_SHIFT + \
-					 FAKE_PAMPD_REMOTENODE_BITS)
-#define FAKE_PAMPD_CHECKSUM_SHIFT	(FAKE_PAMPD_REMOTESIZE_SHIFT + \
-					 FAKE_PAMPD_REMOTESIZE_BITS)
-
-#define FAKE_PAMPD_MASK(x)		((1UL << (x)) - 1)
-
-static inline void *pampd_make_remote(int remotenode, size_t size,
-					unsigned char cksum)
-{
-	unsigned long fake_pampd = 0;
-	fake_pampd |= 1UL << FAKE_PAMPD_ISREMOTE_SHIFT;
-	fake_pampd |= ((unsigned long)remotenode &
-			FAKE_PAMPD_MASK(FAKE_PAMPD_REMOTENODE_BITS)) <<
-				FAKE_PAMPD_REMOTENODE_SHIFT;
-	fake_pampd |= ((unsigned long)size &
-			FAKE_PAMPD_MASK(FAKE_PAMPD_REMOTESIZE_BITS)) <<
-				FAKE_PAMPD_REMOTESIZE_SHIFT;
-	fake_pampd |= ((unsigned long)cksum &
-			FAKE_PAMPD_MASK(FAKE_PAMPD_CHECKSUM_BITS)) <<
-				FAKE_PAMPD_CHECKSUM_SHIFT;
-	return (void *)fake_pampd;
-}
-
-static inline unsigned int pampd_remote_node(void *pampd)
-{
-	unsigned long fake_pampd = (unsigned long)pampd;
-	return (fake_pampd >> FAKE_PAMPD_REMOTENODE_SHIFT) &
-		FAKE_PAMPD_MASK(FAKE_PAMPD_REMOTENODE_BITS);
-}
-
-static inline unsigned int pampd_remote_size(void *pampd)
-{
-	unsigned long fake_pampd = (unsigned long)pampd;
-	return (fake_pampd >> FAKE_PAMPD_REMOTESIZE_SHIFT) &
-		FAKE_PAMPD_MASK(FAKE_PAMPD_REMOTESIZE_BITS);
-}
-
-static inline unsigned char pampd_remote_cksum(void *pampd)
-{
-	unsigned long fake_pampd = (unsigned long)pampd;
-	return (fake_pampd >> FAKE_PAMPD_CHECKSUM_SHIFT) &
-		FAKE_PAMPD_MASK(FAKE_PAMPD_CHECKSUM_BITS);
-}
-
-static inline bool pampd_is_remote(void *pampd)
-{
-	unsigned long fake_pampd = (unsigned long)pampd;
-	return (fake_pampd >> FAKE_PAMPD_ISREMOTE_SHIFT) &
-		FAKE_PAMPD_MASK(FAKE_PAMPD_ISREMOTE_BITS);
-}
-
-static inline bool pampd_is_intransit(void *pampd)
-{
-	unsigned long fake_pampd = (unsigned long)pampd;
-	return (fake_pampd >> FAKE_PAMPD_INTRANSIT_SHIFT) &
-		FAKE_PAMPD_MASK(FAKE_PAMPD_INTRANSIT_BITS);
-}
-
-/* note that it is a BUG for intransit to be set without isremote also set */
-static inline void *pampd_mark_intransit(void *pampd)
-{
-	unsigned long fake_pampd = (unsigned long)pampd;
-
-	fake_pampd |= 1UL << FAKE_PAMPD_ISREMOTE_SHIFT;
-	fake_pampd |= 1UL << FAKE_PAMPD_INTRANSIT_SHIFT;
-	return (void *)fake_pampd;
-}
-
-static inline void *pampd_mask_intransit_and_remote(void *marked_pampd)
-{
-	unsigned long pampd = (unsigned long)marked_pampd;
-
-	pampd &= ~(1UL << FAKE_PAMPD_INTRANSIT_SHIFT);
-	pampd &= ~(1UL << FAKE_PAMPD_ISREMOTE_SHIFT);
-	return (void *)pampd;
-}
-
-extern int ramster_remote_async_get(struct tmem_xhandle *,
-				bool, int, size_t, uint8_t, void *extra);
-extern int ramster_remote_put(struct tmem_xhandle *, char *, size_t,
-				bool, int *);
-extern int ramster_remote_flush(struct tmem_xhandle *, int);
-extern int ramster_remote_flush_object(struct tmem_xhandle *, int);
-extern int r2net_register_handlers(void);
-extern int r2net_remote_target_node_set(int);
-
-#endif /* _TMEM_H */
diff --git a/drivers/staging/ramster/tmem.c b/drivers/staging/ramster/tmem.c
deleted file mode 100644
index 8f2f6892d..000000000
--- a/drivers/staging/ramster/tmem.c
+++ /dev/null
@@ -1,851 +0,0 @@
-/*
- * In-kernel transcendent memory (generic implementation)
- *
- * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
- *
- * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
- * "handles" (triples containing a pool id, and object id, and an index), to
- * pages in a page-accessible memory (PAM).  Tmem references the PAM pages via
- * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
- * set of functions (pamops).  Each pampd contains some representation of
- * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of
- * pages and must be able to insert, find, and delete these pages at a
- * potential frequency of thousands per second concurrently across many CPUs,
- * (and, if used with KVM, across many vcpus across many guests).
- * Tmem is tracked with a hierarchy of data structures, organized by
- * the elements in a handle-tuple: pool_id, object_id, and page index.
- * One or more "clients" (e.g. guests) each provide one or more tmem_pools.
- * Each pool, contains a hash table of rb_trees of tmem_objs.  Each
- * tmem_obj contains a radix-tree-like tree of pointers, with intermediate
- * nodes called tmem_objnodes.  Each leaf pointer in this tree points to
- * a pampd, which is accessible only through a small set of callbacks
- * registered by the PAM implementation (see tmem_register_pamops). Tmem
- * does all memory allocation via a set of callbacks registered by the tmem
- * host implementation (e.g. see tmem_register_hostops).
- */
-
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/atomic.h>
-#include <linux/delay.h>
-
-#include "tmem.h"
-
-/* data structure sentinels used for debugging... see tmem.h */
-#define POOL_SENTINEL 0x87658765
-#define OBJ_SENTINEL 0x12345678
-#define OBJNODE_SENTINEL 0xfedcba09
-
-/*
- * A tmem host implementation must use this function to register callbacks
- * for memory allocation.
- */
-static struct tmem_hostops tmem_hostops;
-
-static void tmem_objnode_tree_init(void);
-
-void tmem_register_hostops(struct tmem_hostops *m)
-{
-	tmem_objnode_tree_init();
-	tmem_hostops = *m;
-}
-
-/*
- * A tmem host implementation must use this function to register
- * callbacks for a page-accessible memory (PAM) implementation
- */
-static struct tmem_pamops tmem_pamops;
-
-void tmem_register_pamops(struct tmem_pamops *m)
-{
-	tmem_pamops = *m;
-}
-
-/*
- * Oid's are potentially very sparse and tmem_objs may have an indeterminately
- * short life, being added and deleted at a relatively high frequency.
- * So an rb_tree is an ideal data structure to manage tmem_objs.  But because
- * of the potentially huge number of tmem_objs, each pool manages a hashtable
- * of rb_trees to reduce search, insert, delete, and rebalancing time.
- * Each hashbucket also has a lock to manage concurrent access.
- *
- * The following routines manage tmem_objs.  When any tmem_obj is accessed,
- * the hashbucket lock must be held.
- */
-
-/* searches for object==oid in pool, returns locked object if found */
-static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
-					struct tmem_oid *oidp)
-{
-	struct rb_node *rbnode;
-	struct tmem_obj *obj;
-
-	rbnode = hb->obj_rb_root.rb_node;
-	while (rbnode) {
-		BUG_ON(RB_EMPTY_NODE(rbnode));
-		obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
-		switch (tmem_oid_compare(oidp, &obj->oid)) {
-		case 0: /* equal */
-			goto out;
-		case -1:
-			rbnode = rbnode->rb_left;
-			break;
-		case 1:
-			rbnode = rbnode->rb_right;
-			break;
-		}
-	}
-	obj = NULL;
-out:
-	return obj;
-}
-
-static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *);
-
-/* free an object that has no more pampds in it */
-static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
-{
-	struct tmem_pool *pool;
-
-	BUG_ON(obj == NULL);
-	ASSERT_SENTINEL(obj, OBJ);
-	BUG_ON(obj->pampd_count > 0);
-	pool = obj->pool;
-	BUG_ON(pool == NULL);
-	if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
-		tmem_pampd_destroy_all_in_obj(obj);
-	BUG_ON(obj->objnode_tree_root != NULL);
-	BUG_ON((long)obj->objnode_count != 0);
-	atomic_dec(&pool->obj_count);
-	BUG_ON(atomic_read(&pool->obj_count) < 0);
-	INVERT_SENTINEL(obj, OBJ);
-	obj->pool = NULL;
-	tmem_oid_set_invalid(&obj->oid);
-	rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
-}
-
-/*
- * initialize, and insert an tmem_object_root (called only if find failed)
- */
-static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
-					struct tmem_pool *pool,
-					struct tmem_oid *oidp)
-{
-	struct rb_root *root = &hb->obj_rb_root;
-	struct rb_node **new = &(root->rb_node), *parent = NULL;
-	struct tmem_obj *this;
-
-	BUG_ON(pool == NULL);
-	atomic_inc(&pool->obj_count);
-	obj->objnode_tree_height = 0;
-	obj->objnode_tree_root = NULL;
-	obj->pool = pool;
-	obj->oid = *oidp;
-	obj->objnode_count = 0;
-	obj->pampd_count = 0;
-	(*tmem_pamops.new_obj)(obj);
-	SET_SENTINEL(obj, OBJ);
-	while (*new) {
-		BUG_ON(RB_EMPTY_NODE(*new));
-		this = rb_entry(*new, struct tmem_obj, rb_tree_node);
-		parent = *new;
-		switch (tmem_oid_compare(oidp, &this->oid)) {
-		case 0:
-			BUG(); /* already present; should never happen! */
-			break;
-		case -1:
-			new = &(*new)->rb_left;
-			break;
-		case 1:
-			new = &(*new)->rb_right;
-			break;
-		}
-	}
-	rb_link_node(&obj->rb_tree_node, parent, new);
-	rb_insert_color(&obj->rb_tree_node, root);
-}
-
-/*
- * Tmem is managed as a set of tmem_pools with certain attributes, such as
- * "ephemeral" vs "persistent".  These attributes apply to all tmem_objs
- * and all pampds that belong to a tmem_pool.  A tmem_pool is created
- * or deleted relatively rarely (for example, when a filesystem is
- * mounted or unmounted.
- */
-
-/* flush all data from a pool and, optionally, free it */
-static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
-{
-	struct rb_node *rbnode;
-	struct tmem_obj *obj;
-	struct tmem_hashbucket *hb = &pool->hashbucket[0];
-	int i;
-
-	BUG_ON(pool == NULL);
-	for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
-		spin_lock(&hb->lock);
-		rbnode = rb_first(&hb->obj_rb_root);
-		while (rbnode != NULL) {
-			obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
-			rbnode = rb_next(rbnode);
-			tmem_pampd_destroy_all_in_obj(obj);
-			tmem_obj_free(obj, hb);
-			(*tmem_hostops.obj_free)(obj, pool);
-		}
-		spin_unlock(&hb->lock);
-	}
-	if (destroy)
-		list_del(&pool->pool_list);
-}
-
-/*
- * A tmem_obj contains a radix-tree-like tree in which the intermediate
- * nodes are called tmem_objnodes.  (The kernel lib/radix-tree.c implementation
- * is very specialized and tuned for specific uses and is not particularly
- * suited for use from this code, though some code from the core algorithms has
- * been reused, thus the copyright notices below).  Each tmem_objnode contains
- * a set of pointers which point to either a set of intermediate tmem_objnodes
- * or a set of of pampds.
- *
- * Portions Copyright (C) 2001 Momchil Velikov
- * Portions Copyright (C) 2001 Christoph Hellwig
- * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
- */
-
-struct tmem_objnode_tree_path {
-	struct tmem_objnode *objnode;
-	int offset;
-};
-
-/* objnode height_to_maxindex translation */
-static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
-
-static void tmem_objnode_tree_init(void)
-{
-	unsigned int ht, tmp;
-
-	for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
-		tmp = ht * OBJNODE_TREE_MAP_SHIFT;
-		if (tmp >= OBJNODE_TREE_INDEX_BITS)
-			tmem_objnode_tree_h2max[ht] = ~0UL;
-		else
-			tmem_objnode_tree_h2max[ht] =
-			    (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
-	}
-}
-
-static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
-{
-	struct tmem_objnode *objnode;
-
-	ASSERT_SENTINEL(obj, OBJ);
-	BUG_ON(obj->pool == NULL);
-	ASSERT_SENTINEL(obj->pool, POOL);
-	objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
-	if (unlikely(objnode == NULL))
-		goto out;
-	objnode->obj = obj;
-	SET_SENTINEL(objnode, OBJNODE);
-	memset(&objnode->slots, 0, sizeof(objnode->slots));
-	objnode->slots_in_use = 0;
-	obj->objnode_count++;
-out:
-	return objnode;
-}
-
-static void tmem_objnode_free(struct tmem_objnode *objnode)
-{
-	struct tmem_pool *pool;
-	int i;
-
-	BUG_ON(objnode == NULL);
-	for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
-		BUG_ON(objnode->slots[i] != NULL);
-	ASSERT_SENTINEL(objnode, OBJNODE);
-	INVERT_SENTINEL(objnode, OBJNODE);
-	BUG_ON(objnode->obj == NULL);
-	ASSERT_SENTINEL(objnode->obj, OBJ);
-	pool = objnode->obj->pool;
-	BUG_ON(pool == NULL);
-	ASSERT_SENTINEL(pool, POOL);
-	objnode->obj->objnode_count--;
-	objnode->obj = NULL;
-	(*tmem_hostops.objnode_free)(objnode, pool);
-}
-
-/*
- * lookup index in object and return associated pampd (or NULL if not found)
- */
-static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
-{
-	unsigned int height, shift;
-	struct tmem_objnode **slot = NULL;
-
-	BUG_ON(obj == NULL);
-	ASSERT_SENTINEL(obj, OBJ);
-	BUG_ON(obj->pool == NULL);
-	ASSERT_SENTINEL(obj->pool, POOL);
-
-	height = obj->objnode_tree_height;
-	if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
-		goto out;
-	if (height == 0 && obj->objnode_tree_root) {
-		slot = &obj->objnode_tree_root;
-		goto out;
-	}
-	shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
-	slot = &obj->objnode_tree_root;
-	while (height > 0) {
-		if (*slot == NULL)
-			goto out;
-		slot = (struct tmem_objnode **)
-			((*slot)->slots +
-			 ((index >> shift) & OBJNODE_TREE_MAP_MASK));
-		shift -= OBJNODE_TREE_MAP_SHIFT;
-		height--;
-	}
-out:
-	return slot != NULL ? (void **)slot : NULL;
-}
-
-static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
-{
-	struct tmem_objnode **slot;
-
-	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
-	return slot != NULL ? *slot : NULL;
-}
-
-static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
-					void *new_pampd, bool no_free)
-{
-	struct tmem_objnode **slot;
-	void *ret = NULL;
-
-	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
-	if ((slot != NULL) && (*slot != NULL)) {
-		void *old_pampd = *(void **)slot;
-		*(void **)slot = new_pampd;
-		if (!no_free)
-			(*tmem_pamops.free)(old_pampd, obj->pool,
-						NULL, 0, false);
-		ret = new_pampd;
-	}
-	return ret;
-}
-
-static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
-					void *pampd)
-{
-	int ret = 0;
-	struct tmem_objnode *objnode = NULL, *newnode, *slot;
-	unsigned int height, shift;
-	int offset = 0;
-
-	/* if necessary, extend the tree to be higher  */
-	if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
-		height = obj->objnode_tree_height + 1;
-		if (index > tmem_objnode_tree_h2max[height])
-			while (index > tmem_objnode_tree_h2max[height])
-				height++;
-		if (obj->objnode_tree_root == NULL) {
-			obj->objnode_tree_height = height;
-			goto insert;
-		}
-		do {
-			newnode = tmem_objnode_alloc(obj);
-			if (!newnode) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			newnode->slots[0] = obj->objnode_tree_root;
-			newnode->slots_in_use = 1;
-			obj->objnode_tree_root = newnode;
-			obj->objnode_tree_height++;
-		} while (height > obj->objnode_tree_height);
-	}
-insert:
-	slot = obj->objnode_tree_root;
-	height = obj->objnode_tree_height;
-	shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
-	while (height > 0) {
-		if (slot == NULL) {
-			/* add a child objnode.  */
-			slot = tmem_objnode_alloc(obj);
-			if (!slot) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			if (objnode) {
-
-				objnode->slots[offset] = slot;
-				objnode->slots_in_use++;
-			} else
-				obj->objnode_tree_root = slot;
-		}
-		/* go down a level */
-		offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
-		objnode = slot;
-		slot = objnode->slots[offset];
-		shift -= OBJNODE_TREE_MAP_SHIFT;
-		height--;
-	}
-	BUG_ON(slot != NULL);
-	if (objnode) {
-		objnode->slots_in_use++;
-		objnode->slots[offset] = pampd;
-	} else
-		obj->objnode_tree_root = pampd;
-	obj->pampd_count++;
-out:
-	return ret;
-}
-
-static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
-{
-	struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
-	struct tmem_objnode_tree_path *pathp = path;
-	struct tmem_objnode *slot = NULL;
-	unsigned int height, shift;
-	int offset;
-
-	BUG_ON(obj == NULL);
-	ASSERT_SENTINEL(obj, OBJ);
-	BUG_ON(obj->pool == NULL);
-	ASSERT_SENTINEL(obj->pool, POOL);
-	height = obj->objnode_tree_height;
-	if (index > tmem_objnode_tree_h2max[height])
-		goto out;
-	slot = obj->objnode_tree_root;
-	if (height == 0 && obj->objnode_tree_root) {
-		obj->objnode_tree_root = NULL;
-		goto out;
-	}
-	shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
-	pathp->objnode = NULL;
-	do {
-		if (slot == NULL)
-			goto out;
-		pathp++;
-		offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
-		pathp->offset = offset;
-		pathp->objnode = slot;
-		slot = slot->slots[offset];
-		shift -= OBJNODE_TREE_MAP_SHIFT;
-		height--;
-	} while (height > 0);
-	if (slot == NULL)
-		goto out;
-	while (pathp->objnode) {
-		pathp->objnode->slots[pathp->offset] = NULL;
-		pathp->objnode->slots_in_use--;
-		if (pathp->objnode->slots_in_use) {
-			if (pathp->objnode == obj->objnode_tree_root) {
-				while (obj->objnode_tree_height > 0 &&
-				  obj->objnode_tree_root->slots_in_use == 1 &&
-				  obj->objnode_tree_root->slots[0]) {
-					struct tmem_objnode *to_free =
-						obj->objnode_tree_root;
-
-					obj->objnode_tree_root =
-							to_free->slots[0];
-					obj->objnode_tree_height--;
-					to_free->slots[0] = NULL;
-					to_free->slots_in_use = 0;
-					tmem_objnode_free(to_free);
-				}
-			}
-			goto out;
-		}
-		tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
-		pathp--;
-	}
-	obj->objnode_tree_height = 0;
-	obj->objnode_tree_root = NULL;
-
-out:
-	if (slot != NULL)
-		obj->pampd_count--;
-	BUG_ON(obj->pampd_count < 0);
-	return slot;
-}
-
-/* recursively walk the objnode_tree destroying pampds and objnodes */
-static void tmem_objnode_node_destroy(struct tmem_obj *obj,
-					struct tmem_objnode *objnode,
-					unsigned int ht)
-{
-	int i;
-
-	if (ht == 0)
-		return;
-	for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
-		if (objnode->slots[i]) {
-			if (ht == 1) {
-				obj->pampd_count--;
-				(*tmem_pamops.free)(objnode->slots[i],
-						obj->pool, NULL, 0, true);
-				objnode->slots[i] = NULL;
-				continue;
-			}
-			tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
-			tmem_objnode_free(objnode->slots[i]);
-			objnode->slots[i] = NULL;
-		}
-	}
-}
-
-static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
-{
-	if (obj->objnode_tree_root == NULL)
-		return;
-	if (obj->objnode_tree_height == 0) {
-		obj->pampd_count--;
-		(*tmem_pamops.free)(obj->objnode_tree_root,
-					obj->pool, NULL, 0, true);
-	} else {
-		tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
-					obj->objnode_tree_height);
-		tmem_objnode_free(obj->objnode_tree_root);
-		obj->objnode_tree_height = 0;
-	}
-	obj->objnode_tree_root = NULL;
-	(*tmem_pamops.free_obj)(obj->pool, obj);
-}
-
-/*
- * Tmem is operated on by a set of well-defined actions:
- * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
- * (The tmem ABI allows for subpages and exchanges but these operations
- * are not included in this implementation.)
- *
- * These "tmem core" operations are implemented in the following functions.
- */
-
-/*
- * "Put" a page, e.g. copy a page from the kernel into newly allocated
- * PAM space (if such space is available).  Tmem_put is complicated by
- * a corner case: What if a page with matching handle already exists in
- * tmem?  To guarantee coherency, one of two actions is necessary: Either
- * the data for the page must be overwritten, or the page must be
- * "flushed" so that the data is not accessible to a subsequent "get".
- * Since these "duplicate puts" are relatively rare, this implementation
- * always flushes for simplicity.
- */
-int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
-		char *data, size_t size, bool raw, int ephemeral)
-{
-	struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
-	void *pampd = NULL, *pampd_del = NULL;
-	int ret = -ENOMEM;
-	struct tmem_hashbucket *hb;
-
-	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
-	spin_lock(&hb->lock);
-	obj = objfound = tmem_obj_find(hb, oidp);
-	if (obj != NULL) {
-		pampd = tmem_pampd_lookup_in_obj(objfound, index);
-		if (pampd != NULL) {
-			/* if found, is a dup put, flush the old one */
-			pampd_del = tmem_pampd_delete_from_obj(obj, index);
-			BUG_ON(pampd_del != pampd);
-			(*tmem_pamops.free)(pampd, pool, oidp, index, true);
-			if (obj->pampd_count == 0) {
-				objnew = obj;
-				objfound = NULL;
-			}
-			pampd = NULL;
-		}
-	} else {
-		obj = objnew = (*tmem_hostops.obj_alloc)(pool);
-		if (unlikely(obj == NULL)) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		tmem_obj_init(obj, hb, pool, oidp);
-	}
-	BUG_ON(obj == NULL);
-	BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
-	pampd = (*tmem_pamops.create)(data, size, raw, ephemeral,
-					obj->pool, &obj->oid, index);
-	if (unlikely(pampd == NULL))
-		goto free;
-	ret = tmem_pampd_add_to_obj(obj, index, pampd);
-	if (unlikely(ret == -ENOMEM))
-		/* may have partially built objnode tree ("stump") */
-		goto delete_and_free;
-	goto out;
-
-delete_and_free:
-	(void)tmem_pampd_delete_from_obj(obj, index);
-free:
-	if (pampd)
-		(*tmem_pamops.free)(pampd, pool, NULL, 0, true);
-	if (objnew) {
-		tmem_obj_free(objnew, hb);
-		(*tmem_hostops.obj_free)(objnew, pool);
-	}
-out:
-	spin_unlock(&hb->lock);
-	return ret;
-}
-
-void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp,
-				uint32_t index, struct tmem_obj **ret_obj,
-				void **saved_hb)
-{
-	struct tmem_hashbucket *hb;
-	struct tmem_obj *obj = NULL;
-	void *pampd = NULL;
-
-	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
-	spin_lock(&hb->lock);
-	obj = tmem_obj_find(hb, oidp);
-	if (likely(obj != NULL))
-		pampd = tmem_pampd_lookup_in_obj(obj, index);
-	*ret_obj = obj;
-	*saved_hb = (void *)hb;
-	/* note, hashbucket remains locked */
-	return pampd;
-}
-
-void tmem_localify_finish(struct tmem_obj *obj, uint32_t index,
-			  void *pampd, void *saved_hb, bool delete)
-{
-	struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb;
-
-	BUG_ON(!spin_is_locked(&hb->lock));
-	if (pampd != NULL) {
-		BUG_ON(obj == NULL);
-		(void)tmem_pampd_replace_in_obj(obj, index, pampd, 1);
-	} else if (delete) {
-		BUG_ON(obj == NULL);
-		(void)tmem_pampd_delete_from_obj(obj, index);
-	}
-	spin_unlock(&hb->lock);
-}
-
-static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,
-				struct tmem_pool *pool, struct tmem_oid *oidp,
-				uint32_t index, bool free, char *data)
-{
-	void *old_pampd = *ppampd, *new_pampd = NULL;
-	bool intransit = false;
-	int ret = 0;
-
-
-	if (!is_ephemeral(pool))
-		new_pampd = (*tmem_pamops.repatriate_preload)(
-				old_pampd, pool, oidp, index, &intransit);
-	if (intransit)
-		ret = -EAGAIN;
-	else if (new_pampd != NULL)
-		*ppampd = new_pampd;
-	/* must release the hb->lock else repatriate can't sleep */
-	spin_unlock(&hb->lock);
-	if (!intransit)
-		ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool,
-						oidp, index, free, data);
-	return ret;
-}
-
-/*
- * "Get" a page, e.g. if one can be found, copy the tmem page with the
- * matching handle from PAM space to the kernel.  By tmem definition,
- * when a "get" is successful on an ephemeral page, the page is "flushed",
- * and when a "get" is successful on a persistent page, the page is retained
- * in tmem.  Note that to preserve
- * coherency, "get" can never be skipped if tmem contains the data.
- * That is, if a get is done with a certain handle and fails, any
- * subsequent "get" must also fail (unless of course there is a
- * "put" done with the same handle).
-
- */
-int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
-		char *data, size_t *size, bool raw, int get_and_free)
-{
-	struct tmem_obj *obj;
-	void *pampd;
-	bool ephemeral = is_ephemeral(pool);
-	int ret = -1;
-	struct tmem_hashbucket *hb;
-	bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
-	bool lock_held = 0;
-	void **ppampd;
-
-again:
-	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
-	spin_lock(&hb->lock);
-	lock_held = 1;
-	obj = tmem_obj_find(hb, oidp);
-	if (obj == NULL)
-		goto out;
-	ppampd = __tmem_pampd_lookup_in_obj(obj, index);
-	if (ppampd == NULL)
-		goto out;
-	if (tmem_pamops.is_remote(*ppampd)) {
-		ret = tmem_repatriate(ppampd, hb, pool, oidp,
-					index, free, data);
-		lock_held = 0; /* note hb->lock has been unlocked */
-		if (ret == -EAGAIN) {
-			/* rare I think, but should cond_resched()??? */
-			usleep_range(10, 1000);
-			goto again;
-		} else if (ret != 0) {
-			if (ret != -ENOENT)
-				pr_err("UNTESTED case in tmem_get, ret=%d\n",
-						ret);
-			ret = -1;
-			goto out;
-		}
-		goto out;
-	}
-	if (free)
-		pampd = tmem_pampd_delete_from_obj(obj, index);
-	else
-		pampd = tmem_pampd_lookup_in_obj(obj, index);
-	if (pampd == NULL)
-		goto out;
-	if (free) {
-		if (obj->pampd_count == 0) {
-			tmem_obj_free(obj, hb);
-			(*tmem_hostops.obj_free)(obj, pool);
-			obj = NULL;
-		}
-	}
-	if (free)
-		ret = (*tmem_pamops.get_data_and_free)(
-				data, size, raw, pampd, pool, oidp, index);
-	else
-		ret = (*tmem_pamops.get_data)(
-				data, size, raw, pampd, pool, oidp, index);
-	if (ret < 0)
-		goto out;
-	ret = 0;
-out:
-	if (lock_held)
-		spin_unlock(&hb->lock);
-	return ret;
-}
-
-/*
- * If a page in tmem matches the handle, "flush" this page from tmem such
- * that any subsequent "get" does not succeed (unless, of course, there
- * was another "put" with the same handle).
- */
-int tmem_flush_page(struct tmem_pool *pool,
-				struct tmem_oid *oidp, uint32_t index)
-{
-	struct tmem_obj *obj;
-	void *pampd;
-	int ret = -1;
-	struct tmem_hashbucket *hb;
-
-	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
-	spin_lock(&hb->lock);
-	obj = tmem_obj_find(hb, oidp);
-	if (obj == NULL)
-		goto out;
-	pampd = tmem_pampd_delete_from_obj(obj, index);
-	if (pampd == NULL)
-		goto out;
-	(*tmem_pamops.free)(pampd, pool, oidp, index, true);
-	if (obj->pampd_count == 0) {
-		tmem_obj_free(obj, hb);
-		(*tmem_hostops.obj_free)(obj, pool);
-	}
-	ret = 0;
-
-out:
-	spin_unlock(&hb->lock);
-	return ret;
-}
-
-/*
- * If a page in tmem matches the handle, replace the page so that any
- * subsequent "get" gets the new page.  Returns the new page if
- * there was a page to replace, else returns NULL.
- */
-int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
-			uint32_t index, void *new_pampd)
-{
-	struct tmem_obj *obj;
-	int ret = -1;
-	struct tmem_hashbucket *hb;
-
-	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
-	spin_lock(&hb->lock);
-	obj = tmem_obj_find(hb, oidp);
-	if (obj == NULL)
-		goto out;
-	new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0);
-	ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
-out:
-	spin_unlock(&hb->lock);
-	return ret;
-}
-
-/*
- * "Flush" all pages in tmem matching this oid.
- */
-int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
-{
-	struct tmem_obj *obj;
-	struct tmem_hashbucket *hb;
-	int ret = -1;
-
-	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
-	spin_lock(&hb->lock);
-	obj = tmem_obj_find(hb, oidp);
-	if (obj == NULL)
-		goto out;
-	tmem_pampd_destroy_all_in_obj(obj);
-	tmem_obj_free(obj, hb);
-	(*tmem_hostops.obj_free)(obj, pool);
-	ret = 0;
-
-out:
-	spin_unlock(&hb->lock);
-	return ret;
-}
-
-/*
- * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
- * all subsequent access to this tmem_pool.
- */
-int tmem_destroy_pool(struct tmem_pool *pool)
-{
-	int ret = -1;
-
-	if (pool == NULL)
-		goto out;
-	tmem_pool_flush(pool, 1);
-	ret = 0;
-out:
-	return ret;
-}
-
-static LIST_HEAD(tmem_global_pool_list);
-
-/*
- * Create a new tmem_pool with the provided flag and return
- * a pool id provided by the tmem host implementation.
- */
-void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
-{
-	int persistent = flags & TMEM_POOL_PERSIST;
-	int shared = flags & TMEM_POOL_SHARED;
-	struct tmem_hashbucket *hb = &pool->hashbucket[0];
-	int i;
-
-	for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
-		hb->obj_rb_root = RB_ROOT;
-		spin_lock_init(&hb->lock);
-	}
-	INIT_LIST_HEAD(&pool->pool_list);
-	atomic_set(&pool->obj_count, 0);
-	SET_SENTINEL(pool, POOL);
-	list_add_tail(&pool->pool_list, &tmem_global_pool_list);
-	pool->persistent = persistent;
-	pool->shared = shared;
-}
diff --git a/drivers/staging/ramster/tmem.h b/drivers/staging/ramster/tmem.h
deleted file mode 100644
index 47f1918c8..000000000
--- a/drivers/staging/ramster/tmem.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * tmem.h
- *
- * Transcendent memory
- *
- * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
- */
-
-#ifndef _TMEM_H_
-#define _TMEM_H_
-
-#include <linux/highmem.h>
-#include <linux/hash.h>
-#include <linux/atomic.h>
-
-/*
- * These are pre-defined by the Xen<->Linux ABI
- */
-#define TMEM_PUT_PAGE			4
-#define TMEM_GET_PAGE			5
-#define TMEM_FLUSH_PAGE			6
-#define TMEM_FLUSH_OBJECT		7
-#define TMEM_POOL_PERSIST		1
-#define TMEM_POOL_SHARED		2
-#define TMEM_POOL_PRECOMPRESSED		4
-#define TMEM_POOL_PAGESIZE_SHIFT	4
-#define TMEM_POOL_PAGESIZE_MASK		0xf
-#define TMEM_POOL_RESERVED_BITS		0x00ffff00
-
-/*
- * sentinels have proven very useful for debugging but can be removed
- * or disabled before final merge.
- */
-#define SENTINELS
-#ifdef SENTINELS
-#define DECL_SENTINEL uint32_t sentinel;
-#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL)
-#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL)
-#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL)
-#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL)
-#else
-#define DECL_SENTINEL
-#define SET_SENTINEL(_x, _y) do { } while (0)
-#define INVERT_SENTINEL(_x, _y) do { } while (0)
-#define ASSERT_SENTINEL(_x, _y) do { } while (0)
-#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0)
-#endif
-
-#define ASSERT_SPINLOCK(_l)	WARN_ON(!spin_is_locked(_l))
-
-/*
- * A pool is the highest-level data structure managed by tmem and
- * usually corresponds to a large independent set of pages such as
- * a filesystem.  Each pool has an id, and certain attributes and counters.
- * It also contains a set of hash buckets, each of which contains an rbtree
- * of objects and a lock to manage concurrency within the pool.
- */
-
-#define TMEM_HASH_BUCKET_BITS	8
-#define TMEM_HASH_BUCKETS	(1<<TMEM_HASH_BUCKET_BITS)
-
-struct tmem_hashbucket {
-	struct rb_root obj_rb_root;
-	spinlock_t lock;
-};
-
-struct tmem_pool {
-	void *client; /* "up" for some clients, avoids table lookup */
-	struct list_head pool_list;
-	uint32_t pool_id;
-	bool persistent;
-	bool shared;
-	atomic_t obj_count;
-	atomic_t refcount;
-	struct tmem_hashbucket hashbucket[TMEM_HASH_BUCKETS];
-	DECL_SENTINEL
-};
-
-#define is_persistent(_p)  (_p->persistent)
-#define is_ephemeral(_p)   (!(_p->persistent))
-
-/*
- * An object id ("oid") is large: 192-bits (to ensure, for example, files
- * in a modern filesystem can be uniquely identified).
- */
-
-struct tmem_oid {
-	uint64_t oid[3];
-};
-
-struct tmem_xhandle {
-	uint8_t client_id;
-	uint8_t xh_data_cksum;
-	uint16_t xh_data_size;
-	uint16_t pool_id;
-	struct tmem_oid oid;
-	uint32_t index;
-	void *extra;
-};
-
-static inline struct tmem_xhandle tmem_xhandle_fill(uint16_t client_id,
-					struct tmem_pool *pool,
-					struct tmem_oid *oidp,
-					uint32_t index)
-{
-	struct tmem_xhandle xh;
-	xh.client_id = client_id;
-	xh.xh_data_cksum = (uint8_t)-1;
-	xh.xh_data_size = (uint16_t)-1;
-	xh.pool_id = pool->pool_id;
-	xh.oid = *oidp;
-	xh.index = index;
-	return xh;
-}
-
-static inline void tmem_oid_set_invalid(struct tmem_oid *oidp)
-{
-	oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
-}
-
-static inline bool tmem_oid_valid(struct tmem_oid *oidp)
-{
-	return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL ||
-		oidp->oid[2] != -1UL;
-}
-
-static inline int tmem_oid_compare(struct tmem_oid *left,
-					struct tmem_oid *right)
-{
-	int ret;
-
-	if (left->oid[2] == right->oid[2]) {
-		if (left->oid[1] == right->oid[1]) {
-			if (left->oid[0] == right->oid[0])
-				ret = 0;
-			else if (left->oid[0] < right->oid[0])
-				ret = -1;
-			else
-				return 1;
-		} else if (left->oid[1] < right->oid[1])
-			ret = -1;
-		else
-			ret = 1;
-	} else if (left->oid[2] < right->oid[2])
-		ret = -1;
-	else
-		ret = 1;
-	return ret;
-}
-
-static inline unsigned tmem_oid_hash(struct tmem_oid *oidp)
-{
-	return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2],
-				TMEM_HASH_BUCKET_BITS);
-}
-
-/*
- * A tmem_obj contains an identifier (oid), pointers to the parent
- * pool and the rb_tree to which it belongs, counters, and an ordered
- * set of pampds, structured in a radix-tree-like tree.  The intermediate
- * nodes of the tree are called tmem_objnodes.
- */
-
-struct tmem_objnode;
-
-struct tmem_obj {
-	struct tmem_oid oid;
-	struct tmem_pool *pool;
-	struct rb_node rb_tree_node;
-	struct tmem_objnode *objnode_tree_root;
-	unsigned int objnode_tree_height;
-	unsigned long objnode_count;
-	long pampd_count;
-	/* for current design of ramster, all pages belonging to
-	 * an object reside on the same remotenode and extra is
-	 * used to record the number of the remotenode so a
-	 * flush-object operation can specify it */
-	void *extra; /* for use by pampd implementation */
-	DECL_SENTINEL
-};
-
-#define OBJNODE_TREE_MAP_SHIFT 6
-#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT)
-#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1)
-#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
-#define OBJNODE_TREE_MAX_PATH \
-		(OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2)
-
-struct tmem_objnode {
-	struct tmem_obj *obj;
-	DECL_SENTINEL
-	void *slots[OBJNODE_TREE_MAP_SIZE];
-	unsigned int slots_in_use;
-};
-
-/* pampd abstract datatype methods provided by the PAM implementation */
-struct tmem_pamops {
-	void *(*create)(char *, size_t, bool, int,
-			struct tmem_pool *, struct tmem_oid *, uint32_t);
-	int (*get_data)(char *, size_t *, bool, void *, struct tmem_pool *,
-				struct tmem_oid *, uint32_t);
-	int (*get_data_and_free)(char *, size_t *, bool, void *,
-				struct tmem_pool *, struct tmem_oid *,
-				uint32_t);
-	void (*free)(void *, struct tmem_pool *,
-				struct tmem_oid *, uint32_t, bool);
-	void (*free_obj)(struct tmem_pool *, struct tmem_obj *);
-	bool (*is_remote)(void *);
-	void *(*repatriate_preload)(void *, struct tmem_pool *,
-					struct tmem_oid *, uint32_t, bool *);
-	int (*repatriate)(void *, void *, struct tmem_pool *,
-				struct tmem_oid *, uint32_t, bool, void *);
-	void (*new_obj)(struct tmem_obj *);
-	int (*replace_in_obj)(void *, struct tmem_obj *);
-};
-extern void tmem_register_pamops(struct tmem_pamops *m);
-
-/* memory allocation methods provided by the host implementation */
-struct tmem_hostops {
-	struct tmem_obj *(*obj_alloc)(struct tmem_pool *);
-	void (*obj_free)(struct tmem_obj *, struct tmem_pool *);
-	struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *);
-	void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *);
-};
-extern void tmem_register_hostops(struct tmem_hostops *m);
-
-/* core tmem accessor functions */
-extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index,
-			char *, size_t, bool, int);
-extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index,
-			char *, size_t *, bool, int);
-extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index,
-			void *);
-extern void *tmem_localify_get_pampd(struct tmem_pool *, struct tmem_oid *,
-				   uint32_t index, struct tmem_obj **,
-				   void **);
-extern void tmem_localify_finish(struct tmem_obj *, uint32_t index,
-				 void *, void *, bool);
-extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *,
-			uint32_t index);
-extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *);
-extern int tmem_destroy_pool(struct tmem_pool *);
-extern void tmem_new_pool(struct tmem_pool *, uint32_t);
-#endif /* _TMEM_H */
diff --git a/drivers/staging/ramster/xvmalloc.c b/drivers/staging/ramster/xvmalloc.c
deleted file mode 100644
index 93ba8e940..000000000
--- a/drivers/staging/ramster/xvmalloc.c
+++ /dev/null
@@ -1,509 +0,0 @@
-/*
- * xvmalloc memory allocator
- *
- * Copyright (C) 2008, 2009, 2010  Nitin Gupta
- *
- * This code is released using a dual license strategy: BSD/GPL
- * You can choose the licence that better fits your requirements.
- *
- * Released under the terms of 3-clause BSD License
- * Released under the terms of GNU General Public License Version 2.0
- */
-
-#ifdef CONFIG_ZRAM_DEBUG
-#define DEBUG
-#endif
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/bitops.h>
-#include <linux/errno.h>
-#include <linux/highmem.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-
-#include "xvmalloc.h"
-#include "xvmalloc_int.h"
-
-static void stat_inc(u64 *value)
-{
-	*value = *value + 1;
-}
-
-static void stat_dec(u64 *value)
-{
-	*value = *value - 1;
-}
-
-static int test_flag(struct block_header *block, enum blockflags flag)
-{
-	return block->prev & BIT(flag);
-}
-
-static void set_flag(struct block_header *block, enum blockflags flag)
-{
-	block->prev |= BIT(flag);
-}
-
-static void clear_flag(struct block_header *block, enum blockflags flag)
-{
-	block->prev &= ~BIT(flag);
-}
-
-/*
- * Given <page, offset> pair, provide a dereferencable pointer.
- * This is called from xv_malloc/xv_free path, so it
- * needs to be fast.
- */
-static void *get_ptr_atomic(struct page *page, u16 offset)
-{
-	unsigned char *base;
-
-	base = kmap_atomic(page);
-	return base + offset;
-}
-
-static void put_ptr_atomic(void *ptr)
-{
-	kunmap_atomic(ptr);
-}
-
-static u32 get_blockprev(struct block_header *block)
-{
-	return block->prev & PREV_MASK;
-}
-
-static void set_blockprev(struct block_header *block, u16 new_offset)
-{
-	block->prev = new_offset | (block->prev & FLAGS_MASK);
-}
-
-static struct block_header *BLOCK_NEXT(struct block_header *block)
-{
-	return (struct block_header *)
-		((char *)block + block->size + XV_ALIGN);
-}
-
-/*
- * Get index of free list containing blocks of maximum size
- * which is less than or equal to given size.
- */
-static u32 get_index_for_insert(u32 size)
-{
-	if (unlikely(size > XV_MAX_ALLOC_SIZE))
-		size = XV_MAX_ALLOC_SIZE;
-	size &= ~FL_DELTA_MASK;
-	return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT;
-}
-
-/*
- * Get index of free list having blocks of size greater than
- * or equal to requested size.
- */
-static u32 get_index(u32 size)
-{
-	if (unlikely(size < XV_MIN_ALLOC_SIZE))
-		size = XV_MIN_ALLOC_SIZE;
-	size = ALIGN(size, FL_DELTA);
-	return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT;
-}
-
-/**
- * find_block - find block of at least given size
- * @pool: memory pool to search from
- * @size: size of block required
- * @page: page containing required block
- * @offset: offset within the page where block is located.
- *
- * Searches two level bitmap to locate block of at least
- * the given size. If such a block is found, it provides
- * <page, offset> to identify this block and returns index
- * in freelist where we found this block.
- * Otherwise, returns 0 and <page, offset> params are not touched.
- */
-static u32 find_block(struct xv_pool *pool, u32 size,
-			struct page **page, u32 *offset)
-{
-	ulong flbitmap, slbitmap;
-	u32 flindex, slindex, slbitstart;
-
-	/* There are no free blocks in this pool */
-	if (!pool->flbitmap)
-		return 0;
-
-	/* Get freelist index correspoding to this size */
-	slindex = get_index(size);
-	slbitmap = pool->slbitmap[slindex / BITS_PER_LONG];
-	slbitstart = slindex % BITS_PER_LONG;
-
-	/*
-	 * If freelist is not empty at this index, we found the
-	 * block - head of this list. This is approximate best-fit match.
-	 */
-	if (test_bit(slbitstart, &slbitmap)) {
-		*page = pool->freelist[slindex].page;
-		*offset = pool->freelist[slindex].offset;
-		return slindex;
-	}
-
-	/*
-	 * No best-fit found. Search a bit further in bitmap for a free block.
-	 * Second level bitmap consists of series of 32-bit chunks. Search
-	 * further in the chunk where we expected a best-fit, starting from
-	 * index location found above.
-	 */
-	slbitstart++;
-	slbitmap >>= slbitstart;
-
-	/* Skip this search if we were already at end of this bitmap chunk */
-	if ((slbitstart != BITS_PER_LONG) && slbitmap) {
-		slindex += __ffs(slbitmap) + 1;
-		*page = pool->freelist[slindex].page;
-		*offset = pool->freelist[slindex].offset;
-		return slindex;
-	}
-
-	/* Now do a full two-level bitmap search to find next nearest fit */
-	flindex = slindex / BITS_PER_LONG;
-
-	flbitmap = (pool->flbitmap) >> (flindex + 1);
-	if (!flbitmap)
-		return 0;
-
-	flindex += __ffs(flbitmap) + 1;
-	slbitmap = pool->slbitmap[flindex];
-	slindex = (flindex * BITS_PER_LONG) + __ffs(slbitmap);
-	*page = pool->freelist[slindex].page;
-	*offset = pool->freelist[slindex].offset;
-
-	return slindex;
-}
-
-/*
- * Insert block at <page, offset> in freelist of given pool.
- * freelist used depends on block size.
- */
-static void insert_block(struct xv_pool *pool, struct page *page, u32 offset,
-			struct block_header *block)
-{
-	u32 flindex, slindex;
-	struct block_header *nextblock;
-
-	slindex = get_index_for_insert(block->size);
-	flindex = slindex / BITS_PER_LONG;
-
-	block->link.prev_page = NULL;
-	block->link.prev_offset = 0;
-	block->link.next_page = pool->freelist[slindex].page;
-	block->link.next_offset = pool->freelist[slindex].offset;
-	pool->freelist[slindex].page = page;
-	pool->freelist[slindex].offset = offset;
-
-	if (block->link.next_page) {
-		nextblock = get_ptr_atomic(block->link.next_page,
-					block->link.next_offset);
-		nextblock->link.prev_page = page;
-		nextblock->link.prev_offset = offset;
-		put_ptr_atomic(nextblock);
-		/* If there was a next page then the free bits are set. */
-		return;
-	}
-
-	__set_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
-	__set_bit(flindex, &pool->flbitmap);
-}
-
-/*
- * Remove block from freelist. Index 'slindex' identifies the freelist.
- */
-static void remove_block(struct xv_pool *pool, struct page *page, u32 offset,
-			struct block_header *block, u32 slindex)
-{
-	u32 flindex = slindex / BITS_PER_LONG;
-	struct block_header *tmpblock;
-
-	if (block->link.prev_page) {
-		tmpblock = get_ptr_atomic(block->link.prev_page,
-				block->link.prev_offset);
-		tmpblock->link.next_page = block->link.next_page;
-		tmpblock->link.next_offset = block->link.next_offset;
-		put_ptr_atomic(tmpblock);
-	}
-
-	if (block->link.next_page) {
-		tmpblock = get_ptr_atomic(block->link.next_page,
-				block->link.next_offset);
-		tmpblock->link.prev_page = block->link.prev_page;
-		tmpblock->link.prev_offset = block->link.prev_offset;
-		put_ptr_atomic(tmpblock);
-	}
-
-	/* Is this block is at the head of the freelist? */
-	if (pool->freelist[slindex].page == page
-	   && pool->freelist[slindex].offset == offset) {
-
-		pool->freelist[slindex].page = block->link.next_page;
-		pool->freelist[slindex].offset = block->link.next_offset;
-
-		if (pool->freelist[slindex].page) {
-			struct block_header *tmpblock;
-			tmpblock = get_ptr_atomic(pool->freelist[slindex].page,
-					pool->freelist[slindex].offset);
-			tmpblock->link.prev_page = NULL;
-			tmpblock->link.prev_offset = 0;
-			put_ptr_atomic(tmpblock);
-		} else {
-			/* This freelist bucket is empty */
-			__clear_bit(slindex % BITS_PER_LONG,
-				    &pool->slbitmap[flindex]);
-			if (!pool->slbitmap[flindex])
-				__clear_bit(flindex, &pool->flbitmap);
-		}
-	}
-
-	block->link.prev_page = NULL;
-	block->link.prev_offset = 0;
-	block->link.next_page = NULL;
-	block->link.next_offset = 0;
-}
-
-/*
- * Allocate a page and add it to freelist of given pool.
- */
-static int grow_pool(struct xv_pool *pool, gfp_t flags)
-{
-	struct page *page;
-	struct block_header *block;
-
-	page = alloc_page(flags);
-	if (unlikely(!page))
-		return -ENOMEM;
-
-	stat_inc(&pool->total_pages);
-
-	spin_lock(&pool->lock);
-	block = get_ptr_atomic(page, 0);
-
-	block->size = PAGE_SIZE - XV_ALIGN;
-	set_flag(block, BLOCK_FREE);
-	clear_flag(block, PREV_FREE);
-	set_blockprev(block, 0);
-
-	insert_block(pool, page, 0, block);
-
-	put_ptr_atomic(block);
-	spin_unlock(&pool->lock);
-
-	return 0;
-}
-
-/*
- * Create a memory pool. Allocates freelist, bitmaps and other
- * per-pool metadata.
- */
-struct xv_pool *xv_create_pool(void)
-{
-	u32 ovhd_size;
-	struct xv_pool *pool;
-
-	ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
-	pool = kzalloc(ovhd_size, GFP_KERNEL);
-	if (!pool)
-		return NULL;
-
-	spin_lock_init(&pool->lock);
-
-	return pool;
-}
-EXPORT_SYMBOL_GPL(xv_create_pool);
-
-void xv_destroy_pool(struct xv_pool *pool)
-{
-	kfree(pool);
-}
-EXPORT_SYMBOL_GPL(xv_destroy_pool);
-
-/**
- * xv_malloc - Allocate block of given size from pool.
- * @pool: pool to allocate from
- * @size: size of block to allocate
- * @page: page no. that holds the object
- * @offset: location of object within page
- *
- * On success, <page, offset> identifies block allocated
- * and 0 is returned. On failure, <page, offset> is set to
- * 0 and -ENOMEM is returned.
- *
- * Allocation requests with size > XV_MAX_ALLOC_SIZE will fail.
- */
-int xv_malloc(struct xv_pool *pool, u32 size, struct page **page,
-		u32 *offset, gfp_t flags)
-{
-	int error;
-	u32 index, tmpsize, origsize, tmpoffset;
-	struct block_header *block, *tmpblock;
-
-	*page = NULL;
-	*offset = 0;
-	origsize = size;
-
-	if (unlikely(!size || size > XV_MAX_ALLOC_SIZE))
-		return -ENOMEM;
-
-	size = ALIGN(size, XV_ALIGN);
-
-	spin_lock(&pool->lock);
-
-	index = find_block(pool, size, page, offset);
-
-	if (!*page) {
-		spin_unlock(&pool->lock);
-		if (flags & GFP_NOWAIT)
-			return -ENOMEM;
-		error = grow_pool(pool, flags);
-		if (unlikely(error))
-			return error;
-
-		spin_lock(&pool->lock);
-		index = find_block(pool, size, page, offset);
-	}
-
-	if (!*page) {
-		spin_unlock(&pool->lock);
-		return -ENOMEM;
-	}
-
-	block = get_ptr_atomic(*page, *offset);
-
-	remove_block(pool, *page, *offset, block, index);
-
-	/* Split the block if required */
-	tmpoffset = *offset + size + XV_ALIGN;
-	tmpsize = block->size - size;
-	tmpblock = (struct block_header *)((char *)block + size + XV_ALIGN);
-	if (tmpsize) {
-		tmpblock->size = tmpsize - XV_ALIGN;
-		set_flag(tmpblock, BLOCK_FREE);
-		clear_flag(tmpblock, PREV_FREE);
-
-		set_blockprev(tmpblock, *offset);
-		if (tmpblock->size >= XV_MIN_ALLOC_SIZE)
-			insert_block(pool, *page, tmpoffset, tmpblock);
-
-		if (tmpoffset + XV_ALIGN + tmpblock->size != PAGE_SIZE) {
-			tmpblock = BLOCK_NEXT(tmpblock);
-			set_blockprev(tmpblock, tmpoffset);
-		}
-	} else {
-		/* This block is exact fit */
-		if (tmpoffset != PAGE_SIZE)
-			clear_flag(tmpblock, PREV_FREE);
-	}
-
-	block->size = origsize;
-	clear_flag(block, BLOCK_FREE);
-
-	put_ptr_atomic(block);
-	spin_unlock(&pool->lock);
-
-	*offset += XV_ALIGN;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(xv_malloc);
-
-/*
- * Free block identified with <page, offset>
- */
-void xv_free(struct xv_pool *pool, struct page *page, u32 offset)
-{
-	void *page_start;
-	struct block_header *block, *tmpblock;
-
-	offset -= XV_ALIGN;
-
-	spin_lock(&pool->lock);
-
-	page_start = get_ptr_atomic(page, 0);
-	block = (struct block_header *)((char *)page_start + offset);
-
-	/* Catch double free bugs */
-	BUG_ON(test_flag(block, BLOCK_FREE));
-
-	block->size = ALIGN(block->size, XV_ALIGN);
-
-	tmpblock = BLOCK_NEXT(block);
-	if (offset + block->size + XV_ALIGN == PAGE_SIZE)
-		tmpblock = NULL;
-
-	/* Merge next block if its free */
-	if (tmpblock && test_flag(tmpblock, BLOCK_FREE)) {
-		/*
-		 * Blocks smaller than XV_MIN_ALLOC_SIZE
-		 * are not inserted in any free list.
-		 */
-		if (tmpblock->size >= XV_MIN_ALLOC_SIZE) {
-			remove_block(pool, page,
-				    offset + block->size + XV_ALIGN, tmpblock,
-				    get_index_for_insert(tmpblock->size));
-		}
-		block->size += tmpblock->size + XV_ALIGN;
-	}
-
-	/* Merge previous block if its free */
-	if (test_flag(block, PREV_FREE)) {
-		tmpblock = (struct block_header *)((char *)(page_start) +
-						get_blockprev(block));
-		offset = offset - tmpblock->size - XV_ALIGN;
-
-		if (tmpblock->size >= XV_MIN_ALLOC_SIZE)
-			remove_block(pool, page, offset, tmpblock,
-				    get_index_for_insert(tmpblock->size));
-
-		tmpblock->size += block->size + XV_ALIGN;
-		block = tmpblock;
-	}
-
-	/* No used objects in this page. Free it. */
-	if (block->size == PAGE_SIZE - XV_ALIGN) {
-		put_ptr_atomic(page_start);
-		spin_unlock(&pool->lock);
-
-		__free_page(page);
-		stat_dec(&pool->total_pages);
-		return;
-	}
-
-	set_flag(block, BLOCK_FREE);
-	if (block->size >= XV_MIN_ALLOC_SIZE)
-		insert_block(pool, page, offset, block);
-
-	if (offset + block->size + XV_ALIGN != PAGE_SIZE) {
-		tmpblock = BLOCK_NEXT(block);
-		set_flag(tmpblock, PREV_FREE);
-		set_blockprev(tmpblock, offset);
-	}
-
-	put_ptr_atomic(page_start);
-	spin_unlock(&pool->lock);
-}
-EXPORT_SYMBOL_GPL(xv_free);
-
-u32 xv_get_object_size(void *obj)
-{
-	struct block_header *blk;
-
-	blk = (struct block_header *)((char *)(obj) - XV_ALIGN);
-	return blk->size;
-}
-EXPORT_SYMBOL_GPL(xv_get_object_size);
-
-/*
- * Returns total memory used by allocator (userdata + metadata)
- */
-u64 xv_get_total_size_bytes(struct xv_pool *pool)
-{
-	return pool->total_pages << PAGE_SHIFT;
-}
-EXPORT_SYMBOL_GPL(xv_get_total_size_bytes);
diff --git a/drivers/staging/ramster/xvmalloc.h b/drivers/staging/ramster/xvmalloc.h
deleted file mode 100644
index 5b1a81aa5..000000000
--- a/drivers/staging/ramster/xvmalloc.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * xvmalloc memory allocator
- *
- * Copyright (C) 2008, 2009, 2010  Nitin Gupta
- *
- * This code is released using a dual license strategy: BSD/GPL
- * You can choose the licence that better fits your requirements.
- *
- * Released under the terms of 3-clause BSD License
- * Released under the terms of GNU General Public License Version 2.0
- */
-
-#ifndef _XV_MALLOC_H_
-#define _XV_MALLOC_H_
-
-#include <linux/types.h>
-
-struct xv_pool;
-
-struct xv_pool *xv_create_pool(void);
-void xv_destroy_pool(struct xv_pool *pool);
-
-int xv_malloc(struct xv_pool *pool, u32 size, struct page **page,
-			u32 *offset, gfp_t flags);
-void xv_free(struct xv_pool *pool, struct page *page, u32 offset);
-
-u32 xv_get_object_size(void *obj);
-u64 xv_get_total_size_bytes(struct xv_pool *pool);
-
-#endif
diff --git a/drivers/staging/ramster/xvmalloc_int.h b/drivers/staging/ramster/xvmalloc_int.h
deleted file mode 100644
index b5f1f7feb..000000000
--- a/drivers/staging/ramster/xvmalloc_int.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * xvmalloc memory allocator
- *
- * Copyright (C) 2008, 2009, 2010  Nitin Gupta
- *
- * This code is released using a dual license strategy: BSD/GPL
- * You can choose the licence that better fits your requirements.
- *
- * Released under the terms of 3-clause BSD License
- * Released under the terms of GNU General Public License Version 2.0
- */
-
-#ifndef _XV_MALLOC_INT_H_
-#define _XV_MALLOC_INT_H_
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-/* User configurable params */
-
-/* Must be power of two */
-#ifdef CONFIG_64BIT
-#define XV_ALIGN_SHIFT 3
-#else
-#define XV_ALIGN_SHIFT	2
-#endif
-#define XV_ALIGN	(1 << XV_ALIGN_SHIFT)
-#define XV_ALIGN_MASK	(XV_ALIGN - 1)
-
-/* This must be greater than sizeof(link_free) */
-#define XV_MIN_ALLOC_SIZE	32
-#define XV_MAX_ALLOC_SIZE	(PAGE_SIZE - XV_ALIGN)
-
-/*
- * Free lists are separated by FL_DELTA bytes
- * This value is 3 for 4k pages and 4 for 64k pages, for any
- * other page size, a conservative (PAGE_SHIFT - 9) is used.
- */
-#if PAGE_SHIFT == 16
-#define FL_DELTA_SHIFT 4
-#else
-#define FL_DELTA_SHIFT (PAGE_SHIFT - 9)
-#endif
-#define FL_DELTA	(1 << FL_DELTA_SHIFT)
-#define FL_DELTA_MASK	(FL_DELTA - 1)
-#define NUM_FREE_LISTS	((XV_MAX_ALLOC_SIZE - XV_MIN_ALLOC_SIZE) \
-				/ FL_DELTA + 1)
-
-#define MAX_FLI		DIV_ROUND_UP(NUM_FREE_LISTS, BITS_PER_LONG)
-
-/* End of user params */
-
-enum blockflags {
-	BLOCK_FREE,
-	PREV_FREE,
-	__NR_BLOCKFLAGS,
-};
-
-#define FLAGS_MASK	XV_ALIGN_MASK
-#define PREV_MASK	(~FLAGS_MASK)
-
-struct freelist_entry {
-	struct page *page;
-	u16 offset;
-	u16 pad;
-};
-
-struct link_free {
-	struct page *prev_page;
-	struct page *next_page;
-	u16 prev_offset;
-	u16 next_offset;
-};
-
-struct block_header {
-	union {
-		/* This common header must be XV_ALIGN bytes */
-		u8 common[XV_ALIGN];
-		struct {
-			u16 size;
-			u16 prev;
-		};
-	};
-	struct link_free link;
-};
-
-struct xv_pool {
-	ulong flbitmap;
-	ulong slbitmap[MAX_FLI];
-	u64 total_pages;	/* stats */
-	struct freelist_entry freelist[NUM_FREE_LISTS];
-	spinlock_t lock;
-};
-
-#endif
diff --git a/drivers/staging/ramster/zcache-main.c b/drivers/staging/ramster/zcache-main.c
deleted file mode 100644
index 68b2e053a..000000000
--- a/drivers/staging/ramster/zcache-main.c
+++ /dev/null
@@ -1,3320 +0,0 @@
-/*
- * zcache.c
- *
- * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
- * Copyright (c) 2010,2011, Nitin Gupta
- *
- * Zcache provides an in-kernel "host implementation" for transcendent memory
- * and, thus indirectly, for cleancache and frontswap.  Zcache includes two
- * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
- * 1) "compression buddies" ("zbud") is used for ephemeral pages
- * 2) xvmalloc is used for persistent pages.
- * Xvmalloc (based on the TLSF allocator) has very low fragmentation
- * so maximizes space efficiency, while zbud allows pairs (and potentially,
- * in the future, more than a pair of) compressed pages to be closely linked
- * so that reclaiming can be done via the kernel's physical-page-oriented
- * "shrinker" interface.
- *
- * [1] For a definition of page-accessible memory (aka PAM), see:
- *   http://marc.info/?l=linux-mm&m=127811271605009
- *  RAMSTER TODO:
- *   - handle remotifying of buddied pages (see zbud_remotify_zbpg)
- *   - kernel boot params: nocleancache/nofrontswap don't always work?!?
- */
-
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/highmem.h>
-#include <linux/list.h>
-#include <linux/lzo.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-#include <linux/atomic.h>
-#include <linux/math64.h>
-#include "tmem.h"
-#include "zcache.h"
-#include "ramster.h"
-#include "cluster/tcp.h"
-
-#include "xvmalloc.h"	/* temporary until change to zsmalloc */
-
-#define	RAMSTER_TESTING
-
-#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
-#error "ramster is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
-#endif
-#ifdef CONFIG_CLEANCACHE
-#include <linux/cleancache.h>
-#endif
-#ifdef CONFIG_FRONTSWAP
-#include <linux/frontswap.h>
-#endif
-
-enum ramster_remotify_op {
-	RAMSTER_REMOTIFY_EPH_PUT,
-	RAMSTER_REMOTIFY_PERS_PUT,
-	RAMSTER_REMOTIFY_FLUSH_PAGE,
-	RAMSTER_REMOTIFY_FLUSH_OBJ,
-	RAMSTER_INTRANSIT_PERS
-};
-
-struct ramster_remotify_hdr {
-	enum ramster_remotify_op op;
-	struct list_head list;
-};
-
-#define ZBH_SENTINEL  0x43214321
-#define ZBPG_SENTINEL  0xdeadbeef
-
-#define ZBUD_MAX_BUDS 2
-
-struct zbud_hdr {
-	struct ramster_remotify_hdr rem_op;
-	uint16_t client_id;
-	uint16_t pool_id;
-	struct tmem_oid oid;
-	uint32_t index;
-	uint16_t size; /* compressed size in bytes, zero means unused */
-	DECL_SENTINEL
-};
-
-#define ZVH_SENTINEL  0x43214321
-static const int zv_max_page_size = (PAGE_SIZE / 8) * 7;
-
-struct zv_hdr {
-	struct ramster_remotify_hdr rem_op;
-	uint16_t client_id;
-	uint16_t pool_id;
-	struct tmem_oid oid;
-	uint32_t index;
-	DECL_SENTINEL
-};
-
-struct flushlist_node {
-	struct ramster_remotify_hdr rem_op;
-	struct tmem_xhandle xh;
-};
-
-union {
-	struct ramster_remotify_hdr rem_op;
-	struct zv_hdr zv;
-	struct zbud_hdr zbud;
-	struct flushlist_node flist;
-} remotify_list_node;
-
-static LIST_HEAD(zcache_rem_op_list);
-static DEFINE_SPINLOCK(zcache_rem_op_list_lock);
-
-#if 0
-/* this is more aggressive but may cause other problems? */
-#define ZCACHE_GFP_MASK	(GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
-#else
-#define ZCACHE_GFP_MASK \
-	(__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
-#endif
-
-#define MAX_POOLS_PER_CLIENT 16
-
-#define MAX_CLIENTS 16
-#define LOCAL_CLIENT ((uint16_t)-1)
-
-MODULE_LICENSE("GPL");
-
-struct zcache_client {
-	struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
-	struct xv_pool *xvpool;
-	bool allocated;
-	atomic_t refcount;
-};
-
-static struct zcache_client zcache_host;
-static struct zcache_client zcache_clients[MAX_CLIENTS];
-
-static inline uint16_t get_client_id_from_client(struct zcache_client *cli)
-{
-	BUG_ON(cli == NULL);
-	if (cli == &zcache_host)
-		return LOCAL_CLIENT;
-	return cli - &zcache_clients[0];
-}
-
-static inline bool is_local_client(struct zcache_client *cli)
-{
-	return cli == &zcache_host;
-}
-
-/**********
- * Compression buddies ("zbud") provides for packing two (or, possibly
- * in the future, more) compressed ephemeral pages into a single "raw"
- * (physical) page and tracking them with data structures so that
- * the raw pages can be easily reclaimed.
- *
- * A zbud page ("zbpg") is an aligned page containing a list_head,
- * a lock, and two "zbud headers".  The remainder of the physical
- * page is divided up into aligned 64-byte "chunks" which contain
- * the compressed data for zero, one, or two zbuds.  Each zbpg
- * resides on: (1) an "unused list" if it has no zbuds; (2) a
- * "buddied" list if it is fully populated  with two zbuds; or
- * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
- * the one unbuddied zbud uses.  The data inside a zbpg cannot be
- * read or written unless the zbpg's lock is held.
- */
-
-struct zbud_page {
-	struct list_head bud_list;
-	spinlock_t lock;
-	struct zbud_hdr buddy[ZBUD_MAX_BUDS];
-	DECL_SENTINEL
-	/* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
-};
-
-#define CHUNK_SHIFT	6
-#define CHUNK_SIZE	(1 << CHUNK_SHIFT)
-#define CHUNK_MASK	(~(CHUNK_SIZE-1))
-#define NCHUNKS		(((PAGE_SIZE - sizeof(struct zbud_page)) & \
-				CHUNK_MASK) >> CHUNK_SHIFT)
-#define MAX_CHUNK	(NCHUNKS-1)
-
-static struct {
-	struct list_head list;
-	unsigned count;
-} zbud_unbuddied[NCHUNKS];
-/* list N contains pages with N chunks USED and NCHUNKS-N unused */
-/* element 0 is never used but optimizing that isn't worth it */
-static unsigned long zbud_cumul_chunk_counts[NCHUNKS];
-
-struct list_head zbud_buddied_list;
-static unsigned long zcache_zbud_buddied_count;
-
-/* protects the buddied list and all unbuddied lists */
-static DEFINE_SPINLOCK(zbud_budlists_spinlock);
-
-static atomic_t zcache_zbud_curr_raw_pages;
-static atomic_t zcache_zbud_curr_zpages;
-static unsigned long zcache_zbud_curr_zbytes;
-static unsigned long zcache_zbud_cumul_zpages;
-static unsigned long zcache_zbud_cumul_zbytes;
-static unsigned long zcache_compress_poor;
-static unsigned long zcache_policy_percent_exceeded;
-static unsigned long zcache_mean_compress_poor;
-
-/*
- * RAMster counters
- * - Remote pages are pages with a local pampd but the data is remote
- * - Foreign pages are pages stored locally but belonging to another node
- */
-static atomic_t ramster_remote_pers_pages = ATOMIC_INIT(0);
-static unsigned long ramster_pers_remotify_enable;
-static unsigned long ramster_eph_remotify_enable;
-static unsigned long ramster_eph_pages_remoted;
-static unsigned long ramster_eph_pages_remote_failed;
-static unsigned long ramster_pers_pages_remoted;
-static unsigned long ramster_pers_pages_remote_failed;
-static unsigned long ramster_pers_pages_remote_nomem;
-static unsigned long ramster_remote_objects_flushed;
-static unsigned long ramster_remote_object_flushes_failed;
-static unsigned long ramster_remote_pages_flushed;
-static unsigned long ramster_remote_page_flushes_failed;
-static unsigned long ramster_remote_eph_pages_succ_get;
-static unsigned long ramster_remote_pers_pages_succ_get;
-static unsigned long ramster_remote_eph_pages_unsucc_get;
-static unsigned long ramster_remote_pers_pages_unsucc_get;
-static atomic_t ramster_curr_flnode_count = ATOMIC_INIT(0);
-static unsigned long ramster_curr_flnode_count_max;
-static atomic_t ramster_foreign_eph_pampd_count = ATOMIC_INIT(0);
-static unsigned long ramster_foreign_eph_pampd_count_max;
-static atomic_t ramster_foreign_pers_pampd_count = ATOMIC_INIT(0);
-static unsigned long ramster_foreign_pers_pampd_count_max;
-
-/* forward references */
-static void *zcache_get_free_page(void);
-static void zcache_free_page(void *p);
-
-/*
- * zbud helper functions
- */
-
-static inline unsigned zbud_max_buddy_size(void)
-{
-	return MAX_CHUNK << CHUNK_SHIFT;
-}
-
-static inline unsigned zbud_size_to_chunks(unsigned size)
-{
-	BUG_ON(size == 0 || size > zbud_max_buddy_size());
-	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
-}
-
-static inline int zbud_budnum(struct zbud_hdr *zh)
-{
-	unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1);
-	struct zbud_page *zbpg = NULL;
-	unsigned budnum = -1U;
-	int i;
-
-	for (i = 0; i < ZBUD_MAX_BUDS; i++)
-		if (offset == offsetof(typeof(*zbpg), buddy[i])) {
-			budnum = i;
-			break;
-		}
-	BUG_ON(budnum == -1U);
-	return budnum;
-}
-
-static char *zbud_data(struct zbud_hdr *zh, unsigned size)
-{
-	struct zbud_page *zbpg;
-	char *p;
-	unsigned budnum;
-
-	ASSERT_SENTINEL(zh, ZBH);
-	budnum = zbud_budnum(zh);
-	BUG_ON(size == 0 || size > zbud_max_buddy_size());
-	zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
-	ASSERT_SPINLOCK(&zbpg->lock);
-	p = (char *)zbpg;
-	if (budnum == 0)
-		p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
-							CHUNK_MASK);
-	else if (budnum == 1)
-		p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
-	return p;
-}
-
-static void zbud_copy_from_pampd(char *data, size_t *size, struct zbud_hdr *zh)
-{
-	struct zbud_page *zbpg;
-	char *p;
-	unsigned budnum;
-
-	ASSERT_SENTINEL(zh, ZBH);
-	budnum = zbud_budnum(zh);
-	zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
-	spin_lock(&zbpg->lock);
-	BUG_ON(zh->size > *size);
-	p = (char *)zbpg;
-	if (budnum == 0)
-		p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
-							CHUNK_MASK);
-	else if (budnum == 1)
-		p += PAGE_SIZE - ((zh->size + CHUNK_SIZE - 1) & CHUNK_MASK);
-	/* client should be filled in by caller */
-	memcpy(data, p, zh->size);
-	*size = zh->size;
-	spin_unlock(&zbpg->lock);
-}
-
-/*
- * zbud raw page management
- */
-
-static struct zbud_page *zbud_alloc_raw_page(void)
-{
-	struct zbud_page *zbpg = NULL;
-	struct zbud_hdr *zh0, *zh1;
-		zbpg = zcache_get_free_page();
-	if (likely(zbpg != NULL)) {
-		INIT_LIST_HEAD(&zbpg->bud_list);
-		zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
-		spin_lock_init(&zbpg->lock);
-		atomic_inc(&zcache_zbud_curr_raw_pages);
-		INIT_LIST_HEAD(&zbpg->bud_list);
-		SET_SENTINEL(zbpg, ZBPG);
-		zh0->size = 0; zh1->size = 0;
-		tmem_oid_set_invalid(&zh0->oid);
-		tmem_oid_set_invalid(&zh1->oid);
-	}
-	return zbpg;
-}
-
-static void zbud_free_raw_page(struct zbud_page *zbpg)
-{
-	struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1];
-
-	ASSERT_SENTINEL(zbpg, ZBPG);
-	BUG_ON(!list_empty(&zbpg->bud_list));
-	ASSERT_SPINLOCK(&zbpg->lock);
-	BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
-	BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
-	INVERT_SENTINEL(zbpg, ZBPG);
-	spin_unlock(&zbpg->lock);
-	atomic_dec(&zcache_zbud_curr_raw_pages);
-	zcache_free_page(zbpg);
-}
-
-/*
- * core zbud handling routines
- */
-
-static unsigned zbud_free(struct zbud_hdr *zh)
-{
-	unsigned size;
-
-	ASSERT_SENTINEL(zh, ZBH);
-	BUG_ON(!tmem_oid_valid(&zh->oid));
-	size = zh->size;
-	BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
-	zh->size = 0;
-	tmem_oid_set_invalid(&zh->oid);
-	INVERT_SENTINEL(zh, ZBH);
-	zcache_zbud_curr_zbytes -= size;
-	atomic_dec(&zcache_zbud_curr_zpages);
-	return size;
-}
-
-static void zbud_free_and_delist(struct zbud_hdr *zh)
-{
-	unsigned chunks;
-	struct zbud_hdr *zh_other;
-	unsigned budnum = zbud_budnum(zh), size;
-	struct zbud_page *zbpg =
-		container_of(zh, struct zbud_page, buddy[budnum]);
-
-	/* FIXME, should be BUG_ON, pool destruction path doesn't disable
-	 * interrupts tmem_destroy_pool()->tmem_pampd_destroy_all_in_obj()->
-	 * tmem_objnode_node_destroy()-> zcache_pampd_free() */
-	WARN_ON(!irqs_disabled());
-	spin_lock(&zbpg->lock);
-	if (list_empty(&zbpg->bud_list)) {
-		/* ignore zombie page... see zbud_evict_pages() */
-		spin_unlock(&zbpg->lock);
-		return;
-	}
-	size = zbud_free(zh);
-	ASSERT_SPINLOCK(&zbpg->lock);
-	zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0];
-	if (zh_other->size == 0) { /* was unbuddied: unlist and free */
-		chunks = zbud_size_to_chunks(size) ;
-		spin_lock(&zbud_budlists_spinlock);
-		BUG_ON(list_empty(&zbud_unbuddied[chunks].list));
-		list_del_init(&zbpg->bud_list);
-		zbud_unbuddied[chunks].count--;
-		spin_unlock(&zbud_budlists_spinlock);
-		zbud_free_raw_page(zbpg);
-	} else { /* was buddied: move remaining buddy to unbuddied list */
-		chunks = zbud_size_to_chunks(zh_other->size) ;
-		spin_lock(&zbud_budlists_spinlock);
-		list_del_init(&zbpg->bud_list);
-		zcache_zbud_buddied_count--;
-		list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list);
-		zbud_unbuddied[chunks].count++;
-		spin_unlock(&zbud_budlists_spinlock);
-		spin_unlock(&zbpg->lock);
-	}
-}
-
-static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id,
-					struct tmem_oid *oid,
-					uint32_t index, struct page *page,
-					void *cdata, unsigned size)
-{
-	struct zbud_hdr *zh0, *zh1, *zh = NULL;
-	struct zbud_page *zbpg = NULL, *ztmp;
-	unsigned nchunks;
-	char *to;
-	int i, found_good_buddy = 0;
-
-	nchunks = zbud_size_to_chunks(size) ;
-	for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
-		spin_lock(&zbud_budlists_spinlock);
-		if (!list_empty(&zbud_unbuddied[i].list)) {
-			list_for_each_entry_safe(zbpg, ztmp,
-				    &zbud_unbuddied[i].list, bud_list) {
-				if (spin_trylock(&zbpg->lock)) {
-					found_good_buddy = i;
-					goto found_unbuddied;
-				}
-			}
-		}
-		spin_unlock(&zbud_budlists_spinlock);
-	}
-	/* didn't find a good buddy, try allocating a new page */
-	zbpg = zbud_alloc_raw_page();
-	if (unlikely(zbpg == NULL))
-		goto out;
-	/* ok, have a page, now compress the data before taking locks */
-	spin_lock(&zbud_budlists_spinlock);
-	spin_lock(&zbpg->lock);
-	list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);
-	zbud_unbuddied[nchunks].count++;
-	zh = &zbpg->buddy[0];
-	goto init_zh;
-
-found_unbuddied:
-	ASSERT_SPINLOCK(&zbpg->lock);
-	zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
-	BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0)));
-	if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */
-		ASSERT_SENTINEL(zh0, ZBH);
-		zh = zh1;
-	} else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */
-		ASSERT_SENTINEL(zh1, ZBH);
-		zh = zh0;
-	} else
-		BUG();
-	list_del_init(&zbpg->bud_list);
-	zbud_unbuddied[found_good_buddy].count--;
-	list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
-	zcache_zbud_buddied_count++;
-
-init_zh:
-	SET_SENTINEL(zh, ZBH);
-	zh->size = size;
-	zh->index = index;
-	zh->oid = *oid;
-	zh->pool_id = pool_id;
-	zh->client_id = client_id;
-	to = zbud_data(zh, size);
-	memcpy(to, cdata, size);
-	spin_unlock(&zbpg->lock);
-	spin_unlock(&zbud_budlists_spinlock);
-	zbud_cumul_chunk_counts[nchunks]++;
-	atomic_inc(&zcache_zbud_curr_zpages);
-	zcache_zbud_cumul_zpages++;
-	zcache_zbud_curr_zbytes += size;
-	zcache_zbud_cumul_zbytes += size;
-out:
-	return zh;
-}
-
-static int zbud_decompress(struct page *page, struct zbud_hdr *zh)
-{
-	struct zbud_page *zbpg;
-	unsigned budnum = zbud_budnum(zh);
-	size_t out_len = PAGE_SIZE;
-	char *to_va, *from_va;
-	unsigned size;
-	int ret = 0;
-
-	zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
-	spin_lock(&zbpg->lock);
-	if (list_empty(&zbpg->bud_list)) {
-		/* ignore zombie page... see zbud_evict_pages() */
-		ret = -EINVAL;
-		goto out;
-	}
-	ASSERT_SENTINEL(zh, ZBH);
-	BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
-	to_va = kmap_atomic(page);
-	size = zh->size;
-	from_va = zbud_data(zh, size);
-	ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len);
-	BUG_ON(ret != LZO_E_OK);
-	BUG_ON(out_len != PAGE_SIZE);
-	kunmap_atomic(to_va);
-out:
-	spin_unlock(&zbpg->lock);
-	return ret;
-}
-
-/*
- * The following routines handle shrinking of ephemeral pages by evicting
- * pages "least valuable" first.
- */
-
-static unsigned long zcache_evicted_raw_pages;
-static unsigned long zcache_evicted_buddied_pages;
-static unsigned long zcache_evicted_unbuddied_pages;
-
-static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id,
-						uint16_t poolid);
-static void zcache_put_pool(struct tmem_pool *pool);
-
-/*
- * Flush and free all zbuds in a zbpg, then free the pageframe
- */
-static void zbud_evict_zbpg(struct zbud_page *zbpg)
-{
-	struct zbud_hdr *zh;
-	int i, j;
-	uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS];
-	uint32_t index[ZBUD_MAX_BUDS];
-	struct tmem_oid oid[ZBUD_MAX_BUDS];
-	struct tmem_pool *pool;
-	unsigned long flags;
-
-	ASSERT_SPINLOCK(&zbpg->lock);
-	for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
-		zh = &zbpg->buddy[i];
-		if (zh->size) {
-			client_id[j] = zh->client_id;
-			pool_id[j] = zh->pool_id;
-			oid[j] = zh->oid;
-			index[j] = zh->index;
-			j++;
-		}
-	}
-	spin_unlock(&zbpg->lock);
-	for (i = 0; i < j; i++) {
-		pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
-		BUG_ON(pool == NULL);
-		local_irq_save(flags);
-		/* these flushes should dispose of any local storage */
-		tmem_flush_page(pool, &oid[i], index[i]);
-		local_irq_restore(flags);
-		zcache_put_pool(pool);
-	}
-}
-
-/*
- * Free nr pages.  This code is funky because we want to hold the locks
- * protecting various lists for as short a time as possible, and in some
- * circumstances the list may change asynchronously when the list lock is
- * not held.  In some cases we also trylock not only to avoid waiting on a
- * page in use by another cpu, but also to avoid potential deadlock due to
- * lock inversion.
- */
-static void zbud_evict_pages(int nr)
-{
-	struct zbud_page *zbpg;
-	int i, newly_unused_pages = 0;
-
-
-	/* now try freeing unbuddied pages, starting with least space avail */
-	for (i = 0; i < MAX_CHUNK; i++) {
-retry_unbud_list_i:
-		spin_lock_bh(&zbud_budlists_spinlock);
-		if (list_empty(&zbud_unbuddied[i].list)) {
-			spin_unlock_bh(&zbud_budlists_spinlock);
-			continue;
-		}
-		list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
-			if (unlikely(!spin_trylock(&zbpg->lock)))
-				continue;
-			zbud_unbuddied[i].count--;
-			spin_unlock(&zbud_budlists_spinlock);
-			zcache_evicted_unbuddied_pages++;
-			/* want budlists unlocked when doing zbpg eviction */
-			zbud_evict_zbpg(zbpg);
-			newly_unused_pages++;
-			local_bh_enable();
-			if (--nr <= 0)
-				goto evict_unused;
-			goto retry_unbud_list_i;
-		}
-		spin_unlock_bh(&zbud_budlists_spinlock);
-	}
-
-	/* as a last resort, free buddied pages */
-retry_bud_list:
-	spin_lock_bh(&zbud_budlists_spinlock);
-	if (list_empty(&zbud_buddied_list)) {
-		spin_unlock_bh(&zbud_budlists_spinlock);
-		goto evict_unused;
-	}
-	list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
-		if (unlikely(!spin_trylock(&zbpg->lock)))
-			continue;
-		zcache_zbud_buddied_count--;
-		spin_unlock(&zbud_budlists_spinlock);
-		zcache_evicted_buddied_pages++;
-		/* want budlists unlocked when doing zbpg eviction */
-		zbud_evict_zbpg(zbpg);
-		newly_unused_pages++;
-		local_bh_enable();
-		if (--nr <= 0)
-			goto evict_unused;
-		goto retry_bud_list;
-	}
-	spin_unlock_bh(&zbud_budlists_spinlock);
-
-evict_unused:
-	return;
-}
-
-static DEFINE_PER_CPU(unsigned char *, zcache_remoteputmem);
-
-static int zbud_remotify_zbud(struct tmem_xhandle *xh, char *data,
-				size_t size)
-{
-	struct tmem_pool *pool;
-	int i, remotenode, ret = -1;
-	unsigned char cksum, *p;
-	unsigned long flags;
-
-	for (p = data, cksum = 0, i = 0; i < size; i++)
-		cksum += *p;
-	ret = ramster_remote_put(xh, data, size, true, &remotenode);
-	if (ret == 0) {
-		/* data was successfully remoted so change the local version
-		 * to point to the remote node where it landed */
-		pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh->pool_id);
-		BUG_ON(pool == NULL);
-		local_irq_save(flags);
-		/* tmem_replace will also free up any local space */
-		(void)tmem_replace(pool, &xh->oid, xh->index,
-			pampd_make_remote(remotenode, size, cksum));
-		local_irq_restore(flags);
-		zcache_put_pool(pool);
-		ramster_eph_pages_remoted++;
-		ret = 0;
-	} else
-		ramster_eph_pages_remote_failed++;
-	return ret;
-}
-
-static int zbud_remotify_zbpg(struct zbud_page *zbpg)
-{
-	struct zbud_hdr *zh1, *zh2 = NULL;
-	struct tmem_xhandle xh1, xh2 = { 0 };
-	char *data1 = NULL, *data2 = NULL;
-	size_t size1 = 0, size2 = 0;
-	int ret = 0;
-	unsigned char *tmpmem = __get_cpu_var(zcache_remoteputmem);
-
-	ASSERT_SPINLOCK(&zbpg->lock);
-	if (zbpg->buddy[0].size == 0)
-		zh1 = &zbpg->buddy[1];
-	else if (zbpg->buddy[1].size == 0)
-		zh1 = &zbpg->buddy[0];
-	else {
-		zh1 = &zbpg->buddy[0];
-		zh2 = &zbpg->buddy[1];
-	}
-	/* don't remotify pages that are already remotified */
-	if (zh1->client_id != LOCAL_CLIENT)
-		zh1 = NULL;
-	if ((zh2 != NULL) && (zh2->client_id != LOCAL_CLIENT))
-		zh2 = NULL;
-
-	/* copy the data and metadata so can release lock */
-	if (zh1 != NULL) {
-		xh1.client_id = zh1->client_id;
-		xh1.pool_id = zh1->pool_id;
-		xh1.oid = zh1->oid;
-		xh1.index = zh1->index;
-		size1 = zh1->size;
-		data1 = zbud_data(zh1, size1);
-		memcpy(tmpmem, zbud_data(zh1, size1), size1);
-		data1 = tmpmem;
-		tmpmem += size1;
-	}
-	if (zh2 != NULL) {
-		xh2.client_id = zh2->client_id;
-		xh2.pool_id = zh2->pool_id;
-		xh2.oid = zh2->oid;
-		xh2.index = zh2->index;
-		size2 = zh2->size;
-		memcpy(tmpmem, zbud_data(zh2, size2), size2);
-		data2 = tmpmem;
-	}
-	spin_unlock(&zbpg->lock);
-	preempt_enable();
-
-	/* OK, no locks held anymore, remotify one or both zbuds */
-	if (zh1 != NULL)
-		ret = zbud_remotify_zbud(&xh1, data1, size1);
-	if (zh2 != NULL)
-		ret |= zbud_remotify_zbud(&xh2, data2, size2);
-	return ret;
-}
-
-void zbud_remotify_pages(int nr)
-{
-	struct zbud_page *zbpg;
-	int i, ret;
-
-	/*
-	 * for now just try remotifying unbuddied pages, starting with
-	 * least space avail
-	 */
-	for (i = 0; i < MAX_CHUNK; i++) {
-retry_unbud_list_i:
-		preempt_disable();  /* enable in zbud_remotify_zbpg */
-		spin_lock_bh(&zbud_budlists_spinlock);
-		if (list_empty(&zbud_unbuddied[i].list)) {
-			spin_unlock_bh(&zbud_budlists_spinlock);
-			preempt_enable();
-			continue; /* next i in for loop */
-		}
-		list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
-			if (unlikely(!spin_trylock(&zbpg->lock)))
-				continue; /* next list_for_each_entry */
-			zbud_unbuddied[i].count--;
-			/* want budlists unlocked when doing zbpg remotify */
-			spin_unlock_bh(&zbud_budlists_spinlock);
-			ret = zbud_remotify_zbpg(zbpg);
-			/* preemption is re-enabled in zbud_remotify_zbpg */
-			if (ret == 0) {
-				if (--nr <= 0)
-					goto out;
-				goto retry_unbud_list_i;
-			}
-			/* if fail to remotify any page, quit */
-			pr_err("TESTING zbud_remotify_pages failed on page,"
-				" trying to re-add\n");
-			spin_lock_bh(&zbud_budlists_spinlock);
-			spin_lock(&zbpg->lock);
-			list_add_tail(&zbpg->bud_list, &zbud_unbuddied[i].list);
-			zbud_unbuddied[i].count++;
-			spin_unlock(&zbpg->lock);
-			spin_unlock_bh(&zbud_budlists_spinlock);
-			pr_err("TESTING zbud_remotify_pages failed on page,"
-				" finished re-add\n");
-			goto out;
-		}
-		spin_unlock_bh(&zbud_budlists_spinlock);
-		preempt_enable();
-	}
-
-next_buddied_zbpg:
-	preempt_disable();  /* enable in zbud_remotify_zbpg */
-	spin_lock_bh(&zbud_budlists_spinlock);
-	if (list_empty(&zbud_buddied_list))
-		goto unlock_out;
-	list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
-		if (unlikely(!spin_trylock(&zbpg->lock)))
-			continue; /* next list_for_each_entry */
-		zcache_zbud_buddied_count--;
-		/* want budlists unlocked when doing zbpg remotify */
-		spin_unlock_bh(&zbud_budlists_spinlock);
-		ret = zbud_remotify_zbpg(zbpg);
-		/* preemption is re-enabled in zbud_remotify_zbpg */
-		if (ret == 0) {
-			if (--nr <= 0)
-				goto out;
-			goto next_buddied_zbpg;
-		}
-		/* if fail to remotify any page, quit */
-		pr_err("TESTING zbud_remotify_pages failed on BUDDIED page,"
-			" trying to re-add\n");
-		spin_lock_bh(&zbud_budlists_spinlock);
-		spin_lock(&zbpg->lock);
-		list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
-		zcache_zbud_buddied_count++;
-		spin_unlock(&zbpg->lock);
-		spin_unlock_bh(&zbud_budlists_spinlock);
-		pr_err("TESTING zbud_remotify_pages failed on BUDDIED page,"
-			" finished re-add\n");
-		goto out;
-	}
-unlock_out:
-	spin_unlock_bh(&zbud_budlists_spinlock);
-	preempt_enable();
-out:
-	return;
-}
-
-/* the "flush list" asynchronously collects pages to remotely flush */
-#define FLUSH_ENTIRE_OBJECT ((uint32_t)-1)
-static void ramster_flnode_free(struct flushlist_node *,
-				struct tmem_pool *);
-
-static void zcache_remote_flush_page(struct flushlist_node *flnode)
-{
-	struct tmem_xhandle *xh;
-	int remotenode, ret;
-
-	preempt_disable();
-	xh = &flnode->xh;
-	remotenode = flnode->xh.client_id;
-	ret = ramster_remote_flush(xh, remotenode);
-	if (ret >= 0)
-		ramster_remote_pages_flushed++;
-	else
-		ramster_remote_page_flushes_failed++;
-	preempt_enable_no_resched();
-	ramster_flnode_free(flnode, NULL);
-}
-
-static void zcache_remote_flush_object(struct flushlist_node *flnode)
-{
-	struct tmem_xhandle *xh;
-	int remotenode, ret;
-
-	preempt_disable();
-	xh = &flnode->xh;
-	remotenode = flnode->xh.client_id;
-	ret = ramster_remote_flush_object(xh, remotenode);
-	if (ret >= 0)
-		ramster_remote_objects_flushed++;
-	else
-		ramster_remote_object_flushes_failed++;
-	preempt_enable_no_resched();
-	ramster_flnode_free(flnode, NULL);
-}
-
-static void zcache_remote_eph_put(struct zbud_hdr *zbud)
-{
-	/* FIXME */
-}
-
-static void zcache_remote_pers_put(struct zv_hdr *zv)
-{
-	struct tmem_xhandle xh;
-	uint16_t size;
-	bool ephemeral;
-	int remotenode, ret = -1;
-	char *data;
-	struct tmem_pool *pool;
-	unsigned long flags;
-	unsigned char cksum;
-	char *p;
-	int i;
-	unsigned char *tmpmem = __get_cpu_var(zcache_remoteputmem);
-
-	ASSERT_SENTINEL(zv, ZVH);
-	BUG_ON(zv->client_id != LOCAL_CLIENT);
-	local_bh_disable();
-	xh.client_id = zv->client_id;
-	xh.pool_id = zv->pool_id;
-	xh.oid = zv->oid;
-	xh.index = zv->index;
-	size = xv_get_object_size(zv) - sizeof(*zv);
-	BUG_ON(size == 0 || size > zv_max_page_size);
-	data = (char *)zv + sizeof(*zv);
-	for (p = data, cksum = 0, i = 0; i < size; i++)
-		cksum += *p;
-	memcpy(tmpmem, data, size);
-	data = tmpmem;
-	pool = zcache_get_pool_by_id(zv->client_id, zv->pool_id);
-	ephemeral = is_ephemeral(pool);
-	zcache_put_pool(pool);
-	/* now OK to release lock set in caller */
-	spin_unlock(&zcache_rem_op_list_lock);
-	local_bh_enable();
-	preempt_disable();
-	ret = ramster_remote_put(&xh, data, size, ephemeral, &remotenode);
-	preempt_enable_no_resched();
-	if (ret != 0) {
-		/*
-		 * This is some form of a memory leak... if the remote put
-		 * fails, there will never be another attempt to remotify
-		 * this page.  But since we've dropped the zv pointer,
-		 * the page may have been freed or the data replaced
-		 * so we can't just "put it back" in the remote op list.
-		 * Even if we could, not sure where to put it in the list
-		 * because there may be flushes that must be strictly
-		 * ordered vs the put.  So leave this as a FIXME for now.
-		 * But count them so we know if it becomes a problem.
-		 */
-		ramster_pers_pages_remote_failed++;
-		goto out;
-	} else
-		atomic_inc(&ramster_remote_pers_pages);
-	ramster_pers_pages_remoted++;
-	/*
-	 * data was successfully remoted so change the local version to
-	 * point to the remote node where it landed
-	 */
-	local_bh_disable();
-	pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh.pool_id);
-	local_irq_save(flags);
-	(void)tmem_replace(pool, &xh.oid, xh.index,
-			pampd_make_remote(remotenode, size, cksum));
-	local_irq_restore(flags);
-	zcache_put_pool(pool);
-	local_bh_enable();
-out:
-	return;
-}
-
-static void zcache_do_remotify_ops(int nr)
-{
-	struct ramster_remotify_hdr *rem_op;
-	union remotify_list_node *u;
-
-	while (1) {
-		if (!nr)
-			goto out;
-		spin_lock(&zcache_rem_op_list_lock);
-		if (list_empty(&zcache_rem_op_list)) {
-			spin_unlock(&zcache_rem_op_list_lock);
-			goto out;
-		}
-		rem_op = list_first_entry(&zcache_rem_op_list,
-				struct ramster_remotify_hdr, list);
-		list_del_init(&rem_op->list);
-		if (rem_op->op != RAMSTER_REMOTIFY_PERS_PUT)
-			spin_unlock(&zcache_rem_op_list_lock);
-		u = (union remotify_list_node *)rem_op;
-		switch (rem_op->op) {
-		case RAMSTER_REMOTIFY_EPH_PUT:
-BUG();
-			zcache_remote_eph_put((struct zbud_hdr *)rem_op);
-			break;
-		case RAMSTER_REMOTIFY_PERS_PUT:
-			zcache_remote_pers_put((struct zv_hdr *)rem_op);
-			break;
-		case RAMSTER_REMOTIFY_FLUSH_PAGE:
-			zcache_remote_flush_page((struct flushlist_node *)u);
-			break;
-		case RAMSTER_REMOTIFY_FLUSH_OBJ:
-			zcache_remote_flush_object((struct flushlist_node *)u);
-			break;
-		default:
-			BUG();
-		}
-	}
-out:
-	return;
-}
-
-/*
- * Communicate interface revision with userspace
- */
-#include "cluster/ramster_nodemanager.h"
-static unsigned long ramster_interface_revision  = R2NM_API_VERSION;
-
-/*
- * For now, just push over a few pages every few seconds to
- * ensure that it basically works
- */
-static struct workqueue_struct *ramster_remotify_workqueue;
-static void ramster_remotify_process(struct work_struct *work);
-static DECLARE_DELAYED_WORK(ramster_remotify_worker,
-		ramster_remotify_process);
-
-static void ramster_remotify_queue_delayed_work(unsigned long delay)
-{
-	if (!queue_delayed_work(ramster_remotify_workqueue,
-				&ramster_remotify_worker, delay))
-		pr_err("ramster_remotify: bad workqueue\n");
-}
-
-
-static int use_frontswap;
-static int use_cleancache;
-static int ramster_remote_target_nodenum = -1;
-static void ramster_remotify_process(struct work_struct *work)
-{
-	static bool remotify_in_progress;
-
-	BUG_ON(irqs_disabled());
-	if (remotify_in_progress)
-		ramster_remotify_queue_delayed_work(HZ);
-	else if (ramster_remote_target_nodenum != -1) {
-		remotify_in_progress = true;
-#ifdef CONFIG_CLEANCACHE
-	if (use_cleancache && ramster_eph_remotify_enable)
-		zbud_remotify_pages(5000); /* FIXME is this a good number? */
-#endif
-#ifdef CONFIG_FRONTSWAP
-	if (use_frontswap && ramster_pers_remotify_enable)
-		zcache_do_remotify_ops(500); /* FIXME is this a good number? */
-#endif
-		remotify_in_progress = false;
-		ramster_remotify_queue_delayed_work(HZ);
-	}
-}
-
-static void ramster_remotify_init(void)
-{
-	unsigned long n = 60UL;
-	ramster_remotify_workqueue =
-		create_singlethread_workqueue("ramster_remotify");
-	ramster_remotify_queue_delayed_work(n * HZ);
-}
-
-
-static void zbud_init(void)
-{
-	int i;
-
-	INIT_LIST_HEAD(&zbud_buddied_list);
-	zcache_zbud_buddied_count = 0;
-	for (i = 0; i < NCHUNKS; i++) {
-		INIT_LIST_HEAD(&zbud_unbuddied[i].list);
-		zbud_unbuddied[i].count = 0;
-	}
-}
-
-#ifdef CONFIG_SYSFS
-/*
- * These sysfs routines show a nice distribution of how many zbpg's are
- * currently (and have ever been placed) in each unbuddied list.  It's fun
- * to watch but can probably go away before final merge.
- */
-static int zbud_show_unbuddied_list_counts(char *buf)
-{
-	int i;
-	char *p = buf;
-
-	for (i = 0; i < NCHUNKS; i++)
-		p += sprintf(p, "%u ", zbud_unbuddied[i].count);
-	return p - buf;
-}
-
-static int zbud_show_cumul_chunk_counts(char *buf)
-{
-	unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0;
-	unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0;
-	unsigned long total_chunks_lte_42 = 0;
-	char *p = buf;
-
-	for (i = 0; i < NCHUNKS; i++) {
-		p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]);
-		chunks += zbud_cumul_chunk_counts[i];
-		total_chunks += zbud_cumul_chunk_counts[i];
-		sum_total_chunks += i * zbud_cumul_chunk_counts[i];
-		if (i == 21)
-			total_chunks_lte_21 = total_chunks;
-		if (i == 32)
-			total_chunks_lte_32 = total_chunks;
-		if (i == 42)
-			total_chunks_lte_42 = total_chunks;
-	}
-	p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
-		total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42,
-		chunks == 0 ? 0 : sum_total_chunks / chunks);
-	return p - buf;
-}
-#endif
-
-/**********
- * This "zv" PAM implementation combines the TLSF-based xvMalloc
- * with lzo1x compression to maximize the amount of data that can
- * be packed into a physical page.
- *
- * Zv represents a PAM page with the index and object (plus a "size" value
- * necessary for decompression) immediately preceding the compressed data.
- */
-
-/* rudimentary policy limits */
-/* total number of persistent pages may not exceed this percentage */
-static unsigned int zv_page_count_policy_percent = 75;
-/*
- * byte count defining poor compression; pages with greater zsize will be
- * rejected
- */
-static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7;
-/*
- * byte count defining poor *mean* compression; pages with greater zsize
- * will be rejected until sufficient better-compressed pages are accepted
- * driving the mean below this threshold
- */
-static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5;
-
-static atomic_t zv_curr_dist_counts[NCHUNKS];
-static atomic_t zv_cumul_dist_counts[NCHUNKS];
-
-
-static struct zv_hdr *zv_create(struct zcache_client *cli, uint32_t pool_id,
-				struct tmem_oid *oid, uint32_t index,
-				void *cdata, unsigned clen)
-{
-	struct page *page;
-	struct zv_hdr *zv = NULL;
-	uint32_t offset;
-	int alloc_size = clen + sizeof(struct zv_hdr);
-	int chunks = (alloc_size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
-	int ret;
-
-	BUG_ON(!irqs_disabled());
-	BUG_ON(chunks >= NCHUNKS);
-	ret = xv_malloc(cli->xvpool, clen + sizeof(struct zv_hdr),
-			&page, &offset, ZCACHE_GFP_MASK);
-	if (unlikely(ret))
-		goto out;
-	atomic_inc(&zv_curr_dist_counts[chunks]);
-	atomic_inc(&zv_cumul_dist_counts[chunks]);
-	zv = kmap_atomic(page) + offset;
-	zv->index = index;
-	zv->oid = *oid;
-	zv->pool_id = pool_id;
-	SET_SENTINEL(zv, ZVH);
-	INIT_LIST_HEAD(&zv->rem_op.list);
-	zv->client_id = get_client_id_from_client(cli);
-	zv->rem_op.op = RAMSTER_REMOTIFY_PERS_PUT;
-	if (zv->client_id == LOCAL_CLIENT) {
-		spin_lock(&zcache_rem_op_list_lock);
-		list_add_tail(&zv->rem_op.list, &zcache_rem_op_list);
-		spin_unlock(&zcache_rem_op_list_lock);
-	}
-	memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);
-	kunmap_atomic(zv);
-out:
-	return zv;
-}
-
-/* similar to zv_create, but just reserve space, no data yet */
-static struct zv_hdr *zv_alloc(struct tmem_pool *pool,
-				struct tmem_oid *oid, uint32_t index,
-				unsigned clen)
-{
-	struct zcache_client *cli = pool->client;
-	struct page *page;
-	struct zv_hdr *zv = NULL;
-	uint32_t offset;
-	int ret;
-
-	BUG_ON(!irqs_disabled());
-	BUG_ON(!is_local_client(pool->client));
-	ret = xv_malloc(cli->xvpool, clen + sizeof(struct zv_hdr),
-			&page, &offset, ZCACHE_GFP_MASK);
-	if (unlikely(ret))
-		goto out;
-	zv = kmap_atomic(page) + offset;
-	SET_SENTINEL(zv, ZVH);
-	INIT_LIST_HEAD(&zv->rem_op.list);
-	zv->client_id = LOCAL_CLIENT;
-	zv->rem_op.op = RAMSTER_INTRANSIT_PERS;
-	zv->index = index;
-	zv->oid = *oid;
-	zv->pool_id = pool->pool_id;
-	kunmap_atomic(zv);
-out:
-	return zv;
-}
-
-static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv)
-{
-	unsigned long flags;
-	struct page *page;
-	uint32_t offset;
-	uint16_t size = xv_get_object_size(zv);
-	int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
-
-	ASSERT_SENTINEL(zv, ZVH);
-	BUG_ON(chunks >= NCHUNKS);
-	atomic_dec(&zv_curr_dist_counts[chunks]);
-	size -= sizeof(*zv);
-	spin_lock(&zcache_rem_op_list_lock);
-	size = xv_get_object_size(zv) - sizeof(*zv);
-	BUG_ON(size == 0);
-	INVERT_SENTINEL(zv, ZVH);
-	if (!list_empty(&zv->rem_op.list))
-		list_del_init(&zv->rem_op.list);
-	spin_unlock(&zcache_rem_op_list_lock);
-	page = virt_to_page(zv);
-	offset = (unsigned long)zv & ~PAGE_MASK;
-	local_irq_save(flags);
-	xv_free(xvpool, page, offset);
-	local_irq_restore(flags);
-}
-
-static void zv_decompress(struct page *page, struct zv_hdr *zv)
-{
-	size_t clen = PAGE_SIZE;
-	char *to_va;
-	unsigned size;
-	int ret;
-
-	ASSERT_SENTINEL(zv, ZVH);
-	size = xv_get_object_size(zv) - sizeof(*zv);
-	BUG_ON(size == 0);
-	to_va = kmap_atomic(page);
-	ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv),
-					size, to_va, &clen);
-	kunmap_atomic(to_va);
-	BUG_ON(ret != LZO_E_OK);
-	BUG_ON(clen != PAGE_SIZE);
-}
-
-static void zv_copy_from_pampd(char *data, size_t *bufsize, struct zv_hdr *zv)
-{
-	unsigned size;
-
-	ASSERT_SENTINEL(zv, ZVH);
-	size = xv_get_object_size(zv) - sizeof(*zv);
-	BUG_ON(size == 0 || size > zv_max_page_size);
-	BUG_ON(size > *bufsize);
-	memcpy(data, (char *)zv + sizeof(*zv), size);
-	*bufsize = size;
-}
-
-static void zv_copy_to_pampd(struct zv_hdr *zv, char *data, size_t size)
-{
-	unsigned zv_size;
-
-	ASSERT_SENTINEL(zv, ZVH);
-	zv_size = xv_get_object_size(zv) - sizeof(*zv);
-	BUG_ON(zv_size != size);
-	BUG_ON(zv_size == 0 || zv_size > zv_max_page_size);
-	memcpy((char *)zv + sizeof(*zv), data, size);
-}
-
-#ifdef CONFIG_SYSFS
-/*
- * show a distribution of compression stats for zv pages.
- */
-
-static int zv_curr_dist_counts_show(char *buf)
-{
-	unsigned long i, n, chunks = 0, sum_total_chunks = 0;
-	char *p = buf;
-
-	for (i = 0; i < NCHUNKS; i++) {
-		n = atomic_read(&zv_curr_dist_counts[i]);
-		p += sprintf(p, "%lu ", n);
-		chunks += n;
-		sum_total_chunks += i * n;
-	}
-	p += sprintf(p, "mean:%lu\n",
-		chunks == 0 ? 0 : sum_total_chunks / chunks);
-	return p - buf;
-}
-
-static int zv_cumul_dist_counts_show(char *buf)
-{
-	unsigned long i, n, chunks = 0, sum_total_chunks = 0;
-	char *p = buf;
-
-	for (i = 0; i < NCHUNKS; i++) {
-		n = atomic_read(&zv_cumul_dist_counts[i]);
-		p += sprintf(p, "%lu ", n);
-		chunks += n;
-		sum_total_chunks += i * n;
-	}
-	p += sprintf(p, "mean:%lu\n",
-		chunks == 0 ? 0 : sum_total_chunks / chunks);
-	return p - buf;
-}
-
-/*
- * setting zv_max_zsize via sysfs causes all persistent (e.g. swap)
- * pages that don't compress to less than this value (including metadata
- * overhead) to be rejected.  We don't allow the value to get too close
- * to PAGE_SIZE.
- */
-static ssize_t zv_max_zsize_show(struct kobject *kobj,
-				    struct kobj_attribute *attr,
-				    char *buf)
-{
-	return sprintf(buf, "%u\n", zv_max_zsize);
-}
-
-static ssize_t zv_max_zsize_store(struct kobject *kobj,
-				    struct kobj_attribute *attr,
-				    const char *buf, size_t count)
-{
-	unsigned long val;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	err = kstrtoul(buf, 10, &val);
-	if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
-		return -EINVAL;
-	zv_max_zsize = val;
-	return count;
-}
-
-/*
- * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap)
- * pages that don't compress to less than this value (including metadata
- * overhead) to be rejected UNLESS the mean compression is also smaller
- * than this value.  In other words, we are load-balancing-by-zsize the
- * accepted pages.  Again, we don't allow the value to get too close
- * to PAGE_SIZE.
- */
-static ssize_t zv_max_mean_zsize_show(struct kobject *kobj,
-				    struct kobj_attribute *attr,
-				    char *buf)
-{
-	return sprintf(buf, "%u\n", zv_max_mean_zsize);
-}
-
-static ssize_t zv_max_mean_zsize_store(struct kobject *kobj,
-				    struct kobj_attribute *attr,
-				    const char *buf, size_t count)
-{
-	unsigned long val;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	err = kstrtoul(buf, 10, &val);
-	if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
-		return -EINVAL;
-	zv_max_mean_zsize = val;
-	return count;
-}
-
-/*
- * setting zv_page_count_policy_percent via sysfs sets an upper bound of
- * persistent (e.g. swap) pages that will be retained according to:
- *     (zv_page_count_policy_percent * totalram_pages) / 100)
- * when that limit is reached, further puts will be rejected (until
- * some pages have been flushed).  Note that, due to compression,
- * this number may exceed 100; it defaults to 75 and we set an
- * arbitary limit of 150.  A poor choice will almost certainly result
- * in OOM's, so this value should only be changed prudently.
- */
-static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj,
-						 struct kobj_attribute *attr,
-						 char *buf)
-{
-	return sprintf(buf, "%u\n", zv_page_count_policy_percent);
-}
-
-static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj,
-						  struct kobj_attribute *attr,
-						  const char *buf, size_t count)
-{
-	unsigned long val;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	err = kstrtoul(buf, 10, &val);
-	if (err || (val == 0) || (val > 150))
-		return -EINVAL;
-	zv_page_count_policy_percent = val;
-	return count;
-}
-
-static struct kobj_attribute zcache_zv_max_zsize_attr = {
-		.attr = { .name = "zv_max_zsize", .mode = 0644 },
-		.show = zv_max_zsize_show,
-		.store = zv_max_zsize_store,
-};
-
-static struct kobj_attribute zcache_zv_max_mean_zsize_attr = {
-		.attr = { .name = "zv_max_mean_zsize", .mode = 0644 },
-		.show = zv_max_mean_zsize_show,
-		.store = zv_max_mean_zsize_store,
-};
-
-static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = {
-		.attr = { .name = "zv_page_count_policy_percent",
-			  .mode = 0644 },
-		.show = zv_page_count_policy_percent_show,
-		.store = zv_page_count_policy_percent_store,
-};
-#endif
-
-/*
- * zcache core code starts here
- */
-
-/* useful stats not collected by cleancache or frontswap */
-static unsigned long zcache_flush_total;
-static unsigned long zcache_flush_found;
-static unsigned long zcache_flobj_total;
-static unsigned long zcache_flobj_found;
-static unsigned long zcache_failed_eph_puts;
-static unsigned long zcache_nonactive_puts;
-static unsigned long zcache_failed_pers_puts;
-
-/*
- * Tmem operations assume the poolid implies the invoking client.
- * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
- * RAMster has each client numbered by cluster node, and a KVM version
- * of zcache would have one client per guest and each client might
- * have a poolid==N.
- */
-static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid)
-{
-	struct tmem_pool *pool = NULL;
-	struct zcache_client *cli = NULL;
-
-	if (cli_id == LOCAL_CLIENT)
-		cli = &zcache_host;
-	else {
-		if (cli_id >= MAX_CLIENTS)
-			goto out;
-		cli = &zcache_clients[cli_id];
-		if (cli == NULL)
-			goto out;
-		atomic_inc(&cli->refcount);
-	}
-	if (poolid < MAX_POOLS_PER_CLIENT) {
-		pool = cli->tmem_pools[poolid];
-		if (pool != NULL)
-			atomic_inc(&pool->refcount);
-	}
-out:
-	return pool;
-}
-
-static void zcache_put_pool(struct tmem_pool *pool)
-{
-	struct zcache_client *cli = NULL;
-
-	if (pool == NULL)
-		BUG();
-	cli = pool->client;
-	atomic_dec(&pool->refcount);
-	atomic_dec(&cli->refcount);
-}
-
-int zcache_new_client(uint16_t cli_id)
-{
-	struct zcache_client *cli = NULL;
-	int ret = -1;
-
-	if (cli_id == LOCAL_CLIENT)
-		cli = &zcache_host;
-	else if ((unsigned int)cli_id < MAX_CLIENTS)
-		cli = &zcache_clients[cli_id];
-	if (cli == NULL)
-		goto out;
-	if (cli->allocated)
-		goto out;
-	cli->allocated = 1;
-#ifdef CONFIG_FRONTSWAP
-	cli->xvpool = xv_create_pool();
-	if (cli->xvpool == NULL)
-		goto out;
-#endif
-	ret = 0;
-out:
-	return ret;
-}
-
-/* counters for debugging */
-static unsigned long zcache_failed_get_free_pages;
-static unsigned long zcache_failed_alloc;
-static unsigned long zcache_put_to_flush;
-
-/*
- * for now, used named slabs so can easily track usage; later can
- * either just use kmalloc, or perhaps add a slab-like allocator
- * to more carefully manage total memory utilization
- */
-static struct kmem_cache *zcache_objnode_cache;
-static struct kmem_cache *zcache_obj_cache;
-static struct kmem_cache *ramster_flnode_cache;
-static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);
-static unsigned long zcache_curr_obj_count_max;
-static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);
-static unsigned long zcache_curr_objnode_count_max;
-
-/*
- * to avoid memory allocation recursion (e.g. due to direct reclaim), we
- * preload all necessary data structures so the hostops callbacks never
- * actually do a malloc
- */
-struct zcache_preload {
-	void *page;
-	struct tmem_obj *obj;
-	int nr;
-	struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];
-	struct flushlist_node *flnode;
-};
-static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
-
-static int zcache_do_preload(struct tmem_pool *pool)
-{
-	struct zcache_preload *kp;
-	struct tmem_objnode *objnode;
-	struct tmem_obj *obj;
-	struct flushlist_node *flnode;
-	void *page;
-	int ret = -ENOMEM;
-
-	if (unlikely(zcache_objnode_cache == NULL))
-		goto out;
-	if (unlikely(zcache_obj_cache == NULL))
-		goto out;
-	preempt_disable();
-	kp = &__get_cpu_var(zcache_preloads);
-	while (kp->nr < ARRAY_SIZE(kp->objnodes)) {
-		preempt_enable_no_resched();
-		objnode = kmem_cache_alloc(zcache_objnode_cache,
-				ZCACHE_GFP_MASK);
-		if (unlikely(objnode == NULL)) {
-			zcache_failed_alloc++;
-			goto out;
-		}
-		preempt_disable();
-		kp = &__get_cpu_var(zcache_preloads);
-		if (kp->nr < ARRAY_SIZE(kp->objnodes))
-			kp->objnodes[kp->nr++] = objnode;
-		else
-			kmem_cache_free(zcache_objnode_cache, objnode);
-	}
-	preempt_enable_no_resched();
-	obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
-	if (unlikely(obj == NULL)) {
-		zcache_failed_alloc++;
-		goto out;
-	}
-	flnode = kmem_cache_alloc(ramster_flnode_cache, ZCACHE_GFP_MASK);
-	if (unlikely(flnode == NULL)) {
-		zcache_failed_alloc++;
-		goto out;
-	}
-	if (is_ephemeral(pool)) {
-		page = (void *)__get_free_page(ZCACHE_GFP_MASK);
-		if (unlikely(page == NULL)) {
-			zcache_failed_get_free_pages++;
-			kmem_cache_free(zcache_obj_cache, obj);
-			kmem_cache_free(ramster_flnode_cache, flnode);
-			goto out;
-		}
-	}
-	preempt_disable();
-	kp = &__get_cpu_var(zcache_preloads);
-	if (kp->obj == NULL)
-		kp->obj = obj;
-	else
-		kmem_cache_free(zcache_obj_cache, obj);
-	if (kp->flnode == NULL)
-		kp->flnode = flnode;
-	else
-		kmem_cache_free(ramster_flnode_cache, flnode);
-	if (is_ephemeral(pool)) {
-		if (kp->page == NULL)
-			kp->page = page;
-		else
-			free_page((unsigned long)page);
-	}
-	ret = 0;
-out:
-	return ret;
-}
-
-static int ramster_do_preload_flnode_only(struct tmem_pool *pool)
-{
-	struct zcache_preload *kp;
-	struct flushlist_node *flnode;
-	int ret = -ENOMEM;
-
-	BUG_ON(!irqs_disabled());
-	if (unlikely(ramster_flnode_cache == NULL))
-		BUG();
-	kp = &__get_cpu_var(zcache_preloads);
-	flnode = kmem_cache_alloc(ramster_flnode_cache, GFP_ATOMIC);
-	if (unlikely(flnode == NULL) && kp->flnode == NULL)
-		BUG();  /* FIXME handle more gracefully, but how??? */
-	else if (kp->flnode == NULL)
-		kp->flnode = flnode;
-	else
-		kmem_cache_free(ramster_flnode_cache, flnode);
-	return ret;
-}
-
-static void *zcache_get_free_page(void)
-{
-	struct zcache_preload *kp;
-	void *page;
-
-	kp = &__get_cpu_var(zcache_preloads);
-	page = kp->page;
-	BUG_ON(page == NULL);
-	kp->page = NULL;
-	return page;
-}
-
-static void zcache_free_page(void *p)
-{
-	free_page((unsigned long)p);
-}
-
-/*
- * zcache implementation for tmem host ops
- */
-
-static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
-{
-	struct tmem_objnode *objnode = NULL;
-	unsigned long count;
-	struct zcache_preload *kp;
-
-	kp = &__get_cpu_var(zcache_preloads);
-	if (kp->nr <= 0)
-		goto out;
-	objnode = kp->objnodes[kp->nr - 1];
-	BUG_ON(objnode == NULL);
-	kp->objnodes[kp->nr - 1] = NULL;
-	kp->nr--;
-	count = atomic_inc_return(&zcache_curr_objnode_count);
-	if (count > zcache_curr_objnode_count_max)
-		zcache_curr_objnode_count_max = count;
-out:
-	return objnode;
-}
-
-static void zcache_objnode_free(struct tmem_objnode *objnode,
-					struct tmem_pool *pool)
-{
-	atomic_dec(&zcache_curr_objnode_count);
-	BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0);
-	kmem_cache_free(zcache_objnode_cache, objnode);
-}
-
-static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
-{
-	struct tmem_obj *obj = NULL;
-	unsigned long count;
-	struct zcache_preload *kp;
-
-	kp = &__get_cpu_var(zcache_preloads);
-	obj = kp->obj;
-	BUG_ON(obj == NULL);
-	kp->obj = NULL;
-	count = atomic_inc_return(&zcache_curr_obj_count);
-	if (count > zcache_curr_obj_count_max)
-		zcache_curr_obj_count_max = count;
-	return obj;
-}
-
-static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
-{
-	atomic_dec(&zcache_curr_obj_count);
-	BUG_ON(atomic_read(&zcache_curr_obj_count) < 0);
-	kmem_cache_free(zcache_obj_cache, obj);
-}
-
-static struct flushlist_node *ramster_flnode_alloc(struct tmem_pool *pool)
-{
-	struct flushlist_node *flnode = NULL;
-	struct zcache_preload *kp;
-	int count;
-
-	kp = &__get_cpu_var(zcache_preloads);
-	flnode = kp->flnode;
-	BUG_ON(flnode == NULL);
-	kp->flnode = NULL;
-	count = atomic_inc_return(&ramster_curr_flnode_count);
-	if (count > ramster_curr_flnode_count_max)
-		ramster_curr_flnode_count_max = count;
-	return flnode;
-}
-
-static void ramster_flnode_free(struct flushlist_node *flnode,
-				struct tmem_pool *pool)
-{
-	atomic_dec(&ramster_curr_flnode_count);
-	BUG_ON(atomic_read(&ramster_curr_flnode_count) < 0);
-	kmem_cache_free(ramster_flnode_cache, flnode);
-}
-
-static struct tmem_hostops zcache_hostops = {
-	.obj_alloc = zcache_obj_alloc,
-	.obj_free = zcache_obj_free,
-	.objnode_alloc = zcache_objnode_alloc,
-	.objnode_free = zcache_objnode_free,
-};
-
-/*
- * zcache implementations for PAM page descriptor ops
- */
-
-
-static inline void dec_and_check(atomic_t *pvar)
-{
-	atomic_dec(pvar);
-	/* later when all accounting is fixed, make this a BUG */
-	WARN_ON_ONCE(atomic_read(pvar) < 0);
-}
-
-static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0);
-static unsigned long zcache_curr_eph_pampd_count_max;
-static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0);
-static unsigned long zcache_curr_pers_pampd_count_max;
-
-/* forward reference */
-static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
-
-static int zcache_pampd_eph_create(char *data, size_t size, bool raw,
-				struct tmem_pool *pool, struct tmem_oid *oid,
-				uint32_t index, void **pampd)
-{
-	int ret = -1;
-	void *cdata = data;
-	size_t clen = size;
-	struct zcache_client *cli = pool->client;
-	uint16_t client_id = get_client_id_from_client(cli);
-	struct page *page = NULL;
-	unsigned long count;
-
-	if (!raw) {
-		page = virt_to_page(data);
-		ret = zcache_compress(page, &cdata, &clen);
-		if (ret == 0)
-			goto out;
-		if (clen == 0 || clen > zbud_max_buddy_size()) {
-			zcache_compress_poor++;
-			goto out;
-		}
-	}
-	*pampd = (void *)zbud_create(client_id, pool->pool_id, oid,
-					index, page, cdata, clen);
-	if (*pampd == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	ret = 0;
-	count = atomic_inc_return(&zcache_curr_eph_pampd_count);
-	if (count > zcache_curr_eph_pampd_count_max)
-		zcache_curr_eph_pampd_count_max = count;
-	if (client_id != LOCAL_CLIENT) {
-		count = atomic_inc_return(&ramster_foreign_eph_pampd_count);
-		if (count > ramster_foreign_eph_pampd_count_max)
-			ramster_foreign_eph_pampd_count_max = count;
-	}
-out:
-	return ret;
-}
-
-static int zcache_pampd_pers_create(char *data, size_t size, bool raw,
-				struct tmem_pool *pool, struct tmem_oid *oid,
-				uint32_t index, void **pampd)
-{
-	int ret = -1;
-	void *cdata = data;
-	size_t clen = size;
-	struct zcache_client *cli = pool->client;
-	struct page *page;
-	unsigned long count;
-	unsigned long zv_mean_zsize;
-	struct zv_hdr *zv;
-	long curr_pers_pampd_count;
-	u64 total_zsize;
-#ifdef RAMSTER_TESTING
-	static bool pampd_neg_warned;
-#endif
-
-	curr_pers_pampd_count = atomic_read(&zcache_curr_pers_pampd_count) -
-			atomic_read(&ramster_remote_pers_pages);
-#ifdef RAMSTER_TESTING
-	/* should always be positive, but warn if accounting is off */
-	if (!pampd_neg_warned) {
-		pr_warn("ramster: bad accounting for curr_pers_pampd_count\n");
-		pampd_neg_warned = true;
-	}
-#endif
-	if (curr_pers_pampd_count >
-		    (zv_page_count_policy_percent * totalram_pages) / 100) {
-		zcache_policy_percent_exceeded++;
-		goto out;
-	}
-	if (raw)
-		goto ok_to_create;
-	page = virt_to_page(data);
-	if (zcache_compress(page, &cdata, &clen) == 0)
-		goto out;
-	/* reject if compression is too poor */
-	if (clen > zv_max_zsize) {
-		zcache_compress_poor++;
-		goto out;
-	}
-	/* reject if mean compression is too poor */
-	if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) {
-		total_zsize = xv_get_total_size_bytes(cli->xvpool);
-		zv_mean_zsize = div_u64(total_zsize, curr_pers_pampd_count);
-		if (zv_mean_zsize > zv_max_mean_zsize) {
-			zcache_mean_compress_poor++;
-			goto out;
-		}
-	}
-ok_to_create:
-	*pampd = (void *)zv_create(cli, pool->pool_id, oid, index, cdata, clen);
-	if (*pampd == NULL) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	ret = 0;
-	count = atomic_inc_return(&zcache_curr_pers_pampd_count);
-	if (count > zcache_curr_pers_pampd_count_max)
-		zcache_curr_pers_pampd_count_max = count;
-	if (is_local_client(cli))
-		goto out;
-	zv = *(struct zv_hdr **)pampd;
-	count = atomic_inc_return(&ramster_foreign_pers_pampd_count);
-	if (count > ramster_foreign_pers_pampd_count_max)
-		ramster_foreign_pers_pampd_count_max = count;
-out:
-	return ret;
-}
-
-static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,
-				struct tmem_pool *pool, struct tmem_oid *oid,
-				uint32_t index)
-{
-	void *pampd = NULL;
-	int ret;
-	bool ephemeral;
-
-	BUG_ON(preemptible());
-	ephemeral = (eph == 1) || ((eph == 0) && is_ephemeral(pool));
-	if (ephemeral)
-		ret = zcache_pampd_eph_create(data, size, raw, pool,
-						oid, index, &pampd);
-	else
-		ret = zcache_pampd_pers_create(data, size, raw, pool,
-						oid, index, &pampd);
-	/* FIXME add some counters here for failed creates? */
-	return pampd;
-}
-
-/*
- * fill the pageframe corresponding to the struct page with the data
- * from the passed pampd
- */
-static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw,
-					void *pampd, struct tmem_pool *pool,
-					struct tmem_oid *oid, uint32_t index)
-{
-	int ret = 0;
-
-	BUG_ON(preemptible());
-	BUG_ON(is_ephemeral(pool)); /* Fix later for shared pools? */
-	BUG_ON(pampd_is_remote(pampd));
-	if (raw)
-		zv_copy_from_pampd(data, bufsize, pampd);
-	else
-		zv_decompress(virt_to_page(data), pampd);
-	return ret;
-}
-
-static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw,
-					void *pampd, struct tmem_pool *pool,
-					struct tmem_oid *oid, uint32_t index)
-{
-	int ret = 0;
-	unsigned long flags;
-	struct zcache_client *cli = pool->client;
-
-	BUG_ON(preemptible());
-	BUG_ON(pampd_is_remote(pampd));
-	if (is_ephemeral(pool)) {
-		local_irq_save(flags);
-		if (raw)
-			zbud_copy_from_pampd(data, bufsize, pampd);
-		else
-			ret = zbud_decompress(virt_to_page(data), pampd);
-		zbud_free_and_delist((struct zbud_hdr *)pampd);
-		local_irq_restore(flags);
-		if (!is_local_client(cli))
-			dec_and_check(&ramster_foreign_eph_pampd_count);
-		dec_and_check(&zcache_curr_eph_pampd_count);
-	} else {
-		if (is_local_client(cli))
-			BUG();
-		if (raw)
-			zv_copy_from_pampd(data, bufsize, pampd);
-		else
-			zv_decompress(virt_to_page(data), pampd);
-		zv_free(cli->xvpool, pampd);
-		if (!is_local_client(cli))
-			dec_and_check(&ramster_foreign_pers_pampd_count);
-		dec_and_check(&zcache_curr_pers_pampd_count);
-		ret = 0;
-	}
-	return ret;
-}
-
-static bool zcache_pampd_is_remote(void *pampd)
-{
-	return pampd_is_remote(pampd);
-}
-
-/*
- * free the pampd and remove it from any zcache lists
- * pampd must no longer be pointed to from any tmem data structures!
- */
-static void zcache_pampd_free(void *pampd, struct tmem_pool *pool,
-			      struct tmem_oid *oid, uint32_t index, bool acct)
-{
-	struct zcache_client *cli = pool->client;
-	bool eph = is_ephemeral(pool);
-	struct zv_hdr *zv;
-
-	BUG_ON(preemptible());
-	if (pampd_is_remote(pampd)) {
-		WARN_ON(acct == false);
-		if (oid == NULL) {
-			/*
-			 * a NULL oid means to ignore this pampd free
-			 * as the remote freeing will be handled elsewhere
-			 */
-		} else if (eph) {
-			/* FIXME remote flush optional but probably good idea */
-			/* FIXME get these working properly again */
-			dec_and_check(&zcache_curr_eph_pampd_count);
-		} else if (pampd_is_intransit(pampd)) {
-			/* did a pers remote get_and_free, so just free local */
-			pampd = pampd_mask_intransit_and_remote(pampd);
-			goto local_pers;
-		} else {
-			struct flushlist_node *flnode =
-				ramster_flnode_alloc(pool);
-
-			flnode->xh.client_id = pampd_remote_node(pampd);
-			flnode->xh.pool_id = pool->pool_id;
-			flnode->xh.oid = *oid;
-			flnode->xh.index = index;
-			flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_PAGE;
-			spin_lock(&zcache_rem_op_list_lock);
-			list_add(&flnode->rem_op.list, &zcache_rem_op_list);
-			spin_unlock(&zcache_rem_op_list_lock);
-			dec_and_check(&zcache_curr_pers_pampd_count);
-			dec_and_check(&ramster_remote_pers_pages);
-		}
-	} else if (eph) {
-		zbud_free_and_delist((struct zbud_hdr *)pampd);
-		if (!is_local_client(pool->client))
-			dec_and_check(&ramster_foreign_eph_pampd_count);
-		if (acct)
-			/* FIXME get these working properly again */
-			dec_and_check(&zcache_curr_eph_pampd_count);
-	} else {
-local_pers:
-		zv = (struct zv_hdr *)pampd;
-		if (!is_local_client(pool->client))
-			dec_and_check(&ramster_foreign_pers_pampd_count);
-		zv_free(cli->xvpool, zv);
-		if (acct)
-			/* FIXME get these working properly again */
-			dec_and_check(&zcache_curr_pers_pampd_count);
-	}
-}
-
-static void zcache_pampd_free_obj(struct tmem_pool *pool,
-					struct tmem_obj *obj)
-{
-	struct flushlist_node *flnode;
-
-	BUG_ON(preemptible());
-	if (obj->extra == NULL)
-		return;
-	BUG_ON(!pampd_is_remote(obj->extra));
-	flnode = ramster_flnode_alloc(pool);
-	flnode->xh.client_id = pampd_remote_node(obj->extra);
-	flnode->xh.pool_id = pool->pool_id;
-	flnode->xh.oid = obj->oid;
-	flnode->xh.index = FLUSH_ENTIRE_OBJECT;
-	flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_OBJ;
-	spin_lock(&zcache_rem_op_list_lock);
-	list_add(&flnode->rem_op.list, &zcache_rem_op_list);
-	spin_unlock(&zcache_rem_op_list_lock);
-}
-
-void zcache_pampd_new_obj(struct tmem_obj *obj)
-{
-	obj->extra = NULL;
-}
-
-int zcache_pampd_replace_in_obj(void *new_pampd, struct tmem_obj *obj)
-{
-	int ret = -1;
-
-	if (new_pampd != NULL) {
-		if (obj->extra == NULL)
-			obj->extra = new_pampd;
-		/* enforce that all remote pages in an object reside
-		 * in the same node! */
-		else if (pampd_remote_node(new_pampd) !=
-				pampd_remote_node((void *)(obj->extra)))
-			BUG();
-		ret = 0;
-	}
-	return ret;
-}
-
-/*
- * Called by the message handler after a (still compressed) page has been
- * fetched from the remote machine in response to an "is_remote" tmem_get
- * or persistent tmem_localify.  For a tmem_get, "extra" is the address of
- * the page that is to be filled to succesfully resolve the tmem_get; for
- * a (persistent) tmem_localify, "extra" is NULL (as the data is placed only
- * in the local zcache).  "data" points to "size" bytes of (compressed) data
- * passed in the message.  In the case of a persistent remote get, if
- * pre-allocation was successful (see zcache_repatriate_preload), the page
- * is placed into both local zcache and at "extra".
- */
-int zcache_localify(int pool_id, struct tmem_oid *oidp,
-			uint32_t index, char *data, size_t size,
-			void *extra)
-{
-	int ret = -ENOENT;
-	unsigned long flags;
-	struct tmem_pool *pool;
-	bool ephemeral, delete = false;
-	size_t clen = PAGE_SIZE;
-	void *pampd, *saved_hb;
-	struct tmem_obj *obj;
-
-	pool = zcache_get_pool_by_id(LOCAL_CLIENT, pool_id);
-	if (unlikely(pool == NULL))
-		/* pool doesn't exist anymore */
-		goto out;
-	ephemeral = is_ephemeral(pool);
-	local_irq_save(flags);  /* FIXME: maybe only disable softirqs? */
-	pampd = tmem_localify_get_pampd(pool, oidp, index, &obj, &saved_hb);
-	if (pampd == NULL) {
-		/* hmmm... must have been a flush while waiting */
-#ifdef RAMSTER_TESTING
-		pr_err("UNTESTED pampd==NULL in zcache_localify\n");
-#endif
-		if (ephemeral)
-			ramster_remote_eph_pages_unsucc_get++;
-		else
-			ramster_remote_pers_pages_unsucc_get++;
-		obj = NULL;
-		goto finish;
-	} else if (unlikely(!pampd_is_remote(pampd))) {
-		/* hmmm... must have been a dup put while waiting */
-#ifdef RAMSTER_TESTING
-		pr_err("UNTESTED dup while waiting in zcache_localify\n");
-#endif
-		if (ephemeral)
-			ramster_remote_eph_pages_unsucc_get++;
-		else
-			ramster_remote_pers_pages_unsucc_get++;
-		obj = NULL;
-		pampd = NULL;
-		ret = -EEXIST;
-		goto finish;
-	} else if (size == 0) {
-		/* no remote data, delete the local is_remote pampd */
-		pampd = NULL;
-		if (ephemeral)
-			ramster_remote_eph_pages_unsucc_get++;
-		else
-			BUG();
-		delete = true;
-		goto finish;
-	}
-	if (!ephemeral && pampd_is_intransit(pampd)) {
-		/* localify to zcache */
-		pampd = pampd_mask_intransit_and_remote(pampd);
-		zv_copy_to_pampd(pampd, data, size);
-	} else {
-		pampd = NULL;
-		obj = NULL;
-	}
-	if (extra != NULL) {
-		/* decompress direct-to-memory to complete remotify */
-		ret = lzo1x_decompress_safe((char *)data, size,
-						(char *)extra, &clen);
-		BUG_ON(ret != LZO_E_OK);
-		BUG_ON(clen != PAGE_SIZE);
-	}
-	if (ephemeral)
-		ramster_remote_eph_pages_succ_get++;
-	else
-		ramster_remote_pers_pages_succ_get++;
-	ret = 0;
-finish:
-	tmem_localify_finish(obj, index, pampd, saved_hb, delete);
-	zcache_put_pool(pool);
-	local_irq_restore(flags);
-out:
-	return ret;
-}
-
-/*
- * Called on a remote persistent tmem_get to attempt to preallocate
- * local storage for the data contained in the remote persistent page.
- * If succesfully preallocated, returns the pampd, marked as remote and
- * in_transit.  Else returns NULL.  Note that the appropriate tmem data
- * structure must be locked.
- */
-static void *zcache_pampd_repatriate_preload(void *pampd,
-						struct tmem_pool *pool,
-						struct tmem_oid *oid,
-						uint32_t index,
-						bool *intransit)
-{
-	int clen = pampd_remote_size(pampd);
-	void *ret_pampd = NULL;
-	unsigned long flags;
-
-	if (!pampd_is_remote(pampd))
-		BUG();
-	if (is_ephemeral(pool))
-		BUG();
-	if (pampd_is_intransit(pampd)) {
-		/*
-		 * to avoid multiple allocations (and maybe a memory leak)
-		 * don't preallocate if already in the process of being
-		 * repatriated
-		 */
-		*intransit = true;
-		goto out;
-	}
-	*intransit = false;
-	local_irq_save(flags);
-	ret_pampd = (void *)zv_alloc(pool, oid, index, clen);
-	if (ret_pampd != NULL) {
-		/*
-		 *  a pampd is marked intransit if it is remote and space has
-		 *  been allocated for it locally (note, only happens for
-		 *  persistent pages, in which case the remote copy is freed)
-		 */
-		ret_pampd = pampd_mark_intransit(ret_pampd);
-		dec_and_check(&ramster_remote_pers_pages);
-	} else
-		ramster_pers_pages_remote_nomem++;
-	local_irq_restore(flags);
-out:
-	return ret_pampd;
-}
-
-/*
- * Called on a remote tmem_get to invoke a message to fetch the page.
- * Might sleep so no tmem locks can be held.  "extra" is passed
- * all the way through the round-trip messaging to zcache_localify.
- */
-static int zcache_pampd_repatriate(void *fake_pampd, void *real_pampd,
-				   struct tmem_pool *pool,
-				   struct tmem_oid *oid, uint32_t index,
-				   bool free, void *extra)
-{
-	struct tmem_xhandle xh;
-	int ret;
-
-	if (pampd_is_intransit(real_pampd))
-		/* have local space pre-reserved, so free remote copy */
-		free = true;
-	xh = tmem_xhandle_fill(LOCAL_CLIENT, pool, oid, index);
-	/* unreliable request/response for now */
-	ret = ramster_remote_async_get(&xh, free,
-					pampd_remote_node(fake_pampd),
-					pampd_remote_size(fake_pampd),
-					pampd_remote_cksum(fake_pampd),
-					extra);
-#ifdef RAMSTER_TESTING
-	if (ret != 0 && ret != -ENOENT)
-		pr_err("TESTING zcache_pampd_repatriate returns, ret=%d\n",
-			ret);
-#endif
-	return ret;
-}
-
-static struct tmem_pamops zcache_pamops = {
-	.create = zcache_pampd_create,
-	.get_data = zcache_pampd_get_data,
-	.free = zcache_pampd_free,
-	.get_data_and_free = zcache_pampd_get_data_and_free,
-	.free_obj = zcache_pampd_free_obj,
-	.is_remote = zcache_pampd_is_remote,
-	.repatriate_preload = zcache_pampd_repatriate_preload,
-	.repatriate = zcache_pampd_repatriate,
-	.new_obj = zcache_pampd_new_obj,
-	.replace_in_obj = zcache_pampd_replace_in_obj,
-};
-
-/*
- * zcache compression/decompression and related per-cpu stuff
- */
-
-#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
-#define LZO_DSTMEM_PAGE_ORDER 1
-static DEFINE_PER_CPU(unsigned char *, zcache_workmem);
-static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
-
-static int zcache_compress(struct page *from, void **out_va, size_t *out_len)
-{
-	int ret = 0;
-	unsigned char *dmem = __get_cpu_var(zcache_dstmem);
-	unsigned char *wmem = __get_cpu_var(zcache_workmem);
-	char *from_va;
-
-	BUG_ON(!irqs_disabled());
-	if (unlikely(dmem == NULL || wmem == NULL))
-		goto out;  /* no buffer, so can't compress */
-	from_va = kmap_atomic(from);
-	mb();
-	ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem);
-	BUG_ON(ret != LZO_E_OK);
-	*out_va = dmem;
-	kunmap_atomic(from_va);
-	ret = 1;
-out:
-	return ret;
-}
-
-
-static int zcache_cpu_notifier(struct notifier_block *nb,
-				unsigned long action, void *pcpu)
-{
-	int cpu = (long)pcpu;
-	struct zcache_preload *kp;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-		per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
-			GFP_KERNEL | __GFP_REPEAT,
-			LZO_DSTMEM_PAGE_ORDER),
-		per_cpu(zcache_workmem, cpu) =
-			kzalloc(LZO1X_MEM_COMPRESS,
-				GFP_KERNEL | __GFP_REPEAT);
-		per_cpu(zcache_remoteputmem, cpu) =
-			kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT);
-		break;
-	case CPU_DEAD:
-	case CPU_UP_CANCELED:
-		kfree(per_cpu(zcache_remoteputmem, cpu));
-		per_cpu(zcache_remoteputmem, cpu) = NULL;
-		free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
-				LZO_DSTMEM_PAGE_ORDER);
-		per_cpu(zcache_dstmem, cpu) = NULL;
-		kfree(per_cpu(zcache_workmem, cpu));
-		per_cpu(zcache_workmem, cpu) = NULL;
-		kp = &per_cpu(zcache_preloads, cpu);
-		while (kp->nr) {
-			kmem_cache_free(zcache_objnode_cache,
-					kp->objnodes[kp->nr - 1]);
-			kp->objnodes[kp->nr - 1] = NULL;
-			kp->nr--;
-		}
-		if (kp->obj) {
-			kmem_cache_free(zcache_obj_cache, kp->obj);
-			kp->obj = NULL;
-		}
-		if (kp->flnode) {
-			kmem_cache_free(ramster_flnode_cache, kp->flnode);
-			kp->flnode = NULL;
-		}
-		if (kp->page) {
-			free_page((unsigned long)kp->page);
-			kp->page = NULL;
-		}
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block zcache_cpu_notifier_block = {
-	.notifier_call = zcache_cpu_notifier
-};
-
-#ifdef CONFIG_SYSFS
-#define ZCACHE_SYSFS_RO(_name) \
-	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
-				struct kobj_attribute *attr, char *buf) \
-	{ \
-		return sprintf(buf, "%lu\n", zcache_##_name); \
-	} \
-	static struct kobj_attribute zcache_##_name##_attr = { \
-		.attr = { .name = __stringify(_name), .mode = 0444 }, \
-		.show = zcache_##_name##_show, \
-	}
-
-#define ZCACHE_SYSFS_RO_ATOMIC(_name) \
-	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
-				struct kobj_attribute *attr, char *buf) \
-	{ \
-	    return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
-	} \
-	static struct kobj_attribute zcache_##_name##_attr = { \
-		.attr = { .name = __stringify(_name), .mode = 0444 }, \
-		.show = zcache_##_name##_show, \
-	}
-
-#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
-	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
-				struct kobj_attribute *attr, char *buf) \
-	{ \
-	    return _func(buf); \
-	} \
-	static struct kobj_attribute zcache_##_name##_attr = { \
-		.attr = { .name = __stringify(_name), .mode = 0444 }, \
-		.show = zcache_##_name##_show, \
-	}
-
-ZCACHE_SYSFS_RO(curr_obj_count_max);
-ZCACHE_SYSFS_RO(curr_objnode_count_max);
-ZCACHE_SYSFS_RO(flush_total);
-ZCACHE_SYSFS_RO(flush_found);
-ZCACHE_SYSFS_RO(flobj_total);
-ZCACHE_SYSFS_RO(flobj_found);
-ZCACHE_SYSFS_RO(failed_eph_puts);
-ZCACHE_SYSFS_RO(nonactive_puts);
-ZCACHE_SYSFS_RO(failed_pers_puts);
-ZCACHE_SYSFS_RO(zbud_curr_zbytes);
-ZCACHE_SYSFS_RO(zbud_cumul_zpages);
-ZCACHE_SYSFS_RO(zbud_cumul_zbytes);
-ZCACHE_SYSFS_RO(zbud_buddied_count);
-ZCACHE_SYSFS_RO(evicted_raw_pages);
-ZCACHE_SYSFS_RO(evicted_unbuddied_pages);
-ZCACHE_SYSFS_RO(evicted_buddied_pages);
-ZCACHE_SYSFS_RO(failed_get_free_pages);
-ZCACHE_SYSFS_RO(failed_alloc);
-ZCACHE_SYSFS_RO(put_to_flush);
-ZCACHE_SYSFS_RO(compress_poor);
-ZCACHE_SYSFS_RO(mean_compress_poor);
-ZCACHE_SYSFS_RO(policy_percent_exceeded);
-ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
-ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
-ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
-ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count);
-ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts,
-			zbud_show_unbuddied_list_counts);
-ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
-			zbud_show_cumul_chunk_counts);
-ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts,
-			zv_curr_dist_counts_show);
-ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts,
-			zv_cumul_dist_counts_show);
-
-static struct attribute *zcache_attrs[] = {
-	&zcache_curr_obj_count_attr.attr,
-	&zcache_curr_obj_count_max_attr.attr,
-	&zcache_curr_objnode_count_attr.attr,
-	&zcache_curr_objnode_count_max_attr.attr,
-	&zcache_flush_total_attr.attr,
-	&zcache_flobj_total_attr.attr,
-	&zcache_flush_found_attr.attr,
-	&zcache_flobj_found_attr.attr,
-	&zcache_failed_eph_puts_attr.attr,
-	&zcache_nonactive_puts_attr.attr,
-	&zcache_failed_pers_puts_attr.attr,
-	&zcache_policy_percent_exceeded_attr.attr,
-	&zcache_compress_poor_attr.attr,
-	&zcache_mean_compress_poor_attr.attr,
-	&zcache_zbud_curr_raw_pages_attr.attr,
-	&zcache_zbud_curr_zpages_attr.attr,
-	&zcache_zbud_curr_zbytes_attr.attr,
-	&zcache_zbud_cumul_zpages_attr.attr,
-	&zcache_zbud_cumul_zbytes_attr.attr,
-	&zcache_zbud_buddied_count_attr.attr,
-	&zcache_evicted_raw_pages_attr.attr,
-	&zcache_evicted_unbuddied_pages_attr.attr,
-	&zcache_evicted_buddied_pages_attr.attr,
-	&zcache_failed_get_free_pages_attr.attr,
-	&zcache_failed_alloc_attr.attr,
-	&zcache_put_to_flush_attr.attr,
-	&zcache_zbud_unbuddied_list_counts_attr.attr,
-	&zcache_zbud_cumul_chunk_counts_attr.attr,
-	&zcache_zv_curr_dist_counts_attr.attr,
-	&zcache_zv_cumul_dist_counts_attr.attr,
-	&zcache_zv_max_zsize_attr.attr,
-	&zcache_zv_max_mean_zsize_attr.attr,
-	&zcache_zv_page_count_policy_percent_attr.attr,
-	NULL,
-};
-
-static struct attribute_group zcache_attr_group = {
-	.attrs = zcache_attrs,
-	.name = "zcache",
-};
-
-#define RAMSTER_SYSFS_RO(_name) \
-	static ssize_t ramster_##_name##_show(struct kobject *kobj, \
-				struct kobj_attribute *attr, char *buf) \
-	{ \
-		return sprintf(buf, "%lu\n", ramster_##_name); \
-	} \
-	static struct kobj_attribute ramster_##_name##_attr = { \
-		.attr = { .name = __stringify(_name), .mode = 0444 }, \
-		.show = ramster_##_name##_show, \
-	}
-
-#define RAMSTER_SYSFS_RW(_name) \
-	static ssize_t ramster_##_name##_show(struct kobject *kobj, \
-				struct kobj_attribute *attr, char *buf) \
-	{ \
-		return sprintf(buf, "%lu\n", ramster_##_name); \
-	} \
-	static ssize_t ramster_##_name##_store(struct kobject *kobj, \
-		struct kobj_attribute *attr, const char *buf, size_t count) \
-	{ \
-		int err; \
-		unsigned long enable; \
-		err = kstrtoul(buf, 10, &enable); \
-		if (err) \
-			return -EINVAL; \
-		ramster_##_name = enable; \
-		return count; \
-	} \
-	static struct kobj_attribute ramster_##_name##_attr = { \
-		.attr = { .name = __stringify(_name), .mode = 0644 }, \
-		.show = ramster_##_name##_show, \
-		.store = ramster_##_name##_store, \
-	}
-
-#define RAMSTER_SYSFS_RO_ATOMIC(_name) \
-	static ssize_t ramster_##_name##_show(struct kobject *kobj, \
-				struct kobj_attribute *attr, char *buf) \
-	{ \
-	    return sprintf(buf, "%d\n", atomic_read(&ramster_##_name)); \
-	} \
-	static struct kobj_attribute ramster_##_name##_attr = { \
-		.attr = { .name = __stringify(_name), .mode = 0444 }, \
-		.show = ramster_##_name##_show, \
-	}
-
-RAMSTER_SYSFS_RO(interface_revision);
-RAMSTER_SYSFS_RO_ATOMIC(remote_pers_pages);
-RAMSTER_SYSFS_RW(pers_remotify_enable);
-RAMSTER_SYSFS_RW(eph_remotify_enable);
-RAMSTER_SYSFS_RO(eph_pages_remoted);
-RAMSTER_SYSFS_RO(eph_pages_remote_failed);
-RAMSTER_SYSFS_RO(pers_pages_remoted);
-RAMSTER_SYSFS_RO(pers_pages_remote_failed);
-RAMSTER_SYSFS_RO(pers_pages_remote_nomem);
-RAMSTER_SYSFS_RO(remote_pages_flushed);
-RAMSTER_SYSFS_RO(remote_page_flushes_failed);
-RAMSTER_SYSFS_RO(remote_objects_flushed);
-RAMSTER_SYSFS_RO(remote_object_flushes_failed);
-RAMSTER_SYSFS_RO(remote_eph_pages_succ_get);
-RAMSTER_SYSFS_RO(remote_eph_pages_unsucc_get);
-RAMSTER_SYSFS_RO(remote_pers_pages_succ_get);
-RAMSTER_SYSFS_RO(remote_pers_pages_unsucc_get);
-RAMSTER_SYSFS_RO_ATOMIC(foreign_eph_pampd_count);
-RAMSTER_SYSFS_RO(foreign_eph_pampd_count_max);
-RAMSTER_SYSFS_RO_ATOMIC(foreign_pers_pampd_count);
-RAMSTER_SYSFS_RO(foreign_pers_pampd_count_max);
-RAMSTER_SYSFS_RO_ATOMIC(curr_flnode_count);
-RAMSTER_SYSFS_RO(curr_flnode_count_max);
-
-#define MANUAL_NODES 8
-static bool ramster_nodes_manual_up[MANUAL_NODES];
-static ssize_t ramster_manual_node_up_show(struct kobject *kobj,
-				struct kobj_attribute *attr, char *buf)
-{
-	int i;
-	char *p = buf;
-	for (i = 0; i < MANUAL_NODES; i++)
-		if (ramster_nodes_manual_up[i])
-			p += sprintf(p, "%d ", i);
-	p += sprintf(p, "\n");
-	return p - buf;
-}
-
-static ssize_t ramster_manual_node_up_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	int err;
-	unsigned long node_num;
-
-	err = kstrtoul(buf, 10, &node_num);
-	if (err) {
-		pr_err("ramster: bad strtoul?\n");
-		return -EINVAL;
-	}
-	if (node_num >= MANUAL_NODES) {
-		pr_err("ramster: bad node_num=%lu?\n", node_num);
-		return -EINVAL;
-	}
-	if (ramster_nodes_manual_up[node_num]) {
-		pr_err("ramster: node %d already up, ignoring\n",
-							(int)node_num);
-	} else {
-		ramster_nodes_manual_up[node_num] = true;
-		r2net_hb_node_up_manual((int)node_num);
-	}
-	return count;
-}
-
-static struct kobj_attribute ramster_manual_node_up_attr = {
-	.attr = { .name = "manual_node_up", .mode = 0644 },
-	.show = ramster_manual_node_up_show,
-	.store = ramster_manual_node_up_store,
-};
-
-static ssize_t ramster_remote_target_nodenum_show(struct kobject *kobj,
-				struct kobj_attribute *attr, char *buf)
-{
-	if (ramster_remote_target_nodenum == -1UL)
-		return sprintf(buf, "unset\n");
-	else
-		return sprintf(buf, "%d\n", ramster_remote_target_nodenum);
-}
-
-static ssize_t ramster_remote_target_nodenum_store(struct kobject *kobj,
-		struct kobj_attribute *attr, const char *buf, size_t count)
-{
-	int err;
-	unsigned long node_num;
-
-	err = kstrtoul(buf, 10, &node_num);
-	if (err) {
-		pr_err("ramster: bad strtoul?\n");
-		return -EINVAL;
-	} else if (node_num == -1UL) {
-		pr_err("ramster: disabling all remotification, "
-			"data may still reside on remote nodes however\n");
-		return -EINVAL;
-	} else if (node_num >= MANUAL_NODES) {
-		pr_err("ramster: bad node_num=%lu?\n", node_num);
-		return -EINVAL;
-	} else if (!ramster_nodes_manual_up[node_num]) {
-		pr_err("ramster: node %d not up, ignoring setting "
-			"of remotification target\n", (int)node_num);
-	} else if (r2net_remote_target_node_set((int)node_num) >= 0) {
-		pr_info("ramster: node %d set as remotification target\n",
-				(int)node_num);
-		ramster_remote_target_nodenum = (int)node_num;
-	} else {
-		pr_err("ramster: bad num to node node_num=%d?\n",
-				(int)node_num);
-		return -EINVAL;
-	}
-	return count;
-}
-
-static struct kobj_attribute ramster_remote_target_nodenum_attr = {
-	.attr = { .name = "remote_target_nodenum", .mode = 0644 },
-	.show = ramster_remote_target_nodenum_show,
-	.store = ramster_remote_target_nodenum_store,
-};
-
-
-static struct attribute *ramster_attrs[] = {
-	&ramster_interface_revision_attr.attr,
-	&ramster_pers_remotify_enable_attr.attr,
-	&ramster_eph_remotify_enable_attr.attr,
-	&ramster_remote_pers_pages_attr.attr,
-	&ramster_eph_pages_remoted_attr.attr,
-	&ramster_eph_pages_remote_failed_attr.attr,
-	&ramster_pers_pages_remoted_attr.attr,
-	&ramster_pers_pages_remote_failed_attr.attr,
-	&ramster_pers_pages_remote_nomem_attr.attr,
-	&ramster_remote_pages_flushed_attr.attr,
-	&ramster_remote_page_flushes_failed_attr.attr,
-	&ramster_remote_objects_flushed_attr.attr,
-	&ramster_remote_object_flushes_failed_attr.attr,
-	&ramster_remote_eph_pages_succ_get_attr.attr,
-	&ramster_remote_eph_pages_unsucc_get_attr.attr,
-	&ramster_remote_pers_pages_succ_get_attr.attr,
-	&ramster_remote_pers_pages_unsucc_get_attr.attr,
-	&ramster_foreign_eph_pampd_count_attr.attr,
-	&ramster_foreign_eph_pampd_count_max_attr.attr,
-	&ramster_foreign_pers_pampd_count_attr.attr,
-	&ramster_foreign_pers_pampd_count_max_attr.attr,
-	&ramster_curr_flnode_count_attr.attr,
-	&ramster_curr_flnode_count_max_attr.attr,
-	&ramster_manual_node_up_attr.attr,
-	&ramster_remote_target_nodenum_attr.attr,
-	NULL,
-};
-
-static struct attribute_group ramster_attr_group = {
-	.attrs = ramster_attrs,
-	.name = "ramster",
-};
-
-#endif /* CONFIG_SYSFS */
-/*
- * When zcache is disabled ("frozen"), pools can be created and destroyed,
- * but all puts (and thus all other operations that require memory allocation)
- * must fail.  If zcache is unfrozen, accepts puts, then frozen again,
- * data consistency requires all puts while frozen to be converted into
- * flushes.
- */
-static bool zcache_freeze;
-
-/*
- * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
- */
-static int shrink_zcache_memory(struct shrinker *shrink,
-				struct shrink_control *sc)
-{
-	int ret = -1;
-	int nr = sc->nr_to_scan;
-	gfp_t gfp_mask = sc->gfp_mask;
-
-	if (nr >= 0) {
-		if (!(gfp_mask & __GFP_FS))
-			/* does this case really need to be skipped? */
-			goto out;
-		zbud_evict_pages(nr);
-	}
-	ret = (int)atomic_read(&zcache_zbud_curr_raw_pages);
-out:
-	return ret;
-}
-
-static struct shrinker zcache_shrinker = {
-	.shrink = shrink_zcache_memory,
-	.seeks = DEFAULT_SEEKS,
-};
-
-/*
- * zcache shims between cleancache/frontswap ops and tmem
- */
-
-int zcache_put(int cli_id, int pool_id, struct tmem_oid *oidp,
-			uint32_t index, char *data, size_t size,
-			bool raw, int ephemeral)
-{
-	struct tmem_pool *pool;
-	int ret = -1;
-
-	BUG_ON(!irqs_disabled());
-	pool = zcache_get_pool_by_id(cli_id, pool_id);
-	if (unlikely(pool == NULL))
-		goto out;
-	if (!zcache_freeze && zcache_do_preload(pool) == 0) {
-		/* preload does preempt_disable on success */
-		ret = tmem_put(pool, oidp, index, data, size, raw, ephemeral);
-		if (ret < 0) {
-			if (is_ephemeral(pool))
-				zcache_failed_eph_puts++;
-			else
-				zcache_failed_pers_puts++;
-		}
-		zcache_put_pool(pool);
-		preempt_enable_no_resched();
-	} else {
-		zcache_put_to_flush++;
-		if (atomic_read(&pool->obj_count) > 0)
-			/* the put fails whether the flush succeeds or not */
-			(void)tmem_flush_page(pool, oidp, index);
-		zcache_put_pool(pool);
-	}
-out:
-	return ret;
-}
-
-int zcache_get(int cli_id, int pool_id, struct tmem_oid *oidp,
-			uint32_t index, char *data, size_t *sizep,
-			bool raw, int get_and_free)
-{
-	struct tmem_pool *pool;
-	int ret = -1;
-	bool eph;
-
-	if (!raw) {
-		BUG_ON(irqs_disabled());
-		BUG_ON(in_softirq());
-	}
-	pool = zcache_get_pool_by_id(cli_id, pool_id);
-	eph = is_ephemeral(pool);
-	if (likely(pool != NULL)) {
-		if (atomic_read(&pool->obj_count) > 0)
-			ret = tmem_get(pool, oidp, index, data, sizep,
-					raw, get_and_free);
-		zcache_put_pool(pool);
-	}
-	WARN_ONCE((!eph && (ret != 0)), "zcache_get fails on persistent pool, "
-			  "bad things are very likely to happen soon\n");
-#ifdef RAMSTER_TESTING
-	if (ret != 0 && ret != -1 && !(ret == -EINVAL && is_ephemeral(pool)))
-		pr_err("TESTING zcache_get tmem_get returns ret=%d\n", ret);
-#endif
-	if (ret == -EAGAIN)
-		BUG(); /* FIXME... don't need this anymore??? let's ensure */
-	return ret;
-}
-
-int zcache_flush(int cli_id, int pool_id,
-				struct tmem_oid *oidp, uint32_t index)
-{
-	struct tmem_pool *pool;
-	int ret = -1;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	zcache_flush_total++;
-	pool = zcache_get_pool_by_id(cli_id, pool_id);
-	ramster_do_preload_flnode_only(pool);
-	if (likely(pool != NULL)) {
-		if (atomic_read(&pool->obj_count) > 0)
-			ret = tmem_flush_page(pool, oidp, index);
-		zcache_put_pool(pool);
-	}
-	if (ret >= 0)
-		zcache_flush_found++;
-	local_irq_restore(flags);
-	return ret;
-}
-
-int zcache_flush_object(int cli_id, int pool_id, struct tmem_oid *oidp)
-{
-	struct tmem_pool *pool;
-	int ret = -1;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	zcache_flobj_total++;
-	pool = zcache_get_pool_by_id(cli_id, pool_id);
-	ramster_do_preload_flnode_only(pool);
-	if (likely(pool != NULL)) {
-		if (atomic_read(&pool->obj_count) > 0)
-			ret = tmem_flush_object(pool, oidp);
-		zcache_put_pool(pool);
-	}
-	if (ret >= 0)
-		zcache_flobj_found++;
-	local_irq_restore(flags);
-	return ret;
-}
-
-int zcache_client_destroy_pool(int cli_id, int pool_id)
-{
-	struct tmem_pool *pool = NULL;
-	struct zcache_client *cli = NULL;
-	int ret = -1;
-
-	if (pool_id < 0)
-		goto out;
-	if (cli_id == LOCAL_CLIENT)
-		cli = &zcache_host;
-	else if ((unsigned int)cli_id < MAX_CLIENTS)
-		cli = &zcache_clients[cli_id];
-	if (cli == NULL)
-		goto out;
-	atomic_inc(&cli->refcount);
-	pool = cli->tmem_pools[pool_id];
-	if (pool == NULL)
-		goto out;
-	cli->tmem_pools[pool_id] = NULL;
-	/* wait for pool activity on other cpus to quiesce */
-	while (atomic_read(&pool->refcount) != 0)
-		;
-	atomic_dec(&cli->refcount);
-	local_bh_disable();
-	ret = tmem_destroy_pool(pool);
-	local_bh_enable();
-	kfree(pool);
-	pr_info("ramster: destroyed pool id=%d cli_id=%d\n", pool_id, cli_id);
-out:
-	return ret;
-}
-
-static int zcache_destroy_pool(int pool_id)
-{
-	return zcache_client_destroy_pool(LOCAL_CLIENT, pool_id);
-}
-
-int zcache_new_pool(uint16_t cli_id, uint32_t flags)
-{
-	int poolid = -1;
-	struct tmem_pool *pool;
-	struct zcache_client *cli = NULL;
-
-	if (cli_id == LOCAL_CLIENT)
-		cli = &zcache_host;
-	else if ((unsigned int)cli_id < MAX_CLIENTS)
-		cli = &zcache_clients[cli_id];
-	if (cli == NULL)
-		goto out;
-	atomic_inc(&cli->refcount);
-	pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC);
-	if (pool == NULL) {
-		pr_info("ramster: pool creation failed: out of memory\n");
-		goto out;
-	}
-
-	for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
-		if (cli->tmem_pools[poolid] == NULL)
-			break;
-	if (poolid >= MAX_POOLS_PER_CLIENT) {
-		pr_info("ramster: pool creation failed: max exceeded\n");
-		kfree(pool);
-		poolid = -1;
-		goto out;
-	}
-	atomic_set(&pool->refcount, 0);
-	pool->client = cli;
-	pool->pool_id = poolid;
-	tmem_new_pool(pool, flags);
-	cli->tmem_pools[poolid] = pool;
-	if (cli_id == LOCAL_CLIENT)
-		pr_info("ramster: created %s tmem pool, id=%d, local client\n",
-			flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
-			poolid);
-	else
-		pr_info("ramster: created %s tmem pool, id=%d, client=%d\n",
-			flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
-			poolid, cli_id);
-out:
-	if (cli != NULL)
-		atomic_dec(&cli->refcount);
-	return poolid;
-}
-
-static int zcache_local_new_pool(uint32_t flags)
-{
-	return zcache_new_pool(LOCAL_CLIENT, flags);
-}
-
-int zcache_autocreate_pool(int cli_id, int pool_id, bool ephemeral)
-{
-	struct tmem_pool *pool;
-	struct zcache_client *cli = NULL;
-	uint32_t flags = ephemeral ? 0 : TMEM_POOL_PERSIST;
-	int ret = -1;
-
-	if (cli_id == LOCAL_CLIENT)
-		goto out;
-	if (pool_id >= MAX_POOLS_PER_CLIENT)
-		goto out;
-	else if ((unsigned int)cli_id < MAX_CLIENTS)
-		cli = &zcache_clients[cli_id];
-	if ((ephemeral && !use_cleancache) || (!ephemeral && !use_frontswap))
-		BUG(); /* FIXME, handle more gracefully later */
-	if (!cli->allocated) {
-		if (zcache_new_client(cli_id))
-			BUG(); /* FIXME, handle more gracefully later */
-		cli = &zcache_clients[cli_id];
-	}
-	atomic_inc(&cli->refcount);
-	pool = cli->tmem_pools[pool_id];
-	if (pool != NULL) {
-		if (pool->persistent && ephemeral) {
-			pr_err("zcache_autocreate_pool: type mismatch\n");
-			goto out;
-		}
-		ret = 0;
-		goto out;
-	}
-	pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);
-	if (pool == NULL) {
-		pr_info("ramster: pool creation failed: out of memory\n");
-		goto out;
-	}
-	atomic_set(&pool->refcount, 0);
-	pool->client = cli;
-	pool->pool_id = pool_id;
-	tmem_new_pool(pool, flags);
-	cli->tmem_pools[pool_id] = pool;
-	pr_info("ramster: AUTOcreated %s tmem poolid=%d, for remote client=%d\n",
-		flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
-		pool_id, cli_id);
-	ret = 0;
-out:
-	if (cli == NULL)
-		BUG(); /* FIXME, handle more gracefully later */
-		/* pr_err("zcache_autocreate_pool: failed\n"); */
-	if (cli != NULL)
-		atomic_dec(&cli->refcount);
-	return ret;
-}
-
-/**********
- * Two kernel functionalities currently can be layered on top of tmem.
- * These are "cleancache" which is used as a second-chance cache for clean
- * page cache pages; and "frontswap" which is used for swap pages
- * to avoid writes to disk.  A generic "shim" is provided here for each
- * to translate in-kernel semantics to zcache semantics.
- */
-
-#ifdef CONFIG_CLEANCACHE
-static void zcache_cleancache_put_page(int pool_id,
-					struct cleancache_filekey key,
-					pgoff_t index, struct page *page)
-{
-	u32 ind = (u32) index;
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-
-#ifdef __PG_WAS_ACTIVE
-	if (!PageWasActive(page)) {
-		zcache_nonactive_puts++;
-		return;
-	}
-#endif
-	if (likely(ind == index)) {
-		char *kva = page_address(page);
-
-		(void)zcache_put(LOCAL_CLIENT, pool_id, &oid, index,
-			kva, PAGE_SIZE, 0, 1);
-	}
-}
-
-static int zcache_cleancache_get_page(int pool_id,
-					struct cleancache_filekey key,
-					pgoff_t index, struct page *page)
-{
-	u32 ind = (u32) index;
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-	int ret = -1;
-
-	preempt_disable();
-	if (likely(ind == index)) {
-		char *kva = page_address(page);
-		size_t size = PAGE_SIZE;
-
-		ret = zcache_get(LOCAL_CLIENT, pool_id, &oid, index,
-			kva, &size, 0, 0);
-#ifdef __PG_WAS_ACTIVE
-		if (ret == 0)
-			SetPageWasActive(page);
-#endif
-	}
-	preempt_enable();
-	return ret;
-}
-
-static void zcache_cleancache_flush_page(int pool_id,
-					struct cleancache_filekey key,
-					pgoff_t index)
-{
-	u32 ind = (u32) index;
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-
-	if (likely(ind == index))
-		(void)zcache_flush(LOCAL_CLIENT, pool_id, &oid, ind);
-}
-
-static void zcache_cleancache_flush_inode(int pool_id,
-					struct cleancache_filekey key)
-{
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-
-	(void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid);
-}
-
-static void zcache_cleancache_flush_fs(int pool_id)
-{
-	if (pool_id >= 0)
-		(void)zcache_destroy_pool(pool_id);
-}
-
-static int zcache_cleancache_init_fs(size_t pagesize)
-{
-	BUG_ON(sizeof(struct cleancache_filekey) !=
-				sizeof(struct tmem_oid));
-	BUG_ON(pagesize != PAGE_SIZE);
-	return zcache_local_new_pool(0);
-}
-
-static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
-{
-	/* shared pools are unsupported and map to private */
-	BUG_ON(sizeof(struct cleancache_filekey) !=
-				sizeof(struct tmem_oid));
-	BUG_ON(pagesize != PAGE_SIZE);
-	return zcache_local_new_pool(0);
-}
-
-static struct cleancache_ops zcache_cleancache_ops = {
-	.put_page = zcache_cleancache_put_page,
-	.get_page = zcache_cleancache_get_page,
-	.invalidate_page = zcache_cleancache_flush_page,
-	.invalidate_inode = zcache_cleancache_flush_inode,
-	.invalidate_fs = zcache_cleancache_flush_fs,
-	.init_shared_fs = zcache_cleancache_init_shared_fs,
-	.init_fs = zcache_cleancache_init_fs
-};
-
-struct cleancache_ops zcache_cleancache_register_ops(void)
-{
-	struct cleancache_ops old_ops =
-		cleancache_register_ops(&zcache_cleancache_ops);
-
-	return old_ops;
-}
-#endif
-
-#ifdef CONFIG_FRONTSWAP
-/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
-static int zcache_frontswap_poolid = -1;
-
-/*
- * Swizzling increases objects per swaptype, increasing tmem concurrency
- * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
- */
-#define SWIZ_BITS		8
-#define SWIZ_MASK		((1 << SWIZ_BITS) - 1)
-#define _oswiz(_type, _ind)	((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
-#define iswiz(_ind)		(_ind >> SWIZ_BITS)
-
-static inline struct tmem_oid oswiz(unsigned type, u32 ind)
-{
-	struct tmem_oid oid = { .oid = { 0 } };
-	oid.oid[0] = _oswiz(type, ind);
-	return oid;
-}
-
-static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
-				   struct page *page)
-{
-	u64 ind64 = (u64)offset;
-	u32 ind = (u32)offset;
-	struct tmem_oid oid = oswiz(type, ind);
-	int ret = -1;
-	unsigned long flags;
-	char *kva;
-
-	BUG_ON(!PageLocked(page));
-	if (likely(ind64 == ind)) {
-		local_irq_save(flags);
-		kva = page_address(page);
-		ret = zcache_put(LOCAL_CLIENT, zcache_frontswap_poolid,
-				&oid, iswiz(ind), kva, PAGE_SIZE, 0, 0);
-		local_irq_restore(flags);
-	}
-	return ret;
-}
-
-/* returns 0 if the page was successfully gotten from frontswap, -1 if
- * was not present (should never happen!) */
-static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
-				   struct page *page)
-{
-	u64 ind64 = (u64)offset;
-	u32 ind = (u32)offset;
-	struct tmem_oid oid = oswiz(type, ind);
-	int ret = -1;
-
-	preempt_disable(); /* FIXME, remove this? */
-	BUG_ON(!PageLocked(page));
-	if (likely(ind64 == ind)) {
-		char *kva = page_address(page);
-		size_t size = PAGE_SIZE;
-
-		ret = zcache_get(LOCAL_CLIENT, zcache_frontswap_poolid,
-					&oid, iswiz(ind), kva, &size, 0, -1);
-	}
-	preempt_enable(); /* FIXME, remove this? */
-	return ret;
-}
-
-/* flush a single page from frontswap */
-static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
-{
-	u64 ind64 = (u64)offset;
-	u32 ind = (u32)offset;
-	struct tmem_oid oid = oswiz(type, ind);
-
-	if (likely(ind64 == ind))
-		(void)zcache_flush(LOCAL_CLIENT, zcache_frontswap_poolid,
-					&oid, iswiz(ind));
-}
-
-/* flush all pages from the passed swaptype */
-static void zcache_frontswap_flush_area(unsigned type)
-{
-	struct tmem_oid oid;
-	int ind;
-
-	for (ind = SWIZ_MASK; ind >= 0; ind--) {
-		oid = oswiz(type, ind);
-		(void)zcache_flush_object(LOCAL_CLIENT,
-						zcache_frontswap_poolid, &oid);
-	}
-}
-
-static void zcache_frontswap_init(unsigned ignored)
-{
-	/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
-	if (zcache_frontswap_poolid < 0)
-		zcache_frontswap_poolid =
-				zcache_local_new_pool(TMEM_POOL_PERSIST);
-}
-
-static struct frontswap_ops zcache_frontswap_ops = {
-	.put_page = zcache_frontswap_put_page,
-	.get_page = zcache_frontswap_get_page,
-	.invalidate_page = zcache_frontswap_flush_page,
-	.invalidate_area = zcache_frontswap_flush_area,
-	.init = zcache_frontswap_init
-};
-
-struct frontswap_ops zcache_frontswap_register_ops(void)
-{
-	struct frontswap_ops old_ops =
-		frontswap_register_ops(&zcache_frontswap_ops);
-
-	return old_ops;
-}
-#endif
-
-/*
- * frontswap selfshrinking
- */
-
-#ifdef CONFIG_FRONTSWAP
-/* In HZ, controls frequency of worker invocation. */
-static unsigned int selfshrink_interval __read_mostly = 5;
-
-static void selfshrink_process(struct work_struct *work);
-static DECLARE_DELAYED_WORK(selfshrink_worker, selfshrink_process);
-
-/* Enable/disable with sysfs. */
-static bool frontswap_selfshrinking __read_mostly;
-
-/* Enable/disable with kernel boot option. */
-static bool use_frontswap_selfshrink __initdata = true;
-
-/*
- * The default values for the following parameters were deemed reasonable
- * by experimentation, may be workload-dependent, and can all be
- * adjusted via sysfs.
- */
-
-/* Control rate for frontswap shrinking. Higher hysteresis is slower. */
-static unsigned int frontswap_hysteresis __read_mostly = 20;
-
-/*
- * Number of selfshrink worker invocations to wait before observing that
- * frontswap selfshrinking should commence. Note that selfshrinking does
- * not use a separate worker thread.
- */
-static unsigned int frontswap_inertia __read_mostly = 3;
-
-/* Countdown to next invocation of frontswap_shrink() */
-static unsigned long frontswap_inertia_counter;
-
-/*
- * Invoked by the selfshrink worker thread, uses current number of pages
- * in frontswap (frontswap_curr_pages()), previous status, and control
- * values (hysteresis and inertia) to determine if frontswap should be
- * shrunk and what the new frontswap size should be.  Note that
- * frontswap_shrink is essentially a partial swapoff that immediately
- * transfers pages from the "swap device" (frontswap) back into kernel
- * RAM; despite the name, frontswap "shrinking" is very different from
- * the "shrinker" interface used by the kernel MM subsystem to reclaim
- * memory.
- */
-static void frontswap_selfshrink(void)
-{
-	static unsigned long cur_frontswap_pages;
-	static unsigned long last_frontswap_pages;
-	static unsigned long tgt_frontswap_pages;
-
-	last_frontswap_pages = cur_frontswap_pages;
-	cur_frontswap_pages = frontswap_curr_pages();
-	if (!cur_frontswap_pages ||
-			(cur_frontswap_pages > last_frontswap_pages)) {
-		frontswap_inertia_counter = frontswap_inertia;
-		return;
-	}
-	if (frontswap_inertia_counter && --frontswap_inertia_counter)
-		return;
-	if (cur_frontswap_pages <= frontswap_hysteresis)
-		tgt_frontswap_pages = 0;
-	else
-		tgt_frontswap_pages = cur_frontswap_pages -
-			(cur_frontswap_pages / frontswap_hysteresis);
-	frontswap_shrink(tgt_frontswap_pages);
-}
-
-static int __init ramster_nofrontswap_selfshrink_setup(char *s)
-{
-	use_frontswap_selfshrink = false;
-	return 1;
-}
-
-__setup("noselfshrink", ramster_nofrontswap_selfshrink_setup);
-
-static void selfshrink_process(struct work_struct *work)
-{
-	if (frontswap_selfshrinking && frontswap_enabled) {
-		frontswap_selfshrink();
-		schedule_delayed_work(&selfshrink_worker,
-			selfshrink_interval * HZ);
-	}
-}
-
-static int ramster_enabled;
-
-static int __init ramster_selfshrink_init(void)
-{
-	frontswap_selfshrinking = ramster_enabled && use_frontswap_selfshrink;
-	if (frontswap_selfshrinking)
-		pr_info("ramster: Initializing frontswap "
-					"selfshrinking driver.\n");
-	else
-		return -ENODEV;
-
-	schedule_delayed_work(&selfshrink_worker, selfshrink_interval * HZ);
-
-	return 0;
-}
-
-subsys_initcall(ramster_selfshrink_init);
-#endif
-
-/*
- * zcache initialization
- * NOTE FOR NOW ramster MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
- * NOTHING HAPPENS!
- */
-
-static int ramster_enabled;
-
-static int __init enable_ramster(char *s)
-{
-	ramster_enabled = 1;
-	return 1;
-}
-__setup("ramster", enable_ramster);
-
-/* allow independent dynamic disabling of cleancache and frontswap */
-
-static int use_cleancache = 1;
-
-static int __init no_cleancache(char *s)
-{
-	pr_info("INIT no_cleancache called\n");
-	use_cleancache = 0;
-	return 1;
-}
-
-/*
- * FIXME: need to guarantee this gets checked before zcache_init is called
- * What is the correct way to achieve this?
- */
-early_param("nocleancache", no_cleancache);
-
-static int use_frontswap = 1;
-
-static int __init no_frontswap(char *s)
-{
-	pr_info("INIT no_frontswap called\n");
-	use_frontswap = 0;
-	return 1;
-}
-
-__setup("nofrontswap", no_frontswap);
-
-static int __init zcache_init(void)
-{
-	int ret = 0;
-
-#ifdef CONFIG_SYSFS
-	ret = sysfs_create_group(mm_kobj, &zcache_attr_group);
-	ret = sysfs_create_group(mm_kobj, &ramster_attr_group);
-	if (ret) {
-		pr_err("ramster: can't create sysfs\n");
-		goto out;
-	}
-#endif /* CONFIG_SYSFS */
-#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
-	if (ramster_enabled) {
-		unsigned int cpu;
-
-		(void)r2net_register_handlers();
-		tmem_register_hostops(&zcache_hostops);
-		tmem_register_pamops(&zcache_pamops);
-		ret = register_cpu_notifier(&zcache_cpu_notifier_block);
-		if (ret) {
-			pr_err("ramster: can't register cpu notifier\n");
-			goto out;
-		}
-		for_each_online_cpu(cpu) {
-			void *pcpu = (void *)(long)cpu;
-			zcache_cpu_notifier(&zcache_cpu_notifier_block,
-				CPU_UP_PREPARE, pcpu);
-		}
-	}
-	zcache_objnode_cache = kmem_cache_create("zcache_objnode",
-				sizeof(struct tmem_objnode), 0, 0, NULL);
-	zcache_obj_cache = kmem_cache_create("zcache_obj",
-				sizeof(struct tmem_obj), 0, 0, NULL);
-	ramster_flnode_cache = kmem_cache_create("ramster_flnode",
-				sizeof(struct flushlist_node), 0, 0, NULL);
-#endif
-#ifdef CONFIG_CLEANCACHE
-	pr_info("INIT ramster_enabled=%d use_cleancache=%d\n",
-					ramster_enabled, use_cleancache);
-	if (ramster_enabled && use_cleancache) {
-		struct cleancache_ops old_ops;
-
-		zbud_init();
-		register_shrinker(&zcache_shrinker);
-		old_ops = zcache_cleancache_register_ops();
-		pr_info("ramster: cleancache enabled using kernel "
-			"transcendent memory and compression buddies\n");
-		if (old_ops.init_fs != NULL)
-			pr_warning("ramster: cleancache_ops overridden");
-	}
-#endif
-#ifdef CONFIG_FRONTSWAP
-	pr_info("INIT ramster_enabled=%d use_frontswap=%d\n",
-					ramster_enabled, use_frontswap);
-	if (ramster_enabled && use_frontswap) {
-		struct frontswap_ops old_ops;
-
-		zcache_new_client(LOCAL_CLIENT);
-		old_ops = zcache_frontswap_register_ops();
-		pr_info("ramster: frontswap enabled using kernel "
-			"transcendent memory and xvmalloc\n");
-		if (old_ops.init != NULL)
-			pr_warning("ramster: frontswap_ops overridden");
-	}
-	if (ramster_enabled && (use_frontswap || use_cleancache))
-		ramster_remotify_init();
-#endif
-out:
-	return ret;
-}
-
-module_init(zcache_init)
diff --git a/drivers/staging/ramster/zcache.h b/drivers/staging/ramster/zcache.h
deleted file mode 100644
index 250b121c2..000000000
--- a/drivers/staging/ramster/zcache.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * zcache.h
- *
- * External zcache functions
- *
- * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp.
- */
-
-#ifndef _ZCACHE_H_
-#define _ZCACHE_H_
-
-extern int zcache_put(int, int, struct tmem_oid *, uint32_t,
-			char *, size_t, bool, int);
-extern int zcache_autocreate_pool(int, int, bool);
-extern int zcache_get(int, int, struct tmem_oid *, uint32_t,
-			char *, size_t *, bool, int);
-extern int zcache_flush(int, int, struct tmem_oid *, uint32_t);
-extern int zcache_flush_object(int, int, struct tmem_oid *);
-extern int zcache_localify(int, struct tmem_oid *, uint32_t,
-			char *, size_t, void *);
-
-#endif /* _ZCACHE_H */
diff --git a/drivers/staging/zcache/Kconfig b/drivers/staging/zcache/Kconfig
deleted file mode 100644
index 7048e01f0..000000000
--- a/drivers/staging/zcache/Kconfig
+++ /dev/null
@@ -1,14 +0,0 @@
-config ZCACHE
-	bool "Dynamic compression of swap pages and clean pagecache pages"
-	# X86 dependency is because zsmalloc uses non-portable pte/tlb
-	# functions
-	depends on (CLEANCACHE || FRONTSWAP) && CRYPTO=y && X86
-	select ZSMALLOC
-	select CRYPTO_LZO
-	default n
-	help
-	  Zcache doubles RAM efficiency while providing a significant
-	  performance boosts on many workloads.  Zcache uses
-	  compression and an in-kernel implementation of transcendent
-	  memory to store clean page cache pages and swap in RAM,
-	  providing a noticeable reduction in disk I/O.
diff --git a/drivers/staging/zcache/Makefile b/drivers/staging/zcache/Makefile
deleted file mode 100644
index 60daa272c..000000000
--- a/drivers/staging/zcache/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-zcache-y	:=	zcache-main.o tmem.o
-
-obj-$(CONFIG_ZCACHE)	+=	zcache.o
diff --git a/drivers/staging/zcache/tmem.c b/drivers/staging/zcache/tmem.c
deleted file mode 100644
index 1ca66ea9b..000000000
--- a/drivers/staging/zcache/tmem.c
+++ /dev/null
@@ -1,770 +0,0 @@
-/*
- * In-kernel transcendent memory (generic implementation)
- *
- * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
- *
- * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
- * "handles" (triples containing a pool id, and object id, and an index), to
- * pages in a page-accessible memory (PAM).  Tmem references the PAM pages via
- * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
- * set of functions (pamops).  Each pampd contains some representation of
- * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of
- * pages and must be able to insert, find, and delete these pages at a
- * potential frequency of thousands per second concurrently across many CPUs,
- * (and, if used with KVM, across many vcpus across many guests).
- * Tmem is tracked with a hierarchy of data structures, organized by
- * the elements in a handle-tuple: pool_id, object_id, and page index.
- * One or more "clients" (e.g. guests) each provide one or more tmem_pools.
- * Each pool, contains a hash table of rb_trees of tmem_objs.  Each
- * tmem_obj contains a radix-tree-like tree of pointers, with intermediate
- * nodes called tmem_objnodes.  Each leaf pointer in this tree points to
- * a pampd, which is accessible only through a small set of callbacks
- * registered by the PAM implementation (see tmem_register_pamops). Tmem
- * does all memory allocation via a set of callbacks registered by the tmem
- * host implementation (e.g. see tmem_register_hostops).
- */
-
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/atomic.h>
-
-#include "tmem.h"
-
-/* data structure sentinels used for debugging... see tmem.h */
-#define POOL_SENTINEL 0x87658765
-#define OBJ_SENTINEL 0x12345678
-#define OBJNODE_SENTINEL 0xfedcba09
-
-/*
- * A tmem host implementation must use this function to register callbacks
- * for memory allocation.
- */
-static struct tmem_hostops tmem_hostops;
-
-static void tmem_objnode_tree_init(void);
-
-void tmem_register_hostops(struct tmem_hostops *m)
-{
-	tmem_objnode_tree_init();
-	tmem_hostops = *m;
-}
-
-/*
- * A tmem host implementation must use this function to register
- * callbacks for a page-accessible memory (PAM) implementation
- */
-static struct tmem_pamops tmem_pamops;
-
-void tmem_register_pamops(struct tmem_pamops *m)
-{
-	tmem_pamops = *m;
-}
-
-/*
- * Oid's are potentially very sparse and tmem_objs may have an indeterminately
- * short life, being added and deleted at a relatively high frequency.
- * So an rb_tree is an ideal data structure to manage tmem_objs.  But because
- * of the potentially huge number of tmem_objs, each pool manages a hashtable
- * of rb_trees to reduce search, insert, delete, and rebalancing time.
- * Each hashbucket also has a lock to manage concurrent access.
- *
- * The following routines manage tmem_objs.  When any tmem_obj is accessed,
- * the hashbucket lock must be held.
- */
-
-/* searches for object==oid in pool, returns locked object if found */
-static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
-					struct tmem_oid *oidp)
-{
-	struct rb_node *rbnode;
-	struct tmem_obj *obj;
-
-	rbnode = hb->obj_rb_root.rb_node;
-	while (rbnode) {
-		BUG_ON(RB_EMPTY_NODE(rbnode));
-		obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
-		switch (tmem_oid_compare(oidp, &obj->oid)) {
-		case 0: /* equal */
-			goto out;
-		case -1:
-			rbnode = rbnode->rb_left;
-			break;
-		case 1:
-			rbnode = rbnode->rb_right;
-			break;
-		}
-	}
-	obj = NULL;
-out:
-	return obj;
-}
-
-static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *);
-
-/* free an object that has no more pampds in it */
-static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
-{
-	struct tmem_pool *pool;
-
-	BUG_ON(obj == NULL);
-	ASSERT_SENTINEL(obj, OBJ);
-	BUG_ON(obj->pampd_count > 0);
-	pool = obj->pool;
-	BUG_ON(pool == NULL);
-	if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
-		tmem_pampd_destroy_all_in_obj(obj);
-	BUG_ON(obj->objnode_tree_root != NULL);
-	BUG_ON((long)obj->objnode_count != 0);
-	atomic_dec(&pool->obj_count);
-	BUG_ON(atomic_read(&pool->obj_count) < 0);
-	INVERT_SENTINEL(obj, OBJ);
-	obj->pool = NULL;
-	tmem_oid_set_invalid(&obj->oid);
-	rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
-}
-
-/*
- * initialize, and insert an tmem_object_root (called only if find failed)
- */
-static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
-					struct tmem_pool *pool,
-					struct tmem_oid *oidp)
-{
-	struct rb_root *root = &hb->obj_rb_root;
-	struct rb_node **new = &(root->rb_node), *parent = NULL;
-	struct tmem_obj *this;
-
-	BUG_ON(pool == NULL);
-	atomic_inc(&pool->obj_count);
-	obj->objnode_tree_height = 0;
-	obj->objnode_tree_root = NULL;
-	obj->pool = pool;
-	obj->oid = *oidp;
-	obj->objnode_count = 0;
-	obj->pampd_count = 0;
-	(*tmem_pamops.new_obj)(obj);
-	SET_SENTINEL(obj, OBJ);
-	while (*new) {
-		BUG_ON(RB_EMPTY_NODE(*new));
-		this = rb_entry(*new, struct tmem_obj, rb_tree_node);
-		parent = *new;
-		switch (tmem_oid_compare(oidp, &this->oid)) {
-		case 0:
-			BUG(); /* already present; should never happen! */
-			break;
-		case -1:
-			new = &(*new)->rb_left;
-			break;
-		case 1:
-			new = &(*new)->rb_right;
-			break;
-		}
-	}
-	rb_link_node(&obj->rb_tree_node, parent, new);
-	rb_insert_color(&obj->rb_tree_node, root);
-}
-
-/*
- * Tmem is managed as a set of tmem_pools with certain attributes, such as
- * "ephemeral" vs "persistent".  These attributes apply to all tmem_objs
- * and all pampds that belong to a tmem_pool.  A tmem_pool is created
- * or deleted relatively rarely (for example, when a filesystem is
- * mounted or unmounted.
- */
-
-/* flush all data from a pool and, optionally, free it */
-static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
-{
-	struct rb_node *rbnode;
-	struct tmem_obj *obj;
-	struct tmem_hashbucket *hb = &pool->hashbucket[0];
-	int i;
-
-	BUG_ON(pool == NULL);
-	for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
-		spin_lock(&hb->lock);
-		rbnode = rb_first(&hb->obj_rb_root);
-		while (rbnode != NULL) {
-			obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
-			rbnode = rb_next(rbnode);
-			tmem_pampd_destroy_all_in_obj(obj);
-			tmem_obj_free(obj, hb);
-			(*tmem_hostops.obj_free)(obj, pool);
-		}
-		spin_unlock(&hb->lock);
-	}
-	if (destroy)
-		list_del(&pool->pool_list);
-}
-
-/*
- * A tmem_obj contains a radix-tree-like tree in which the intermediate
- * nodes are called tmem_objnodes.  (The kernel lib/radix-tree.c implementation
- * is very specialized and tuned for specific uses and is not particularly
- * suited for use from this code, though some code from the core algorithms has
- * been reused, thus the copyright notices below).  Each tmem_objnode contains
- * a set of pointers which point to either a set of intermediate tmem_objnodes
- * or a set of of pampds.
- *
- * Portions Copyright (C) 2001 Momchil Velikov
- * Portions Copyright (C) 2001 Christoph Hellwig
- * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
- */
-
-struct tmem_objnode_tree_path {
-	struct tmem_objnode *objnode;
-	int offset;
-};
-
-/* objnode height_to_maxindex translation */
-static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
-
-static void tmem_objnode_tree_init(void)
-{
-	unsigned int ht, tmp;
-
-	for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
-		tmp = ht * OBJNODE_TREE_MAP_SHIFT;
-		if (tmp >= OBJNODE_TREE_INDEX_BITS)
-			tmem_objnode_tree_h2max[ht] = ~0UL;
-		else
-			tmem_objnode_tree_h2max[ht] =
-			    (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
-	}
-}
-
-static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
-{
-	struct tmem_objnode *objnode;
-
-	ASSERT_SENTINEL(obj, OBJ);
-	BUG_ON(obj->pool == NULL);
-	ASSERT_SENTINEL(obj->pool, POOL);
-	objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
-	if (unlikely(objnode == NULL))
-		goto out;
-	objnode->obj = obj;
-	SET_SENTINEL(objnode, OBJNODE);
-	memset(&objnode->slots, 0, sizeof(objnode->slots));
-	objnode->slots_in_use = 0;
-	obj->objnode_count++;
-out:
-	return objnode;
-}
-
-static void tmem_objnode_free(struct tmem_objnode *objnode)
-{
-	struct tmem_pool *pool;
-	int i;
-
-	BUG_ON(objnode == NULL);
-	for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
-		BUG_ON(objnode->slots[i] != NULL);
-	ASSERT_SENTINEL(objnode, OBJNODE);
-	INVERT_SENTINEL(objnode, OBJNODE);
-	BUG_ON(objnode->obj == NULL);
-	ASSERT_SENTINEL(objnode->obj, OBJ);
-	pool = objnode->obj->pool;
-	BUG_ON(pool == NULL);
-	ASSERT_SENTINEL(pool, POOL);
-	objnode->obj->objnode_count--;
-	objnode->obj = NULL;
-	(*tmem_hostops.objnode_free)(objnode, pool);
-}
-
-/*
- * lookup index in object and return associated pampd (or NULL if not found)
- */
-static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
-{
-	unsigned int height, shift;
-	struct tmem_objnode **slot = NULL;
-
-	BUG_ON(obj == NULL);
-	ASSERT_SENTINEL(obj, OBJ);
-	BUG_ON(obj->pool == NULL);
-	ASSERT_SENTINEL(obj->pool, POOL);
-
-	height = obj->objnode_tree_height;
-	if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
-		goto out;
-	if (height == 0 && obj->objnode_tree_root) {
-		slot = &obj->objnode_tree_root;
-		goto out;
-	}
-	shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
-	slot = &obj->objnode_tree_root;
-	while (height > 0) {
-		if (*slot == NULL)
-			goto out;
-		slot = (struct tmem_objnode **)
-			((*slot)->slots +
-			 ((index >> shift) & OBJNODE_TREE_MAP_MASK));
-		shift -= OBJNODE_TREE_MAP_SHIFT;
-		height--;
-	}
-out:
-	return slot != NULL ? (void **)slot : NULL;
-}
-
-static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
-{
-	struct tmem_objnode **slot;
-
-	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
-	return slot != NULL ? *slot : NULL;
-}
-
-static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
-					void *new_pampd)
-{
-	struct tmem_objnode **slot;
-	void *ret = NULL;
-
-	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
-	if ((slot != NULL) && (*slot != NULL)) {
-		void *old_pampd = *(void **)slot;
-		*(void **)slot = new_pampd;
-		(*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0);
-		ret = new_pampd;
-	}
-	return ret;
-}
-
-static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
-					void *pampd)
-{
-	int ret = 0;
-	struct tmem_objnode *objnode = NULL, *newnode, *slot;
-	unsigned int height, shift;
-	int offset = 0;
-
-	/* if necessary, extend the tree to be higher  */
-	if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
-		height = obj->objnode_tree_height + 1;
-		if (index > tmem_objnode_tree_h2max[height])
-			while (index > tmem_objnode_tree_h2max[height])
-				height++;
-		if (obj->objnode_tree_root == NULL) {
-			obj->objnode_tree_height = height;
-			goto insert;
-		}
-		do {
-			newnode = tmem_objnode_alloc(obj);
-			if (!newnode) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			newnode->slots[0] = obj->objnode_tree_root;
-			newnode->slots_in_use = 1;
-			obj->objnode_tree_root = newnode;
-			obj->objnode_tree_height++;
-		} while (height > obj->objnode_tree_height);
-	}
-insert:
-	slot = obj->objnode_tree_root;
-	height = obj->objnode_tree_height;
-	shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
-	while (height > 0) {
-		if (slot == NULL) {
-			/* add a child objnode.  */
-			slot = tmem_objnode_alloc(obj);
-			if (!slot) {
-				ret = -ENOMEM;
-				goto out;
-			}
-			if (objnode) {
-
-				objnode->slots[offset] = slot;
-				objnode->slots_in_use++;
-			} else
-				obj->objnode_tree_root = slot;
-		}
-		/* go down a level */
-		offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
-		objnode = slot;
-		slot = objnode->slots[offset];
-		shift -= OBJNODE_TREE_MAP_SHIFT;
-		height--;
-	}
-	BUG_ON(slot != NULL);
-	if (objnode) {
-		objnode->slots_in_use++;
-		objnode->slots[offset] = pampd;
-	} else
-		obj->objnode_tree_root = pampd;
-	obj->pampd_count++;
-out:
-	return ret;
-}
-
-static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
-{
-	struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
-	struct tmem_objnode_tree_path *pathp = path;
-	struct tmem_objnode *slot = NULL;
-	unsigned int height, shift;
-	int offset;
-
-	BUG_ON(obj == NULL);
-	ASSERT_SENTINEL(obj, OBJ);
-	BUG_ON(obj->pool == NULL);
-	ASSERT_SENTINEL(obj->pool, POOL);
-	height = obj->objnode_tree_height;
-	if (index > tmem_objnode_tree_h2max[height])
-		goto out;
-	slot = obj->objnode_tree_root;
-	if (height == 0 && obj->objnode_tree_root) {
-		obj->objnode_tree_root = NULL;
-		goto out;
-	}
-	shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
-	pathp->objnode = NULL;
-	do {
-		if (slot == NULL)
-			goto out;
-		pathp++;
-		offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
-		pathp->offset = offset;
-		pathp->objnode = slot;
-		slot = slot->slots[offset];
-		shift -= OBJNODE_TREE_MAP_SHIFT;
-		height--;
-	} while (height > 0);
-	if (slot == NULL)
-		goto out;
-	while (pathp->objnode) {
-		pathp->objnode->slots[pathp->offset] = NULL;
-		pathp->objnode->slots_in_use--;
-		if (pathp->objnode->slots_in_use) {
-			if (pathp->objnode == obj->objnode_tree_root) {
-				while (obj->objnode_tree_height > 0 &&
-				  obj->objnode_tree_root->slots_in_use == 1 &&
-				  obj->objnode_tree_root->slots[0]) {
-					struct tmem_objnode *to_free =
-						obj->objnode_tree_root;
-
-					obj->objnode_tree_root =
-							to_free->slots[0];
-					obj->objnode_tree_height--;
-					to_free->slots[0] = NULL;
-					to_free->slots_in_use = 0;
-					tmem_objnode_free(to_free);
-				}
-			}
-			goto out;
-		}
-		tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
-		pathp--;
-	}
-	obj->objnode_tree_height = 0;
-	obj->objnode_tree_root = NULL;
-
-out:
-	if (slot != NULL)
-		obj->pampd_count--;
-	BUG_ON(obj->pampd_count < 0);
-	return slot;
-}
-
-/* recursively walk the objnode_tree destroying pampds and objnodes */
-static void tmem_objnode_node_destroy(struct tmem_obj *obj,
-					struct tmem_objnode *objnode,
-					unsigned int ht)
-{
-	int i;
-
-	if (ht == 0)
-		return;
-	for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
-		if (objnode->slots[i]) {
-			if (ht == 1) {
-				obj->pampd_count--;
-				(*tmem_pamops.free)(objnode->slots[i],
-						obj->pool, NULL, 0);
-				objnode->slots[i] = NULL;
-				continue;
-			}
-			tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
-			tmem_objnode_free(objnode->slots[i]);
-			objnode->slots[i] = NULL;
-		}
-	}
-}
-
-static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
-{
-	if (obj->objnode_tree_root == NULL)
-		return;
-	if (obj->objnode_tree_height == 0) {
-		obj->pampd_count--;
-		(*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0);
-	} else {
-		tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
-					obj->objnode_tree_height);
-		tmem_objnode_free(obj->objnode_tree_root);
-		obj->objnode_tree_height = 0;
-	}
-	obj->objnode_tree_root = NULL;
-	(*tmem_pamops.free_obj)(obj->pool, obj);
-}
-
-/*
- * Tmem is operated on by a set of well-defined actions:
- * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
- * (The tmem ABI allows for subpages and exchanges but these operations
- * are not included in this implementation.)
- *
- * These "tmem core" operations are implemented in the following functions.
- */
-
-/*
- * "Put" a page, e.g. copy a page from the kernel into newly allocated
- * PAM space (if such space is available).  Tmem_put is complicated by
- * a corner case: What if a page with matching handle already exists in
- * tmem?  To guarantee coherency, one of two actions is necessary: Either
- * the data for the page must be overwritten, or the page must be
- * "flushed" so that the data is not accessible to a subsequent "get".
- * Since these "duplicate puts" are relatively rare, this implementation
- * always flushes for simplicity.
- */
-int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
-		char *data, size_t size, bool raw, bool ephemeral)
-{
-	struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
-	void *pampd = NULL, *pampd_del = NULL;
-	int ret = -ENOMEM;
-	struct tmem_hashbucket *hb;
-
-	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
-	spin_lock(&hb->lock);
-	obj = objfound = tmem_obj_find(hb, oidp);
-	if (obj != NULL) {
-		pampd = tmem_pampd_lookup_in_obj(objfound, index);
-		if (pampd != NULL) {
-			/* if found, is a dup put, flush the old one */
-			pampd_del = tmem_pampd_delete_from_obj(obj, index);
-			BUG_ON(pampd_del != pampd);
-			(*tmem_pamops.free)(pampd, pool, oidp, index);
-			if (obj->pampd_count == 0) {
-				objnew = obj;
-				objfound = NULL;
-			}
-			pampd = NULL;
-		}
-	} else {
-		obj = objnew = (*tmem_hostops.obj_alloc)(pool);
-		if (unlikely(obj == NULL)) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		tmem_obj_init(obj, hb, pool, oidp);
-	}
-	BUG_ON(obj == NULL);
-	BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
-	pampd = (*tmem_pamops.create)(data, size, raw, ephemeral,
-					obj->pool, &obj->oid, index);
-	if (unlikely(pampd == NULL))
-		goto free;
-	ret = tmem_pampd_add_to_obj(obj, index, pampd);
-	if (unlikely(ret == -ENOMEM))
-		/* may have partially built objnode tree ("stump") */
-		goto delete_and_free;
-	goto out;
-
-delete_and_free:
-	(void)tmem_pampd_delete_from_obj(obj, index);
-free:
-	if (pampd)
-		(*tmem_pamops.free)(pampd, pool, NULL, 0);
-	if (objnew) {
-		tmem_obj_free(objnew, hb);
-		(*tmem_hostops.obj_free)(objnew, pool);
-	}
-out:
-	spin_unlock(&hb->lock);
-	return ret;
-}
-
-/*
- * "Get" a page, e.g. if one can be found, copy the tmem page with the
- * matching handle from PAM space to the kernel.  By tmem definition,
- * when a "get" is successful on an ephemeral page, the page is "flushed",
- * and when a "get" is successful on a persistent page, the page is retained
- * in tmem.  Note that to preserve
- * coherency, "get" can never be skipped if tmem contains the data.
- * That is, if a get is done with a certain handle and fails, any
- * subsequent "get" must also fail (unless of course there is a
- * "put" done with the same handle).
-
- */
-int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
-		char *data, size_t *size, bool raw, int get_and_free)
-{
-	struct tmem_obj *obj;
-	void *pampd;
-	bool ephemeral = is_ephemeral(pool);
-	int ret = -1;
-	struct tmem_hashbucket *hb;
-	bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
-	bool lock_held = false;
-
-	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
-	spin_lock(&hb->lock);
-	lock_held = true;
-	obj = tmem_obj_find(hb, oidp);
-	if (obj == NULL)
-		goto out;
-	if (free)
-		pampd = tmem_pampd_delete_from_obj(obj, index);
-	else
-		pampd = tmem_pampd_lookup_in_obj(obj, index);
-	if (pampd == NULL)
-		goto out;
-	if (free) {
-		if (obj->pampd_count == 0) {
-			tmem_obj_free(obj, hb);
-			(*tmem_hostops.obj_free)(obj, pool);
-			obj = NULL;
-		}
-	}
-	if (tmem_pamops.is_remote(pampd)) {
-		lock_held = false;
-		spin_unlock(&hb->lock);
-	}
-	if (free)
-		ret = (*tmem_pamops.get_data_and_free)(
-				data, size, raw, pampd, pool, oidp, index);
-	else
-		ret = (*tmem_pamops.get_data)(
-				data, size, raw, pampd, pool, oidp, index);
-	if (ret < 0)
-		goto out;
-	ret = 0;
-out:
-	if (lock_held)
-		spin_unlock(&hb->lock);
-	return ret;
-}
-
-/*
- * If a page in tmem matches the handle, "flush" this page from tmem such
- * that any subsequent "get" does not succeed (unless, of course, there
- * was another "put" with the same handle).
- */
-int tmem_flush_page(struct tmem_pool *pool,
-				struct tmem_oid *oidp, uint32_t index)
-{
-	struct tmem_obj *obj;
-	void *pampd;
-	int ret = -1;
-	struct tmem_hashbucket *hb;
-
-	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
-	spin_lock(&hb->lock);
-	obj = tmem_obj_find(hb, oidp);
-	if (obj == NULL)
-		goto out;
-	pampd = tmem_pampd_delete_from_obj(obj, index);
-	if (pampd == NULL)
-		goto out;
-	(*tmem_pamops.free)(pampd, pool, oidp, index);
-	if (obj->pampd_count == 0) {
-		tmem_obj_free(obj, hb);
-		(*tmem_hostops.obj_free)(obj, pool);
-	}
-	ret = 0;
-
-out:
-	spin_unlock(&hb->lock);
-	return ret;
-}
-
-/*
- * If a page in tmem matches the handle, replace the page so that any
- * subsequent "get" gets the new page.  Returns 0 if
- * there was a page to replace, else returns -1.
- */
-int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
-			uint32_t index, void *new_pampd)
-{
-	struct tmem_obj *obj;
-	int ret = -1;
-	struct tmem_hashbucket *hb;
-
-	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
-	spin_lock(&hb->lock);
-	obj = tmem_obj_find(hb, oidp);
-	if (obj == NULL)
-		goto out;
-	new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd);
-	ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
-out:
-	spin_unlock(&hb->lock);
-	return ret;
-}
-
-/*
- * "Flush" all pages in tmem matching this oid.
- */
-int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
-{
-	struct tmem_obj *obj;
-	struct tmem_hashbucket *hb;
-	int ret = -1;
-
-	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
-	spin_lock(&hb->lock);
-	obj = tmem_obj_find(hb, oidp);
-	if (obj == NULL)
-		goto out;
-	tmem_pampd_destroy_all_in_obj(obj);
-	tmem_obj_free(obj, hb);
-	(*tmem_hostops.obj_free)(obj, pool);
-	ret = 0;
-
-out:
-	spin_unlock(&hb->lock);
-	return ret;
-}
-
-/*
- * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
- * all subsequent access to this tmem_pool.
- */
-int tmem_destroy_pool(struct tmem_pool *pool)
-{
-	int ret = -1;
-
-	if (pool == NULL)
-		goto out;
-	tmem_pool_flush(pool, 1);
-	ret = 0;
-out:
-	return ret;
-}
-
-static LIST_HEAD(tmem_global_pool_list);
-
-/*
- * Create a new tmem_pool with the provided flag and return
- * a pool id provided by the tmem host implementation.
- */
-void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
-{
-	int persistent = flags & TMEM_POOL_PERSIST;
-	int shared = flags & TMEM_POOL_SHARED;
-	struct tmem_hashbucket *hb = &pool->hashbucket[0];
-	int i;
-
-	for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
-		hb->obj_rb_root = RB_ROOT;
-		spin_lock_init(&hb->lock);
-	}
-	INIT_LIST_HEAD(&pool->pool_list);
-	atomic_set(&pool->obj_count, 0);
-	SET_SENTINEL(pool, POOL);
-	list_add_tail(&pool->pool_list, &tmem_global_pool_list);
-	pool->persistent = persistent;
-	pool->shared = shared;
-}
diff --git a/drivers/staging/zcache/tmem.h b/drivers/staging/zcache/tmem.h
deleted file mode 100644
index 0d4aa8270..000000000
--- a/drivers/staging/zcache/tmem.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * tmem.h
- *
- * Transcendent memory
- *
- * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
- */
-
-#ifndef _TMEM_H_
-#define _TMEM_H_
-
-#include <linux/types.h>
-#include <linux/highmem.h>
-#include <linux/hash.h>
-#include <linux/atomic.h>
-
-/*
- * These are pre-defined by the Xen<->Linux ABI
- */
-#define TMEM_PUT_PAGE			4
-#define TMEM_GET_PAGE			5
-#define TMEM_FLUSH_PAGE			6
-#define TMEM_FLUSH_OBJECT		7
-#define TMEM_POOL_PERSIST		1
-#define TMEM_POOL_SHARED		2
-#define TMEM_POOL_PRECOMPRESSED		4
-#define TMEM_POOL_PAGESIZE_SHIFT	4
-#define TMEM_POOL_PAGESIZE_MASK		0xf
-#define TMEM_POOL_RESERVED_BITS		0x00ffff00
-
-/*
- * sentinels have proven very useful for debugging but can be removed
- * or disabled before final merge.
- */
-#define SENTINELS
-#ifdef SENTINELS
-#define DECL_SENTINEL uint32_t sentinel;
-#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL)
-#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL)
-#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL)
-#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL)
-#else
-#define DECL_SENTINEL
-#define SET_SENTINEL(_x, _y) do { } while (0)
-#define INVERT_SENTINEL(_x, _y) do { } while (0)
-#define ASSERT_SENTINEL(_x, _y) do { } while (0)
-#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0)
-#endif
-
-#define ASSERT_SPINLOCK(_l)	lockdep_assert_held(_l)
-
-/*
- * A pool is the highest-level data structure managed by tmem and
- * usually corresponds to a large independent set of pages such as
- * a filesystem.  Each pool has an id, and certain attributes and counters.
- * It also contains a set of hash buckets, each of which contains an rbtree
- * of objects and a lock to manage concurrency within the pool.
- */
-
-#define TMEM_HASH_BUCKET_BITS	8
-#define TMEM_HASH_BUCKETS	(1<<TMEM_HASH_BUCKET_BITS)
-
-struct tmem_hashbucket {
-	struct rb_root obj_rb_root;
-	spinlock_t lock;
-};
-
-struct tmem_pool {
-	void *client; /* "up" for some clients, avoids table lookup */
-	struct list_head pool_list;
-	uint32_t pool_id;
-	bool persistent;
-	bool shared;
-	atomic_t obj_count;
-	atomic_t refcount;
-	struct tmem_hashbucket hashbucket[TMEM_HASH_BUCKETS];
-	DECL_SENTINEL
-};
-
-#define is_persistent(_p)  (_p->persistent)
-#define is_ephemeral(_p)   (!(_p->persistent))
-
-/*
- * An object id ("oid") is large: 192-bits (to ensure, for example, files
- * in a modern filesystem can be uniquely identified).
- */
-
-struct tmem_oid {
-	uint64_t oid[3];
-};
-
-static inline void tmem_oid_set_invalid(struct tmem_oid *oidp)
-{
-	oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
-}
-
-static inline bool tmem_oid_valid(struct tmem_oid *oidp)
-{
-	return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL ||
-		oidp->oid[2] != -1UL;
-}
-
-static inline int tmem_oid_compare(struct tmem_oid *left,
-					struct tmem_oid *right)
-{
-	int ret;
-
-	if (left->oid[2] == right->oid[2]) {
-		if (left->oid[1] == right->oid[1]) {
-			if (left->oid[0] == right->oid[0])
-				ret = 0;
-			else if (left->oid[0] < right->oid[0])
-				ret = -1;
-			else
-				return 1;
-		} else if (left->oid[1] < right->oid[1])
-			ret = -1;
-		else
-			ret = 1;
-	} else if (left->oid[2] < right->oid[2])
-		ret = -1;
-	else
-		ret = 1;
-	return ret;
-}
-
-static inline unsigned tmem_oid_hash(struct tmem_oid *oidp)
-{
-	return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2],
-				TMEM_HASH_BUCKET_BITS);
-}
-
-/*
- * A tmem_obj contains an identifier (oid), pointers to the parent
- * pool and the rb_tree to which it belongs, counters, and an ordered
- * set of pampds, structured in a radix-tree-like tree.  The intermediate
- * nodes of the tree are called tmem_objnodes.
- */
-
-struct tmem_objnode;
-
-struct tmem_obj {
-	struct tmem_oid oid;
-	struct tmem_pool *pool;
-	struct rb_node rb_tree_node;
-	struct tmem_objnode *objnode_tree_root;
-	unsigned int objnode_tree_height;
-	unsigned long objnode_count;
-	long pampd_count;
-	void *extra; /* for private use by pampd implementation */
-	DECL_SENTINEL
-};
-
-#define OBJNODE_TREE_MAP_SHIFT 6
-#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT)
-#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1)
-#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
-#define OBJNODE_TREE_MAX_PATH \
-		(OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2)
-
-struct tmem_objnode {
-	struct tmem_obj *obj;
-	DECL_SENTINEL
-	void *slots[OBJNODE_TREE_MAP_SIZE];
-	unsigned int slots_in_use;
-};
-
-/* pampd abstract datatype methods provided by the PAM implementation */
-struct tmem_pamops {
-	void *(*create)(char *, size_t, bool, int,
-			struct tmem_pool *, struct tmem_oid *, uint32_t);
-	int (*get_data)(char *, size_t *, bool, void *, struct tmem_pool *,
-				struct tmem_oid *, uint32_t);
-	int (*get_data_and_free)(char *, size_t *, bool, void *,
-				struct tmem_pool *, struct tmem_oid *,
-				uint32_t);
-	void (*free)(void *, struct tmem_pool *, struct tmem_oid *, uint32_t);
-	void (*free_obj)(struct tmem_pool *, struct tmem_obj *);
-	bool (*is_remote)(void *);
-	void (*new_obj)(struct tmem_obj *);
-	int (*replace_in_obj)(void *, struct tmem_obj *);
-};
-extern void tmem_register_pamops(struct tmem_pamops *m);
-
-/* memory allocation methods provided by the host implementation */
-struct tmem_hostops {
-	struct tmem_obj *(*obj_alloc)(struct tmem_pool *);
-	void (*obj_free)(struct tmem_obj *, struct tmem_pool *);
-	struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *);
-	void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *);
-};
-extern void tmem_register_hostops(struct tmem_hostops *m);
-
-/* core tmem accessor functions */
-extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index,
-			char *, size_t, bool, bool);
-extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index,
-			char *, size_t *, bool, int);
-extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index,
-			void *);
-extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *,
-			uint32_t index);
-extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *);
-extern int tmem_destroy_pool(struct tmem_pool *);
-extern void tmem_new_pool(struct tmem_pool *, uint32_t);
-#endif /* _TMEM_H */
diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c
deleted file mode 100644
index 1812bedab..000000000
--- a/drivers/staging/zcache/zcache-main.c
+++ /dev/null
@@ -1,2083 +0,0 @@
-/*
- * zcache.c
- *
- * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
- * Copyright (c) 2010,2011, Nitin Gupta
- *
- * Zcache provides an in-kernel "host implementation" for transcendent memory
- * and, thus indirectly, for cleancache and frontswap.  Zcache includes two
- * page-accessible memory [1] interfaces, both utilizing the crypto compression
- * API:
- * 1) "compression buddies" ("zbud") is used for ephemeral pages
- * 2) zsmalloc is used for persistent pages.
- * Xvmalloc (based on the TLSF allocator) has very low fragmentation
- * so maximizes space efficiency, while zbud allows pairs (and potentially,
- * in the future, more than a pair of) compressed pages to be closely linked
- * so that reclaiming can be done via the kernel's physical-page-oriented
- * "shrinker" interface.
- *
- * [1] For a definition of page-accessible memory (aka PAM), see:
- *   http://marc.info/?l=linux-mm&m=127811271605009
- */
-
-#include <linux/module.h>
-#include <linux/cpu.h>
-#include <linux/highmem.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-#include <linux/atomic.h>
-#include <linux/math64.h>
-#include <linux/crypto.h>
-#include <linux/string.h>
-#include "tmem.h"
-
-#include "../zsmalloc/zsmalloc.h"
-
-#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
-#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
-#endif
-#ifdef CONFIG_CLEANCACHE
-#include <linux/cleancache.h>
-#endif
-#ifdef CONFIG_FRONTSWAP
-#include <linux/frontswap.h>
-#endif
-
-#if 0
-/* this is more aggressive but may cause other problems? */
-#define ZCACHE_GFP_MASK	(GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
-#else
-#define ZCACHE_GFP_MASK \
-	(__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
-#endif
-
-#define MAX_POOLS_PER_CLIENT 16
-
-#define MAX_CLIENTS 16
-#define LOCAL_CLIENT ((uint16_t)-1)
-
-MODULE_LICENSE("GPL");
-
-struct zcache_client {
-	struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
-	struct zs_pool *zspool;
-	bool allocated;
-	atomic_t refcount;
-};
-
-static struct zcache_client zcache_host;
-static struct zcache_client zcache_clients[MAX_CLIENTS];
-
-static inline uint16_t get_client_id_from_client(struct zcache_client *cli)
-{
-	BUG_ON(cli == NULL);
-	if (cli == &zcache_host)
-		return LOCAL_CLIENT;
-	return cli - &zcache_clients[0];
-}
-
-static inline bool is_local_client(struct zcache_client *cli)
-{
-	return cli == &zcache_host;
-}
-
-/* crypto API for zcache  */
-#define ZCACHE_COMP_NAME_SZ CRYPTO_MAX_ALG_NAME
-static char zcache_comp_name[ZCACHE_COMP_NAME_SZ];
-static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms;
-
-enum comp_op {
-	ZCACHE_COMPOP_COMPRESS,
-	ZCACHE_COMPOP_DECOMPRESS
-};
-
-static inline int zcache_comp_op(enum comp_op op,
-				const u8 *src, unsigned int slen,
-				u8 *dst, unsigned int *dlen)
-{
-	struct crypto_comp *tfm;
-	int ret;
-
-	BUG_ON(!zcache_comp_pcpu_tfms);
-	tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu());
-	BUG_ON(!tfm);
-	switch (op) {
-	case ZCACHE_COMPOP_COMPRESS:
-		ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
-		break;
-	case ZCACHE_COMPOP_DECOMPRESS:
-		ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
-		break;
-	}
-	put_cpu();
-	return ret;
-}
-
-/**********
- * Compression buddies ("zbud") provides for packing two (or, possibly
- * in the future, more) compressed ephemeral pages into a single "raw"
- * (physical) page and tracking them with data structures so that
- * the raw pages can be easily reclaimed.
- *
- * A zbud page ("zbpg") is an aligned page containing a list_head,
- * a lock, and two "zbud headers".  The remainder of the physical
- * page is divided up into aligned 64-byte "chunks" which contain
- * the compressed data for zero, one, or two zbuds.  Each zbpg
- * resides on: (1) an "unused list" if it has no zbuds; (2) a
- * "buddied" list if it is fully populated  with two zbuds; or
- * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
- * the one unbuddied zbud uses.  The data inside a zbpg cannot be
- * read or written unless the zbpg's lock is held.
- */
-
-#define ZBH_SENTINEL  0x43214321
-#define ZBPG_SENTINEL  0xdeadbeef
-
-#define ZBUD_MAX_BUDS 2
-
-struct zbud_hdr {
-	uint16_t client_id;
-	uint16_t pool_id;
-	struct tmem_oid oid;
-	uint32_t index;
-	uint16_t size; /* compressed size in bytes, zero means unused */
-	DECL_SENTINEL
-};
-
-struct zbud_page {
-	struct list_head bud_list;
-	spinlock_t lock;
-	struct zbud_hdr buddy[ZBUD_MAX_BUDS];
-	DECL_SENTINEL
-	/* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
-};
-
-#define CHUNK_SHIFT	6
-#define CHUNK_SIZE	(1 << CHUNK_SHIFT)
-#define CHUNK_MASK	(~(CHUNK_SIZE-1))
-#define NCHUNKS		(((PAGE_SIZE - sizeof(struct zbud_page)) & \
-				CHUNK_MASK) >> CHUNK_SHIFT)
-#define MAX_CHUNK	(NCHUNKS-1)
-
-static struct {
-	struct list_head list;
-	unsigned count;
-} zbud_unbuddied[NCHUNKS];
-/* list N contains pages with N chunks USED and NCHUNKS-N unused */
-/* element 0 is never used but optimizing that isn't worth it */
-static unsigned long zbud_cumul_chunk_counts[NCHUNKS];
-
-struct list_head zbud_buddied_list;
-static unsigned long zcache_zbud_buddied_count;
-
-/* protects the buddied list and all unbuddied lists */
-static DEFINE_SPINLOCK(zbud_budlists_spinlock);
-
-static LIST_HEAD(zbpg_unused_list);
-static unsigned long zcache_zbpg_unused_list_count;
-
-/* protects the unused page list */
-static DEFINE_SPINLOCK(zbpg_unused_list_spinlock);
-
-static atomic_t zcache_zbud_curr_raw_pages;
-static atomic_t zcache_zbud_curr_zpages;
-static unsigned long zcache_zbud_curr_zbytes;
-static unsigned long zcache_zbud_cumul_zpages;
-static unsigned long zcache_zbud_cumul_zbytes;
-static unsigned long zcache_compress_poor;
-static unsigned long zcache_mean_compress_poor;
-
-/* forward references */
-static void *zcache_get_free_page(void);
-static void zcache_free_page(void *p);
-
-/*
- * zbud helper functions
- */
-
-static inline unsigned zbud_max_buddy_size(void)
-{
-	return MAX_CHUNK << CHUNK_SHIFT;
-}
-
-static inline unsigned zbud_size_to_chunks(unsigned size)
-{
-	BUG_ON(size == 0 || size > zbud_max_buddy_size());
-	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
-}
-
-static inline int zbud_budnum(struct zbud_hdr *zh)
-{
-	unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1);
-	struct zbud_page *zbpg = NULL;
-	unsigned budnum = -1U;
-	int i;
-
-	for (i = 0; i < ZBUD_MAX_BUDS; i++)
-		if (offset == offsetof(typeof(*zbpg), buddy[i])) {
-			budnum = i;
-			break;
-		}
-	BUG_ON(budnum == -1U);
-	return budnum;
-}
-
-static char *zbud_data(struct zbud_hdr *zh, unsigned size)
-{
-	struct zbud_page *zbpg;
-	char *p;
-	unsigned budnum;
-
-	ASSERT_SENTINEL(zh, ZBH);
-	budnum = zbud_budnum(zh);
-	BUG_ON(size == 0 || size > zbud_max_buddy_size());
-	zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
-	ASSERT_SPINLOCK(&zbpg->lock);
-	p = (char *)zbpg;
-	if (budnum == 0)
-		p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
-							CHUNK_MASK);
-	else if (budnum == 1)
-		p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
-	return p;
-}
-
-/*
- * zbud raw page management
- */
-
-static struct zbud_page *zbud_alloc_raw_page(void)
-{
-	struct zbud_page *zbpg = NULL;
-	struct zbud_hdr *zh0, *zh1;
-	bool recycled = 0;
-
-	/* if any pages on the zbpg list, use one */
-	spin_lock(&zbpg_unused_list_spinlock);
-	if (!list_empty(&zbpg_unused_list)) {
-		zbpg = list_first_entry(&zbpg_unused_list,
-				struct zbud_page, bud_list);
-		list_del_init(&zbpg->bud_list);
-		zcache_zbpg_unused_list_count--;
-		recycled = 1;
-	}
-	spin_unlock(&zbpg_unused_list_spinlock);
-	if (zbpg == NULL)
-		/* none on zbpg list, try to get a kernel page */
-		zbpg = zcache_get_free_page();
-	if (likely(zbpg != NULL)) {
-		INIT_LIST_HEAD(&zbpg->bud_list);
-		zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
-		spin_lock_init(&zbpg->lock);
-		if (recycled) {
-			ASSERT_INVERTED_SENTINEL(zbpg, ZBPG);
-			SET_SENTINEL(zbpg, ZBPG);
-			BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
-			BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
-		} else {
-			atomic_inc(&zcache_zbud_curr_raw_pages);
-			INIT_LIST_HEAD(&zbpg->bud_list);
-			SET_SENTINEL(zbpg, ZBPG);
-			zh0->size = 0; zh1->size = 0;
-			tmem_oid_set_invalid(&zh0->oid);
-			tmem_oid_set_invalid(&zh1->oid);
-		}
-	}
-	return zbpg;
-}
-
-static void zbud_free_raw_page(struct zbud_page *zbpg)
-{
-	struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1];
-
-	ASSERT_SENTINEL(zbpg, ZBPG);
-	BUG_ON(!list_empty(&zbpg->bud_list));
-	ASSERT_SPINLOCK(&zbpg->lock);
-	BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
-	BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
-	INVERT_SENTINEL(zbpg, ZBPG);
-	spin_unlock(&zbpg->lock);
-	spin_lock(&zbpg_unused_list_spinlock);
-	list_add(&zbpg->bud_list, &zbpg_unused_list);
-	zcache_zbpg_unused_list_count++;
-	spin_unlock(&zbpg_unused_list_spinlock);
-}
-
-/*
- * core zbud handling routines
- */
-
-static unsigned zbud_free(struct zbud_hdr *zh)
-{
-	unsigned size;
-
-	ASSERT_SENTINEL(zh, ZBH);
-	BUG_ON(!tmem_oid_valid(&zh->oid));
-	size = zh->size;
-	BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
-	zh->size = 0;
-	tmem_oid_set_invalid(&zh->oid);
-	INVERT_SENTINEL(zh, ZBH);
-	zcache_zbud_curr_zbytes -= size;
-	atomic_dec(&zcache_zbud_curr_zpages);
-	return size;
-}
-
-static void zbud_free_and_delist(struct zbud_hdr *zh)
-{
-	unsigned chunks;
-	struct zbud_hdr *zh_other;
-	unsigned budnum = zbud_budnum(zh), size;
-	struct zbud_page *zbpg =
-		container_of(zh, struct zbud_page, buddy[budnum]);
-
-	spin_lock(&zbud_budlists_spinlock);
-	spin_lock(&zbpg->lock);
-	if (list_empty(&zbpg->bud_list)) {
-		/* ignore zombie page... see zbud_evict_pages() */
-		spin_unlock(&zbpg->lock);
-		spin_unlock(&zbud_budlists_spinlock);
-		return;
-	}
-	size = zbud_free(zh);
-	ASSERT_SPINLOCK(&zbpg->lock);
-	zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0];
-	if (zh_other->size == 0) { /* was unbuddied: unlist and free */
-		chunks = zbud_size_to_chunks(size) ;
-		BUG_ON(list_empty(&zbud_unbuddied[chunks].list));
-		list_del_init(&zbpg->bud_list);
-		zbud_unbuddied[chunks].count--;
-		spin_unlock(&zbud_budlists_spinlock);
-		zbud_free_raw_page(zbpg);
-	} else { /* was buddied: move remaining buddy to unbuddied list */
-		chunks = zbud_size_to_chunks(zh_other->size) ;
-		list_del_init(&zbpg->bud_list);
-		zcache_zbud_buddied_count--;
-		list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list);
-		zbud_unbuddied[chunks].count++;
-		spin_unlock(&zbud_budlists_spinlock);
-		spin_unlock(&zbpg->lock);
-	}
-}
-
-static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id,
-					struct tmem_oid *oid,
-					uint32_t index, struct page *page,
-					void *cdata, unsigned size)
-{
-	struct zbud_hdr *zh0, *zh1, *zh = NULL;
-	struct zbud_page *zbpg = NULL, *ztmp;
-	unsigned nchunks;
-	char *to;
-	int i, found_good_buddy = 0;
-
-	nchunks = zbud_size_to_chunks(size) ;
-	for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
-		spin_lock(&zbud_budlists_spinlock);
-		if (!list_empty(&zbud_unbuddied[i].list)) {
-			list_for_each_entry_safe(zbpg, ztmp,
-				    &zbud_unbuddied[i].list, bud_list) {
-				if (spin_trylock(&zbpg->lock)) {
-					found_good_buddy = i;
-					goto found_unbuddied;
-				}
-			}
-		}
-		spin_unlock(&zbud_budlists_spinlock);
-	}
-	/* didn't find a good buddy, try allocating a new page */
-	zbpg = zbud_alloc_raw_page();
-	if (unlikely(zbpg == NULL))
-		goto out;
-	/* ok, have a page, now compress the data before taking locks */
-	spin_lock(&zbud_budlists_spinlock);
-	spin_lock(&zbpg->lock);
-	list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);
-	zbud_unbuddied[nchunks].count++;
-	zh = &zbpg->buddy[0];
-	goto init_zh;
-
-found_unbuddied:
-	ASSERT_SPINLOCK(&zbpg->lock);
-	zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
-	BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0)));
-	if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */
-		ASSERT_SENTINEL(zh0, ZBH);
-		zh = zh1;
-	} else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */
-		ASSERT_SENTINEL(zh1, ZBH);
-		zh = zh0;
-	} else
-		BUG();
-	list_del_init(&zbpg->bud_list);
-	zbud_unbuddied[found_good_buddy].count--;
-	list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
-	zcache_zbud_buddied_count++;
-
-init_zh:
-	SET_SENTINEL(zh, ZBH);
-	zh->size = size;
-	zh->index = index;
-	zh->oid = *oid;
-	zh->pool_id = pool_id;
-	zh->client_id = client_id;
-	to = zbud_data(zh, size);
-	memcpy(to, cdata, size);
-	spin_unlock(&zbpg->lock);
-	spin_unlock(&zbud_budlists_spinlock);
-
-	zbud_cumul_chunk_counts[nchunks]++;
-	atomic_inc(&zcache_zbud_curr_zpages);
-	zcache_zbud_cumul_zpages++;
-	zcache_zbud_curr_zbytes += size;
-	zcache_zbud_cumul_zbytes += size;
-out:
-	return zh;
-}
-
-static int zbud_decompress(struct page *page, struct zbud_hdr *zh)
-{
-	struct zbud_page *zbpg;
-	unsigned budnum = zbud_budnum(zh);
-	unsigned int out_len = PAGE_SIZE;
-	char *to_va, *from_va;
-	unsigned size;
-	int ret = 0;
-
-	zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
-	spin_lock(&zbpg->lock);
-	if (list_empty(&zbpg->bud_list)) {
-		/* ignore zombie page... see zbud_evict_pages() */
-		ret = -EINVAL;
-		goto out;
-	}
-	ASSERT_SENTINEL(zh, ZBH);
-	BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
-	to_va = kmap_atomic(page);
-	size = zh->size;
-	from_va = zbud_data(zh, size);
-	ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, from_va, size,
-				to_va, &out_len);
-	BUG_ON(ret);
-	BUG_ON(out_len != PAGE_SIZE);
-	kunmap_atomic(to_va);
-out:
-	spin_unlock(&zbpg->lock);
-	return ret;
-}
-
-/*
- * The following routines handle shrinking of ephemeral pages by evicting
- * pages "least valuable" first.
- */
-
-static unsigned long zcache_evicted_raw_pages;
-static unsigned long zcache_evicted_buddied_pages;
-static unsigned long zcache_evicted_unbuddied_pages;
-
-static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id,
-						uint16_t poolid);
-static void zcache_put_pool(struct tmem_pool *pool);
-
-/*
- * Flush and free all zbuds in a zbpg, then free the pageframe
- */
-static void zbud_evict_zbpg(struct zbud_page *zbpg)
-{
-	struct zbud_hdr *zh;
-	int i, j;
-	uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS];
-	uint32_t index[ZBUD_MAX_BUDS];
-	struct tmem_oid oid[ZBUD_MAX_BUDS];
-	struct tmem_pool *pool;
-
-	ASSERT_SPINLOCK(&zbpg->lock);
-	BUG_ON(!list_empty(&zbpg->bud_list));
-	for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
-		zh = &zbpg->buddy[i];
-		if (zh->size) {
-			client_id[j] = zh->client_id;
-			pool_id[j] = zh->pool_id;
-			oid[j] = zh->oid;
-			index[j] = zh->index;
-			j++;
-			zbud_free(zh);
-		}
-	}
-	spin_unlock(&zbpg->lock);
-	for (i = 0; i < j; i++) {
-		pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
-		if (pool != NULL) {
-			tmem_flush_page(pool, &oid[i], index[i]);
-			zcache_put_pool(pool);
-		}
-	}
-	ASSERT_SENTINEL(zbpg, ZBPG);
-	spin_lock(&zbpg->lock);
-	zbud_free_raw_page(zbpg);
-}
-
-/*
- * Free nr pages.  This code is funky because we want to hold the locks
- * protecting various lists for as short a time as possible, and in some
- * circumstances the list may change asynchronously when the list lock is
- * not held.  In some cases we also trylock not only to avoid waiting on a
- * page in use by another cpu, but also to avoid potential deadlock due to
- * lock inversion.
- */
-static void zbud_evict_pages(int nr)
-{
-	struct zbud_page *zbpg;
-	int i;
-
-	/* first try freeing any pages on unused list */
-retry_unused_list:
-	spin_lock_bh(&zbpg_unused_list_spinlock);
-	if (!list_empty(&zbpg_unused_list)) {
-		/* can't walk list here, since it may change when unlocked */
-		zbpg = list_first_entry(&zbpg_unused_list,
-				struct zbud_page, bud_list);
-		list_del_init(&zbpg->bud_list);
-		zcache_zbpg_unused_list_count--;
-		atomic_dec(&zcache_zbud_curr_raw_pages);
-		spin_unlock_bh(&zbpg_unused_list_spinlock);
-		zcache_free_page(zbpg);
-		zcache_evicted_raw_pages++;
-		if (--nr <= 0)
-			goto out;
-		goto retry_unused_list;
-	}
-	spin_unlock_bh(&zbpg_unused_list_spinlock);
-
-	/* now try freeing unbuddied pages, starting with least space avail */
-	for (i = 0; i < MAX_CHUNK; i++) {
-retry_unbud_list_i:
-		spin_lock_bh(&zbud_budlists_spinlock);
-		if (list_empty(&zbud_unbuddied[i].list)) {
-			spin_unlock_bh(&zbud_budlists_spinlock);
-			continue;
-		}
-		list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
-			if (unlikely(!spin_trylock(&zbpg->lock)))
-				continue;
-			list_del_init(&zbpg->bud_list);
-			zbud_unbuddied[i].count--;
-			spin_unlock(&zbud_budlists_spinlock);
-			zcache_evicted_unbuddied_pages++;
-			/* want budlists unlocked when doing zbpg eviction */
-			zbud_evict_zbpg(zbpg);
-			local_bh_enable();
-			if (--nr <= 0)
-				goto out;
-			goto retry_unbud_list_i;
-		}
-		spin_unlock_bh(&zbud_budlists_spinlock);
-	}
-
-	/* as a last resort, free buddied pages */
-retry_bud_list:
-	spin_lock_bh(&zbud_budlists_spinlock);
-	if (list_empty(&zbud_buddied_list)) {
-		spin_unlock_bh(&zbud_budlists_spinlock);
-		goto out;
-	}
-	list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
-		if (unlikely(!spin_trylock(&zbpg->lock)))
-			continue;
-		list_del_init(&zbpg->bud_list);
-		zcache_zbud_buddied_count--;
-		spin_unlock(&zbud_budlists_spinlock);
-		zcache_evicted_buddied_pages++;
-		/* want budlists unlocked when doing zbpg eviction */
-		zbud_evict_zbpg(zbpg);
-		local_bh_enable();
-		if (--nr <= 0)
-			goto out;
-		goto retry_bud_list;
-	}
-	spin_unlock_bh(&zbud_budlists_spinlock);
-out:
-	return;
-}
-
-static void zbud_init(void)
-{
-	int i;
-
-	INIT_LIST_HEAD(&zbud_buddied_list);
-	zcache_zbud_buddied_count = 0;
-	for (i = 0; i < NCHUNKS; i++) {
-		INIT_LIST_HEAD(&zbud_unbuddied[i].list);
-		zbud_unbuddied[i].count = 0;
-	}
-}
-
-#ifdef CONFIG_SYSFS
-/*
- * These sysfs routines show a nice distribution of how many zbpg's are
- * currently (and have ever been placed) in each unbuddied list.  It's fun
- * to watch but can probably go away before final merge.
- */
-static int zbud_show_unbuddied_list_counts(char *buf)
-{
-	int i;
-	char *p = buf;
-
-	for (i = 0; i < NCHUNKS; i++)
-		p += sprintf(p, "%u ", zbud_unbuddied[i].count);
-	return p - buf;
-}
-
-static int zbud_show_cumul_chunk_counts(char *buf)
-{
-	unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0;
-	unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0;
-	unsigned long total_chunks_lte_42 = 0;
-	char *p = buf;
-
-	for (i = 0; i < NCHUNKS; i++) {
-		p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]);
-		chunks += zbud_cumul_chunk_counts[i];
-		total_chunks += zbud_cumul_chunk_counts[i];
-		sum_total_chunks += i * zbud_cumul_chunk_counts[i];
-		if (i == 21)
-			total_chunks_lte_21 = total_chunks;
-		if (i == 32)
-			total_chunks_lte_32 = total_chunks;
-		if (i == 42)
-			total_chunks_lte_42 = total_chunks;
-	}
-	p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
-		total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42,
-		chunks == 0 ? 0 : sum_total_chunks / chunks);
-	return p - buf;
-}
-#endif
-
-/**********
- * This "zv" PAM implementation combines the slab-based zsmalloc
- * with the crypto compression API to maximize the amount of data that can
- * be packed into a physical page.
- *
- * Zv represents a PAM page with the index and object (plus a "size" value
- * necessary for decompression) immediately preceding the compressed data.
- */
-
-#define ZVH_SENTINEL  0x43214321
-
-struct zv_hdr {
-	uint32_t pool_id;
-	struct tmem_oid oid;
-	uint32_t index;
-	size_t size;
-	DECL_SENTINEL
-};
-
-/* rudimentary policy limits */
-/* total number of persistent pages may not exceed this percentage */
-static unsigned int zv_page_count_policy_percent = 75;
-/*
- * byte count defining poor compression; pages with greater zsize will be
- * rejected
- */
-static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7;
-/*
- * byte count defining poor *mean* compression; pages with greater zsize
- * will be rejected until sufficient better-compressed pages are accepted
- * driving the mean below this threshold
- */
-static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5;
-
-static atomic_t zv_curr_dist_counts[NCHUNKS];
-static atomic_t zv_cumul_dist_counts[NCHUNKS];
-
-static struct zv_hdr *zv_create(struct zs_pool *pool, uint32_t pool_id,
-				struct tmem_oid *oid, uint32_t index,
-				void *cdata, unsigned clen)
-{
-	struct zv_hdr *zv;
-	u32 size = clen + sizeof(struct zv_hdr);
-	int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
-	void *handle = NULL;
-
-	BUG_ON(!irqs_disabled());
-	BUG_ON(chunks >= NCHUNKS);
-	handle = zs_malloc(pool, size);
-	if (!handle)
-		goto out;
-	atomic_inc(&zv_curr_dist_counts[chunks]);
-	atomic_inc(&zv_cumul_dist_counts[chunks]);
-	zv = zs_map_object(pool, handle);
-	zv->index = index;
-	zv->oid = *oid;
-	zv->pool_id = pool_id;
-	zv->size = clen;
-	SET_SENTINEL(zv, ZVH);
-	memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);
-	zs_unmap_object(pool, handle);
-out:
-	return handle;
-}
-
-static void zv_free(struct zs_pool *pool, void *handle)
-{
-	unsigned long flags;
-	struct zv_hdr *zv;
-	uint16_t size;
-	int chunks;
-
-	zv = zs_map_object(pool, handle);
-	ASSERT_SENTINEL(zv, ZVH);
-	size = zv->size + sizeof(struct zv_hdr);
-	INVERT_SENTINEL(zv, ZVH);
-	zs_unmap_object(pool, handle);
-
-	chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT;
-	BUG_ON(chunks >= NCHUNKS);
-	atomic_dec(&zv_curr_dist_counts[chunks]);
-
-	local_irq_save(flags);
-	zs_free(pool, handle);
-	local_irq_restore(flags);
-}
-
-static void zv_decompress(struct page *page, void *handle)
-{
-	unsigned int clen = PAGE_SIZE;
-	char *to_va;
-	int ret;
-	struct zv_hdr *zv;
-
-	zv = zs_map_object(zcache_host.zspool, handle);
-	BUG_ON(zv->size == 0);
-	ASSERT_SENTINEL(zv, ZVH);
-	to_va = kmap_atomic(page);
-	ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, (char *)zv + sizeof(*zv),
-				zv->size, to_va, &clen);
-	kunmap_atomic(to_va);
-	zs_unmap_object(zcache_host.zspool, handle);
-	BUG_ON(ret);
-	BUG_ON(clen != PAGE_SIZE);
-}
-
-#ifdef CONFIG_SYSFS
-/*
- * show a distribution of compression stats for zv pages.
- */
-
-static int zv_curr_dist_counts_show(char *buf)
-{
-	unsigned long i, n, chunks = 0, sum_total_chunks = 0;
-	char *p = buf;
-
-	for (i = 0; i < NCHUNKS; i++) {
-		n = atomic_read(&zv_curr_dist_counts[i]);
-		p += sprintf(p, "%lu ", n);
-		chunks += n;
-		sum_total_chunks += i * n;
-	}
-	p += sprintf(p, "mean:%lu\n",
-		chunks == 0 ? 0 : sum_total_chunks / chunks);
-	return p - buf;
-}
-
-static int zv_cumul_dist_counts_show(char *buf)
-{
-	unsigned long i, n, chunks = 0, sum_total_chunks = 0;
-	char *p = buf;
-
-	for (i = 0; i < NCHUNKS; i++) {
-		n = atomic_read(&zv_cumul_dist_counts[i]);
-		p += sprintf(p, "%lu ", n);
-		chunks += n;
-		sum_total_chunks += i * n;
-	}
-	p += sprintf(p, "mean:%lu\n",
-		chunks == 0 ? 0 : sum_total_chunks / chunks);
-	return p - buf;
-}
-
-/*
- * setting zv_max_zsize via sysfs causes all persistent (e.g. swap)
- * pages that don't compress to less than this value (including metadata
- * overhead) to be rejected.  We don't allow the value to get too close
- * to PAGE_SIZE.
- */
-static ssize_t zv_max_zsize_show(struct kobject *kobj,
-				    struct kobj_attribute *attr,
-				    char *buf)
-{
-	return sprintf(buf, "%u\n", zv_max_zsize);
-}
-
-static ssize_t zv_max_zsize_store(struct kobject *kobj,
-				    struct kobj_attribute *attr,
-				    const char *buf, size_t count)
-{
-	unsigned long val;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	err = kstrtoul(buf, 10, &val);
-	if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
-		return -EINVAL;
-	zv_max_zsize = val;
-	return count;
-}
-
-/*
- * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap)
- * pages that don't compress to less than this value (including metadata
- * overhead) to be rejected UNLESS the mean compression is also smaller
- * than this value.  In other words, we are load-balancing-by-zsize the
- * accepted pages.  Again, we don't allow the value to get too close
- * to PAGE_SIZE.
- */
-static ssize_t zv_max_mean_zsize_show(struct kobject *kobj,
-				    struct kobj_attribute *attr,
-				    char *buf)
-{
-	return sprintf(buf, "%u\n", zv_max_mean_zsize);
-}
-
-static ssize_t zv_max_mean_zsize_store(struct kobject *kobj,
-				    struct kobj_attribute *attr,
-				    const char *buf, size_t count)
-{
-	unsigned long val;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	err = kstrtoul(buf, 10, &val);
-	if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7))
-		return -EINVAL;
-	zv_max_mean_zsize = val;
-	return count;
-}
-
-/*
- * setting zv_page_count_policy_percent via sysfs sets an upper bound of
- * persistent (e.g. swap) pages that will be retained according to:
- *     (zv_page_count_policy_percent * totalram_pages) / 100)
- * when that limit is reached, further puts will be rejected (until
- * some pages have been flushed).  Note that, due to compression,
- * this number may exceed 100; it defaults to 75 and we set an
- * arbitary limit of 150.  A poor choice will almost certainly result
- * in OOM's, so this value should only be changed prudently.
- */
-static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj,
-						 struct kobj_attribute *attr,
-						 char *buf)
-{
-	return sprintf(buf, "%u\n", zv_page_count_policy_percent);
-}
-
-static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj,
-						  struct kobj_attribute *attr,
-						  const char *buf, size_t count)
-{
-	unsigned long val;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	err = kstrtoul(buf, 10, &val);
-	if (err || (val == 0) || (val > 150))
-		return -EINVAL;
-	zv_page_count_policy_percent = val;
-	return count;
-}
-
-static struct kobj_attribute zcache_zv_max_zsize_attr = {
-		.attr = { .name = "zv_max_zsize", .mode = 0644 },
-		.show = zv_max_zsize_show,
-		.store = zv_max_zsize_store,
-};
-
-static struct kobj_attribute zcache_zv_max_mean_zsize_attr = {
-		.attr = { .name = "zv_max_mean_zsize", .mode = 0644 },
-		.show = zv_max_mean_zsize_show,
-		.store = zv_max_mean_zsize_store,
-};
-
-static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = {
-		.attr = { .name = "zv_page_count_policy_percent",
-			  .mode = 0644 },
-		.show = zv_page_count_policy_percent_show,
-		.store = zv_page_count_policy_percent_store,
-};
-#endif
-
-/*
- * zcache core code starts here
- */
-
-/* useful stats not collected by cleancache or frontswap */
-static unsigned long zcache_flush_total;
-static unsigned long zcache_flush_found;
-static unsigned long zcache_flobj_total;
-static unsigned long zcache_flobj_found;
-static unsigned long zcache_failed_eph_puts;
-static unsigned long zcache_failed_pers_puts;
-
-/*
- * Tmem operations assume the poolid implies the invoking client.
- * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
- * RAMster has each client numbered by cluster node, and a KVM version
- * of zcache would have one client per guest and each client might
- * have a poolid==N.
- */
-static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid)
-{
-	struct tmem_pool *pool = NULL;
-	struct zcache_client *cli = NULL;
-
-	if (cli_id == LOCAL_CLIENT)
-		cli = &zcache_host;
-	else {
-		if (cli_id >= MAX_CLIENTS)
-			goto out;
-		cli = &zcache_clients[cli_id];
-		if (cli == NULL)
-			goto out;
-		atomic_inc(&cli->refcount);
-	}
-	if (poolid < MAX_POOLS_PER_CLIENT) {
-		pool = cli->tmem_pools[poolid];
-		if (pool != NULL)
-			atomic_inc(&pool->refcount);
-	}
-out:
-	return pool;
-}
-
-static void zcache_put_pool(struct tmem_pool *pool)
-{
-	struct zcache_client *cli = NULL;
-
-	if (pool == NULL)
-		BUG();
-	cli = pool->client;
-	atomic_dec(&pool->refcount);
-	atomic_dec(&cli->refcount);
-}
-
-int zcache_new_client(uint16_t cli_id)
-{
-	struct zcache_client *cli = NULL;
-	int ret = -1;
-
-	if (cli_id == LOCAL_CLIENT)
-		cli = &zcache_host;
-	else if ((unsigned int)cli_id < MAX_CLIENTS)
-		cli = &zcache_clients[cli_id];
-	if (cli == NULL)
-		goto out;
-	if (cli->allocated)
-		goto out;
-	cli->allocated = 1;
-#ifdef CONFIG_FRONTSWAP
-	cli->zspool = zs_create_pool("zcache", ZCACHE_GFP_MASK);
-	if (cli->zspool == NULL)
-		goto out;
-#endif
-	ret = 0;
-out:
-	return ret;
-}
-
-/* counters for debugging */
-static unsigned long zcache_failed_get_free_pages;
-static unsigned long zcache_failed_alloc;
-static unsigned long zcache_put_to_flush;
-
-/*
- * for now, used named slabs so can easily track usage; later can
- * either just use kmalloc, or perhaps add a slab-like allocator
- * to more carefully manage total memory utilization
- */
-static struct kmem_cache *zcache_objnode_cache;
-static struct kmem_cache *zcache_obj_cache;
-static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);
-static unsigned long zcache_curr_obj_count_max;
-static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);
-static unsigned long zcache_curr_objnode_count_max;
-
-/*
- * to avoid memory allocation recursion (e.g. due to direct reclaim), we
- * preload all necessary data structures so the hostops callbacks never
- * actually do a malloc
- */
-struct zcache_preload {
-	void *page;
-	struct tmem_obj *obj;
-	int nr;
-	struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];
-};
-static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
-
-static int zcache_do_preload(struct tmem_pool *pool)
-{
-	struct zcache_preload *kp;
-	struct tmem_objnode *objnode;
-	struct tmem_obj *obj;
-	void *page;
-	int ret = -ENOMEM;
-
-	if (unlikely(zcache_objnode_cache == NULL))
-		goto out;
-	if (unlikely(zcache_obj_cache == NULL))
-		goto out;
-	preempt_disable();
-	kp = &__get_cpu_var(zcache_preloads);
-	while (kp->nr < ARRAY_SIZE(kp->objnodes)) {
-		preempt_enable_no_resched();
-		objnode = kmem_cache_alloc(zcache_objnode_cache,
-				ZCACHE_GFP_MASK);
-		if (unlikely(objnode == NULL)) {
-			zcache_failed_alloc++;
-			goto out;
-		}
-		preempt_disable();
-		kp = &__get_cpu_var(zcache_preloads);
-		if (kp->nr < ARRAY_SIZE(kp->objnodes))
-			kp->objnodes[kp->nr++] = objnode;
-		else
-			kmem_cache_free(zcache_objnode_cache, objnode);
-	}
-	preempt_enable_no_resched();
-	obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
-	if (unlikely(obj == NULL)) {
-		zcache_failed_alloc++;
-		goto out;
-	}
-	page = (void *)__get_free_page(ZCACHE_GFP_MASK);
-	if (unlikely(page == NULL)) {
-		zcache_failed_get_free_pages++;
-		kmem_cache_free(zcache_obj_cache, obj);
-		goto out;
-	}
-	preempt_disable();
-	kp = &__get_cpu_var(zcache_preloads);
-	if (kp->obj == NULL)
-		kp->obj = obj;
-	else
-		kmem_cache_free(zcache_obj_cache, obj);
-	if (kp->page == NULL)
-		kp->page = page;
-	else
-		free_page((unsigned long)page);
-	ret = 0;
-out:
-	return ret;
-}
-
-static void *zcache_get_free_page(void)
-{
-	struct zcache_preload *kp;
-	void *page;
-
-	kp = &__get_cpu_var(zcache_preloads);
-	page = kp->page;
-	BUG_ON(page == NULL);
-	kp->page = NULL;
-	return page;
-}
-
-static void zcache_free_page(void *p)
-{
-	free_page((unsigned long)p);
-}
-
-/*
- * zcache implementation for tmem host ops
- */
-
-static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
-{
-	struct tmem_objnode *objnode = NULL;
-	unsigned long count;
-	struct zcache_preload *kp;
-
-	kp = &__get_cpu_var(zcache_preloads);
-	if (kp->nr <= 0)
-		goto out;
-	objnode = kp->objnodes[kp->nr - 1];
-	BUG_ON(objnode == NULL);
-	kp->objnodes[kp->nr - 1] = NULL;
-	kp->nr--;
-	count = atomic_inc_return(&zcache_curr_objnode_count);
-	if (count > zcache_curr_objnode_count_max)
-		zcache_curr_objnode_count_max = count;
-out:
-	return objnode;
-}
-
-static void zcache_objnode_free(struct tmem_objnode *objnode,
-					struct tmem_pool *pool)
-{
-	atomic_dec(&zcache_curr_objnode_count);
-	BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0);
-	kmem_cache_free(zcache_objnode_cache, objnode);
-}
-
-static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
-{
-	struct tmem_obj *obj = NULL;
-	unsigned long count;
-	struct zcache_preload *kp;
-
-	kp = &__get_cpu_var(zcache_preloads);
-	obj = kp->obj;
-	BUG_ON(obj == NULL);
-	kp->obj = NULL;
-	count = atomic_inc_return(&zcache_curr_obj_count);
-	if (count > zcache_curr_obj_count_max)
-		zcache_curr_obj_count_max = count;
-	return obj;
-}
-
-static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
-{
-	atomic_dec(&zcache_curr_obj_count);
-	BUG_ON(atomic_read(&zcache_curr_obj_count) < 0);
-	kmem_cache_free(zcache_obj_cache, obj);
-}
-
-static struct tmem_hostops zcache_hostops = {
-	.obj_alloc = zcache_obj_alloc,
-	.obj_free = zcache_obj_free,
-	.objnode_alloc = zcache_objnode_alloc,
-	.objnode_free = zcache_objnode_free,
-};
-
-/*
- * zcache implementations for PAM page descriptor ops
- */
-
-static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0);
-static unsigned long zcache_curr_eph_pampd_count_max;
-static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0);
-static unsigned long zcache_curr_pers_pampd_count_max;
-
-/* forward reference */
-static int zcache_compress(struct page *from, void **out_va, unsigned *out_len);
-
-static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph,
-				struct tmem_pool *pool, struct tmem_oid *oid,
-				 uint32_t index)
-{
-	void *pampd = NULL, *cdata;
-	unsigned clen;
-	int ret;
-	unsigned long count;
-	struct page *page = (struct page *)(data);
-	struct zcache_client *cli = pool->client;
-	uint16_t client_id = get_client_id_from_client(cli);
-	unsigned long zv_mean_zsize;
-	unsigned long curr_pers_pampd_count;
-	u64 total_zsize;
-
-	if (eph) {
-		ret = zcache_compress(page, &cdata, &clen);
-		if (ret == 0)
-			goto out;
-		if (clen == 0 || clen > zbud_max_buddy_size()) {
-			zcache_compress_poor++;
-			goto out;
-		}
-		pampd = (void *)zbud_create(client_id, pool->pool_id, oid,
-						index, page, cdata, clen);
-		if (pampd != NULL) {
-			count = atomic_inc_return(&zcache_curr_eph_pampd_count);
-			if (count > zcache_curr_eph_pampd_count_max)
-				zcache_curr_eph_pampd_count_max = count;
-		}
-	} else {
-		curr_pers_pampd_count =
-			atomic_read(&zcache_curr_pers_pampd_count);
-		if (curr_pers_pampd_count >
-		    (zv_page_count_policy_percent * totalram_pages) / 100)
-			goto out;
-		ret = zcache_compress(page, &cdata, &clen);
-		if (ret == 0)
-			goto out;
-		/* reject if compression is too poor */
-		if (clen > zv_max_zsize) {
-			zcache_compress_poor++;
-			goto out;
-		}
-		/* reject if mean compression is too poor */
-		if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) {
-			total_zsize = zs_get_total_size_bytes(cli->zspool);
-			zv_mean_zsize = div_u64(total_zsize,
-						curr_pers_pampd_count);
-			if (zv_mean_zsize > zv_max_mean_zsize) {
-				zcache_mean_compress_poor++;
-				goto out;
-			}
-		}
-		pampd = (void *)zv_create(cli->zspool, pool->pool_id,
-						oid, index, cdata, clen);
-		if (pampd == NULL)
-			goto out;
-		count = atomic_inc_return(&zcache_curr_pers_pampd_count);
-		if (count > zcache_curr_pers_pampd_count_max)
-			zcache_curr_pers_pampd_count_max = count;
-	}
-out:
-	return pampd;
-}
-
-/*
- * fill the pageframe corresponding to the struct page with the data
- * from the passed pampd
- */
-static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw,
-					void *pampd, struct tmem_pool *pool,
-					struct tmem_oid *oid, uint32_t index)
-{
-	int ret = 0;
-
-	BUG_ON(is_ephemeral(pool));
-	zv_decompress((struct page *)(data), pampd);
-	return ret;
-}
-
-/*
- * fill the pageframe corresponding to the struct page with the data
- * from the passed pampd
- */
-static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw,
-					void *pampd, struct tmem_pool *pool,
-					struct tmem_oid *oid, uint32_t index)
-{
-	BUG_ON(!is_ephemeral(pool));
-	if (zbud_decompress((struct page *)(data), pampd) < 0)
-		return -EINVAL;
-	zbud_free_and_delist((struct zbud_hdr *)pampd);
-	atomic_dec(&zcache_curr_eph_pampd_count);
-	return 0;
-}
-
-/*
- * free the pampd and remove it from any zcache lists
- * pampd must no longer be pointed to from any tmem data structures!
- */
-static void zcache_pampd_free(void *pampd, struct tmem_pool *pool,
-				struct tmem_oid *oid, uint32_t index)
-{
-	struct zcache_client *cli = pool->client;
-
-	if (is_ephemeral(pool)) {
-		zbud_free_and_delist((struct zbud_hdr *)pampd);
-		atomic_dec(&zcache_curr_eph_pampd_count);
-		BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);
-	} else {
-		zv_free(cli->zspool, pampd);
-		atomic_dec(&zcache_curr_pers_pampd_count);
-		BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);
-	}
-}
-
-static void zcache_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj)
-{
-}
-
-static void zcache_pampd_new_obj(struct tmem_obj *obj)
-{
-}
-
-static int zcache_pampd_replace_in_obj(void *pampd, struct tmem_obj *obj)
-{
-	return -1;
-}
-
-static bool zcache_pampd_is_remote(void *pampd)
-{
-	return 0;
-}
-
-static struct tmem_pamops zcache_pamops = {
-	.create = zcache_pampd_create,
-	.get_data = zcache_pampd_get_data,
-	.get_data_and_free = zcache_pampd_get_data_and_free,
-	.free = zcache_pampd_free,
-	.free_obj = zcache_pampd_free_obj,
-	.new_obj = zcache_pampd_new_obj,
-	.replace_in_obj = zcache_pampd_replace_in_obj,
-	.is_remote = zcache_pampd_is_remote,
-};
-
-/*
- * zcache compression/decompression and related per-cpu stuff
- */
-
-static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
-#define ZCACHE_DSTMEM_ORDER 1
-
-static int zcache_compress(struct page *from, void **out_va, unsigned *out_len)
-{
-	int ret = 0;
-	unsigned char *dmem = __get_cpu_var(zcache_dstmem);
-	char *from_va;
-
-	BUG_ON(!irqs_disabled());
-	if (unlikely(dmem == NULL))
-		goto out;  /* no buffer or no compressor so can't compress */
-	*out_len = PAGE_SIZE << ZCACHE_DSTMEM_ORDER;
-	from_va = kmap_atomic(from);
-	mb();
-	ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, from_va, PAGE_SIZE, dmem,
-				out_len);
-	BUG_ON(ret);
-	*out_va = dmem;
-	kunmap_atomic(from_va);
-	ret = 1;
-out:
-	return ret;
-}
-
-static int zcache_comp_cpu_up(int cpu)
-{
-	struct crypto_comp *tfm;
-
-	tfm = crypto_alloc_comp(zcache_comp_name, 0, 0);
-	if (IS_ERR(tfm))
-		return NOTIFY_BAD;
-	*per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm;
-	return NOTIFY_OK;
-}
-
-static void zcache_comp_cpu_down(int cpu)
-{
-	struct crypto_comp *tfm;
-
-	tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu);
-	crypto_free_comp(tfm);
-	*per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL;
-}
-
-static int zcache_cpu_notifier(struct notifier_block *nb,
-				unsigned long action, void *pcpu)
-{
-	int ret, cpu = (long)pcpu;
-	struct zcache_preload *kp;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-		ret = zcache_comp_cpu_up(cpu);
-		if (ret != NOTIFY_OK) {
-			pr_err("zcache: can't allocate compressor transform\n");
-			return ret;
-		}
-		per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
-			GFP_KERNEL | __GFP_REPEAT, ZCACHE_DSTMEM_ORDER);
-		break;
-	case CPU_DEAD:
-	case CPU_UP_CANCELED:
-		zcache_comp_cpu_down(cpu);
-		free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
-			ZCACHE_DSTMEM_ORDER);
-		per_cpu(zcache_dstmem, cpu) = NULL;
-		kp = &per_cpu(zcache_preloads, cpu);
-		while (kp->nr) {
-			kmem_cache_free(zcache_objnode_cache,
-					kp->objnodes[kp->nr - 1]);
-			kp->objnodes[kp->nr - 1] = NULL;
-			kp->nr--;
-		}
-		if (kp->obj) {
-			kmem_cache_free(zcache_obj_cache, kp->obj);
-			kp->obj = NULL;
-		}
-		if (kp->page) {
-			free_page((unsigned long)kp->page);
-			kp->page = NULL;
-		}
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block zcache_cpu_notifier_block = {
-	.notifier_call = zcache_cpu_notifier
-};
-
-#ifdef CONFIG_SYSFS
-#define ZCACHE_SYSFS_RO(_name) \
-	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
-				struct kobj_attribute *attr, char *buf) \
-	{ \
-		return sprintf(buf, "%lu\n", zcache_##_name); \
-	} \
-	static struct kobj_attribute zcache_##_name##_attr = { \
-		.attr = { .name = __stringify(_name), .mode = 0444 }, \
-		.show = zcache_##_name##_show, \
-	}
-
-#define ZCACHE_SYSFS_RO_ATOMIC(_name) \
-	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
-				struct kobj_attribute *attr, char *buf) \
-	{ \
-	    return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
-	} \
-	static struct kobj_attribute zcache_##_name##_attr = { \
-		.attr = { .name = __stringify(_name), .mode = 0444 }, \
-		.show = zcache_##_name##_show, \
-	}
-
-#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
-	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
-				struct kobj_attribute *attr, char *buf) \
-	{ \
-	    return _func(buf); \
-	} \
-	static struct kobj_attribute zcache_##_name##_attr = { \
-		.attr = { .name = __stringify(_name), .mode = 0444 }, \
-		.show = zcache_##_name##_show, \
-	}
-
-ZCACHE_SYSFS_RO(curr_obj_count_max);
-ZCACHE_SYSFS_RO(curr_objnode_count_max);
-ZCACHE_SYSFS_RO(flush_total);
-ZCACHE_SYSFS_RO(flush_found);
-ZCACHE_SYSFS_RO(flobj_total);
-ZCACHE_SYSFS_RO(flobj_found);
-ZCACHE_SYSFS_RO(failed_eph_puts);
-ZCACHE_SYSFS_RO(failed_pers_puts);
-ZCACHE_SYSFS_RO(zbud_curr_zbytes);
-ZCACHE_SYSFS_RO(zbud_cumul_zpages);
-ZCACHE_SYSFS_RO(zbud_cumul_zbytes);
-ZCACHE_SYSFS_RO(zbud_buddied_count);
-ZCACHE_SYSFS_RO(zbpg_unused_list_count);
-ZCACHE_SYSFS_RO(evicted_raw_pages);
-ZCACHE_SYSFS_RO(evicted_unbuddied_pages);
-ZCACHE_SYSFS_RO(evicted_buddied_pages);
-ZCACHE_SYSFS_RO(failed_get_free_pages);
-ZCACHE_SYSFS_RO(failed_alloc);
-ZCACHE_SYSFS_RO(put_to_flush);
-ZCACHE_SYSFS_RO(compress_poor);
-ZCACHE_SYSFS_RO(mean_compress_poor);
-ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
-ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
-ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
-ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count);
-ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts,
-			zbud_show_unbuddied_list_counts);
-ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
-			zbud_show_cumul_chunk_counts);
-ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts,
-			zv_curr_dist_counts_show);
-ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts,
-			zv_cumul_dist_counts_show);
-
-static struct attribute *zcache_attrs[] = {
-	&zcache_curr_obj_count_attr.attr,
-	&zcache_curr_obj_count_max_attr.attr,
-	&zcache_curr_objnode_count_attr.attr,
-	&zcache_curr_objnode_count_max_attr.attr,
-	&zcache_flush_total_attr.attr,
-	&zcache_flobj_total_attr.attr,
-	&zcache_flush_found_attr.attr,
-	&zcache_flobj_found_attr.attr,
-	&zcache_failed_eph_puts_attr.attr,
-	&zcache_failed_pers_puts_attr.attr,
-	&zcache_compress_poor_attr.attr,
-	&zcache_mean_compress_poor_attr.attr,
-	&zcache_zbud_curr_raw_pages_attr.attr,
-	&zcache_zbud_curr_zpages_attr.attr,
-	&zcache_zbud_curr_zbytes_attr.attr,
-	&zcache_zbud_cumul_zpages_attr.attr,
-	&zcache_zbud_cumul_zbytes_attr.attr,
-	&zcache_zbud_buddied_count_attr.attr,
-	&zcache_zbpg_unused_list_count_attr.attr,
-	&zcache_evicted_raw_pages_attr.attr,
-	&zcache_evicted_unbuddied_pages_attr.attr,
-	&zcache_evicted_buddied_pages_attr.attr,
-	&zcache_failed_get_free_pages_attr.attr,
-	&zcache_failed_alloc_attr.attr,
-	&zcache_put_to_flush_attr.attr,
-	&zcache_zbud_unbuddied_list_counts_attr.attr,
-	&zcache_zbud_cumul_chunk_counts_attr.attr,
-	&zcache_zv_curr_dist_counts_attr.attr,
-	&zcache_zv_cumul_dist_counts_attr.attr,
-	&zcache_zv_max_zsize_attr.attr,
-	&zcache_zv_max_mean_zsize_attr.attr,
-	&zcache_zv_page_count_policy_percent_attr.attr,
-	NULL,
-};
-
-static struct attribute_group zcache_attr_group = {
-	.attrs = zcache_attrs,
-	.name = "zcache",
-};
-
-#endif /* CONFIG_SYSFS */
-/*
- * When zcache is disabled ("frozen"), pools can be created and destroyed,
- * but all puts (and thus all other operations that require memory allocation)
- * must fail.  If zcache is unfrozen, accepts puts, then frozen again,
- * data consistency requires all puts while frozen to be converted into
- * flushes.
- */
-static bool zcache_freeze;
-
-/*
- * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
- */
-static int shrink_zcache_memory(struct shrinker *shrink,
-				struct shrink_control *sc)
-{
-	int ret = -1;
-	int nr = sc->nr_to_scan;
-	gfp_t gfp_mask = sc->gfp_mask;
-
-	if (nr >= 0) {
-		if (!(gfp_mask & __GFP_FS))
-			/* does this case really need to be skipped? */
-			goto out;
-		zbud_evict_pages(nr);
-	}
-	ret = (int)atomic_read(&zcache_zbud_curr_raw_pages);
-out:
-	return ret;
-}
-
-static struct shrinker zcache_shrinker = {
-	.shrink = shrink_zcache_memory,
-	.seeks = DEFAULT_SEEKS,
-};
-
-/*
- * zcache shims between cleancache/frontswap ops and tmem
- */
-
-static int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp,
-				uint32_t index, struct page *page)
-{
-	struct tmem_pool *pool;
-	int ret = -1;
-
-	BUG_ON(!irqs_disabled());
-	pool = zcache_get_pool_by_id(cli_id, pool_id);
-	if (unlikely(pool == NULL))
-		goto out;
-	if (!zcache_freeze && zcache_do_preload(pool) == 0) {
-		/* preload does preempt_disable on success */
-		ret = tmem_put(pool, oidp, index, (char *)(page),
-				PAGE_SIZE, 0, is_ephemeral(pool));
-		if (ret < 0) {
-			if (is_ephemeral(pool))
-				zcache_failed_eph_puts++;
-			else
-				zcache_failed_pers_puts++;
-		}
-		zcache_put_pool(pool);
-		preempt_enable_no_resched();
-	} else {
-		zcache_put_to_flush++;
-		if (atomic_read(&pool->obj_count) > 0)
-			/* the put fails whether the flush succeeds or not */
-			(void)tmem_flush_page(pool, oidp, index);
-		zcache_put_pool(pool);
-	}
-out:
-	return ret;
-}
-
-static int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp,
-				uint32_t index, struct page *page)
-{
-	struct tmem_pool *pool;
-	int ret = -1;
-	unsigned long flags;
-	size_t size = PAGE_SIZE;
-
-	local_irq_save(flags);
-	pool = zcache_get_pool_by_id(cli_id, pool_id);
-	if (likely(pool != NULL)) {
-		if (atomic_read(&pool->obj_count) > 0)
-			ret = tmem_get(pool, oidp, index, (char *)(page),
-					&size, 0, is_ephemeral(pool));
-		zcache_put_pool(pool);
-	}
-	local_irq_restore(flags);
-	return ret;
-}
-
-static int zcache_flush_page(int cli_id, int pool_id,
-				struct tmem_oid *oidp, uint32_t index)
-{
-	struct tmem_pool *pool;
-	int ret = -1;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	zcache_flush_total++;
-	pool = zcache_get_pool_by_id(cli_id, pool_id);
-	if (likely(pool != NULL)) {
-		if (atomic_read(&pool->obj_count) > 0)
-			ret = tmem_flush_page(pool, oidp, index);
-		zcache_put_pool(pool);
-	}
-	if (ret >= 0)
-		zcache_flush_found++;
-	local_irq_restore(flags);
-	return ret;
-}
-
-static int zcache_flush_object(int cli_id, int pool_id,
-				struct tmem_oid *oidp)
-{
-	struct tmem_pool *pool;
-	int ret = -1;
-	unsigned long flags;
-
-	local_irq_save(flags);
-	zcache_flobj_total++;
-	pool = zcache_get_pool_by_id(cli_id, pool_id);
-	if (likely(pool != NULL)) {
-		if (atomic_read(&pool->obj_count) > 0)
-			ret = tmem_flush_object(pool, oidp);
-		zcache_put_pool(pool);
-	}
-	if (ret >= 0)
-		zcache_flobj_found++;
-	local_irq_restore(flags);
-	return ret;
-}
-
-static int zcache_destroy_pool(int cli_id, int pool_id)
-{
-	struct tmem_pool *pool = NULL;
-	struct zcache_client *cli = NULL;
-	int ret = -1;
-
-	if (pool_id < 0)
-		goto out;
-	if (cli_id == LOCAL_CLIENT)
-		cli = &zcache_host;
-	else if ((unsigned int)cli_id < MAX_CLIENTS)
-		cli = &zcache_clients[cli_id];
-	if (cli == NULL)
-		goto out;
-	atomic_inc(&cli->refcount);
-	pool = cli->tmem_pools[pool_id];
-	if (pool == NULL)
-		goto out;
-	cli->tmem_pools[pool_id] = NULL;
-	/* wait for pool activity on other cpus to quiesce */
-	while (atomic_read(&pool->refcount) != 0)
-		;
-	atomic_dec(&cli->refcount);
-	local_bh_disable();
-	ret = tmem_destroy_pool(pool);
-	local_bh_enable();
-	kfree(pool);
-	pr_info("zcache: destroyed pool id=%d, cli_id=%d\n",
-			pool_id, cli_id);
-out:
-	return ret;
-}
-
-static int zcache_new_pool(uint16_t cli_id, uint32_t flags)
-{
-	int poolid = -1;
-	struct tmem_pool *pool;
-	struct zcache_client *cli = NULL;
-
-	if (cli_id == LOCAL_CLIENT)
-		cli = &zcache_host;
-	else if ((unsigned int)cli_id < MAX_CLIENTS)
-		cli = &zcache_clients[cli_id];
-	if (cli == NULL)
-		goto out;
-	atomic_inc(&cli->refcount);
-	pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC);
-	if (pool == NULL) {
-		pr_info("zcache: pool creation failed: out of memory\n");
-		goto out;
-	}
-
-	for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
-		if (cli->tmem_pools[poolid] == NULL)
-			break;
-	if (poolid >= MAX_POOLS_PER_CLIENT) {
-		pr_info("zcache: pool creation failed: max exceeded\n");
-		kfree(pool);
-		poolid = -1;
-		goto out;
-	}
-	atomic_set(&pool->refcount, 0);
-	pool->client = cli;
-	pool->pool_id = poolid;
-	tmem_new_pool(pool, flags);
-	cli->tmem_pools[poolid] = pool;
-	pr_info("zcache: created %s tmem pool, id=%d, client=%d\n",
-		flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
-		poolid, cli_id);
-out:
-	if (cli != NULL)
-		atomic_dec(&cli->refcount);
-	return poolid;
-}
-
-/**********
- * Two kernel functionalities currently can be layered on top of tmem.
- * These are "cleancache" which is used as a second-chance cache for clean
- * page cache pages; and "frontswap" which is used for swap pages
- * to avoid writes to disk.  A generic "shim" is provided here for each
- * to translate in-kernel semantics to zcache semantics.
- */
-
-#ifdef CONFIG_CLEANCACHE
-static void zcache_cleancache_put_page(int pool_id,
-					struct cleancache_filekey key,
-					pgoff_t index, struct page *page)
-{
-	u32 ind = (u32) index;
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-
-	if (likely(ind == index))
-		(void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index, page);
-}
-
-static int zcache_cleancache_get_page(int pool_id,
-					struct cleancache_filekey key,
-					pgoff_t index, struct page *page)
-{
-	u32 ind = (u32) index;
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-	int ret = -1;
-
-	if (likely(ind == index))
-		ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index, page);
-	return ret;
-}
-
-static void zcache_cleancache_flush_page(int pool_id,
-					struct cleancache_filekey key,
-					pgoff_t index)
-{
-	u32 ind = (u32) index;
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-
-	if (likely(ind == index))
-		(void)zcache_flush_page(LOCAL_CLIENT, pool_id, &oid, ind);
-}
-
-static void zcache_cleancache_flush_inode(int pool_id,
-					struct cleancache_filekey key)
-{
-	struct tmem_oid oid = *(struct tmem_oid *)&key;
-
-	(void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid);
-}
-
-static void zcache_cleancache_flush_fs(int pool_id)
-{
-	if (pool_id >= 0)
-		(void)zcache_destroy_pool(LOCAL_CLIENT, pool_id);
-}
-
-static int zcache_cleancache_init_fs(size_t pagesize)
-{
-	BUG_ON(sizeof(struct cleancache_filekey) !=
-				sizeof(struct tmem_oid));
-	BUG_ON(pagesize != PAGE_SIZE);
-	return zcache_new_pool(LOCAL_CLIENT, 0);
-}
-
-static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
-{
-	/* shared pools are unsupported and map to private */
-	BUG_ON(sizeof(struct cleancache_filekey) !=
-				sizeof(struct tmem_oid));
-	BUG_ON(pagesize != PAGE_SIZE);
-	return zcache_new_pool(LOCAL_CLIENT, 0);
-}
-
-static struct cleancache_ops zcache_cleancache_ops = {
-	.put_page = zcache_cleancache_put_page,
-	.get_page = zcache_cleancache_get_page,
-	.invalidate_page = zcache_cleancache_flush_page,
-	.invalidate_inode = zcache_cleancache_flush_inode,
-	.invalidate_fs = zcache_cleancache_flush_fs,
-	.init_shared_fs = zcache_cleancache_init_shared_fs,
-	.init_fs = zcache_cleancache_init_fs
-};
-
-struct cleancache_ops zcache_cleancache_register_ops(void)
-{
-	struct cleancache_ops old_ops =
-		cleancache_register_ops(&zcache_cleancache_ops);
-
-	return old_ops;
-}
-#endif
-
-#ifdef CONFIG_FRONTSWAP
-/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
-static int zcache_frontswap_poolid = -1;
-
-/*
- * Swizzling increases objects per swaptype, increasing tmem concurrency
- * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
- * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from
- * frontswap_get_page(), but has side-effects. Hence using 8.
- */
-#define SWIZ_BITS		8
-#define SWIZ_MASK		((1 << SWIZ_BITS) - 1)
-#define _oswiz(_type, _ind)	((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
-#define iswiz(_ind)		(_ind >> SWIZ_BITS)
-
-static inline struct tmem_oid oswiz(unsigned type, u32 ind)
-{
-	struct tmem_oid oid = { .oid = { 0 } };
-	oid.oid[0] = _oswiz(type, ind);
-	return oid;
-}
-
-static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
-				   struct page *page)
-{
-	u64 ind64 = (u64)offset;
-	u32 ind = (u32)offset;
-	struct tmem_oid oid = oswiz(type, ind);
-	int ret = -1;
-	unsigned long flags;
-
-	BUG_ON(!PageLocked(page));
-	if (likely(ind64 == ind)) {
-		local_irq_save(flags);
-		ret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid,
-					&oid, iswiz(ind), page);
-		local_irq_restore(flags);
-	}
-	return ret;
-}
-
-/* returns 0 if the page was successfully gotten from frontswap, -1 if
- * was not present (should never happen!) */
-static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
-				   struct page *page)
-{
-	u64 ind64 = (u64)offset;
-	u32 ind = (u32)offset;
-	struct tmem_oid oid = oswiz(type, ind);
-	int ret = -1;
-
-	BUG_ON(!PageLocked(page));
-	if (likely(ind64 == ind))
-		ret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid,
-					&oid, iswiz(ind), page);
-	return ret;
-}
-
-/* flush a single page from frontswap */
-static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
-{
-	u64 ind64 = (u64)offset;
-	u32 ind = (u32)offset;
-	struct tmem_oid oid = oswiz(type, ind);
-
-	if (likely(ind64 == ind))
-		(void)zcache_flush_page(LOCAL_CLIENT, zcache_frontswap_poolid,
-					&oid, iswiz(ind));
-}
-
-/* flush all pages from the passed swaptype */
-static void zcache_frontswap_flush_area(unsigned type)
-{
-	struct tmem_oid oid;
-	int ind;
-
-	for (ind = SWIZ_MASK; ind >= 0; ind--) {
-		oid = oswiz(type, ind);
-		(void)zcache_flush_object(LOCAL_CLIENT,
-						zcache_frontswap_poolid, &oid);
-	}
-}
-
-static void zcache_frontswap_init(unsigned ignored)
-{
-	/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
-	if (zcache_frontswap_poolid < 0)
-		zcache_frontswap_poolid =
-			zcache_new_pool(LOCAL_CLIENT, TMEM_POOL_PERSIST);
-}
-
-static struct frontswap_ops zcache_frontswap_ops = {
-	.put_page = zcache_frontswap_put_page,
-	.get_page = zcache_frontswap_get_page,
-	.invalidate_page = zcache_frontswap_flush_page,
-	.invalidate_area = zcache_frontswap_flush_area,
-	.init = zcache_frontswap_init
-};
-
-struct frontswap_ops zcache_frontswap_register_ops(void)
-{
-	struct frontswap_ops old_ops =
-		frontswap_register_ops(&zcache_frontswap_ops);
-
-	return old_ops;
-}
-#endif
-
-/*
- * zcache initialization
- * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
- * NOTHING HAPPENS!
- */
-
-static int zcache_enabled;
-
-static int __init enable_zcache(char *s)
-{
-	zcache_enabled = 1;
-	return 1;
-}
-__setup("zcache", enable_zcache);
-
-/* allow independent dynamic disabling of cleancache and frontswap */
-
-static int use_cleancache = 1;
-
-static int __init no_cleancache(char *s)
-{
-	use_cleancache = 0;
-	return 1;
-}
-
-__setup("nocleancache", no_cleancache);
-
-static int use_frontswap = 1;
-
-static int __init no_frontswap(char *s)
-{
-	use_frontswap = 0;
-	return 1;
-}
-
-__setup("nofrontswap", no_frontswap);
-
-static int __init enable_zcache_compressor(char *s)
-{
-	strncpy(zcache_comp_name, s, ZCACHE_COMP_NAME_SZ);
-	zcache_enabled = 1;
-	return 1;
-}
-__setup("zcache=", enable_zcache_compressor);
-
-
-static int zcache_comp_init(void)
-{
-	int ret = 0;
-
-	/* check crypto algorithm */
-	if (*zcache_comp_name != '\0') {
-		ret = crypto_has_comp(zcache_comp_name, 0, 0);
-		if (!ret)
-			pr_info("zcache: %s not supported\n",
-					zcache_comp_name);
-	}
-	if (!ret)
-		strcpy(zcache_comp_name, "lzo");
-	ret = crypto_has_comp(zcache_comp_name, 0, 0);
-	if (!ret) {
-		ret = 1;
-		goto out;
-	}
-	pr_info("zcache: using %s compressor\n", zcache_comp_name);
-
-	/* alloc percpu transforms */
-	ret = 0;
-	zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
-	if (!zcache_comp_pcpu_tfms)
-		ret = 1;
-out:
-	return ret;
-}
-
-static int __init zcache_init(void)
-{
-	int ret = 0;
-
-#ifdef CONFIG_SYSFS
-	ret = sysfs_create_group(mm_kobj, &zcache_attr_group);
-	if (ret) {
-		pr_err("zcache: can't create sysfs\n");
-		goto out;
-	}
-#endif /* CONFIG_SYSFS */
-#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
-	if (zcache_enabled) {
-		unsigned int cpu;
-
-		tmem_register_hostops(&zcache_hostops);
-		tmem_register_pamops(&zcache_pamops);
-		ret = register_cpu_notifier(&zcache_cpu_notifier_block);
-		if (ret) {
-			pr_err("zcache: can't register cpu notifier\n");
-			goto out;
-		}
-		ret = zcache_comp_init();
-		if (ret) {
-			pr_err("zcache: compressor initialization failed\n");
-			goto out;
-		}
-		for_each_online_cpu(cpu) {
-			void *pcpu = (void *)(long)cpu;
-			zcache_cpu_notifier(&zcache_cpu_notifier_block,
-				CPU_UP_PREPARE, pcpu);
-		}
-	}
-	zcache_objnode_cache = kmem_cache_create("zcache_objnode",
-				sizeof(struct tmem_objnode), 0, 0, NULL);
-	zcache_obj_cache = kmem_cache_create("zcache_obj",
-				sizeof(struct tmem_obj), 0, 0, NULL);
-	ret = zcache_new_client(LOCAL_CLIENT);
-	if (ret) {
-		pr_err("zcache: can't create client\n");
-		goto out;
-	}
-#endif
-#ifdef CONFIG_CLEANCACHE
-	if (zcache_enabled && use_cleancache) {
-		struct cleancache_ops old_ops;
-
-		zbud_init();
-		register_shrinker(&zcache_shrinker);
-		old_ops = zcache_cleancache_register_ops();
-		pr_info("zcache: cleancache enabled using kernel "
-			"transcendent memory and compression buddies\n");
-		if (old_ops.init_fs != NULL)
-			pr_warning("zcache: cleancache_ops overridden");
-	}
-#endif
-#ifdef CONFIG_FRONTSWAP
-	if (zcache_enabled && use_frontswap) {
-		struct frontswap_ops old_ops;
-
-		old_ops = zcache_frontswap_register_ops();
-		pr_info("zcache: frontswap enabled using kernel "
-			"transcendent memory and zsmalloc\n");
-		if (old_ops.init != NULL)
-			pr_warning("zcache: frontswap_ops overridden");
-	}
-#endif
-out:
-	return ret;
-}
-
-module_init(zcache_init)
diff --git a/drivers/staging/zram/Makefile b/drivers/staging/zram/Makefile
deleted file mode 100644
index cb0f9ced6..000000000
--- a/drivers/staging/zram/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-zram-y	:=	zram_drv.o
-
-obj-$(CONFIG_ZRAM)	+=	zram.o
diff --git a/drivers/staging/zram/zram.txt b/drivers/staging/zram/zram.txt
deleted file mode 100644
index 765d790ae..000000000
--- a/drivers/staging/zram/zram.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-zram: Compressed RAM based block devices
-----------------------------------------
-
-Project home: http://compcache.googlecode.com/
-
-* Introduction
-
-The zram module creates RAM based block devices named /dev/zram<id>
-(<id> = 0, 1, ...). Pages written to these disks are compressed and stored
-in memory itself. These disks allow very fast I/O and compression provides
-good amounts of memory savings. Some of the usecases include /tmp storage,
-use as swap disks, various caches under /var and maybe many more :)
-
-Statistics for individual zram devices are exported through sysfs nodes at
-/sys/block/zram<id>/
-
-* Usage
-
-Following shows a typical sequence of steps for using zram.
-
-1) Load Module:
-	modprobe zram num_devices=4
-	This creates 4 devices: /dev/zram{0,1,2,3}
-	(num_devices parameter is optional. Default: 1)
-
-2) Set Disksize
-        Set disk size by writing the value to sysfs node 'disksize'.
-        The value can be either in bytes or you can use mem suffixes.
-        Examples:
-            # Initialize /dev/zram0 with 50MB disksize
-            echo $((50*1024*1024)) > /sys/block/zram0/disksize
-
-            # Using mem suffixes
-            echo 256K > /sys/block/zram0/disksize
-            echo 512M > /sys/block/zram0/disksize
-            echo 1G > /sys/block/zram0/disksize
-
-3) Activate:
-	mkswap /dev/zram0
-	swapon /dev/zram0
-
-	mkfs.ext4 /dev/zram1
-	mount /dev/zram1 /tmp
-
-4) Stats:
-	Per-device statistics are exported as various nodes under
-	/sys/block/zram<id>/
-		disksize
-		num_reads
-		num_writes
-		invalid_io
-		notify_free
-		discard
-		zero_pages
-		orig_data_size
-		compr_data_size
-		mem_used_total
-
-5) Deactivate:
-	swapoff /dev/zram0
-	umount /dev/zram1
-
-6) Reset:
-	Write any positive value to 'reset' sysfs node
-	echo 1 > /sys/block/zram0/reset
-	echo 1 > /sys/block/zram1/reset
-
-	This frees all the memory allocated for the given device and
-	resets the disksize to zero. You must set the disksize again
-	before reusing the device.
-
-Please report any problems at:
- - Mailing list: linux-mm-cc at laptop dot org
- - Issue tracker: http://code.google.com/p/compcache/issues/list
-
-Nitin Gupta
-ngupta@vflare.org
diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c
deleted file mode 100644
index 83fa65773..000000000
--- a/drivers/staging/zram/zram_drv.c
+++ /dev/null
@@ -1,995 +0,0 @@
-/*
- * Compressed RAM block device
- *
- * Copyright (C) 2008, 2009, 2010  Nitin Gupta
- *
- * This code is released using a dual license strategy: BSD/GPL
- * You can choose the licence that better fits your requirements.
- *
- * Released under the terms of 3-clause BSD License
- * Released under the terms of GNU General Public License Version 2.0
- *
- * Project home: http://compcache.googlecode.com
- */
-
-#define KMSG_COMPONENT "zram"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#ifdef CONFIG_ZRAM_DEBUG
-#define DEBUG
-#endif
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/bio.h>
-#include <linux/bitops.h>
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
-#include <linux/device.h>
-#include <linux/genhd.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <linux/lzo.h>
-#include <linux/string.h>
-#include <linux/vmalloc.h>
-#include <linux/ratelimit.h>
-
-#include "zram_drv.h"
-
-/* Globals */
-static int zram_major;
-static struct zram *zram_devices;
-
-/*
- * We don't need to see memory allocation errors more than once every 1
- * second to know that a problem is occurring.
- */
-#define ALLOC_ERROR_LOG_RATE_MS 1000
-
-/* Module params (documentation at end) */
-static unsigned int num_devices = 1;
-
-static inline struct zram *dev_to_zram(struct device *dev)
-{
-	return (struct zram *)dev_to_disk(dev)->private_data;
-}
-
-static ssize_t disksize_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n", zram->disksize);
-}
-
-static ssize_t initstate_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%u\n", zram->init_done);
-}
-
-static ssize_t num_reads_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-			(u64)atomic64_read(&zram->stats.num_reads));
-}
-
-static ssize_t num_writes_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-			(u64)atomic64_read(&zram->stats.num_writes));
-}
-
-static ssize_t invalid_io_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-			(u64)atomic64_read(&zram->stats.invalid_io));
-}
-
-static ssize_t notify_free_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-			(u64)atomic64_read(&zram->stats.notify_free));
-}
-
-static ssize_t zero_pages_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%u\n", zram->stats.pages_zero);
-}
-
-static ssize_t orig_data_size_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-		(u64)(zram->stats.pages_stored) << PAGE_SHIFT);
-}
-
-static ssize_t compr_data_size_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-			(u64)atomic64_read(&zram->stats.compr_size));
-}
-
-static ssize_t mem_used_total_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	u64 val = 0;
-	struct zram *zram = dev_to_zram(dev);
-	struct zram_meta *meta = zram->meta;
-
-	down_read(&zram->init_lock);
-	if (zram->init_done)
-		val = zs_get_total_size_bytes(meta->mem_pool);
-	up_read(&zram->init_lock);
-
-	return sprintf(buf, "%llu\n", val);
-}
-
-static int zram_test_flag(struct zram_meta *meta, u32 index,
-			enum zram_pageflags flag)
-{
-	return meta->table[index].flags & BIT(flag);
-}
-
-static void zram_set_flag(struct zram_meta *meta, u32 index,
-			enum zram_pageflags flag)
-{
-	meta->table[index].flags |= BIT(flag);
-}
-
-static void zram_clear_flag(struct zram_meta *meta, u32 index,
-			enum zram_pageflags flag)
-{
-	meta->table[index].flags &= ~BIT(flag);
-}
-
-static inline int is_partial_io(struct bio_vec *bvec)
-{
-	return bvec->bv_len != PAGE_SIZE;
-}
-
-/*
- * Check if request is within bounds and aligned on zram logical blocks.
- */
-static inline int valid_io_request(struct zram *zram, struct bio *bio)
-{
-	u64 start, end, bound;
-
-	/* unaligned request */
-	if (unlikely(bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
-		return 0;
-	if (unlikely(bio->bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
-		return 0;
-
-	start = bio->bi_sector;
-	end = start + (bio->bi_size >> SECTOR_SHIFT);
-	bound = zram->disksize >> SECTOR_SHIFT;
-	/* out of range range */
-	if (unlikely(start >= bound || end > bound || start > end))
-		return 0;
-
-	/* I/O request is valid */
-	return 1;
-}
-
-static void zram_meta_free(struct zram_meta *meta)
-{
-	zs_destroy_pool(meta->mem_pool);
-	kfree(meta->compress_workmem);
-	free_pages((unsigned long)meta->compress_buffer, 1);
-	vfree(meta->table);
-	kfree(meta);
-}
-
-static struct zram_meta *zram_meta_alloc(u64 disksize)
-{
-	size_t num_pages;
-	struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
-	if (!meta)
-		goto out;
-
-	meta->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
-	if (!meta->compress_workmem)
-		goto free_meta;
-
-	meta->compress_buffer =
-		(void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
-	if (!meta->compress_buffer) {
-		pr_err("Error allocating compressor buffer space\n");
-		goto free_workmem;
-	}
-
-	num_pages = disksize >> PAGE_SHIFT;
-	meta->table = vzalloc(num_pages * sizeof(*meta->table));
-	if (!meta->table) {
-		pr_err("Error allocating zram address table\n");
-		goto free_buffer;
-	}
-
-	meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM |
-					__GFP_NOWARN);
-	if (!meta->mem_pool) {
-		pr_err("Error creating memory pool\n");
-		goto free_table;
-	}
-
-	return meta;
-
-free_table:
-	vfree(meta->table);
-free_buffer:
-	free_pages((unsigned long)meta->compress_buffer, 1);
-free_workmem:
-	kfree(meta->compress_workmem);
-free_meta:
-	kfree(meta);
-	meta = NULL;
-out:
-	return meta;
-}
-
-static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
-{
-	if (*offset + bvec->bv_len >= PAGE_SIZE)
-		(*index)++;
-	*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
-}
-
-static int page_zero_filled(void *ptr)
-{
-	unsigned int pos;
-	unsigned long *page;
-
-	page = (unsigned long *)ptr;
-
-	for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
-		if (page[pos])
-			return 0;
-	}
-
-	return 1;
-}
-
-static void handle_zero_page(struct bio_vec *bvec)
-{
-	struct page *page = bvec->bv_page;
-	void *user_mem;
-
-	user_mem = kmap_atomic(page);
-	if (is_partial_io(bvec))
-		memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
-	else
-		clear_page(user_mem);
-	kunmap_atomic(user_mem);
-
-	flush_dcache_page(page);
-}
-
-static void zram_free_page(struct zram *zram, size_t index)
-{
-	struct zram_meta *meta = zram->meta;
-	unsigned long handle = meta->table[index].handle;
-	u16 size = meta->table[index].size;
-
-	if (unlikely(!handle)) {
-		/*
-		 * No memory is allocated for zero filled pages.
-		 * Simply clear zero page flag.
-		 */
-		if (zram_test_flag(meta, index, ZRAM_ZERO)) {
-			zram_clear_flag(meta, index, ZRAM_ZERO);
-			zram->stats.pages_zero--;
-		}
-		return;
-	}
-
-	if (unlikely(size > max_zpage_size))
-		zram->stats.bad_compress--;
-
-	zs_free(meta->mem_pool, handle);
-
-	if (size <= PAGE_SIZE / 2)
-		zram->stats.good_compress--;
-
-	atomic64_sub(meta->table[index].size, &zram->stats.compr_size);
-	zram->stats.pages_stored--;
-
-	meta->table[index].handle = 0;
-	meta->table[index].size = 0;
-}
-
-static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
-{
-	int ret = LZO_E_OK;
-	size_t clen = PAGE_SIZE;
-	unsigned char *cmem;
-	struct zram_meta *meta = zram->meta;
-	unsigned long handle = meta->table[index].handle;
-
-	if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
-		clear_page(mem);
-		return 0;
-	}
-
-	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
-	if (meta->table[index].size == PAGE_SIZE)
-		copy_page(mem, cmem);
-	else
-		ret = lzo1x_decompress_safe(cmem, meta->table[index].size,
-						mem, &clen);
-	zs_unmap_object(meta->mem_pool, handle);
-
-	/* Should NEVER happen. Return bio error if it does. */
-	if (unlikely(ret != LZO_E_OK)) {
-		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
-		atomic64_inc(&zram->stats.failed_reads);
-		return ret;
-	}
-
-	return 0;
-}
-
-static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
-			  u32 index, int offset, struct bio *bio)
-{
-	int ret;
-	struct page *page;
-	unsigned char *user_mem, *uncmem = NULL;
-	struct zram_meta *meta = zram->meta;
-	page = bvec->bv_page;
-
-	if (unlikely(!meta->table[index].handle) ||
-			zram_test_flag(meta, index, ZRAM_ZERO)) {
-		handle_zero_page(bvec);
-		return 0;
-	}
-
-	if (is_partial_io(bvec))
-		/* Use  a temporary buffer to decompress the page */
-		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
-
-	user_mem = kmap_atomic(page);
-	if (!is_partial_io(bvec))
-		uncmem = user_mem;
-
-	if (!uncmem) {
-		pr_info("Unable to allocate temp memory\n");
-		ret = -ENOMEM;
-		goto out_cleanup;
-	}
-
-	ret = zram_decompress_page(zram, uncmem, index);
-	/* Should NEVER happen. Return bio error if it does. */
-	if (unlikely(ret != LZO_E_OK))
-		goto out_cleanup;
-
-	if (is_partial_io(bvec))
-		memcpy(user_mem + bvec->bv_offset, uncmem + offset,
-				bvec->bv_len);
-
-	flush_dcache_page(page);
-	ret = 0;
-out_cleanup:
-	kunmap_atomic(user_mem);
-	if (is_partial_io(bvec))
-		kfree(uncmem);
-	return ret;
-}
-
-static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
-			   int offset)
-{
-	int ret = 0;
-	size_t clen;
-	unsigned long handle;
-	struct page *page;
-	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
-	struct zram_meta *meta = zram->meta;
-	static unsigned long zram_rs_time;
-
-	page = bvec->bv_page;
-	src = meta->compress_buffer;
-
-	if (is_partial_io(bvec)) {
-		/*
-		 * This is a partial IO. We need to read the full page
-		 * before to write the changes.
-		 */
-		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
-		if (!uncmem) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		ret = zram_decompress_page(zram, uncmem, index);
-		if (ret)
-			goto out;
-	}
-
-	user_mem = kmap_atomic(page);
-
-	if (is_partial_io(bvec)) {
-		memcpy(uncmem + offset, user_mem + bvec->bv_offset,
-		       bvec->bv_len);
-		kunmap_atomic(user_mem);
-		user_mem = NULL;
-	} else {
-		uncmem = user_mem;
-	}
-
-	if (page_zero_filled(uncmem)) {
-		kunmap_atomic(user_mem);
-		/* Free memory associated with this sector now. */
-		zram_free_page(zram, index);
-
-		zram->stats.pages_zero++;
-		zram_set_flag(meta, index, ZRAM_ZERO);
-		ret = 0;
-		goto out;
-	}
-
-	/*
-	 * zram_slot_free_notify could miss free so that let's
-	 * double check.
-	 */
-	if (unlikely(meta->table[index].handle ||
-			zram_test_flag(meta, index, ZRAM_ZERO)))
-		zram_free_page(zram, index);
-
-	ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen,
-			       meta->compress_workmem);
-
-	if (!is_partial_io(bvec)) {
-		kunmap_atomic(user_mem);
-		user_mem = NULL;
-		uncmem = NULL;
-	}
-
-	if (unlikely(ret != LZO_E_OK)) {
-		pr_err("Compression failed! err=%d\n", ret);
-		goto out;
-	}
-
-	if (unlikely(clen > max_zpage_size)) {
-		zram->stats.bad_compress++;
-		clen = PAGE_SIZE;
-		src = NULL;
-		if (is_partial_io(bvec))
-			src = uncmem;
-	}
-
-	handle = zs_malloc(meta->mem_pool, clen);
-	if (!handle) {
-		if (printk_timed_ratelimit(&zram_rs_time,
-					   ALLOC_ERROR_LOG_RATE_MS))
-			pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
-				index, clen);
-		ret = -ENOMEM;
-		goto out;
-	}
-	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
-
-	if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
-		src = kmap_atomic(page);
-		copy_page(cmem, src);
-		kunmap_atomic(src);
-	} else {
-		memcpy(cmem, src, clen);
-	}
-
-	zs_unmap_object(meta->mem_pool, handle);
-
-	/*
-	 * Free memory associated with this sector
-	 * before overwriting unused sectors.
-	 */
-	zram_free_page(zram, index);
-
-	meta->table[index].handle = handle;
-	meta->table[index].size = clen;
-
-	/* Update stats */
-	atomic64_add(clen, &zram->stats.compr_size);
-	zram->stats.pages_stored++;
-	if (clen <= PAGE_SIZE / 2)
-		zram->stats.good_compress++;
-
-out:
-	if (is_partial_io(bvec))
-		kfree(uncmem);
-
-	if (ret)
-		atomic64_inc(&zram->stats.failed_writes);
-	return ret;
-}
-
-static void handle_pending_slot_free(struct zram *zram)
-{
-	struct zram_slot_free *free_rq;
-
-	spin_lock(&zram->slot_free_lock);
-	while (zram->slot_free_rq) {
-		free_rq = zram->slot_free_rq;
-		zram->slot_free_rq = free_rq->next;
-		zram_free_page(zram, free_rq->index);
-		kfree(free_rq);
-	}
-	spin_unlock(&zram->slot_free_lock);
-}
-
-static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
-			int offset, struct bio *bio, int rw)
-{
-	int ret;
-
-	if (rw == READ) {
-		down_read(&zram->lock);
-		handle_pending_slot_free(zram);
-		ret = zram_bvec_read(zram, bvec, index, offset, bio);
-		up_read(&zram->lock);
-	} else {
-		down_write(&zram->lock);
-		handle_pending_slot_free(zram);
-		ret = zram_bvec_write(zram, bvec, index, offset);
-		up_write(&zram->lock);
-	}
-
-	return ret;
-}
-
-static void zram_reset_device(struct zram *zram, bool reset_capacity)
-{
-	size_t index;
-	struct zram_meta *meta;
-
-	flush_work(&zram->free_work);
-
-	down_write(&zram->init_lock);
-	if (!zram->init_done) {
-		up_write(&zram->init_lock);
-		return;
-	}
-
-	meta = zram->meta;
-	zram->init_done = 0;
-
-	/* Free all pages that are still in this zram device */
-	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
-		unsigned long handle = meta->table[index].handle;
-		if (!handle)
-			continue;
-
-		zs_free(meta->mem_pool, handle);
-	}
-
-	zram_meta_free(zram->meta);
-	zram->meta = NULL;
-	/* Reset stats */
-	memset(&zram->stats, 0, sizeof(zram->stats));
-
-	zram->disksize = 0;
-	if (reset_capacity)
-		set_capacity(zram->disk, 0);
-	up_write(&zram->init_lock);
-}
-
-static void zram_init_device(struct zram *zram, struct zram_meta *meta)
-{
-	if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) {
-		pr_info(
-		"There is little point creating a zram of greater than "
-		"twice the size of memory since we expect a 2:1 compression "
-		"ratio. Note that zram uses about 0.1%% of the size of "
-		"the disk when not in use so a huge zram is "
-		"wasteful.\n"
-		"\tMemory Size: %lu kB\n"
-		"\tSize you selected: %llu kB\n"
-		"Continuing anyway ...\n",
-		(totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10
-		);
-	}
-
-	/* zram devices sort of resembles non-rotational disks */
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
-
-	zram->meta = meta;
-	zram->init_done = 1;
-
-	pr_debug("Initialization done!\n");
-}
-
-static ssize_t disksize_store(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t len)
-{
-	u64 disksize;
-	struct zram_meta *meta;
-	struct zram *zram = dev_to_zram(dev);
-
-	disksize = memparse(buf, NULL);
-	if (!disksize)
-		return -EINVAL;
-
-	disksize = PAGE_ALIGN(disksize);
-	meta = zram_meta_alloc(disksize);
-	down_write(&zram->init_lock);
-	if (zram->init_done) {
-		up_write(&zram->init_lock);
-		zram_meta_free(meta);
-		pr_info("Cannot change disksize for initialized device\n");
-		return -EBUSY;
-	}
-
-	zram->disksize = disksize;
-	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
-	zram_init_device(zram, meta);
-	up_write(&zram->init_lock);
-
-	return len;
-}
-
-static ssize_t reset_store(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t len)
-{
-	int ret;
-	unsigned short do_reset;
-	struct zram *zram;
-	struct block_device *bdev;
-
-	zram = dev_to_zram(dev);
-	bdev = bdget_disk(zram->disk, 0);
-
-	/* Do not reset an active device! */
-	if (bdev->bd_holders)
-		return -EBUSY;
-
-	ret = kstrtou16(buf, 10, &do_reset);
-	if (ret)
-		return ret;
-
-	if (!do_reset)
-		return -EINVAL;
-
-	/* Make sure all pending I/O is finished */
-	if (bdev)
-		fsync_bdev(bdev);
-
-	zram_reset_device(zram, true);
-	return len;
-}
-
-static void __zram_make_request(struct zram *zram, struct bio *bio, int rw)
-{
-	int i, offset;
-	u32 index;
-	struct bio_vec *bvec;
-
-	switch (rw) {
-	case READ:
-		atomic64_inc(&zram->stats.num_reads);
-		break;
-	case WRITE:
-		atomic64_inc(&zram->stats.num_writes);
-		break;
-	}
-
-	index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
-	offset = (bio->bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
-
-	bio_for_each_segment(bvec, bio, i) {
-		int max_transfer_size = PAGE_SIZE - offset;
-
-		if (bvec->bv_len > max_transfer_size) {
-			/*
-			 * zram_bvec_rw() can only make operation on a single
-			 * zram page. Split the bio vector.
-			 */
-			struct bio_vec bv;
-
-			bv.bv_page = bvec->bv_page;
-			bv.bv_len = max_transfer_size;
-			bv.bv_offset = bvec->bv_offset;
-
-			if (zram_bvec_rw(zram, &bv, index, offset, bio, rw) < 0)
-				goto out;
-
-			bv.bv_len = bvec->bv_len - max_transfer_size;
-			bv.bv_offset += max_transfer_size;
-			if (zram_bvec_rw(zram, &bv, index+1, 0, bio, rw) < 0)
-				goto out;
-		} else
-			if (zram_bvec_rw(zram, bvec, index, offset, bio, rw)
-			    < 0)
-				goto out;
-
-		update_position(&index, &offset, bvec);
-	}
-
-	set_bit(BIO_UPTODATE, &bio->bi_flags);
-	bio_endio(bio, 0);
-	return;
-
-out:
-	bio_io_error(bio);
-}
-
-/*
- * Handler function for all zram I/O requests.
- */
-static void zram_make_request(struct request_queue *queue, struct bio *bio)
-{
-	struct zram *zram = queue->queuedata;
-
-	down_read(&zram->init_lock);
-	if (unlikely(!zram->init_done))
-		goto error;
-
-	if (!valid_io_request(zram, bio)) {
-		atomic64_inc(&zram->stats.invalid_io);
-		goto error;
-	}
-
-	__zram_make_request(zram, bio, bio_data_dir(bio));
-	up_read(&zram->init_lock);
-
-	return;
-
-error:
-	up_read(&zram->init_lock);
-	bio_io_error(bio);
-}
-
-static void zram_slot_free(struct work_struct *work)
-{
-	struct zram *zram;
-
-	zram = container_of(work, struct zram, free_work);
-	down_write(&zram->lock);
-	handle_pending_slot_free(zram);
-	up_write(&zram->lock);
-}
-
-static void add_slot_free(struct zram *zram, struct zram_slot_free *free_rq)
-{
-	spin_lock(&zram->slot_free_lock);
-	free_rq->next = zram->slot_free_rq;
-	zram->slot_free_rq = free_rq;
-	spin_unlock(&zram->slot_free_lock);
-}
-
-static void zram_slot_free_notify(struct block_device *bdev,
-				unsigned long index)
-{
-	struct zram *zram;
-	struct zram_slot_free *free_rq;
-
-	zram = bdev->bd_disk->private_data;
-	atomic64_inc(&zram->stats.notify_free);
-
-	free_rq = kmalloc(sizeof(struct zram_slot_free), GFP_ATOMIC);
-	if (!free_rq)
-		return;
-
-	free_rq->index = index;
-	add_slot_free(zram, free_rq);
-	schedule_work(&zram->free_work);
-}
-
-static const struct block_device_operations zram_devops = {
-	.swap_slot_free_notify = zram_slot_free_notify,
-	.owner = THIS_MODULE
-};
-
-static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR,
-		disksize_show, disksize_store);
-static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL);
-static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store);
-static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL);
-static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL);
-static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL);
-static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL);
-static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL);
-static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL);
-static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL);
-static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL);
-
-static struct attribute *zram_disk_attrs[] = {
-	&dev_attr_disksize.attr,
-	&dev_attr_initstate.attr,
-	&dev_attr_reset.attr,
-	&dev_attr_num_reads.attr,
-	&dev_attr_num_writes.attr,
-	&dev_attr_invalid_io.attr,
-	&dev_attr_notify_free.attr,
-	&dev_attr_zero_pages.attr,
-	&dev_attr_orig_data_size.attr,
-	&dev_attr_compr_data_size.attr,
-	&dev_attr_mem_used_total.attr,
-	NULL,
-};
-
-static struct attribute_group zram_disk_attr_group = {
-	.attrs = zram_disk_attrs,
-};
-
-static int create_device(struct zram *zram, int device_id)
-{
-	int ret = -ENOMEM;
-
-	init_rwsem(&zram->lock);
-	init_rwsem(&zram->init_lock);
-
-	INIT_WORK(&zram->free_work, zram_slot_free);
-	spin_lock_init(&zram->slot_free_lock);
-	zram->slot_free_rq = NULL;
-
-	zram->queue = blk_alloc_queue(GFP_KERNEL);
-	if (!zram->queue) {
-		pr_err("Error allocating disk queue for device %d\n",
-			device_id);
-		goto out;
-	}
-
-	blk_queue_make_request(zram->queue, zram_make_request);
-	zram->queue->queuedata = zram;
-
-	 /* gendisk structure */
-	zram->disk = alloc_disk(1);
-	if (!zram->disk) {
-		pr_warn("Error allocating disk structure for device %d\n",
-			device_id);
-		goto out_free_queue;
-	}
-
-	zram->disk->major = zram_major;
-	zram->disk->first_minor = device_id;
-	zram->disk->fops = &zram_devops;
-	zram->disk->queue = zram->queue;
-	zram->disk->private_data = zram;
-	snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
-
-	/* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
-	set_capacity(zram->disk, 0);
-
-	/*
-	 * To ensure that we always get PAGE_SIZE aligned
-	 * and n*PAGE_SIZED sized I/O requests.
-	 */
-	blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
-	blk_queue_logical_block_size(zram->disk->queue,
-					ZRAM_LOGICAL_BLOCK_SIZE);
-	blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
-	blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
-
-	add_disk(zram->disk);
-
-	ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
-				&zram_disk_attr_group);
-	if (ret < 0) {
-		pr_warn("Error creating sysfs group");
-		goto out_free_disk;
-	}
-
-	zram->init_done = 0;
-	return 0;
-
-out_free_disk:
-	del_gendisk(zram->disk);
-	put_disk(zram->disk);
-out_free_queue:
-	blk_cleanup_queue(zram->queue);
-out:
-	return ret;
-}
-
-static void destroy_device(struct zram *zram)
-{
-	sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
-			&zram_disk_attr_group);
-
-	if (zram->disk) {
-		del_gendisk(zram->disk);
-		put_disk(zram->disk);
-	}
-
-	if (zram->queue)
-		blk_cleanup_queue(zram->queue);
-}
-
-static int __init zram_init(void)
-{
-	int ret, dev_id;
-
-	if (num_devices > max_num_devices) {
-		pr_warn("Invalid value for num_devices: %u\n",
-				num_devices);
-		ret = -EINVAL;
-		goto out;
-	}
-
-	zram_major = register_blkdev(0, "zram");
-	if (zram_major <= 0) {
-		pr_warn("Unable to get major number\n");
-		ret = -EBUSY;
-		goto out;
-	}
-
-	/* Allocate the device array and initialize each one */
-	zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL);
-	if (!zram_devices) {
-		ret = -ENOMEM;
-		goto unregister;
-	}
-
-	for (dev_id = 0; dev_id < num_devices; dev_id++) {
-		ret = create_device(&zram_devices[dev_id], dev_id);
-		if (ret)
-			goto free_devices;
-	}
-
-	pr_info("Created %u device(s) ...\n", num_devices);
-
-	return 0;
-
-free_devices:
-	while (dev_id)
-		destroy_device(&zram_devices[--dev_id]);
-	kfree(zram_devices);
-unregister:
-	unregister_blkdev(zram_major, "zram");
-out:
-	return ret;
-}
-
-static void __exit zram_exit(void)
-{
-	int i;
-	struct zram *zram;
-
-	for (i = 0; i < num_devices; i++) {
-		zram = &zram_devices[i];
-
-		destroy_device(zram);
-		/*
-		 * Shouldn't access zram->disk after destroy_device
-		 * because destroy_device already released zram->disk.
-		 */
-		zram_reset_device(zram, false);
-	}
-
-	unregister_blkdev(zram_major, "zram");
-
-	kfree(zram_devices);
-	pr_debug("Cleanup done!\n");
-}
-
-module_init(zram_init);
-module_exit(zram_exit);
-
-module_param(num_devices, uint, 0);
-MODULE_PARM_DESC(num_devices, "Number of zram devices");
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
-MODULE_DESCRIPTION("Compressed RAM Block Device");
-MODULE_ALIAS("devname:zram");
diff --git a/drivers/staging/zram/zram_sysfs.c b/drivers/staging/zram/zram_sysfs.c
deleted file mode 100644
index aafcf0330..000000000
--- a/drivers/staging/zram/zram_sysfs.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Compressed RAM block device
- *
- * Copyright (C) 2008, 2009, 2010  Nitin Gupta
- *
- * This code is released using a dual license strategy: BSD/GPL
- * You can choose the licence that better fits your requirements.
- *
- * Released under the terms of 3-clause BSD License
- * Released under the terms of GNU General Public License Version 2.0
- *
- * Project home: http://compcache.googlecode.com/
- */
-
-#include <linux/device.h>
-#include <linux/genhd.h>
-#include <linux/mm.h>
-
-#include "zram_drv.h"
-
-static u64 zram_stat64_read(struct zram *zram, u64 *v)
-{
-	u64 val;
-
-	spin_lock(&zram->stat64_lock);
-	val = *v;
-	spin_unlock(&zram->stat64_lock);
-
-	return val;
-}
-
-static struct zram *dev_to_zram(struct device *dev)
-{
-	int i;
-	struct zram *zram = NULL;
-
-	for (i = 0; i < zram_get_num_devices(); i++) {
-		zram = &zram_devices[i];
-		if (disk_to_dev(zram->disk) == dev)
-			break;
-	}
-
-	return zram;
-}
-
-static ssize_t disksize_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n", zram->disksize);
-}
-
-static ssize_t disksize_store(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t len)
-{
-	int ret;
-	u64 disksize;
-	struct zram *zram = dev_to_zram(dev);
-
-	ret = kstrtoull(buf, 10, &disksize);
-	if (ret)
-		return ret;
-
-	down_write(&zram->init_lock);
-	if (zram->init_done) {
-		up_write(&zram->init_lock);
-		pr_info("Cannot change disksize for initialized device\n");
-		return -EBUSY;
-	}
-
-	zram->disksize = PAGE_ALIGN(disksize);
-	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
-	up_write(&zram->init_lock);
-
-	return len;
-}
-
-static ssize_t initstate_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%u\n", zram->init_done);
-}
-
-static ssize_t reset_store(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t len)
-{
-	int ret;
-	unsigned short do_reset;
-	struct zram *zram;
-	struct block_device *bdev;
-
-	zram = dev_to_zram(dev);
-	bdev = bdget_disk(zram->disk, 0);
-
-	if (!bdev)
-		return -ENOMEM;
-
-	/* Do not reset an active device! */
-	if (bdev->bd_holders) {
-		ret = -EBUSY;
-		goto out;
-	}
-
-	ret = kstrtou16(buf, 10, &do_reset);
-	if (ret)
-		goto out;
-
-	if (!do_reset) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* Make sure all pending I/O is finished */
-	fsync_bdev(bdev);
-	bdput(bdev);
-
-	down_write(&zram->init_lock);
-	if (zram->init_done)
-		__zram_reset_device(zram);
-	up_write(&zram->init_lock);
-
-	return len;
-
-out:
-	bdput(bdev);
-	return ret;
-}
-
-static ssize_t num_reads_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-		zram_stat64_read(zram, &zram->stats.num_reads));
-}
-
-static ssize_t num_writes_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-		zram_stat64_read(zram, &zram->stats.num_writes));
-}
-
-static ssize_t invalid_io_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-		zram_stat64_read(zram, &zram->stats.invalid_io));
-}
-
-static ssize_t notify_free_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-		zram_stat64_read(zram, &zram->stats.notify_free));
-}
-
-static ssize_t zero_pages_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%u\n", zram->stats.pages_zero);
-}
-
-static ssize_t orig_data_size_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-		(u64)(zram->stats.pages_stored) << PAGE_SHIFT);
-}
-
-static ssize_t compr_data_size_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct zram *zram = dev_to_zram(dev);
-
-	return sprintf(buf, "%llu\n",
-		zram_stat64_read(zram, &zram->stats.compr_size));
-}
-
-static ssize_t mem_used_total_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	u64 val = 0;
-	struct zram *zram = dev_to_zram(dev);
-
-	down_read(&zram->init_lock);
-	if (zram->init_done) {
-		val = zs_get_total_size_bytes(zram->mem_pool) +
-			((u64)(zram->stats.pages_expand) << PAGE_SHIFT);
-	}
-	up_read(&zram->init_lock);
-
-	return sprintf(buf, "%llu\n", val);
-}
-
-static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR,
-		disksize_show, disksize_store);
-static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL);
-static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store);
-static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL);
-static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL);
-static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL);
-static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL);
-static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL);
-static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL);
-static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL);
-static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL);
-
-static struct attribute *zram_disk_attrs[] = {
-	&dev_attr_disksize.attr,
-	&dev_attr_initstate.attr,
-	&dev_attr_reset.attr,
-	&dev_attr_num_reads.attr,
-	&dev_attr_num_writes.attr,
-	&dev_attr_invalid_io.attr,
-	&dev_attr_notify_free.attr,
-	&dev_attr_zero_pages.attr,
-	&dev_attr_orig_data_size.attr,
-	&dev_attr_compr_data_size.attr,
-	&dev_attr_mem_used_total.attr,
-	NULL,
-};
-
-struct attribute_group zram_disk_attr_group = {
-	.attrs = zram_disk_attrs,
-};
diff --git a/drivers/staging/zsmalloc/Kconfig b/drivers/staging/zsmalloc/Kconfig
deleted file mode 100644
index 7fab03229..000000000
--- a/drivers/staging/zsmalloc/Kconfig
+++ /dev/null
@@ -1,10 +0,0 @@
-config ZSMALLOC
-	bool "Memory allocator for compressed pages"
-	default n
-	help
-	  zsmalloc is a slab-based memory allocator designed to store
-	  compressed RAM pages.  zsmalloc uses virtual memory mapping
-	  in order to reduce fragmentation.  However, this results in a
-	  non-standard allocator interface where a handle, not a pointer, is
-	  returned by an alloc().  This handle must be mapped in order to
-	  access the allocated space.
diff --git a/drivers/staging/zsmalloc/Makefile b/drivers/staging/zsmalloc/Makefile
deleted file mode 100644
index b134848a5..000000000
--- a/drivers/staging/zsmalloc/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-zsmalloc-y 		:= zsmalloc-main.o
-
-obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c
deleted file mode 100644
index 41a68032c..000000000
--- a/drivers/staging/zsmalloc/zsmalloc-main.c
+++ /dev/null
@@ -1,1072 +0,0 @@
-/*
- * zsmalloc memory allocator
- *
- * Copyright (C) 2011  Nitin Gupta
- *
- * This code is released using a dual license strategy: BSD/GPL
- * You can choose the license that better fits your requirements.
- *
- * Released under the terms of 3-clause BSD License
- * Released under the terms of GNU General Public License Version 2.0
- */
-
-
-/*
- * This allocator is designed for use with zcache and zram. Thus, the
- * allocator is supposed to work well under low memory conditions. In
- * particular, it never attempts higher order page allocation which is
- * very likely to fail under memory pressure. On the other hand, if we
- * just use single (0-order) pages, it would suffer from very high
- * fragmentation -- any object of size PAGE_SIZE/2 or larger would occupy
- * an entire page. This was one of the major issues with its predecessor
- * (xvmalloc).
- *
- * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
- * and links them together using various 'struct page' fields. These linked
- * pages act as a single higher-order page i.e. an object can span 0-order
- * page boundaries. The code refers to these linked pages as a single entity
- * called zspage.
- *
- * Following is how we use various fields and flags of underlying
- * struct page(s) to form a zspage.
- *
- * Usage of struct page fields:
- *	page->first_page: points to the first component (0-order) page
- *	page->index (union with page->freelist): offset of the first object
- *		starting in this page. For the first page, this is
- *		always 0, so we use this field (aka freelist) to point
- *		to the first free object in zspage.
- *	page->lru: links together all component pages (except the first page)
- *		of a zspage
- *
- *	For _first_ page only:
- *
- *	page->private (union with page->first_page): refers to the
- *		component page after the first page
- *	page->freelist: points to the first free object in zspage.
- *		Free objects are linked together using in-place
- *		metadata.
- *	page->objects: maximum number of objects we can store in this
- *		zspage (class->zspage_order * PAGE_SIZE / class->size)
- *	page->lru: links together first pages of various zspages.
- *		Basically forming list of zspages in a fullness group.
- *	page->mapping: class index and fullness group of the zspage
- *
- * Usage of struct page flags:
- *	PG_private: identifies the first component page
- *	PG_private2: identifies the last component page
- *
- */
-
-#ifdef CONFIG_ZSMALLOC_DEBUG
-#define DEBUG
-#endif
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/bitops.h>
-#include <linux/errno.h>
-#include <linux/highmem.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <asm/tlbflush.h>
-#include <asm/pgtable.h>
-#include <linux/cpumask.h>
-#include <linux/cpu.h>
-#include <linux/vmalloc.h>
-#include <linux/hardirq.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-
-#include "zsmalloc.h"
-
-/*
- * This must be power of 2 and greater than of equal to sizeof(link_free).
- * These two conditions ensure that any 'struct link_free' itself doesn't
- * span more than 1 page which avoids complex case of mapping 2 pages simply
- * to restore link_free pointer values.
- */
-#define ZS_ALIGN		8
-
-/*
- * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
- * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
- */
-#define ZS_MAX_ZSPAGE_ORDER 2
-#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
-
-/*
- * Object location (<PFN>, <obj_idx>) is encoded as
- * as single (void *) handle value.
- *
- * Note that object index <obj_idx> is relative to system
- * page <PFN> it is stored in, so for each sub-page belonging
- * to a zspage, obj_idx starts with 0.
- *
- * This is made more complicated by various memory models and PAE.
- */
-
-#ifndef MAX_PHYSMEM_BITS
-#ifdef CONFIG_HIGHMEM64G
-#define MAX_PHYSMEM_BITS 36
-#else /* !CONFIG_HIGHMEM64G */
-/*
- * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
- * be PAGE_SHIFT
- */
-#define MAX_PHYSMEM_BITS BITS_PER_LONG
-#endif
-#endif
-#define _PFN_BITS		(MAX_PHYSMEM_BITS - PAGE_SHIFT)
-#define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS)
-#define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
-
-#define MAX(a, b) ((a) >= (b) ? (a) : (b))
-/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
-#define ZS_MIN_ALLOC_SIZE \
-	MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
-#define ZS_MAX_ALLOC_SIZE	PAGE_SIZE
-
-/*
- * On systems with 4K page size, this gives 254 size classes! There is a
- * trader-off here:
- *  - Large number of size classes is potentially wasteful as free page are
- *    spread across these classes
- *  - Small number of size classes causes large internal fragmentation
- *  - Probably its better to use specific size classes (empirically
- *    determined). NOTE: all those class sizes must be set as multiple of
- *    ZS_ALIGN to make sure link_free itself never has to span 2 pages.
- *
- *  ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
- *  (reason above)
- */
-#define ZS_SIZE_CLASS_DELTA	(PAGE_SIZE >> 8)
-#define ZS_SIZE_CLASSES		((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
-					ZS_SIZE_CLASS_DELTA + 1)
-
-/*
- * We do not maintain any list for completely empty or full pages
- */
-enum fullness_group {
-	ZS_ALMOST_FULL,
-	ZS_ALMOST_EMPTY,
-	_ZS_NR_FULLNESS_GROUPS,
-
-	ZS_EMPTY,
-	ZS_FULL
-};
-
-/*
- * We assign a page to ZS_ALMOST_EMPTY fullness group when:
- *	n <= N / f, where
- * n = number of allocated objects
- * N = total number of objects zspage can store
- * f = 1/fullness_threshold_frac
- *
- * Similarly, we assign zspage to:
- *	ZS_ALMOST_FULL	when n > N / f
- *	ZS_EMPTY	when n == 0
- *	ZS_FULL		when n == N
- *
- * (see: fix_fullness_group())
- */
-static const int fullness_threshold_frac = 4;
-
-struct size_class {
-	/*
-	 * Size of objects stored in this class. Must be multiple
-	 * of ZS_ALIGN.
-	 */
-	int size;
-	unsigned int index;
-
-	/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
-	int pages_per_zspage;
-
-	spinlock_t lock;
-
-	/* stats */
-	u64 pages_allocated;
-
-	struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
-};
-
-/*
- * Placed within free objects to form a singly linked list.
- * For every zspage, first_page->freelist gives head of this list.
- *
- * This must be power of 2 and less than or equal to ZS_ALIGN
- */
-struct link_free {
-	/* Handle of next free chunk (encodes <PFN, obj_idx>) */
-	void *next;
-};
-
-struct zs_pool {
-	struct size_class size_class[ZS_SIZE_CLASSES];
-
-	gfp_t flags;	/* allocation flags used when growing pool */
-};
-
-/*
- * A zspage's class index and fullness group
- * are encoded in its (first)page->mapping
- */
-#define CLASS_IDX_BITS	28
-#define FULLNESS_BITS	4
-#define CLASS_IDX_MASK	((1 << CLASS_IDX_BITS) - 1)
-#define FULLNESS_MASK	((1 << FULLNESS_BITS) - 1)
-
-/*
- * By default, zsmalloc uses a copy-based object mapping method to access
- * allocations that span two pages. However, if a particular architecture
- * performs VM mapping faster than copying, then it should be added here
- * so that USE_PGTABLE_MAPPING is defined. This causes zsmalloc to use
- * page table mapping rather than copying for object mapping.
- */
-#if defined(CONFIG_ARM) && !defined(MODULE)
-#define USE_PGTABLE_MAPPING
-#endif
-
-struct mapping_area {
-#ifdef USE_PGTABLE_MAPPING
-	struct vm_struct *vm; /* vm area for mapping object that span pages */
-#else
-	char *vm_buf; /* copy buffer for objects that span pages */
-#endif
-	char *vm_addr; /* address of kmap_atomic()'ed pages */
-	enum zs_mapmode vm_mm; /* mapping mode */
-};
-
-
-/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
-static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
-
-static int is_first_page(struct page *page)
-{
-	return PagePrivate(page);
-}
-
-static int is_last_page(struct page *page)
-{
-	return PagePrivate2(page);
-}
-
-static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
-				enum fullness_group *fullness)
-{
-	unsigned long m;
-	BUG_ON(!is_first_page(page));
-
-	m = (unsigned long)page->mapping;
-	*fullness = m & FULLNESS_MASK;
-	*class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
-}
-
-static void set_zspage_mapping(struct page *page, unsigned int class_idx,
-				enum fullness_group fullness)
-{
-	unsigned long m;
-	BUG_ON(!is_first_page(page));
-
-	m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
-			(fullness & FULLNESS_MASK);
-	page->mapping = (struct address_space *)m;
-}
-
-static int get_size_class_index(int size)
-{
-	int idx = 0;
-
-	if (likely(size > ZS_MIN_ALLOC_SIZE))
-		idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
-				ZS_SIZE_CLASS_DELTA);
-
-	return idx;
-}
-
-static enum fullness_group get_fullness_group(struct page *page)
-{
-	int inuse, max_objects;
-	enum fullness_group fg;
-	BUG_ON(!is_first_page(page));
-
-	inuse = page->inuse;
-	max_objects = page->objects;
-
-	if (inuse == 0)
-		fg = ZS_EMPTY;
-	else if (inuse == max_objects)
-		fg = ZS_FULL;
-	else if (inuse <= max_objects / fullness_threshold_frac)
-		fg = ZS_ALMOST_EMPTY;
-	else
-		fg = ZS_ALMOST_FULL;
-
-	return fg;
-}
-
-static void insert_zspage(struct page *page, struct size_class *class,
-				enum fullness_group fullness)
-{
-	struct page **head;
-
-	BUG_ON(!is_first_page(page));
-
-	if (fullness >= _ZS_NR_FULLNESS_GROUPS)
-		return;
-
-	head = &class->fullness_list[fullness];
-	if (*head)
-		list_add_tail(&page->lru, &(*head)->lru);
-
-	*head = page;
-}
-
-static void remove_zspage(struct page *page, struct size_class *class,
-				enum fullness_group fullness)
-{
-	struct page **head;
-
-	BUG_ON(!is_first_page(page));
-
-	if (fullness >= _ZS_NR_FULLNESS_GROUPS)
-		return;
-
-	head = &class->fullness_list[fullness];
-	BUG_ON(!*head);
-	if (list_empty(&(*head)->lru))
-		*head = NULL;
-	else if (*head == page)
-		*head = (struct page *)list_entry((*head)->lru.next,
-					struct page, lru);
-
-	list_del_init(&page->lru);
-}
-
-static enum fullness_group fix_fullness_group(struct zs_pool *pool,
-						struct page *page)
-{
-	int class_idx;
-	struct size_class *class;
-	enum fullness_group currfg, newfg;
-
-	BUG_ON(!is_first_page(page));
-
-	get_zspage_mapping(page, &class_idx, &currfg);
-	newfg = get_fullness_group(page);
-	if (newfg == currfg)
-		goto out;
-
-	class = &pool->size_class[class_idx];
-	remove_zspage(page, class, currfg);
-	insert_zspage(page, class, newfg);
-	set_zspage_mapping(page, class_idx, newfg);
-
-out:
-	return newfg;
-}
-
-/*
- * We have to decide on how many pages to link together
- * to form a zspage for each size class. This is important
- * to reduce wastage due to unusable space left at end of
- * each zspage which is given as:
- *	wastage = Zp - Zp % size_class
- * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
- *
- * For example, for size class of 3/8 * PAGE_SIZE, we should
- * link together 3 PAGE_SIZE sized pages to form a zspage
- * since then we can perfectly fit in 8 such objects.
- */
-static int get_pages_per_zspage(int class_size)
-{
-	int i, max_usedpc = 0;
-	/* zspage order which gives maximum used size per KB */
-	int max_usedpc_order = 1;
-
-	for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) {
-		int zspage_size;
-		int waste, usedpc;
-
-		zspage_size = i * PAGE_SIZE;
-		waste = zspage_size % class_size;
-		usedpc = (zspage_size - waste) * 100 / zspage_size;
-
-		if (usedpc > max_usedpc) {
-			max_usedpc = usedpc;
-			max_usedpc_order = i;
-		}
-	}
-
-	return max_usedpc_order;
-}
-
-/*
- * A single 'zspage' is composed of many system pages which are
- * linked together using fields in struct page. This function finds
- * the first/head page, given any component page of a zspage.
- */
-static struct page *get_first_page(struct page *page)
-{
-	if (is_first_page(page))
-		return page;
-	else
-		return page->first_page;
-}
-
-static struct page *get_next_page(struct page *page)
-{
-	struct page *next;
-
-	if (is_last_page(page))
-		next = NULL;
-	else if (is_first_page(page))
-		next = (struct page *)page_private(page);
-	else
-		next = list_entry(page->lru.next, struct page, lru);
-
-	return next;
-}
-
-/*
- * Encode <page, obj_idx> as a single handle value.
- * On hardware platforms with physical memory starting at 0x0 the pfn
- * could be 0 so we ensure that the handle will never be 0 by adjusting the
- * encoded obj_idx value before encoding.
- */
-static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
-{
-	unsigned long handle;
-
-	if (!page) {
-		BUG_ON(obj_idx);
-		return NULL;
-	}
-
-	handle = page_to_pfn(page) << OBJ_INDEX_BITS;
-	handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
-
-	return (void *)handle;
-}
-
-/*
- * Decode <page, obj_idx> pair from the given object handle. We adjust the
- * decoded obj_idx back to its original value since it was adjusted in
- * obj_location_to_handle().
- */
-static void obj_handle_to_location(unsigned long handle, struct page **page,
-				unsigned long *obj_idx)
-{
-	*page = pfn_to_page(handle >> OBJ_INDEX_BITS);
-	*obj_idx = (handle & OBJ_INDEX_MASK) - 1;
-}
-
-static unsigned long obj_idx_to_offset(struct page *page,
-				unsigned long obj_idx, int class_size)
-{
-	unsigned long off = 0;
-
-	if (!is_first_page(page))
-		off = page->index;
-
-	return off + obj_idx * class_size;
-}
-
-static void reset_page(struct page *page)
-{
-	clear_bit(PG_private, &page->flags);
-	clear_bit(PG_private_2, &page->flags);
-	set_page_private(page, 0);
-	page->mapping = NULL;
-	page->freelist = NULL;
-	reset_page_mapcount(page);
-}
-
-static void free_zspage(struct page *first_page)
-{
-	struct page *nextp, *tmp, *head_extra;
-
-	BUG_ON(!is_first_page(first_page));
-	BUG_ON(first_page->inuse);
-
-	head_extra = (struct page *)page_private(first_page);
-
-	reset_page(first_page);
-	__free_page(first_page);
-
-	/* zspage with only 1 system page */
-	if (!head_extra)
-		return;
-
-	list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) {
-		list_del(&nextp->lru);
-		reset_page(nextp);
-		__free_page(nextp);
-	}
-	reset_page(head_extra);
-	__free_page(head_extra);
-}
-
-/* Initialize a newly allocated zspage */
-static void init_zspage(struct page *first_page, struct size_class *class)
-{
-	unsigned long off = 0;
-	struct page *page = first_page;
-
-	BUG_ON(!is_first_page(first_page));
-	while (page) {
-		struct page *next_page;
-		struct link_free *link;
-		unsigned int i, objs_on_page;
-
-		/*
-		 * page->index stores offset of first object starting
-		 * in the page. For the first page, this is always 0,
-		 * so we use first_page->index (aka ->freelist) to store
-		 * head of corresponding zspage's freelist.
-		 */
-		if (page != first_page)
-			page->index = off;
-
-		link = (struct link_free *)kmap_atomic(page) +
-						off / sizeof(*link);
-		objs_on_page = (PAGE_SIZE - off) / class->size;
-
-		for (i = 1; i <= objs_on_page; i++) {
-			off += class->size;
-			if (off < PAGE_SIZE) {
-				link->next = obj_location_to_handle(page, i);
-				link += class->size / sizeof(*link);
-			}
-		}
-
-		/*
-		 * We now come to the last (full or partial) object on this
-		 * page, which must point to the first object on the next
-		 * page (if present)
-		 */
-		next_page = get_next_page(page);
-		link->next = obj_location_to_handle(next_page, 0);
-		kunmap_atomic(link);
-		page = next_page;
-		off = (off + class->size) % PAGE_SIZE;
-	}
-}
-
-/*
- * Allocate a zspage for the given size class
- */
-static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
-{
-	int i, error;
-	struct page *first_page = NULL, *uninitialized_var(prev_page);
-
-	/*
-	 * Allocate individual pages and link them together as:
-	 * 1. first page->private = first sub-page
-	 * 2. all sub-pages are linked together using page->lru
-	 * 3. each sub-page is linked to the first page using page->first_page
-	 *
-	 * For each size class, First/Head pages are linked together using
-	 * page->lru. Also, we set PG_private to identify the first page
-	 * (i.e. no other sub-page has this flag set) and PG_private_2 to
-	 * identify the last page.
-	 */
-	error = -ENOMEM;
-	for (i = 0; i < class->pages_per_zspage; i++) {
-		struct page *page;
-
-		page = alloc_page(flags);
-		if (!page)
-			goto cleanup;
-
-		INIT_LIST_HEAD(&page->lru);
-		if (i == 0) {	/* first page */
-			SetPagePrivate(page);
-			set_page_private(page, 0);
-			first_page = page;
-			first_page->inuse = 0;
-		}
-		if (i == 1)
-			set_page_private(first_page, (unsigned long)page);
-		if (i >= 1)
-			page->first_page = first_page;
-		if (i >= 2)
-			list_add(&page->lru, &prev_page->lru);
-		if (i == class->pages_per_zspage - 1)	/* last page */
-			SetPagePrivate2(page);
-		prev_page = page;
-	}
-
-	init_zspage(first_page, class);
-
-	first_page->freelist = obj_location_to_handle(first_page, 0);
-	/* Maximum number of objects we can store in this zspage */
-	first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
-
-	error = 0; /* Success */
-
-cleanup:
-	if (unlikely(error) && first_page) {
-		free_zspage(first_page);
-		first_page = NULL;
-	}
-
-	return first_page;
-}
-
-static struct page *find_get_zspage(struct size_class *class)
-{
-	int i;
-	struct page *page;
-
-	for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
-		page = class->fullness_list[i];
-		if (page)
-			break;
-	}
-
-	return page;
-}
-
-#ifdef USE_PGTABLE_MAPPING
-static inline int __zs_cpu_up(struct mapping_area *area)
-{
-	/*
-	 * Make sure we don't leak memory if a cpu UP notification
-	 * and zs_init() race and both call zs_cpu_up() on the same cpu
-	 */
-	if (area->vm)
-		return 0;
-	area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
-	if (!area->vm)
-		return -ENOMEM;
-	return 0;
-}
-
-static inline void __zs_cpu_down(struct mapping_area *area)
-{
-	if (area->vm)
-		free_vm_area(area->vm);
-	area->vm = NULL;
-}
-
-static inline void *__zs_map_object(struct mapping_area *area,
-				struct page *pages[2], int off, int size)
-{
-	BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages));
-	area->vm_addr = area->vm->addr;
-	return area->vm_addr + off;
-}
-
-static inline void __zs_unmap_object(struct mapping_area *area,
-				struct page *pages[2], int off, int size)
-{
-	unsigned long addr = (unsigned long)area->vm_addr;
-
-	unmap_kernel_range(addr, PAGE_SIZE * 2);
-}
-
-#else /* USE_PGTABLE_MAPPING */
-
-static inline int __zs_cpu_up(struct mapping_area *area)
-{
-	/*
-	 * Make sure we don't leak memory if a cpu UP notification
-	 * and zs_init() race and both call zs_cpu_up() on the same cpu
-	 */
-	if (area->vm_buf)
-		return 0;
-	area->vm_buf = (char *)__get_free_page(GFP_KERNEL);
-	if (!area->vm_buf)
-		return -ENOMEM;
-	return 0;
-}
-
-static inline void __zs_cpu_down(struct mapping_area *area)
-{
-	if (area->vm_buf)
-		free_page((unsigned long)area->vm_buf);
-	area->vm_buf = NULL;
-}
-
-static void *__zs_map_object(struct mapping_area *area,
-			struct page *pages[2], int off, int size)
-{
-	int sizes[2];
-	void *addr;
-	char *buf = area->vm_buf;
-
-	/* disable page faults to match kmap_atomic() return conditions */
-	pagefault_disable();
-
-	/* no read fastpath */
-	if (area->vm_mm == ZS_MM_WO)
-		goto out;
-
-	sizes[0] = PAGE_SIZE - off;
-	sizes[1] = size - sizes[0];
-
-	/* copy object to per-cpu buffer */
-	addr = kmap_atomic(pages[0]);
-	memcpy(buf, addr + off, sizes[0]);
-	kunmap_atomic(addr);
-	addr = kmap_atomic(pages[1]);
-	memcpy(buf + sizes[0], addr, sizes[1]);
-	kunmap_atomic(addr);
-out:
-	return area->vm_buf;
-}
-
-static void __zs_unmap_object(struct mapping_area *area,
-			struct page *pages[2], int off, int size)
-{
-	int sizes[2];
-	void *addr;
-	char *buf = area->vm_buf;
-
-	/* no write fastpath */
-	if (area->vm_mm == ZS_MM_RO)
-		goto out;
-
-	sizes[0] = PAGE_SIZE - off;
-	sizes[1] = size - sizes[0];
-
-	/* copy per-cpu buffer to object */
-	addr = kmap_atomic(pages[0]);
-	memcpy(addr + off, buf, sizes[0]);
-	kunmap_atomic(addr);
-	addr = kmap_atomic(pages[1]);
-	memcpy(addr, buf + sizes[0], sizes[1]);
-	kunmap_atomic(addr);
-
-out:
-	/* enable page faults to match kunmap_atomic() return conditions */
-	pagefault_enable();
-}
-
-#endif /* USE_PGTABLE_MAPPING */
-
-static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action,
-				void *pcpu)
-{
-	int ret, cpu = (long)pcpu;
-	struct mapping_area *area;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-		area = &per_cpu(zs_map_area, cpu);
-		ret = __zs_cpu_up(area);
-		if (ret)
-			return notifier_from_errno(ret);
-		break;
-	case CPU_DEAD:
-	case CPU_UP_CANCELED:
-		area = &per_cpu(zs_map_area, cpu);
-		__zs_cpu_down(area);
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block zs_cpu_nb = {
-	.notifier_call = zs_cpu_notifier
-};
-
-static void zs_exit(void)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu)
-		zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu);
-	unregister_cpu_notifier(&zs_cpu_nb);
-}
-
-static int zs_init(void)
-{
-	int cpu, ret;
-
-	register_cpu_notifier(&zs_cpu_nb);
-	for_each_online_cpu(cpu) {
-		ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
-		if (notifier_to_errno(ret))
-			goto fail;
-	}
-	return 0;
-fail:
-	zs_exit();
-	return notifier_to_errno(ret);
-}
-
-/**
- * zs_create_pool - Creates an allocation pool to work from.
- * @flags: allocation flags used to allocate pool metadata
- *
- * This function must be called before anything when using
- * the zsmalloc allocator.
- *
- * On success, a pointer to the newly created pool is returned,
- * otherwise NULL.
- */
-struct zs_pool *zs_create_pool(gfp_t flags)
-{
-	int i, ovhd_size;
-	struct zs_pool *pool;
-
-	ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
-	pool = kzalloc(ovhd_size, GFP_KERNEL);
-	if (!pool)
-		return NULL;
-
-	for (i = 0; i < ZS_SIZE_CLASSES; i++) {
-		int size;
-		struct size_class *class;
-
-		size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
-		if (size > ZS_MAX_ALLOC_SIZE)
-			size = ZS_MAX_ALLOC_SIZE;
-
-		class = &pool->size_class[i];
-		class->size = size;
-		class->index = i;
-		spin_lock_init(&class->lock);
-		class->pages_per_zspage = get_pages_per_zspage(size);
-
-	}
-
-	pool->flags = flags;
-
-	return pool;
-}
-EXPORT_SYMBOL_GPL(zs_create_pool);
-
-void zs_destroy_pool(struct zs_pool *pool)
-{
-	int i;
-
-	for (i = 0; i < ZS_SIZE_CLASSES; i++) {
-		int fg;
-		struct size_class *class = &pool->size_class[i];
-
-		for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
-			if (class->fullness_list[fg]) {
-				pr_info("Freeing non-empty class with size %db, fullness group %d\n",
-					class->size, fg);
-			}
-		}
-	}
-	kfree(pool);
-}
-EXPORT_SYMBOL_GPL(zs_destroy_pool);
-
-/**
- * zs_malloc - Allocate block of given size from pool.
- * @pool: pool to allocate from
- * @size: size of block to allocate
- *
- * On success, handle to the allocated object is returned,
- * otherwise 0.
- * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
- */
-unsigned long zs_malloc(struct zs_pool *pool, size_t size)
-{
-	unsigned long obj;
-	struct link_free *link;
-	int class_idx;
-	struct size_class *class;
-
-	struct page *first_page, *m_page;
-	unsigned long m_objidx, m_offset;
-
-	if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
-		return 0;
-
-	class_idx = get_size_class_index(size);
-	class = &pool->size_class[class_idx];
-	BUG_ON(class_idx != class->index);
-
-	spin_lock(&class->lock);
-	first_page = find_get_zspage(class);
-
-	if (!first_page) {
-		spin_unlock(&class->lock);
-		first_page = alloc_zspage(class, pool->flags);
-		if (unlikely(!first_page))
-			return 0;
-
-		set_zspage_mapping(first_page, class->index, ZS_EMPTY);
-		spin_lock(&class->lock);
-		class->pages_allocated += class->pages_per_zspage;
-	}
-
-	obj = (unsigned long)first_page->freelist;
-	obj_handle_to_location(obj, &m_page, &m_objidx);
-	m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
-
-	link = (struct link_free *)kmap_atomic(m_page) +
-					m_offset / sizeof(*link);
-	first_page->freelist = link->next;
-	memset(link, POISON_INUSE, sizeof(*link));
-	kunmap_atomic(link);
-
-	first_page->inuse++;
-	/* Now move the zspage to another fullness group, if required */
-	fix_fullness_group(pool, first_page);
-	spin_unlock(&class->lock);
-
-	return obj;
-}
-EXPORT_SYMBOL_GPL(zs_malloc);
-
-void zs_free(struct zs_pool *pool, unsigned long obj)
-{
-	struct link_free *link;
-	struct page *first_page, *f_page;
-	unsigned long f_objidx, f_offset;
-
-	int class_idx;
-	struct size_class *class;
-	enum fullness_group fullness;
-
-	if (unlikely(!obj))
-		return;
-
-	obj_handle_to_location(obj, &f_page, &f_objidx);
-	first_page = get_first_page(f_page);
-
-	get_zspage_mapping(first_page, &class_idx, &fullness);
-	class = &pool->size_class[class_idx];
-	f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
-
-	spin_lock(&class->lock);
-
-	/* Insert this object in containing zspage's freelist */
-	link = (struct link_free *)((unsigned char *)kmap_atomic(f_page)
-							+ f_offset);
-	link->next = first_page->freelist;
-	kunmap_atomic(link);
-	first_page->freelist = (void *)obj;
-
-	first_page->inuse--;
-	fullness = fix_fullness_group(pool, first_page);
-
-	if (fullness == ZS_EMPTY)
-		class->pages_allocated -= class->pages_per_zspage;
-
-	spin_unlock(&class->lock);
-
-	if (fullness == ZS_EMPTY)
-		free_zspage(first_page);
-}
-EXPORT_SYMBOL_GPL(zs_free);
-
-/**
- * zs_map_object - get address of allocated object from handle.
- * @pool: pool from which the object was allocated
- * @handle: handle returned from zs_malloc
- *
- * Before using an object allocated from zs_malloc, it must be mapped using
- * this function. When done with the object, it must be unmapped using
- * zs_unmap_object.
- *
- * Only one object can be mapped per cpu at a time. There is no protection
- * against nested mappings.
- *
- * This function returns with preemption and page faults disabled.
- */
-void *zs_map_object(struct zs_pool *pool, unsigned long handle,
-			enum zs_mapmode mm)
-{
-	struct page *page;
-	unsigned long obj_idx, off;
-
-	unsigned int class_idx;
-	enum fullness_group fg;
-	struct size_class *class;
-	struct mapping_area *area;
-	struct page *pages[2];
-
-	BUG_ON(!handle);
-
-	/*
-	 * Because we use per-cpu mapping areas shared among the
-	 * pools/users, we can't allow mapping in interrupt context
-	 * because it can corrupt another users mappings.
-	 */
-	BUG_ON(in_interrupt());
-
-	obj_handle_to_location(handle, &page, &obj_idx);
-	get_zspage_mapping(get_first_page(page), &class_idx, &fg);
-	class = &pool->size_class[class_idx];
-	off = obj_idx_to_offset(page, obj_idx, class->size);
-
-	area = &get_cpu_var(zs_map_area);
-	area->vm_mm = mm;
-	if (off + class->size <= PAGE_SIZE) {
-		/* this object is contained entirely within a page */
-		area->vm_addr = kmap_atomic(page);
-		return area->vm_addr + off;
-	}
-
-	/* this object spans two pages */
-	pages[0] = page;
-	pages[1] = get_next_page(page);
-	BUG_ON(!pages[1]);
-
-	return __zs_map_object(area, pages, off, class->size);
-}
-EXPORT_SYMBOL_GPL(zs_map_object);
-
-void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
-{
-	struct page *page;
-	unsigned long obj_idx, off;
-
-	unsigned int class_idx;
-	enum fullness_group fg;
-	struct size_class *class;
-	struct mapping_area *area;
-
-	BUG_ON(!handle);
-
-	obj_handle_to_location(handle, &page, &obj_idx);
-	get_zspage_mapping(get_first_page(page), &class_idx, &fg);
-	class = &pool->size_class[class_idx];
-	off = obj_idx_to_offset(page, obj_idx, class->size);
-
-	area = &__get_cpu_var(zs_map_area);
-	if (off + class->size <= PAGE_SIZE)
-		kunmap_atomic(area->vm_addr);
-	else {
-		struct page *pages[2];
-
-		pages[0] = page;
-		pages[1] = get_next_page(page);
-		BUG_ON(!pages[1]);
-
-		__zs_unmap_object(area, pages, off, class->size);
-	}
-	put_cpu_var(zs_map_area);
-}
-EXPORT_SYMBOL_GPL(zs_unmap_object);
-
-u64 zs_get_total_size_bytes(struct zs_pool *pool)
-{
-	int i;
-	u64 npages = 0;
-
-	for (i = 0; i < ZS_SIZE_CLASSES; i++)
-		npages += pool->size_class[i].pages_allocated;
-
-	return npages << PAGE_SHIFT;
-}
-EXPORT_SYMBOL_GPL(zs_get_total_size_bytes);
-
-module_init(zs_init);
-module_exit(zs_exit);
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
diff --git a/drivers/staging/zsmalloc/zsmalloc.h b/drivers/staging/zsmalloc/zsmalloc.h
deleted file mode 100644
index fbe6bec42..000000000
--- a/drivers/staging/zsmalloc/zsmalloc.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * zsmalloc memory allocator
- *
- * Copyright (C) 2011  Nitin Gupta
- *
- * This code is released using a dual license strategy: BSD/GPL
- * You can choose the license that better fits your requirements.
- *
- * Released under the terms of 3-clause BSD License
- * Released under the terms of GNU General Public License Version 2.0
- */
-
-#ifndef _ZS_MALLOC_H_
-#define _ZS_MALLOC_H_
-
-#include <linux/types.h>
-
-/*
- * zsmalloc mapping modes
- *
- * NOTE: These only make a difference when a mapped object spans pages
- */
-enum zs_mapmode {
-	ZS_MM_RW, /* normal read-write mapping */
-	ZS_MM_RO, /* read-only (no copy-out at unmap time) */
-	ZS_MM_WO /* write-only (no copy-in at map time) */
-};
-
-struct zs_pool;
-
-struct zs_pool *zs_create_pool(gfp_t flags);
-void zs_destroy_pool(struct zs_pool *pool);
-
-unsigned long zs_malloc(struct zs_pool *pool, size_t size);
-void zs_free(struct zs_pool *pool, unsigned long obj);
-
-void *zs_map_object(struct zs_pool *pool, unsigned long handle,
-			enum zs_mapmode mm);
-void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
-
-u64 zs_get_total_size_bytes(struct zs_pool *pool);
-
-#endif
diff --git a/drivers/staging/zsmalloc/zsmalloc_int.h b/drivers/staging/zsmalloc/zsmalloc_int.h
deleted file mode 100644
index 92eefc663..000000000
--- a/drivers/staging/zsmalloc/zsmalloc_int.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * zsmalloc memory allocator
- *
- * Copyright (C) 2011  Nitin Gupta
- *
- * This code is released using a dual license strategy: BSD/GPL
- * You can choose the license that better fits your requirements.
- *
- * Released under the terms of 3-clause BSD License
- * Released under the terms of GNU General Public License Version 2.0
- */
-
-#ifndef _ZS_MALLOC_INT_H_
-#define _ZS_MALLOC_INT_H_
-
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-
-/*
- * This must be power of 2 and greater than of equal to sizeof(link_free).
- * These two conditions ensure that any 'struct link_free' itself doesn't
- * span more than 1 page which avoids complex case of mapping 2 pages simply
- * to restore link_free pointer values.
- */
-#define ZS_ALIGN		8
-
-/*
- * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single)
- * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N.
- */
-#define ZS_MAX_ZSPAGE_ORDER 2
-#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
-
-/*
- * Object location (<PFN>, <obj_idx>) is encoded as
- * as single (void *) handle value.
- *
- * Note that object index <obj_idx> is relative to system
- * page <PFN> it is stored in, so for each sub-page belonging
- * to a zspage, obj_idx starts with 0.
- *
- * This is made more complicated by various memory models and PAE.
- */
-
-#ifndef MAX_PHYSMEM_BITS
-#ifdef CONFIG_HIGHMEM64G
-#define MAX_PHYSMEM_BITS 36
-#else /* !CONFIG_HIGHMEM64G */
-/*
- * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just
- * be PAGE_SHIFT
- */
-#define MAX_PHYSMEM_BITS BITS_PER_LONG
-#endif
-#endif
-#define _PFN_BITS		(MAX_PHYSMEM_BITS - PAGE_SHIFT)
-#define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS)
-#define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
-
-#define MAX(a, b) ((a) >= (b) ? (a) : (b))
-/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
-#define ZS_MIN_ALLOC_SIZE \
-	MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
-#define ZS_MAX_ALLOC_SIZE	PAGE_SIZE
-
-/*
- * On systems with 4K page size, this gives 254 size classes! There is a
- * trader-off here:
- *  - Large number of size classes is potentially wasteful as free page are
- *    spread across these classes
- *  - Small number of size classes causes large internal fragmentation
- *  - Probably its better to use specific size classes (empirically
- *    determined). NOTE: all those class sizes must be set as multiple of
- *    ZS_ALIGN to make sure link_free itself never has to span 2 pages.
- *
- *  ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
- *  (reason above)
- */
-#define ZS_SIZE_CLASS_DELTA	16
-#define ZS_SIZE_CLASSES		((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
-					ZS_SIZE_CLASS_DELTA + 1)
-
-/*
- * We do not maintain any list for completely empty or full pages
- */
-enum fullness_group {
-	ZS_ALMOST_FULL,
-	ZS_ALMOST_EMPTY,
-	_ZS_NR_FULLNESS_GROUPS,
-
-	ZS_EMPTY,
-	ZS_FULL
-};
-
-/*
- * We assign a page to ZS_ALMOST_EMPTY fullness group when:
- *	n <= N / f, where
- * n = number of allocated objects
- * N = total number of objects zspage can store
- * f = 1/fullness_threshold_frac
- *
- * Similarly, we assign zspage to:
- *	ZS_ALMOST_FULL	when n > N / f
- *	ZS_EMPTY	when n == 0
- *	ZS_FULL		when n == N
- *
- * (see: fix_fullness_group())
- */
-static const int fullness_threshold_frac = 4;
-
-struct mapping_area {
-	struct vm_struct *vm;
-	pte_t *vm_ptes[2];
-	char *vm_addr;
-};
-
-struct size_class {
-	/*
-	 * Size of objects stored in this class. Must be multiple
-	 * of ZS_ALIGN.
-	 */
-	int size;
-	unsigned int index;
-
-	/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
-	int zspage_order;
-
-	spinlock_t lock;
-
-	/* stats */
-	u64 pages_allocated;
-
-	struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
-};
-
-/*
- * Placed within free objects to form a singly linked list.
- * For every zspage, first_page->freelist gives head of this list.
- *
- * This must be power of 2 and less than or equal to ZS_ALIGN
- */
-struct link_free {
-	/* Handle of next free chunk (encodes <PFN, obj_idx>) */
-	void *next;
-};
-
-struct zs_pool {
-	struct size_class size_class[ZS_SIZE_CLASSES];
-
-	gfp_t flags;	/* allocation flags used when growing pool */
-	const char *name;
-};
-
-#endif
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 0a2d86ea5..b7df271f2 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -1310,8 +1310,7 @@ static inline void n_tty_receive_char(struct tty_struct *tty, unsigned char c)
 			tty->canon_data++;
 			spin_unlock_irqrestore(&tty->read_lock, flags);
 			kill_fasync(&tty->fasync, SIGIO, POLL_IN);
-			if (waitqueue_active(&tty->read_wait))
-				wake_up_interruptible(&tty->read_wait);
+			wake_up_interruptible(&tty->read_wait);
 			return;
 		}
 	}
@@ -1434,8 +1433,7 @@ static void n_tty_receive_buf(struct tty_struct *tty, const unsigned char *cp,
 	if ((!tty->icanon && (tty->read_cnt >= tty->minimum_to_wake)) ||
 		L_EXTPROC(tty)) {
 		kill_fasync(&tty->fasync, SIGIO, POLL_IN);
-		if (waitqueue_active(&tty->read_wait))
-			wake_up_interruptible(&tty->read_wait);
+		wake_up_interruptible(&tty->read_wait);
 	}
 
 	/*
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 8d4a762aa..061fcf47f 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -2018,8 +2018,24 @@ static int tty_open(struct inode *inode, struct file *filp)
 	if (!noctty &&
 	    current->signal->leader &&
 	    !current->signal->tty &&
-	    tty->session == NULL)
-		__proc_set_tty(current, tty);
+	    tty->session == NULL) {
+		/*
+		 * Don't let a process that only has write access to the tty
+		 * obtain the privileges associated with having a tty as
+		 * controlling terminal (being able to reopen it with full
+		 * access through /dev/tty, being able to perform pushback).
+		 * Many distributions set the group of all ttys to "tty" and
+		 * grant write-only access to all terminals for setgid tty
+		 * binaries, which should not imply full privileges on all ttys.
+		 *
+		 * This could theoretically break old code that performs open()
+		 * on a write-only file descriptor. In that case, it might be
+		 * necessary to also permit this if
+		 * inode_permission(inode, MAY_READ) == 0.
+		 */
+		if (filp->f_mode & FMODE_READ)
+			__proc_set_tty(current, tty);
+	}
 	spin_unlock_irq(&current->sighand->siglock);
 	tty_unlock();
 	mutex_unlock(&tty_mutex);
@@ -2308,7 +2324,7 @@ static int fionbio(struct file *file, int __user *p)
  *		Takes ->siglock() when updating signal->tty
  */
 
-static int tiocsctty(struct tty_struct *tty, int arg)
+static int tiocsctty(struct tty_struct *tty, struct file *file, int arg)
 {
 	int ret = 0;
 	if (current->signal->leader && (task_session(current) == tty->session))
@@ -2341,6 +2357,13 @@ static int tiocsctty(struct tty_struct *tty, int arg)
 			goto unlock;
 		}
 	}
+
+	/* See the comment in tty_open(). */
+	if ((file->f_mode & FMODE_READ) == 0 && !capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto unlock;
+	}
+
 	proc_set_tty(current, tty);
 unlock:
 	mutex_unlock(&tty_mutex);
@@ -2698,7 +2721,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		no_tty();
 		return 0;
 	case TIOCSCTTY:
-		return tiocsctty(tty, arg);
+		return tiocsctty(tty, file, arg);
 	case TIOCGPGRP:
 		return tiocgpgrp(tty, real_tty, p);
 	case TIOCSPGRP:
diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index cc1004a2f..bfc9b6912 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -114,16 +114,18 @@ static void usb_parse_ss_endpoint_companion(struct device *ddev, int cfgno,
 				cfgno, inum, asnum, ep->desc.bEndpointAddress);
 		ep->ss_ep_comp.bmAttributes = 16;
 	} else if (usb_endpoint_xfer_isoc(&ep->desc) &&
-			desc->bmAttributes > 2) {
+		   USB_SS_MULT(desc->bmAttributes) > 3) {
 		dev_warn(ddev, "Isoc endpoint has Mult of %d in "
 				"config %d interface %d altsetting %d ep %d: "
-				"setting to 3\n", desc->bmAttributes + 1,
+				"setting to 3\n",
+				USB_SS_MULT(desc->bmAttributes),
 				cfgno, inum, asnum, ep->desc.bEndpointAddress);
 		ep->ss_ep_comp.bmAttributes = 2;
 	}
 
 	if (usb_endpoint_xfer_isoc(&ep->desc))
-		max_tx = (desc->bMaxBurst + 1) * (desc->bmAttributes + 1) *
+		max_tx = (desc->bMaxBurst + 1) *
+			(USB_SS_MULT(desc->bmAttributes)) *
 			usb_endpoint_maxp(&ep->desc);
 	else if (usb_endpoint_xfer_int(&ep->desc))
 		max_tx = usb_endpoint_maxp(&ep->desc) *
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index e845a1f80..392c125fc 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -49,6 +49,13 @@ static const struct usb_device_id usb_quirk_list[] = {
 	/* Microsoft LifeCam-VX700 v2.0 */
 	{ USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME },
 
+	/* Logitech ConferenceCam CC3000e */
+	{ USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT },
+	{ USB_DEVICE(0x046d, 0x0848), .driver_info = USB_QUIRK_DELAY_INIT },
+
+	/* Logitech PTZ Pro Camera */
+	{ USB_DEVICE(0x046d, 0x0853), .driver_info = USB_QUIRK_DELAY_INIT },
+
 	/* Logitech Quickcam Fusion */
 	{ USB_DEVICE(0x046d, 0x08c1), .driver_info = USB_QUIRK_RESET_RESUME },
 
@@ -73,6 +80,12 @@ static const struct usb_device_id usb_quirk_list[] = {
 	/* Philips PSC805 audio device */
 	{ USB_DEVICE(0x0471, 0x0155), .driver_info = USB_QUIRK_RESET_RESUME },
 
+	/* Plantronic Audio 655 DSP */
+	{ USB_DEVICE(0x047f, 0xc008), .driver_info = USB_QUIRK_RESET_RESUME },
+
+	/* Plantronic Audio 648 USB */
+	{ USB_DEVICE(0x047f, 0xc013), .driver_info = USB_QUIRK_RESET_RESUME },
+
 	/* Artisman Watchdog Dongle */
 	{ USB_DEVICE(0x04b4, 0x0526), .driver_info =
 			USB_QUIRK_CONFIG_INTF_STRINGS },
diff --git a/drivers/usb/host/ehci-sysfs.c b/drivers/usb/host/ehci-sysfs.c
index 14ced00ba..065902429 100644
--- a/drivers/usb/host/ehci-sysfs.c
+++ b/drivers/usb/host/ehci-sysfs.c
@@ -29,7 +29,7 @@ static ssize_t show_companion(struct device *dev,
 	int			count = PAGE_SIZE;
 	char			*ptr = buf;
 
-	ehci = hcd_to_ehci(bus_to_hcd(dev_get_drvdata(dev)));
+	ehci = hcd_to_ehci(dev_get_drvdata(dev));
 	nports = HCS_N_PORTS(ehci->hcs_params);
 
 	for (index = 0; index < nports; ++index) {
@@ -54,7 +54,7 @@ static ssize_t store_companion(struct device *dev,
 	struct ehci_hcd		*ehci;
 	int			portnum, new_owner;
 
-	ehci = hcd_to_ehci(bus_to_hcd(dev_get_drvdata(dev)));
+	ehci = hcd_to_ehci(dev_get_drvdata(dev));
 	new_owner = PORT_OWNER;		/* Owned by companion */
 	if (sscanf(buf, "%d", &portnum) != 1)
 		return -EINVAL;
@@ -85,7 +85,7 @@ static ssize_t show_uframe_periodic_max(struct device *dev,
 	struct ehci_hcd		*ehci;
 	int			n;
 
-	ehci = hcd_to_ehci(bus_to_hcd(dev_get_drvdata(dev)));
+	ehci = hcd_to_ehci(dev_get_drvdata(dev));
 	n = scnprintf(buf, PAGE_SIZE, "%d\n", ehci->uframe_periodic_max);
 	return n;
 }
@@ -102,7 +102,7 @@ static ssize_t store_uframe_periodic_max(struct device *dev,
 	unsigned long		flags;
 	ssize_t			ret;
 
-	ehci = hcd_to_ehci(bus_to_hcd(dev_get_drvdata(dev)));
+	ehci = hcd_to_ehci(dev_get_drvdata(dev));
 	if (kstrtouint(buf, 0, &uframe_periodic_max) < 0)
 		return -EINVAL;
 
diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
index 048cc382a..cad4a174e 100644
--- a/drivers/usb/host/xhci-mem.c
+++ b/drivers/usb/host/xhci-mem.c
@@ -1493,10 +1493,10 @@ int xhci_endpoint_init(struct xhci_hcd *xhci,
 	 * use Event Data TRBs, and we don't chain in a link TRB on short
 	 * transfers, we're basically dividing by 1.
 	 *
-	 * xHCI 1.0 specification indicates that the Average TRB Length should
-	 * be set to 8 for control endpoints.
+	 * xHCI 1.0 and 1.1 specification indicates that the Average TRB Length
+	 * should be set to 8 for control endpoints.
 	 */
-	if (usb_endpoint_xfer_control(&ep->desc) && xhci->hci_version == 0x100)
+	if (usb_endpoint_xfer_control(&ep->desc) && xhci->hci_version >= 0x100)
 		ep_ctx->tx_info |= cpu_to_le32(AVG_TRB_LENGTH_FOR_EP(8));
 	else
 		ep_ctx->tx_info |=
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index 710b2e98b..305393373 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -121,6 +121,7 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
 		 * PPT chipsets.
 		 */
 		xhci->quirks |= XHCI_SPURIOUS_REBOOT;
+		xhci->quirks |= XHCI_SPURIOUS_WAKEUP;
 	}
 	if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 		(pdev->device == PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_XHCI ||
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index d857ba543..5d4f1b754 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -331,6 +331,15 @@ static int xhci_abort_cmd_ring(struct xhci_hcd *xhci)
 	ret = handshake(xhci, &xhci->op_regs->cmd_ring,
 			CMD_RING_RUNNING, 0, 5 * 1000 * 1000);
 	if (ret < 0) {
+		/* we are about to kill xhci, give it one more chance */
+		xhci_write_64(xhci, temp_64 | CMD_RING_ABORT,
+			      &xhci->op_regs->cmd_ring);
+		udelay(1000);
+		ret = handshake(xhci, &xhci->op_regs->cmd_ring,
+				CMD_RING_RUNNING, 0, 3 * 1000 * 1000);
+		if (ret == 0)
+			return 0;
+
 		xhci_err(xhci, "Stopped the command ring failed, "
 				"maybe the host is dead\n");
 		xhci->xhc_state |= XHCI_STATE_DYING;
@@ -2337,6 +2346,7 @@ static int handle_tx_event(struct xhci_hcd *xhci,
 	u32 trb_comp_code;
 	int ret = 0;
 	int td_num = 0;
+	bool handling_skipped_tds = false;
 
 	slot_id = TRB_TO_SLOT_ID(le32_to_cpu(event->flags));
 	xdev = xhci->devs[slot_id];
@@ -2470,6 +2480,10 @@ static int handle_tx_event(struct xhci_hcd *xhci,
 		ep->skip = true;
 		xhci_dbg(xhci, "Miss service interval error, set skip flag\n");
 		goto cleanup;
+	case COMP_PING_ERR:
+		ep->skip = true;
+		xhci_dbg(xhci, "No Ping response error, Skip one Isoc TD\n");
+		goto cleanup;
 	default:
 		if (xhci_is_vendor_info_code(xhci, trb_comp_code)) {
 			status = 0;
@@ -2601,13 +2615,18 @@ static int handle_tx_event(struct xhci_hcd *xhci,
 						 ep, &status);
 
 cleanup:
+
+
+		handling_skipped_tds = ep->skip &&
+			trb_comp_code != COMP_MISSED_INT &&
+			trb_comp_code != COMP_PING_ERR;
+
 		/*
-		 * Do not update event ring dequeue pointer if ep->skip is set.
-		 * Will roll back to continue process missed tds.
+		 * Do not update event ring dequeue pointer if we're in a loop
+		 * processing missed tds.
 		 */
-		if (trb_comp_code == COMP_MISSED_INT || !ep->skip) {
+		if (!handling_skipped_tds)
 			inc_deq(xhci, xhci->event_ring);
-		}
 
 		if (ret) {
 			urb = td->urb;
@@ -2642,7 +2661,7 @@ static int handle_tx_event(struct xhci_hcd *xhci,
 	 * Process them as short transfer until reach the td pointed by
 	 * the event.
 	 */
-	} while (ep->skip && trb_comp_code != COMP_MISSED_INT);
+	} while (handling_skipped_tds);
 
 	return 0;
 }
@@ -3493,8 +3512,8 @@ int xhci_queue_ctrl_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
 	if (start_cycle == 0)
 		field |= 0x1;
 
-	/* xHCI 1.0 6.4.1.2.1: Transfer Type field */
-	if (xhci->hci_version == 0x100) {
+	/* xHCI 1.0/1.1 6.4.1.2.1: Transfer Type field */
+	if (xhci->hci_version >= 0x100) {
 		if (urb->transfer_buffer_length > 0) {
 			if (setup->bRequestType & USB_DIR_IN)
 				field |= TRB_TX_TYPE(TRB_DATA_IN);
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index c918fd6aa..eca811ef9 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -141,7 +141,8 @@ static int xhci_start(struct xhci_hcd *xhci)
 				"waited %u microseconds.\n",
 				XHCI_MAX_HALT_USEC);
 	if (!ret)
-		xhci->xhc_state &= ~XHCI_STATE_HALTED;
+		xhci->xhc_state &= ~(XHCI_STATE_HALTED | XHCI_STATE_DYING);
+
 	return ret;
 }
 
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 1e4899c2d..4038789d6 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -629,6 +629,10 @@ static struct usb_device_id id_table_combined [] = {
 	{ USB_DEVICE(FTDI_VID, FTDI_NT_ORIONLXM_PID),
 		.driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
 	{ USB_DEVICE(FTDI_VID, FTDI_SYNAPSE_SS200_PID) },
+	{ USB_DEVICE(FTDI_VID, FTDI_CUSTOMWARE_MINIPLEX_PID) },
+	{ USB_DEVICE(FTDI_VID, FTDI_CUSTOMWARE_MINIPLEX2_PID) },
+	{ USB_DEVICE(FTDI_VID, FTDI_CUSTOMWARE_MINIPLEX2WI_PID) },
+	{ USB_DEVICE(FTDI_VID, FTDI_CUSTOMWARE_MINIPLEX3_PID) },
 	/*
 	 * ELV devices:
 	 */
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
index 1fee973f1..70b24c02b 100644
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -568,6 +568,14 @@
  */
 #define FTDI_SYNAPSE_SS200_PID 0x9090 /* SS200 - SNAP Stick 200 */
 
+/*
+ * CustomWare / ShipModul NMEA multiplexers product ids (FTDI_VID)
+ */
+#define FTDI_CUSTOMWARE_MINIPLEX_PID	0xfd48	/* MiniPlex first generation NMEA Multiplexer */
+#define FTDI_CUSTOMWARE_MINIPLEX2_PID	0xfd49	/* MiniPlex-USB and MiniPlex-2 series */
+#define FTDI_CUSTOMWARE_MINIPLEX2WI_PID	0xfd4a	/* MiniPlex-2Wi */
+#define FTDI_CUSTOMWARE_MINIPLEX3_PID	0xfd4b	/* MiniPlex-3 series */
+
 
 /********************************/
 /** third-party VID/PID combos **/
diff --git a/drivers/video/msm/mdss/mdss_dsi.c b/drivers/video/msm/mdss/mdss_dsi.c
index 3163ebd9a..a15204ef3 100644
--- a/drivers/video/msm/mdss/mdss_dsi.c
+++ b/drivers/video/msm/mdss/mdss_dsi.c
@@ -23,6 +23,10 @@
 #include <linux/regulator/consumer.h>
 #include <linux/clk.h>
 
+#ifdef CONFIG_STATE_NOTIFIER
+#include <linux/state_notifier.h>
+#endif
+
 #include "mdss.h"
 #include "mdss_fb.h"
 #include "mdss_panel.h"
@@ -1474,6 +1478,10 @@ static int mdss_dsi_event_handler(struct mdss_panel_data *pdata,
 		ctrl_pdata->mdp_tg_on = 1;
 		if (ctrl_pdata->on_cmds.link_state == DSI_HS_MODE)
 			rc = mdss_dsi_unblank(pdata);
+#ifdef CONFIG_STATE_NOTIFIER
+		if (!use_fb_notifier)
+			state_resume();
+#endif
 		break;
 	case MDSS_EVENT_BLANK:
 		if (ctrl_pdata->off_cmds.link_state == DSI_HS_MODE)
@@ -1484,6 +1492,10 @@ static int mdss_dsi_event_handler(struct mdss_panel_data *pdata,
 		if (ctrl_pdata->off_cmds.link_state == DSI_LP_MODE)
 			rc = mdss_dsi_blank(pdata);
 		rc = mdss_dsi_off(pdata);
+#ifdef CONFIG_STATE_NOTIFIER
+		if (!use_fb_notifier)
+			state_suspend();
+#endif
 		break;
 #if defined(CONFIG_FB_MSM_MDSS_S6E8AA0A_HD_PANEL)
 	case MTP_READ:
diff --git a/drivers/video/msm/mdss/mdss_mdp_pp.c b/drivers/video/msm/mdss/mdss_mdp_pp.c
index 94ab630a2..def3f7de9 100644
--- a/drivers/video/msm/mdss/mdss_mdp_pp.c
+++ b/drivers/video/msm/mdss/mdss_mdp_pp.c
@@ -4373,7 +4373,7 @@ int mdss_mdp_ad_input(struct msm_fb_data_type *mfd,
 	mutex_lock(&ad->lock);
 	if ((!PP_AD_STATE_IS_INITCFG(ad->state) &&
 			!PP_AD_STS_IS_DIRTY(ad->sts)) &&
-			!input->mode == MDSS_AD_MODE_CALIB) {
+			input->mode != MDSS_AD_MODE_CALIB) {
 		pr_warn("AD not initialized or configured.");
 		ret = -EPERM;
 		goto error;
diff --git a/drivers/video/msm/mhl_v2/sii8240/sii8240.c b/drivers/video/msm/mhl_v2/sii8240/sii8240.c
index 9c8c43769..9ad69723f 100755
--- a/drivers/video/msm/mhl_v2/sii8240/sii8240.c
+++ b/drivers/video/msm/mhl_v2/sii8240/sii8240.c
@@ -444,8 +444,7 @@ static int tmds_control(struct sii8240_data *sii8240, bool tmds_on)
 		return ret;
 	}
 
-	switch (tmds_on) {
-	case true:
+	if (tmds_on) {
 #ifdef SFEATURE_HDCP_SUPPORT
 		ret = mhl_read_byte_reg(tpi, 0x1A, &value);
 		if (TMDS_OUTPUT_CONTROL_POWER_DOWN & value) {
@@ -497,9 +496,8 @@ static int tmds_control(struct sii8240_data *sii8240, bool tmds_on)
 		if (unlikely(ret < 0))
 			pr_err("[ERROR] %s() send AVIF fail\n", __func__);
 #endif
-		break;
+	} else {
 
-	case false:
 #ifdef SFEATURE_HDCP_SUPPORT
 		sii8240_hdcp_on(sii8240, false);
 #endif
@@ -520,11 +518,6 @@ static int tmds_control(struct sii8240_data *sii8240, bool tmds_on)
 					__func__, __LINE__);
 			return ret;
 		}
-		break;
-
-	default:
-		pr_err("[ERROR] %s() unknown value\n", __func__);
-		break;
 	}
 	return ret;
 }
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index 89f264c67..de9273ea7 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -235,7 +235,7 @@ static int __init no_cleancache(char *s)
 }
 __setup("nocleancache", no_cleancache);
 
-static struct cleancache_ops __initdata tmem_cleancache_ops = {
+static const struct cleancache_ops tmem_cleancache_ops = {
 	.put_page = tmem_cleancache_put_page,
 	.get_page = tmem_cleancache_get_page,
 	.invalidate_page = tmem_cleancache_flush_page,
@@ -389,14 +389,26 @@ static int __init xen_tmem_init(void)
 #endif
 #ifdef CONFIG_CLEANCACHE
 	BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
-	if (tmem_enabled && use_cleancache) {
-		char *s = "";
-		struct cleancache_ops old_ops =
-			cleancache_register_ops(&tmem_cleancache_ops);
-		if (old_ops.init_fs != NULL)
-			s = " (WARNING: cleancache_ops overridden)";
-		printk(KERN_INFO "cleancache enabled, RAM provided by "
-				 "Xen Transcendent Memory%s\n", s);
+	if (tmem_enabled && cleancache) {
+		int err;
+
+		err = cleancache_register_ops(&tmem_cleancache_ops);
+		if (err)
+			pr_warn("xen-tmem: failed to enable cleancache: %d\n",
+				err);
+		else
+			pr_info("cleancache enabled, RAM provided by "
+				"Xen Transcendent Memory\n");
+	}
+#endif
+#ifdef CONFIG_XEN_SELFBALLOONING
+	/*
+	 * There is no point of driving pages to the swap system if they
+	 * aren't going anywhere in tmem universe.
+	 */
+	if (!frontswap) {
+		selfshrinking = false;
+		selfballooning = false;
 	}
 #endif
 	return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9e5132582..575c1902d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3685,7 +3685,8 @@ void btrfs_evict_inode(struct inode *inode)
 		goto no_delete;
 	}
 	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
-	btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	if (!special_file(inode->i_mode))
+		btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
 	if (root->fs_info->log_root_recovering) {
 		BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f4fa5cf0c..e5eacd9dd 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -383,8 +383,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 	if (opt->flags & CEPH_OPT_NOCRC)
 		seq_puts(m, ",nocrc");
 
-	if (opt->name)
-		seq_printf(m, ",name=%s", opt->name);
+	if (opt->name) {
+		seq_puts(m, ",name=");
+		seq_escape(m, opt->name, ", \t\n\\");
+	}
 	if (opt->key)
 		seq_puts(m, ",secret=<hidden>");
 
@@ -429,7 +431,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 	if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
 		seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
 	if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
-		seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+		seq_show_option(m, "snapdirname", fsopt->snapdir_name);
 	return 0;
 }
 
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 6dd3b61ea..8431216ee 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -388,6 +388,48 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	return 0;
 }
 
+/* Server has provided av pairs/target info in the type 2 challenge
+ * packet and we have plucked it and stored within smb session.
+ * We parse that blob here to find the server given timestamp
+ * as part of ntlmv2 authentication (or local current time as
+ * default in case of failure)
+ */
+static __le64
+find_timestamp(struct cifs_ses *ses)
+{
+	unsigned int attrsize;
+	unsigned int type;
+	unsigned int onesize = sizeof(struct ntlmssp2_name);
+	unsigned char *blobptr;
+	unsigned char *blobend;
+	struct ntlmssp2_name *attrptr;
+
+	if (!ses->auth_key.len || !ses->auth_key.response)
+		return 0;
+
+	blobptr = ses->auth_key.response;
+	blobend = blobptr + ses->auth_key.len;
+
+	while (blobptr + onesize < blobend) {
+		attrptr = (struct ntlmssp2_name *) blobptr;
+		type = le16_to_cpu(attrptr->type);
+		if (type == NTLMSSP_AV_EOL)
+			break;
+		blobptr += 2; /* advance attr type */
+		attrsize = le16_to_cpu(attrptr->length);
+		blobptr += 2; /* advance attr size */
+		if (blobptr + attrsize > blobend)
+			break;
+		if (type == NTLMSSP_AV_TIMESTAMP) {
+			if (attrsize == sizeof(u64))
+				return *((__le64 *)blobptr);
+		}
+		blobptr += attrsize; /* advance attr value */
+	}
+
+	return cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+}
+
 static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
 			    const struct nls_table *nls_cp)
 {
@@ -549,6 +591,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 	struct ntlmv2_resp *buf;
 	char ntlmv2_hash[16];
 	unsigned char *tiblob = NULL; /* target info blob */
+	__le64 rsp_timestamp;
 
 	if (ses->server->secType == RawNTLMSSP) {
 		if (!ses->domainName) {
@@ -566,6 +609,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 		}
 	}
 
+	/* Must be within 5 minutes of the server (or in range +/-2h
+	 * in case of Mac OS X), so simply carry over server timestamp
+	 * (as Windows 7 does)
+	 */
+	rsp_timestamp = find_timestamp(ses);
+
 	baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
 	tilen = ses->auth_key.len;
 	tiblob = ses->auth_key.response;
@@ -583,7 +632,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 			(ses->auth_key.response + CIFS_SESS_KEY_SIZE);
 	buf->blob_signature = cpu_to_le32(0x00000101);
 	buf->reserved = 0;
-	buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	buf->time = rsp_timestamp;
 	get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
 	buf->reserved2 = 0;
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c0f65e848..5b730ba78 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -373,10 +373,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
 		seq_printf(s, ",multiuser");
 	else if (tcon->ses->user_name)
-		seq_printf(s, ",username=%s", tcon->ses->user_name);
+		seq_show_option(s, "username", tcon->ses->user_name);
 
 	if (tcon->ses->domainName)
-		seq_printf(s, ",domain=%s", tcon->ses->domainName);
+		seq_show_option(s, "domain", tcon->ses->domainName);
 
 	if (srcaddr->sa_family != AF_UNSPEC) {
 		struct sockaddr_in *saddr4;
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 534c1d46e..eba8f1d4a 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -55,26 +55,26 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
-	if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
-		goto out;
-	if (nd) {
-		dentry_save = nd->path.dentry;
-		vfsmount_save = nd->path.mnt;
-		nd->path.dentry = lower_dentry;
-		nd->path.mnt = lower_mnt;
-	}
-	rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd);
-	if (nd) {
-		nd->path.dentry = dentry_save;
-		nd->path.mnt = vfsmount_save;
+	if (lower_dentry->d_op && lower_dentry->d_op->d_revalidate) {
+		if (nd) {
+			dentry_save = nd->path.dentry;
+			vfsmount_save = nd->path.mnt;
+			nd->path.dentry = lower_dentry;
+			nd->path.mnt = lower_mnt;
+		}
+		rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd);
+		if (nd) {
+			nd->path.dentry = dentry_save;
+			nd->path.mnt = vfsmount_save;
+		}
 	}
 	if (dentry->d_inode) {
-		struct inode *lower_inode =
-			ecryptfs_inode_to_lower(dentry->d_inode);
+		struct inode *inode = dentry->d_inode;
 
-		fsstack_copy_attr_all(dentry->d_inode, lower_inode);
+		fsstack_copy_attr_all(inode, ecryptfs_inode_to_lower(inode));
+		if (!inode->i_nlink)
+			return 0;
 	}
-out:
 	return rc;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f408762a..951ff257f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1677,10 +1677,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
 	}
 
 	if (sbi->s_qf_names[USRQUOTA])
-		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+		seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
 
 	if (sbi->s_qf_names[GRPQUOTA])
-		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+		seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
 
 	if (test_opt(sb, USRQUOTA))
 		seq_puts(seq, ",usrquota");
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 6172fa77a..4db9a9a31 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1298,11 +1298,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
 	if (is_ancestor(root, sdp->sd_master_dir))
 		seq_printf(s, ",meta");
 	if (args->ar_lockproto[0])
-		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+		seq_show_option(s, "lockproto", args->ar_lockproto);
 	if (args->ar_locktable[0])
-		seq_printf(s, ",locktable=%s", args->ar_locktable);
+		seq_show_option(s, "locktable", args->ar_locktable);
 	if (args->ar_hostdata[0])
-		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+		seq_show_option(s, "hostdata", args->ar_hostdata);
 	if (args->ar_spectator)
 		seq_printf(s, ",spectator");
 	if (args->ar_localflocks)
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index cdb41a1f6..8daea16ef 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -287,7 +287,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 			page_cache_release(page);
 			goto fail;
 		}
-		page_cache_release(page);
 		node->page[i] = page;
 	}
 
@@ -397,11 +396,11 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
 
 void hfs_bnode_free(struct hfs_bnode *node)
 {
-	//int i;
+	int i;
 
-	//for (i = 0; i < node->tree->pages_per_bnode; i++)
-	//	if (node->page[i])
-	//		page_cache_release(node->page[i]);
+	for (i = 0; i < node->tree->pages_per_bnode; i++)
+		if (node->page[i])
+			page_cache_release(node->page[i]);
 	kfree(node);
 }
 
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 92fb358ce..db240c54a 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -132,13 +132,16 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len)
 	hfs_bnode_write(node, entry, data_off + key_len, entry_len);
 	hfs_bnode_dump(node);
 
-	if (new_node) {
-		/* update parent key if we inserted a key
-		 * at the start of the first node
-		 */
-		if (!rec && new_node != node)
-			hfs_brec_update_parent(fd);
+	/*
+	 * update parent key if we inserted a key
+	 * at the start of the node and it is not the new node
+	 */
+	if (!rec && new_node != node) {
+		hfs_bnode_read_key(node, fd->search_key, data_off + size);
+		hfs_brec_update_parent(fd);
+	}
 
+	if (new_node) {
 		hfs_bnode_put(fd->bnode);
 		if (!new_node->parent) {
 			hfs_btree_inc_height(tree);
@@ -167,9 +170,6 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len)
 		goto again;
 	}
 
-	if (!rec)
-		hfs_brec_update_parent(fd);
-
 	return 0;
 }
 
@@ -366,6 +366,8 @@ static int hfs_brec_update_parent(struct hfs_find_data *fd)
 	if (IS_ERR(parent))
 		return PTR_ERR(parent);
 	__hfs_brec_find(parent, fd);
+	if (fd->record < 0)
+		return -ENOENT;
 	hfs_bnode_dump(parent);
 	rec = fd->record;
 
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 7b4c537d6..be0e218a3 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -138,9 +138,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root)
 	struct hfs_sb_info *sbi = HFS_SB(root->d_sb);
 
 	if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
-		seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
+		seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4);
 	if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
-		seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type);
+		seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4);
 	seq_printf(seq, ",uid=%u,gid=%u", sbi->s_uid, sbi->s_gid);
 	if (sbi->s_file_umask != 0133)
 		seq_printf(seq, ",file_umask=%o", sbi->s_file_umask);
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 1c42cc5b8..a1e91092f 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 			page_cache_release(page);
 			goto fail;
 		}
-		page_cache_release(page);
 		node->page[i] = page;
 	}
 
@@ -566,13 +565,11 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num)
 
 void hfs_bnode_free(struct hfs_bnode *node)
 {
-#if 0
 	int i;
 
 	for (i = 0; i < node->tree->pages_per_bnode; i++)
 		if (node->page[i])
 			page_cache_release(node->page[i]);
-#endif
 	kfree(node);
 }
 
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 06fa56186..38e41d07d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -211,9 +211,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb);
 
 	if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
-		seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
+		seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4);
 	if (sbi->type != HFSPLUS_DEF_CR_TYPE)
-		seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
+		seq_show_option_n(seq, "type", (char *)&sbi->type, 4);
 	seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
 		sbi->uid, sbi->gid);
 	if (sbi->part >= 0)
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 07c516bfe..fe63b15f5 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -264,7 +264,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
 	size_t offset = strlen(root_ino) + 1;
 
 	if (strlen(root_path) > offset)
-		seq_printf(seq, ",%s", root_path + offset);
+		seq_show_option(seq, root_path + offset, NULL);
 
 	return 0;
 }
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 30dd7b10b..bdb86a8a8 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -8,6 +8,17 @@
 #include <linux/sched.h>
 #include "hpfs_fn.h"
 
+static void hpfs_update_directory_times(struct inode *dir)
+{
+	time_t t = get_seconds();
+	if (t == dir->i_mtime.tv_sec &&
+	    t == dir->i_ctime.tv_sec)
+		return;
+	dir->i_mtime.tv_sec = dir->i_ctime.tv_sec = t;
+	dir->i_mtime.tv_nsec = dir->i_ctime.tv_nsec = 0;
+	hpfs_write_inode_nolock(dir);
+}
+
 static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	const unsigned char *name = dentry->d_name.name;
@@ -99,6 +110,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 		result->i_mode = mode | S_IFDIR;
 		hpfs_write_inode_nolock(result);
 	}
+	hpfs_update_directory_times(dir);
 	d_instantiate(dentry, result);
 	hpfs_unlock(dir->i_sb);
 	return 0;
@@ -187,6 +199,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, s
 		result->i_mode = mode | S_IFREG;
 		hpfs_write_inode_nolock(result);
 	}
+	hpfs_update_directory_times(dir);
 	d_instantiate(dentry, result);
 	hpfs_unlock(dir->i_sb);
 	return 0;
@@ -262,6 +275,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de
 	insert_inode_hash(result);
 
 	hpfs_write_inode_nolock(result);
+	hpfs_update_directory_times(dir);
 	d_instantiate(dentry, result);
 	brelse(bh);
 	hpfs_unlock(dir->i_sb);
@@ -340,6 +354,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
 	insert_inode_hash(result);
 
 	hpfs_write_inode_nolock(result);
+	hpfs_update_directory_times(dir);
 	d_instantiate(dentry, result);
 	hpfs_unlock(dir->i_sb);
 	return 0;
@@ -423,6 +438,8 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
 out1:
 	hpfs_brelse4(&qbh);
 out:
+	if (!err)
+		hpfs_update_directory_times(dir);
 	hpfs_unlock(dir->i_sb);
 	return err;
 }
@@ -477,6 +494,8 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
 out1:
 	hpfs_brelse4(&qbh);
 out:
+	if (!err)
+		hpfs_update_directory_times(dir);
 	hpfs_unlock(dir->i_sb);
 	return err;
 }
@@ -595,7 +614,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto end1;
 	}
 
-	end:
+end:
 	hpfs_i(i)->i_parent_dir = new_dir->i_ino;
 	if (S_ISDIR(i->i_mode)) {
 		inc_nlink(new_dir);
@@ -610,6 +629,10 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		brelse(bh);
 	}
 end1:
+	if (!err) {
+		hpfs_update_directory_times(old_dir);
+		hpfs_update_directory_times(new_dir);
+	}
 	hpfs_unlock(i->i_sb);
 	return err;
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3d344ab0b..92eff4da0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1851,7 +1851,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode
 	if (server->caps & NFS_CAP_POSIX_LOCK)
 		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
 
-	if (opendata->o_arg.open_flags & O_EXCL) {
+	if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) {
 		nfs4_exclusive_attrset(opendata, sattr);
 
 		nfs_fattr_init(opendata->o_res.f_attr);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 7ba6ac187..8e48ba5f6 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1411,6 +1411,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
 	int found, ret;
 	int set_maybe;
 	int dispatch_assert = 0;
+	int dispatched = 0;
 
 	if (!dlm_grab(dlm))
 		return DLM_MASTER_RESP_NO;
@@ -1617,13 +1618,16 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
 			mlog(ML_ERROR, "failed to dispatch assert master work\n");
 			response = DLM_MASTER_RESP_ERROR;
 			dlm_lockres_put(res);
+		} else {
+			dispatched = 1;
 		}
 	} else {
 		if (res)
 			dlm_lockres_put(res);
 	}
 
-	dlm_put(dlm);
+	if (!dispatched)
+		dlm_put(dlm);
 	return response;
 }
 
@@ -2041,7 +2045,6 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
 
 
 	/* queue up work for dlm_assert_master_worker */
-	dlm_grab(dlm);  /* get an extra ref for the work item */
 	dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
 	item->u.am.lockres = res; /* already have a ref */
 	/* can optionally ignore node numbers higher than this node */
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index d15b0714e..0e5013ed7 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1689,6 +1689,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
 	unsigned int hash;
 	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
 	u32 flags = DLM_ASSERT_MASTER_REQUERY;
+	int dispatched = 0;
 
 	if (!dlm_grab(dlm)) {
 		/* since the domain has gone away on this
@@ -1710,6 +1711,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
 				mlog_errno(-ENOMEM);
 				/* retry!? */
 				BUG();
+			} else {
+				dispatched = 1;
 			}
 		} else /* put.. incase we are not the master */
 			dlm_lockres_put(res);
@@ -1717,7 +1720,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
 	}
 	spin_unlock(&dlm->spinlock);
 
-	dlm_put(dlm);
+	if (!dispatched)
+		dlm_put(dlm);
 	return master;
 }
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 68f4541c2..7e478160b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1578,8 +1578,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
 		seq_printf(s, ",localflocks,");
 
 	if (osb->osb_cluster_stack[0])
-		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
-			   osb->osb_cluster_stack);
+		seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack,
+				  OCFS2_STACK_LABEL_LEN);
 	if (opts & OCFS2_MOUNT_USRQUOTA)
 		seq_printf(s, ",usrquota");
 	if (opts & OCFS2_MOUNT_GRPQUOTA)
@@ -2357,7 +2357,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		mlog_errno(status);
 		goto bail;
 	}
-	cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
+	
+	cleancache_init_shared_fs(sb);
 
 bail:
 	return status;
diff --git a/fs/pipe.c b/fs/pipe.c
index abfb93525..6049235e2 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -390,6 +390,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 			void *addr;
 			size_t chars = buf->len, remaining;
 			int error, atomic;
+			int offset;
 
 			if (chars > total_len)
 				chars = total_len;
@@ -403,9 +404,10 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 
 			atomic = !iov_fault_in_pages_write(iov, chars);
 			remaining = chars;
+			offset = buf->offset;
 redo:
 			addr = ops->map(pipe, buf, atomic);
-			error = pipe_iov_copy_to_user(iov, addr, &buf->offset,
+			error = pipe_iov_copy_to_user(iov, addr, &offset,
 						      &remaining, atomic);
 			ops->unmap(pipe, buf, addr);
 			if (unlikely(error)) {
@@ -421,6 +423,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 				break;
 			}
 			ret += chars;
+			buf->offset += chars;
 			buf->len -= chars;
 
 			/* Was it a packet buffer? Clean up and exit */
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 33f9e9b81..e1bd03dda 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		* sysctl_overcommit_ratio / 100) + total_swap_pages;
 
 	cached = global_page_state(NR_FILE_PAGES) -
-			total_swapcache_pages - i.bufferram;
+			total_swapcache_pages()- i.bufferram;
 	if (cached < 0)
 		cached = 0;
 
@@ -112,7 +112,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freeram),
 		K(i.bufferram),
 		K(cached),
-		K(total_swapcache_pages),
+		K(total_swapcache_pages()),
 		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
 		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
 		K(pages[LRU_ACTIVE_ANON]),
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 85a854e38..0abccc488 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -170,7 +170,7 @@ static void init_header(struct ctl_table_header *head,
 	if (node) {
 		struct ctl_table *entry;
 		for (entry = table; entry->procname; entry++, node++) {
-			rb_init_node(&node->node);
+			RB_CLEAR_NODE(&node->node);
 			node->header = head;
 		}
 	}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 8169be93a..e12357bb3 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -645,18 +645,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",acl");
 
 	if (REISERFS_SB(s)->s_jdev)
-		seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev);
+		seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev);
 
 	if (journal->j_max_commit_age != journal->j_default_max_commit_age)
 		seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
 
 #ifdef CONFIG_QUOTA
 	if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
-		seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]);
+		seq_show_option(seq, "usrjquota",
+				REISERFS_SB(s)->s_qf_names[USRQUOTA]);
 	else if (opts & (1 << REISERFS_USRQUOTA))
 		seq_puts(seq, ",usrquota");
 	if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
-		seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
+		seq_show_option(seq, "grpjquota",
+				REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
 	else if (opts & (1 << REISERFS_GRPQUOTA))
 		seq_puts(seq, ",grpquota");
 	if (REISERFS_SB(s)->s_jquota_fmt) {
diff --git a/fs/splice.c b/fs/splice.c
index 67c5210e7..286417764 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1165,7 +1165,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 	long ret, bytes;
 	umode_t i_mode;
 	size_t len;
-	int i, flags;
+	int i, flags, more;
 
 	/*
 	 * We require the input being a regular file, as we don't want to
@@ -1208,6 +1208,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 	 * Don't block on output, we have to drain the direct pipe.
 	 */
 	sd->flags &= ~SPLICE_F_NONBLOCK;
+	more = sd->flags & SPLICE_F_MORE;
 
 	while (len) {
 		size_t read_len;
@@ -1220,6 +1221,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 		read_len = ret;
 		sd->total_len = read_len;
 
+		/*
+		 * If more data is pending, set SPLICE_F_MORE
+		 * If this is the last data and SPLICE_F_MORE was not set
+		 * initially, clears it.
+		 */
+		if (read_len < len)
+			sd->flags |= SPLICE_F_MORE;
+		else if (!more)
+			sd->flags &= ~SPLICE_F_MORE;
 		/*
 		 * NOTE: nonblocking mode only applies to the input. We
 		 * must not do the output in nonblocking mode as then we
diff --git a/fs/super.c b/fs/super.c
index 672ccae0f..88b352193 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -167,7 +167,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		s->s_maxbytes = MAX_NON_LFS;
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
-		s->cleancache_poolid = -1;
+		s->cleancache_poolid = CLEANCACHE_NO_POOL;
 
 		s->s_shrink.seeks = DEFAULT_SEEKS;
 		s->s_shrink.shrink = prune_super;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index dab9a5f6d..d6c787dc2 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -523,9 +523,9 @@ xfs_showargs(
 		seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
 
 	if (mp->m_logname)
-		seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
+		seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname);
 	if (mp->m_rtname)
-		seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
+		seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname);
 
 	if (mp->m_dalign > 0)
 		seq_printf(m, "," MNTOPT_SUNIT "=%d",
diff --git a/ik.ramdisk/common/default.prop b/ik.ramdisk/common/default.prop
index 077870132..8200bd439 100644
--- a/ik.ramdisk/common/default.prop
+++ b/ik.ramdisk/common/default.prop
@@ -5,7 +5,7 @@ persist.cne.feature=0
 persist.security.ams.enforcing=0
 ro.secure=0
 ro.allow.mock.location=1
-ro.debuggable=1
+ro.debuggable=0
 ro.adb.secure=0
 ro.zygote=zygote32
 persist.sys.strict_op_enable=false
diff --git a/ik.ramdisk/common/fstab.qcom b/ik.ramdisk/common/fstab.qcom
index 03e280b7f..d373b4236 100644
--- a/ik.ramdisk/common/fstab.qcom
+++ b/ik.ramdisk/common/fstab.qcom
@@ -3,10 +3,10 @@
 # specify MF_CHECK, and must come before any filesystems that do specify MF_CHECK
 
 #<src>						<mnt_point>	<type>	<mnt_flags and options>								<fs_mgr_flags>
-/dev/block/platform/msm_sdcc.1/by-name/system	/system		f2fs	ro										wait
 /dev/block/platform/msm_sdcc.1/by-name/system	/system		ext4	ro,errors=panic									wait
-/dev/block/platform/msm_sdcc.1/by-name/userdata	/data		f2fs	nosuid,nodev,noatime,nodiratime,discard,inline_xattr				wait,encryptable=footer
+/dev/block/platform/msm_sdcc.1/by-name/system	/system		f2fs	ro										wait
 /dev/block/platform/msm_sdcc.1/by-name/userdata	/data		ext4	nosuid,nodev,noatime,noauto_da_alloc,discard,journal_async_commit,errors=panic	wait,check,encryptable=footer
+/dev/block/platform/msm_sdcc.1/by-name/userdata	/data		f2fs	nosuid,nodev,noatime,nodiratime,discard,inline_xattr				wait,encryptable=footer
 # VOLD
 /devices/msm_sdcc.[.23]/mmc_host/mmc*		auto		vfat	default										voldmanaged=extSdCard:auto,noemulatedsd
 /devices/platform/xhci-hcd/usb*sda		auto		vfat	default										voldmanaged=UsbDriveA:auto
diff --git a/ik.ramdisk/common/init.idlekernel.sh b/ik.ramdisk/common/init.idlekernel.sh
index 44022c976..691d2dd02 100755
--- a/ik.ramdisk/common/init.idlekernel.sh
+++ b/ik.ramdisk/common/init.idlekernel.sh
@@ -118,6 +118,16 @@ if [ ! -f $CFILE ]; then
 	echo $CDEF > $CFILE
 fi
 echo `cat $CFILE` > $SFILE
+#
+# uksm
+#
+CFILE=$DDIR/uksm_run
+SFILE=/sys/kernel/mm/uksm/run
+CDEF=0
+if [ ! -f $CFILE ]; then
+	echo $CDEF > $CFILE
+fi
+echo `cat $CFILE` > $SFILE
 
 /sbin/setonboot apply &
 /system/xbin/busybox run-parts /system/etc/init.d &
@@ -125,3 +135,36 @@ echo `cat $CFILE` > $SFILE
 stop thermal-engine
 sleep 2
 start thermal-engine
+
+(
+	sleep 60;
+
+	BB=/system/xbin/busybox
+	
+	# stop google service and restart it on boot. this remove high cpu load and ram leak!
+	if [ "$($BB pidof com.google.android.gms | wc -l)" -eq "1" ]; then
+		$BB kill "$($BB pidof com.google.android.gms)";
+	fi;
+	if [ "$($BB pidof com.google.android.gms.unstable | wc -l)" -eq "1" ]; then
+		$BB kill "$($BB pidof com.google.android.gms.unstable)";
+	fi;
+	if [ "$($BB pidof com.google.android.gms.persistent | wc -l)" -eq "1" ]; then
+		$BB kill "$($BB pidof com.google.android.gms.persistent)";
+	fi;
+	if [ "$($BB pidof com.google.android.gms.wearable | wc -l)" -eq "1" ]; then
+		$BB kill "$($BB pidof com.google.android.gms.wearable)";
+	fi;
+
+	# Google Services battery drain fixer by Alcolawl@xda
+	# http://forum.xda-developers.com/google-nexus-5/general/script-google-play-services-battery-t3059585/post59563859
+	pm enable com.google.android.gms/.update.SystemUpdateActivity
+	pm enable com.google.android.gms/.update.SystemUpdateService
+	pm enable com.google.android.gms/.update.SystemUpdateService$ActiveReceiver
+	pm enable com.google.android.gms/.update.SystemUpdateService$Receiver
+	pm enable com.google.android.gms/.update.SystemUpdateService$SecretCodeReceiver
+	pm enable com.google.android.gsf/.update.SystemUpdateActivity
+	pm enable com.google.android.gsf/.update.SystemUpdatePanoActivity
+	pm enable com.google.android.gsf/.update.SystemUpdateService
+	pm enable com.google.android.gsf/.update.SystemUpdateService$Receiver
+	pm enable com.google.android.gsf/.update.SystemUpdateService$SecretCodeReceiver
+)&
diff --git a/ik.ramdisk/common/init.qcom.rc b/ik.ramdisk/common/init.qcom.rc
index 412392f73..aee4cfc3a 100755
--- a/ik.ramdisk/common/init.qcom.rc
+++ b/ik.ramdisk/common/init.qcom.rc
@@ -27,6 +27,7 @@
 
 import init.qcom.usb.rc
 import init.target.rc
+#import init.carrier.rc
 
 on early-init
     mount debugfs debugfs /sys/kernel/debug
diff --git a/include/linux/async.h b/include/linux/async.h
index 68a953019..364e7ff16 100644
--- a/include/linux/async.h
+++ b/include/linux/async.h
@@ -9,19 +9,46 @@
  * as published by the Free Software Foundation; version 2
  * of the License.
  */
+#ifndef __ASYNC_H__
+#define __ASYNC_H__
 
 #include <linux/types.h>
 #include <linux/list.h>
 
 typedef u64 async_cookie_t;
 typedef void (async_func_ptr) (void *data, async_cookie_t cookie);
+struct async_domain {
+	struct list_head node;
+	struct list_head domain;
+	int count;
+	unsigned registered:1;
+};
+
+/*
+ * domain participates in global async_synchronize_full
+ */
+#define ASYNC_DOMAIN(_name) \
+	struct async_domain _name = { .node = LIST_HEAD_INIT(_name.node), \
+				      .domain = LIST_HEAD_INIT(_name.domain), \
+				      .count = 0, \
+				      .registered = 1 }
+
+/*
+ * domain is free to go out of scope as soon as all pending work is
+ * complete, this domain does not participate in async_synchronize_full
+ */
+#define ASYNC_DOMAIN_EXCLUSIVE(_name) \
+	struct async_domain _name = { .node = LIST_HEAD_INIT(_name.node), \
+				      .domain = LIST_HEAD_INIT(_name.domain), \
+				      .count = 0, \
+				      .registered = 0 }
 
 extern async_cookie_t async_schedule(async_func_ptr *ptr, void *data);
 extern async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
-					    struct list_head *list);
+					    struct async_domain *domain);
 extern void async_synchronize_full(void);
-extern void async_synchronize_full_domain(struct list_head *list);
+extern void async_synchronize_full_domain(struct async_domain *domain);
 extern void async_synchronize_cookie(async_cookie_t cookie);
 extern void async_synchronize_cookie_domain(async_cookie_t cookie,
-					    struct list_head *list);
-
+					    struct async_domain *domain);
+#endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6619eb255..f134c8334 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -431,6 +431,7 @@ struct request_queue {
 #define QUEUE_FLAG_SECDISCARD  17	/* supports SECDISCARD */
 #define QUEUE_FLAG_SAME_FORCE  18	/* force complete on same CPU */
 #define QUEUE_FLAG_SANITIZE    19	/* supports SANITIZE */
+#define QUEUE_FLAG_FAST        20	/* fast block device (e.g. ram based) */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
@@ -513,6 +514,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 #define blk_queue_sanitize(q)	test_bit(QUEUE_FLAG_SANITIZE, &(q)->queue_flags)
 #define blk_queue_secdiscard(q)	(blk_queue_discard(q) && \
 	test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
+#define blk_queue_fast(q)	test_bit(QUEUE_FLAG_FAST, &(q)->queue_flags)
 
 #define blk_noretry_request(rq) \
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h
index 42e55deee..cb3e14234 100644
--- a/include/linux/cleancache.h
+++ b/include/linux/cleancache.h
@@ -5,6 +5,10 @@
 #include <linux/exportfs.h>
 #include <linux/mm.h>
 
+#define CLEANCACHE_NO_POOL		-1
+#define CLEANCACHE_NO_BACKEND		-2
+#define CLEANCACHE_NO_BACKEND_SHARED	-3
+
 #define CLEANCACHE_KEY_MAX 6
 
 /*
@@ -33,18 +37,17 @@ struct cleancache_ops {
 	void (*invalidate_fs)(int);
 };
 
-extern struct cleancache_ops
-	cleancache_register_ops(struct cleancache_ops *ops);
+extern int cleancache_register_ops(const struct cleancache_ops *ops);
 extern void __cleancache_init_fs(struct super_block *);
-extern void __cleancache_init_shared_fs(char *, struct super_block *);
+extern void __cleancache_init_shared_fs(struct super_block *);
 extern int  __cleancache_get_page(struct page *);
 extern void __cleancache_put_page(struct page *);
 extern void __cleancache_invalidate_page(struct address_space *, struct page *);
 extern void __cleancache_invalidate_inode(struct address_space *);
 extern void __cleancache_invalidate_fs(struct super_block *);
-extern int cleancache_enabled;
 
 #ifdef CONFIG_CLEANCACHE
+#define cleancache_enabled (1)
 static inline bool cleancache_fs_enabled(struct page *page)
 {
 	return page->mapping->host->i_sb->cleancache_poolid >= 0;
@@ -78,10 +81,10 @@ static inline void cleancache_init_fs(struct super_block *sb)
 		__cleancache_init_fs(sb);
 }
 
-static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+static inline void cleancache_init_shared_fs(struct super_block *sb)
 {
 	if (cleancache_enabled)
-		__cleancache_init_shared_fs(uuid, sb);
+		__cleancache_init_shared_fs(sb);
 }
 
 static inline int cleancache_get_page(struct page *page)
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 923d093c9..7f5c2a39e 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -164,6 +164,110 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
     (typeof(ptr)) (__ptr + (off)); })
 #endif
 
+#include <linux/types.h>
+
+#define __READ_ONCE_SIZE						\
+({									\
+	switch (size) {							\
+	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;		\
+	case 2: *(__u16 *)res = *(volatile __u16 *)p; break;		\
+	case 4: *(__u32 *)res = *(volatile __u32 *)p; break;		\
+	case 8: *(__u64 *)res = *(volatile __u64 *)p; break;		\
+	default:							\
+		barrier();						\
+		__builtin_memcpy((void *)res, (const void *)p, size);	\
+		barrier();						\
+	}								\
+})
+
+static __always_inline
+void __read_once_size(const volatile void *p, void *res, int size)
+{
+	__READ_ONCE_SIZE;
+}
+
+#ifdef CONFIG_KASAN
+/*
+ * This function is not 'inline' because __no_sanitize_address confilcts
+ * with inlining. Attempt to inline it may cause a build failure.
+ * 	https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
+ * '__maybe_unused' allows us to avoid defined-but-not-used warnings.
+ */
+static __no_sanitize_address __maybe_unused
+void __read_once_size_nocheck(const volatile void *p, void *res, int size)
+{
+	__READ_ONCE_SIZE;
+}
+#else
+static __always_inline
+void __read_once_size_nocheck(const volatile void *p, void *res, int size)
+{
+	__READ_ONCE_SIZE;
+}
+#endif
+
+static __always_inline void __write_once_size(volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(volatile __u8 *)p = *(__u8 *)res; break;
+	case 2: *(volatile __u16 *)p = *(__u16 *)res; break;
+	case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
+	case 8: *(volatile __u64 *)p = *(__u64 *)res; break;
+	default:
+		barrier();
+		__builtin_memcpy((void *)p, (const void *)res, size);
+		barrier();
+	}
+}
+
+/*
+ * Prevent the compiler from merging or refetching reads or writes. The
+ * compiler is also forbidden from reordering successive instances of
+ * READ_ONCE, WRITE_ONCE and ACCESS_ONCE (see below), but only when the
+ * compiler is aware of some particular ordering.  One way to make the
+ * compiler aware of ordering is to put the two invocations of READ_ONCE,
+ * WRITE_ONCE or ACCESS_ONCE() in different C statements.
+ *
+ * In contrast to ACCESS_ONCE these two macros will also work on aggregate
+ * data types like structs or unions. If the size of the accessed data
+ * type exceeds the word size of the machine (e.g., 32 bits or 64 bits)
+ * READ_ONCE() and WRITE_ONCE() will fall back to memcpy(). There's at
+ * least two memcpy()s: one for the __builtin_memcpy() and then one for
+ * the macro doing the copy of variable - '__u' allocated on the stack.
+ *
+ * Their two major use cases are: (1) Mediating communication between
+ * process-level code and irq/NMI handlers, all running on the same CPU,
+ * and (2) Ensuring that the compiler does not  fold, spindle, or otherwise
+ * mutilate accesses that either do not require ordering or that interact
+ * with an explicit memory barrier or atomic instruction that provides the
+ * required ordering.
+ */
+
+#define __READ_ONCE(x, check)						\
+({									\
+	union { typeof(x) __val; char __c[1]; } __u;			\
+	if (check)							\
+		__read_once_size(&(x), __u.__c, sizeof(x));		\
+	else								\
+		__read_once_size_nocheck(&(x), __u.__c, sizeof(x));	\
+	__u.__val;							\
+})
+#define READ_ONCE(x) __READ_ONCE(x, 1)
+
+/*
+ * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need
+ * to hide memory access from KASAN.
+ */
+#define READ_ONCE_NOCHECK(x) __READ_ONCE(x, 0)
+
+#define WRITE_ONCE(x, val) \
+({							\
+	union { typeof(x) __val; char __c[1]; } __u =	\
+		{ .__val = (__force typeof(x)) (val) }; \
+	__write_once_size(&(x), __u.__c, sizeof(x));	\
+	__u.__val;					\
+})
+
 #endif /* __KERNEL__ */
 
 #endif /* __ASSEMBLY__ */
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 92f9fcdc5..233c27277 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -126,21 +126,33 @@ enum {
 #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
 #ifdef CONFIG_HOTPLUG_CPU
 extern int register_cpu_notifier(struct notifier_block *nb);
+extern int __register_cpu_notifier(struct notifier_block *nb);
 extern void unregister_cpu_notifier(struct notifier_block *nb);
+extern void __unregister_cpu_notifier(struct notifier_block *nb);
 #else
 
 #ifndef MODULE
 extern int register_cpu_notifier(struct notifier_block *nb);
+extern int __register_cpu_notifier(struct notifier_block *nb);
 #else
 static inline int register_cpu_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
+
+static inline int __register_cpu_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
 #endif
 
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
 {
 }
+
+static inline void __unregister_cpu_notifier(struct notifier_block *nb)
+{
+}
 #endif
 
 int cpu_up(unsigned int cpu);
@@ -148,6 +160,9 @@ void notify_cpu_starting(unsigned int cpu);
 extern void cpu_maps_update_begin(void);
 extern void cpu_maps_update_done(void);
 
+#define cpu_notifier_register_begin	cpu_maps_update_begin
+#define cpu_notifier_register_done	cpu_maps_update_done
+
 #else	/* CONFIG_SMP */
 
 #define cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 3a4cef532..10ca9e2a7 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -83,6 +83,22 @@ enum fid_type {
 	 * 64 bit parent inode number.
 	 */
 	FILEID_NILFS_WITH_PARENT = 0x62,
+
+	/*
+	 * 32 bit generation number, 40 bit i_pos.
+	 */
+	FILEID_FAT_WITHOUT_PARENT = 0x71,
+
+	/*
+	 * 32 bit generation number, 40 bit i_pos,
+	 * 32 bit parent generation number, 40 bit parent i_pos
+	 */
+	FILEID_FAT_WITH_PARENT = 0x72,
+	
+	/*
+	 * Filesystems must not use 0xff file ID.
+	 */
+	FILEID_INVALID = 0xff,
 };
 
 struct fid {
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index b11e298e9..2b9894067 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -11,6 +11,7 @@
 #include <linux/user_namespace.h>
 #include <linux/securebits.h>
 #include <net/net_namespace.h>
+#include <linux/sched/sysctl.h>
 
 #ifdef CONFIG_SMP
 # define INIT_PUSHABLE_TASKS(tsk)					\
diff --git a/include/linux/list.h b/include/linux/list.h
index 1712c7e1f..8a8acdcf3 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -361,6 +361,28 @@ static inline void list_splice_tail_init(struct list_head *list,
 #define list_first_entry(ptr, type, member) \
 	list_entry((ptr)->next, type, member)
 
+/**
+ * list_last_entry - get the last element from a list
+ * @ptr:	the list head to take the element from.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ */
+#define list_last_entry(ptr, type, member) \
+	list_entry((ptr)->prev, type, member)
+
+/**
+ * list_first_entry_or_null - get the first element from a list
+ * @ptr:	the list head to take the element from.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Note that if the list is empty, it returns NULL.
+ */
+#define list_first_entry_or_null(ptr, type, member) \
+	(!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
+
 /**
  * list_next_entry - get the next element in list
  * @pos:	the type * to cursor
@@ -432,9 +454,9 @@ static inline void list_splice_tail_init(struct list_head *list,
  * @member:	the name of the list_struct within the struct.
  */
 #define list_for_each_entry(pos, head, member)				\
-	for (pos = list_entry((head)->next, typeof(*pos), member);	\
-	     &pos->member != (head); 	\
-	     pos = list_entry(pos->member.next, typeof(*pos), member))
+	for (pos = list_first_entry(head, typeof(*pos), member);	\
+	     &pos->member != (head);					\
+	     pos = list_next_entry(pos, member))
 
 /**
  * list_for_each_entry_reverse - iterate backwards over list of given type.
@@ -443,9 +465,9 @@ static inline void list_splice_tail_init(struct list_head *list,
  * @member:	the name of the list_struct within the struct.
  */
 #define list_for_each_entry_reverse(pos, head, member)			\
-	for (pos = list_entry((head)->prev, typeof(*pos), member);	\
-	     &pos->member != (head); 	\
-	     pos = list_entry(pos->member.prev, typeof(*pos), member))
+	for (pos = list_last_entry(head, typeof(*pos), member);		\
+	     &pos->member != (head); 					\
+	     pos = list_prev_entry(pos, member))
 
 /**
  * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue()
@@ -468,9 +490,9 @@ static inline void list_splice_tail_init(struct list_head *list,
  * the current position.
  */
 #define list_for_each_entry_continue(pos, head, member) 		\
-	for (pos = list_entry(pos->member.next, typeof(*pos), member);	\
-	     &pos->member != (head);	\
-	     pos = list_entry(pos->member.next, typeof(*pos), member))
+	for (pos = list_next_entry(pos, member);			\
+	     &pos->member != (head);					\
+	     pos = list_next_entry(pos, member))
 
 /**
  * list_for_each_entry_continue_reverse - iterate backwards from the given point
@@ -482,9 +504,9 @@ static inline void list_splice_tail_init(struct list_head *list,
  * the current position.
  */
 #define list_for_each_entry_continue_reverse(pos, head, member)		\
-	for (pos = list_entry(pos->member.prev, typeof(*pos), member);	\
-	     &pos->member != (head);	\
-	     pos = list_entry(pos->member.prev, typeof(*pos), member))
+	for (pos = list_prev_entry(pos, member);			\
+	     &pos->member != (head);					\
+	     pos = list_prev_entry(pos, member))
 
 /**
  * list_for_each_entry_from - iterate over list of given type from the current point
@@ -495,8 +517,8 @@ static inline void list_splice_tail_init(struct list_head *list,
  * Iterate over list of given type, continuing from current position.
  */
 #define list_for_each_entry_from(pos, head, member) 			\
-	for (; &pos->member != (head);	\
-	     pos = list_entry(pos->member.next, typeof(*pos), member))
+	for (; &pos->member != (head);					\
+	     pos = list_next_entry(pos, member))
 
 /**
  * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
@@ -506,10 +528,10 @@ static inline void list_splice_tail_init(struct list_head *list,
  * @member:	the name of the list_struct within the struct.
  */
 #define list_for_each_entry_safe(pos, n, head, member)			\
-	for (pos = list_entry((head)->next, typeof(*pos), member),	\
-		n = list_entry(pos->member.next, typeof(*pos), member);	\
+	for (pos = list_first_entry(head, typeof(*pos), member),	\
+		n = list_next_entry(pos, member);			\
 	     &pos->member != (head); 					\
-	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+	     pos = n, n = list_next_entry(n, member))
 
 /**
  * list_for_each_entry_safe_continue - continue list iteration safe against removal
@@ -522,10 +544,10 @@ static inline void list_splice_tail_init(struct list_head *list,
  * safe against removal of list entry.
  */
 #define list_for_each_entry_safe_continue(pos, n, head, member) 		\
-	for (pos = list_entry(pos->member.next, typeof(*pos), member), 		\
-		n = list_entry(pos->member.next, typeof(*pos), member);		\
+	for (pos = list_next_entry(pos, member), 				\
+		n = list_next_entry(pos, member);				\
 	     &pos->member != (head);						\
-	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+	     pos = n, n = list_next_entry(n, member))
 
 /**
  * list_for_each_entry_safe_from - iterate over list from current point safe against removal
@@ -538,9 +560,9 @@ static inline void list_splice_tail_init(struct list_head *list,
  * removal of list entry.
  */
 #define list_for_each_entry_safe_from(pos, n, head, member) 			\
-	for (n = list_entry(pos->member.next, typeof(*pos), member);		\
+	for (n = list_next_entry(pos, member);					\
 	     &pos->member != (head);						\
-	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+	     pos = n, n = list_next_entry(n, member))
 
 /**
  * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
@@ -553,10 +575,10 @@ static inline void list_splice_tail_init(struct list_head *list,
  * of list entry.
  */
 #define list_for_each_entry_safe_reverse(pos, n, head, member)		\
-	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
-		n = list_entry(pos->member.prev, typeof(*pos), member);	\
+	for (pos = list_last_entry(head, typeof(*pos), member),		\
+		n = list_prev_entry(pos, member);			\
 	     &pos->member != (head); 					\
-	     pos = n, n = list_entry(n->member.prev, typeof(*n), member))
+	     pos = n, n = list_prev_entry(n, member))
 
 /**
  * list_safe_reset_next - reset a stale list_for_each_entry_safe loop
@@ -571,7 +593,7 @@ static inline void list_splice_tail_init(struct list_head *list,
  * completing the current iteration of the loop body.
  */
 #define list_safe_reset_next(pos, n, member)				\
-	n = list_entry(pos->member.next, typeof(*pos), member)
+	n = list_next_entry(pos, member)
 
 /*
  * Double linked lists with a single pointer list head.
diff --git a/include/linux/lz4.h b/include/linux/lz4.h
new file mode 100644
index 000000000..6b784c59f
--- /dev/null
+++ b/include/linux/lz4.h
@@ -0,0 +1,87 @@
+#ifndef __LZ4_H__
+#define __LZ4_H__
+/*
+ * LZ4 Kernel Interface
+ *
+ * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define LZ4_MEM_COMPRESS	(16384)
+#define LZ4HC_MEM_COMPRESS	(262144 + (2 * sizeof(unsigned char *)))
+
+/*
+ * lz4_compressbound()
+ * Provides the maximum size that LZ4 may output in a "worst case" scenario
+ * (input data not compressible)
+ */
+static inline size_t lz4_compressbound(size_t isize)
+{
+	return isize + (isize / 255) + 16;
+}
+
+/*
+ * lz4_compress()
+ *	src     : source address of the original data
+ *	src_len : size of the original data
+ *	dst	: output buffer address of the compressed data
+ *		This requires 'dst' of size LZ4_COMPRESSBOUND.
+ *	dst_len : is the output size, which is returned after compress done
+ *	workmem : address of the working memory.
+ *		This requires 'workmem' of size LZ4_MEM_COMPRESS.
+ *	return  : Success if return 0
+ *		  Error if return (< 0)
+ *	note :  Destination buffer and workmem must be already allocated with
+ *		the defined size.
+ */
+int lz4_compress(const unsigned char *src, size_t src_len,
+		unsigned char *dst, size_t *dst_len, void *wrkmem);
+
+ /*
+  * lz4hc_compress()
+  *	 src	 : source address of the original data
+  *	 src_len : size of the original data
+  *	 dst	 : output buffer address of the compressed data
+  *		This requires 'dst' of size LZ4_COMPRESSBOUND.
+  *	 dst_len : is the output size, which is returned after compress done
+  *	 workmem : address of the working memory.
+  *		This requires 'workmem' of size LZ4HC_MEM_COMPRESS.
+  *	 return  : Success if return 0
+  *		   Error if return (< 0)
+  *	 note :  Destination buffer and workmem must be already allocated with
+  *		 the defined size.
+  */
+int lz4hc_compress(const unsigned char *src, size_t src_len,
+		unsigned char *dst, size_t *dst_len, void *wrkmem);
+
+/*
+ * lz4_decompress()
+ *	src     : source address of the compressed data
+ *	src_len : is the input size, whcih is returned after decompress done
+ *	dest	: output buffer address of the decompressed data
+ *	actual_dest_len: is the size of uncompressed data, supposing it's known
+ *	return  : Success if return 0
+ *		  Error if return (< 0)
+ *	note :  Destination buffer must be already allocated.
+ *		slightly faster than lz4_decompress_unknownoutputsize()
+ */
+int lz4_decompress(const unsigned char *src, size_t *src_len,
+		unsigned char *dest, size_t actual_dest_len);
+
+/*
+ * lz4_decompress_unknownoutputsize()
+ *	src     : source address of the compressed data
+ *	src_len : is the input size, therefore the compressed size
+ *	dest	: output buffer address of the decompressed data
+ *	dest_len: is the max size of the destination buffer, which is
+ *			returned with actual size of decompressed data after
+ *			decompress done
+ *	return  : Success if return 0
+ *		  Error if return (< 0)
+ *	note :  Destination buffer must be already allocated.
+ */
+int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
+		unsigned char *dest, size_t *dest_len);
+#endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f5ca7ecfc..4e7889f1b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -389,7 +389,7 @@ static inline struct page *compound_head(struct page *page)
  * both from it and to it can be tracked, using atomic_inc_and_test
  * and atomic_add_negative(-1).
  */
-static inline void reset_page_mapcount(struct page *page)
+static inline void page_mapcount_reset(struct page *page)
 {
 	atomic_set(&(page)->_mapcount, -1);
 }
@@ -810,18 +810,7 @@ void page_address_init(void);
 #define PAGE_MAPPING_KSM	2
 #define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
 
-extern struct address_space swapper_space;
-static inline struct address_space *page_mapping(struct page *page)
-{
-	struct address_space *mapping = page->mapping;
-
-	VM_BUG_ON(PageSlab(page));
-	if (unlikely(PageSwapCache(page)))
-		mapping = &swapper_space;
-	else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
-		mapping = NULL;
-	return mapping;
-}
+extern struct address_space *page_mapping(struct page *page);
 
 /* Neutral page->mapping pointer to address_space or anon_vma or other */
 static inline void *page_rmapping(struct page *page)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 2d0243350..1bd8e64f3 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -115,6 +115,9 @@ enum pageflags {
 #endif
 #ifdef CONFIG_SDP
 	PG_sensitive,
+#endif
+#ifdef CONFIG_ZCACHE
+	PG_was_active,
 #endif
 	__NR_PAGEFLAGS,
 #if defined(CONFIG_CMA_PAGE_COUNTING)
@@ -221,6 +224,12 @@ PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 
 __PAGEFLAG(SlobFree, slob_free)
 
+#ifdef CONFIG_ZCACHE
+PAGEFLAG(WasActive, was_active)
+#else
+PAGEFLAG_FALSE(WasActive)
+#endif
+
 /*
  * Private page markings that may be used by the filesystem that owns the page
  * for its own purposes.
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 696ca3920..530b807fc 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -236,6 +236,11 @@ static inline struct page *page_cache_alloc_readahead(struct address_space *x)
 
 typedef int filler_t(void *, struct page *);
 
+pgoff_t page_cache_next_hole(struct address_space *mapping,
+                             pgoff_t index, unsigned long max_scan);
+pgoff_t page_cache_prev_hole(struct address_space *mapping,
+			     pgoff_t index, unsigned long max_scan);
+
 extern struct page * find_get_page(struct address_space *mapping,
 				pgoff_t index);
 extern struct page * find_lock_page(struct address_space *mapping,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 469c9536c..579baf06f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -176,6 +176,8 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_NO_D3 = (__force pci_dev_flags_t) 2,
 	/* Provide indication device is assigned by a Virtual Machine Manager */
 	PCI_DEV_FLAGS_ASSIGNED = (__force pci_dev_flags_t) 4,
+	/* Get VPD from function 0 VPD */
+	PCI_DEV_FLAGS_VPD_REF_F0 = (__force pci_dev_flags_t) (1 << 8),
 };
 
 enum pci_irq_reroute_variant {
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index ffc444c38..74aee022b 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -60,6 +60,33 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 
 #define RADIX_TREE_MAX_TAGS 3
 
+#ifdef __KERNEL__
+#define RADIX_TREE_MAP_SHIFT	(CONFIG_BASE_SMALL ? 4 : 6)
+#else
+#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
+#endif
+
+#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
+#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
+
+#define RADIX_TREE_TAG_LONGS	\
+	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
+
+struct radix_tree_node {
+	unsigned int	height;		/* Height from the bottom */
+	unsigned int	count;
+	union {
+		struct radix_tree_node *parent;	/* Used when ascending tree */
+		struct rcu_head	rcu_head;	/* Used when freeing node */
+	};
+	void __rcu	*slots[RADIX_TREE_MAP_SIZE];
+	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
+};
+
+#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
+					  RADIX_TREE_MAP_SHIFT))
+
 /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
 struct radix_tree_root {
 	unsigned int		height;
@@ -101,6 +128,7 @@ do {									\
  *   concurrently with other readers.
  *
  * The notable exceptions to this rule are the following functions:
+ * __radix_tree_lookup
  * radix_tree_lookup
  * radix_tree_lookup_slot
  * radix_tree_tag_get
@@ -216,21 +244,29 @@ static inline void radix_tree_replace_slot(void **pslot, void *item)
 	rcu_assign_pointer(*pslot, item);
 }
 
+int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
+			struct radix_tree_node **nodep, void ***slotp);
 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
+			  struct radix_tree_node **nodep, void ***slotp);
 void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index,
+			      struct radix_tree_node *node);
+void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items);
+unsigned int
+radix_tree_gang_lookup_index(struct radix_tree_root *root, void **results,
+			unsigned long *indices, unsigned long first_index,
+			unsigned int max_items);
 unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
 			void ***results, unsigned long *indices,
 			unsigned long first_index, unsigned int max_items);
-unsigned long radix_tree_next_hole(struct radix_tree_root *root,
-				unsigned long index, unsigned long max_scan);
-unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
-				unsigned long index, unsigned long max_scan);
 int radix_tree_preload(gfp_t gfp_mask);
+int radix_tree_maybe_preload(gfp_t gfp_mask);
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
@@ -321,13 +357,29 @@ radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
 void **radix_tree_next_chunk(struct radix_tree_root *root,
 			     struct radix_tree_iter *iter, unsigned flags);
 
+/**
+ * radix_tree_iter_retry - retry this chunk of the iteration
+ * @iter:	iterator state
+ *
+ * If we iterate over a tree protected only by the RCU lock, a race
+ * against deletion or creation may result in seeing a slot for which
+ * radix_tree_deref_retry() returns true.  If so, call this function
+ * and continue the iteration.
+ */
+static inline __must_check
+void **radix_tree_iter_retry(struct radix_tree_iter *iter)
+{
+	iter->next_index = iter->index;
+	return NULL;
+}
+
 /**
  * radix_tree_chunk_size - get current chunk size
  *
  * @iter:	pointer to radix tree iterator
  * Returns:	current chunk size
  */
-static __always_inline unsigned
+static __always_inline long
 radix_tree_chunk_size(struct radix_tree_iter *iter)
 {
 	return iter->next_index - iter->index;
@@ -361,9 +413,9 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
 			return slot + offset + 1;
 		}
 	} else {
-		unsigned size = radix_tree_chunk_size(iter) - 1;
+		long size = radix_tree_chunk_size(iter);
 
-		while (size--) {
+		while (--size > 0) {
 			slot++;
 			iter->index++;
 			if (likely(*slot))
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 033b507b3..a5aa7ae67 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -23,72 +23,7 @@
   I know it's not the cleaner way,  but in C (not in C++) to get
   performances and genericity...
 
-  Some example of insert and search follows here. The search is a plain
-  normal search over an ordered tree. The insert instead must be implemented
-  in two steps: First, the code must insert the element in order as a red leaf
-  in the tree, and then the support library function rb_insert_color() must
-  be called. Such function will do the not trivial work to rebalance the
-  rbtree, if necessary.
-
------------------------------------------------------------------------
-static inline struct page * rb_search_page_cache(struct inode * inode,
-						 unsigned long offset)
-{
-	struct rb_node * n = inode->i_rb_page_cache.rb_node;
-	struct page * page;
-
-	while (n)
-	{
-		page = rb_entry(n, struct page, rb_page_cache);
-
-		if (offset < page->offset)
-			n = n->rb_left;
-		else if (offset > page->offset)
-			n = n->rb_right;
-		else
-			return page;
-	}
-	return NULL;
-}
-
-static inline struct page * __rb_insert_page_cache(struct inode * inode,
-						   unsigned long offset,
-						   struct rb_node * node)
-{
-	struct rb_node ** p = &inode->i_rb_page_cache.rb_node;
-	struct rb_node * parent = NULL;
-	struct page * page;
-
-	while (*p)
-	{
-		parent = *p;
-		page = rb_entry(parent, struct page, rb_page_cache);
-
-		if (offset < page->offset)
-			p = &(*p)->rb_left;
-		else if (offset > page->offset)
-			p = &(*p)->rb_right;
-		else
-			return page;
-	}
-
-	rb_link_node(node, parent, p);
-
-	return NULL;
-}
-
-static inline struct page * rb_insert_page_cache(struct inode * inode,
-						 unsigned long offset,
-						 struct rb_node * node)
-{
-	struct page * ret;
-	if ((ret = __rb_insert_page_cache(inode, offset, node)))
-		goto out;
-	rb_insert_color(node, &inode->i_rb_page_cache);
- out:
-	return ret;
-}
------------------------------------------------------------------------
+  See Documentation/rbtree.txt for documentation and samples.
 */
 
 #ifndef	_LINUX_RBTREE_H
@@ -96,64 +31,37 @@ static inline struct page * rb_insert_page_cache(struct inode * inode,
 
 #include <linux/kernel.h>
 #include <linux/stddef.h>
+#include <linux/rcupdate.h>
 
-struct rb_node
-{
-	unsigned long  rb_parent_color;
-#define	RB_RED		0
-#define	RB_BLACK	1
+struct rb_node {
+	unsigned long  __rb_parent_color;
 	struct rb_node *rb_right;
 	struct rb_node *rb_left;
 } __attribute__((aligned(sizeof(long))));
     /* The alignment might seem pointless, but allegedly CRIS needs it */
 
-struct rb_root
-{
+struct rb_root {
 	struct rb_node *rb_node;
 };
 
 
-#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_color & ~3))
-#define rb_color(r)   ((r)->rb_parent_color & 1)
-#define rb_is_red(r)   (!rb_color(r))
-#define rb_is_black(r) rb_color(r)
-#define rb_set_red(r)  do { (r)->rb_parent_color &= ~1; } while (0)
-#define rb_set_black(r)  do { (r)->rb_parent_color |= 1; } while (0)
-
-static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
-{
-	rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
-}
-static inline void rb_set_color(struct rb_node *rb, int color)
-{
-	rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
-}
+#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
 
 #define RB_ROOT	(struct rb_root) { NULL, }
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 
-#define RB_EMPTY_ROOT(root)	((root)->rb_node == NULL)
-#define RB_EMPTY_NODE(node)	(rb_parent(node) == node)
-#define RB_CLEAR_NODE(node)	(rb_set_parent(node, node))
+#define RB_EMPTY_ROOT(root)  ((root)->rb_node == NULL)
+
+/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
+#define RB_EMPTY_NODE(node)  \
+	((node)->__rb_parent_color == (unsigned long)(node))
+#define RB_CLEAR_NODE(node)  \
+	((node)->__rb_parent_color = (unsigned long)(node))
 
-static inline void rb_init_node(struct rb_node *rb)
-{
-	rb->rb_parent_color = 0;
-	rb->rb_right = NULL;
-	rb->rb_left = NULL;
-	RB_CLEAR_NODE(rb);
-}
 
 extern void rb_insert_color(struct rb_node *, struct rb_root *);
 extern void rb_erase(struct rb_node *, struct rb_root *);
 
-typedef void (*rb_augment_f)(struct rb_node *node, void *data);
-
-extern void rb_augment_insert(struct rb_node *node,
-			      rb_augment_f func, void *data);
-extern struct rb_node *rb_augment_erase_begin(struct rb_node *node);
-extern void rb_augment_erase_end(struct rb_node *node,
-				 rb_augment_f func, void *data);
 
 /* Find logical next and previous nodes in a tree */
 extern struct rb_node *rb_next(const struct rb_node *);
@@ -161,17 +69,58 @@ extern struct rb_node *rb_prev(const struct rb_node *);
 extern struct rb_node *rb_first(const struct rb_root *);
 extern struct rb_node *rb_last(const struct rb_root *);
 
+/* Postorder iteration - always visit the parent after its children */
+extern struct rb_node *rb_first_postorder(const struct rb_root *);
+extern struct rb_node *rb_next_postorder(const struct rb_node *);
+
 /* Fast replacement of a single node without remove/rebalance/add/rebalance */
-extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 			    struct rb_root *root);
 
-static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
-				struct rb_node ** rb_link)
+static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
+				struct rb_node **rb_link)
 {
-	node->rb_parent_color = (unsigned long )parent;
+	node->__rb_parent_color = (unsigned long)parent;
 	node->rb_left = node->rb_right = NULL;
 
 	*rb_link = node;
 }
 
+static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
+				    struct rb_node **rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	rcu_assign_pointer(*rb_link, node);
+}
+
+#define rb_entry_safe(ptr, type, member) \
+	({ typeof(ptr) ____ptr = (ptr); \
+	   ____ptr ? rb_entry(____ptr, type, member) : NULL; \
+	})
+
+/**
+ * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
+ * given type allowing the backing memory of @pos to be invalidated
+ *
+ * @pos:	the 'type *' to use as a loop cursor.
+ * @n:		another 'type *' to use as temporary storage
+ * @root:	'rb_root *' of the rbtree.
+ * @field:	the name of the rb_node field within 'type'.
+ *
+ * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
+ * list_for_each_entry_safe() and allows the iteration to continue independent
+ * of changes to @pos by the body of the loop.
+ *
+ * Note, however, that it cannot handle other modifications that re-order the
+ * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
+ * rb_erase() may rebalance the tree, causing us to miss some nodes.
+ */
+#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
+	for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
+	     pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
+			typeof(*pos), field); 1; }); \
+	     pos = n)
+
 #endif	/* _LINUX_RBTREE_H */
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
new file mode 100644
index 000000000..14d7b831b
--- /dev/null
+++ b/include/linux/rbtree_augmented.h
@@ -0,0 +1,249 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/include/linux/rbtree_augmented.h
+*/
+
+#ifndef _LINUX_RBTREE_AUGMENTED_H
+#define _LINUX_RBTREE_AUGMENTED_H
+
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+
+/*
+ * Please note - only struct rb_augment_callbacks and the prototypes for
+ * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
+ * The rest are implementation details you are not expected to depend on.
+ *
+ * See Documentation/rbtree.txt for documentation and samples.
+ */
+
+struct rb_augment_callbacks {
+	void (*propagate)(struct rb_node *node, struct rb_node *stop);
+	void (*copy)(struct rb_node *old, struct rb_node *new);
+	void (*rotate)(struct rb_node *old, struct rb_node *new);
+};
+
+extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+/*
+ * Fixup the rbtree and update the augmented information when rebalancing.
+ *
+ * On insertion, the user must update the augmented information on the path
+ * leading to the inserted node, then call rb_link_node() as usual and
+ * rb_augment_inserted() instead of the usual rb_insert_color() call.
+ * If rb_augment_inserted() rebalances the rbtree, it will callback into
+ * a user provided function to update the augmented information on the
+ * affected subtrees.
+ */
+static inline void
+rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+		    const struct rb_augment_callbacks *augment)
+{
+	__rb_insert_augmented(node, root, augment->rotate);
+}
+
+#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,	\
+			     rbtype, rbaugmented, rbcompute)		\
+static inline void							\
+rbname ## _propagate(struct rb_node *rb, struct rb_node *stop)		\
+{									\
+	while (rb != stop) {						\
+		rbstruct *node = rb_entry(rb, rbstruct, rbfield);	\
+		rbtype augmented = rbcompute(node);			\
+		if (node->rbaugmented == augmented)			\
+			break;						\
+		node->rbaugmented = augmented;				\
+		rb = rb_parent(&node->rbfield);				\
+	}								\
+}									\
+static inline void							\
+rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)		\
+{									\
+	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		\
+	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		\
+	new->rbaugmented = old->rbaugmented;				\
+}									\
+static void								\
+rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)	\
+{									\
+	rbstruct *old = rb_entry(rb_old, rbstruct, rbfield);		\
+	rbstruct *new = rb_entry(rb_new, rbstruct, rbfield);		\
+	new->rbaugmented = old->rbaugmented;				\
+	old->rbaugmented = rbcompute(old);				\
+}									\
+rbstatic const struct rb_augment_callbacks rbname = {			\
+	rbname ## _propagate, rbname ## _copy, rbname ## _rotate	\
+};
+
+
+#define	RB_RED		0
+#define	RB_BLACK	1
+
+#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))
+
+#define __rb_color(pc)     ((pc) & 1)
+#define __rb_is_black(pc)  __rb_color(pc)
+#define __rb_is_red(pc)    (!__rb_color(pc))
+#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
+#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
+#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+	rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
+}
+
+static inline void rb_set_parent_color(struct rb_node *rb,
+				       struct rb_node *p, int color)
+{
+	rb->__rb_parent_color = (unsigned long)p | color;
+}
+
+static inline void
+__rb_change_child(struct rb_node *old, struct rb_node *new,
+		  struct rb_node *parent, struct rb_root *root)
+{
+	if (parent) {
+		if (parent->rb_left == old)
+			WRITE_ONCE(parent->rb_left, new);
+		else
+			WRITE_ONCE(parent->rb_right, new);
+	} else
+		WRITE_ONCE(root->rb_node, new);
+}
+
+extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+
+static __always_inline struct rb_node *
+__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+		     const struct rb_augment_callbacks *augment)
+{
+	struct rb_node *child = node->rb_right;
+	struct rb_node *tmp = node->rb_left;
+	struct rb_node *parent, *rebalance;
+	unsigned long pc;
+
+	if (!tmp) {
+		/*
+		 * Case 1: node to erase has no more than 1 child (easy!)
+		 *
+		 * Note that if there is one child it must be red due to 5)
+		 * and node must be black due to 4). We adjust colors locally
+		 * so as to bypass __rb_erase_color() later on.
+		 */
+		pc = node->__rb_parent_color;
+		parent = __rb_parent(pc);
+		__rb_change_child(node, child, parent, root);
+		if (child) {
+			child->__rb_parent_color = pc;
+			rebalance = NULL;
+		} else
+			rebalance = __rb_is_black(pc) ? parent : NULL;
+		tmp = parent;
+	} else if (!child) {
+		/* Still case 1, but this time the child is node->rb_left */
+		tmp->__rb_parent_color = pc = node->__rb_parent_color;
+		parent = __rb_parent(pc);
+		__rb_change_child(node, tmp, parent, root);
+		rebalance = NULL;
+		tmp = parent;
+	} else {
+		struct rb_node *successor = child, *child2;
+
+		tmp = child->rb_left;
+		if (!tmp) {
+			/*
+			 * Case 2: node's successor is its right child
+			 *
+			 *    (n)          (s)
+			 *    / \          / \
+			 *  (x) (s)  ->  (x) (c)
+			 *        \
+			 *        (c)
+			 */
+			parent = successor;
+			child2 = successor->rb_right;
+
+			augment->copy(node, successor);
+		} else {
+			/*
+			 * Case 3: node's successor is leftmost under
+			 * node's right child subtree
+			 *
+			 *    (n)          (s)
+			 *    / \          / \
+			 *  (x) (y)  ->  (x) (y)
+			 *      /            /
+			 *    (p)          (p)
+			 *    /            /
+			 *  (s)          (c)
+			 *    \
+			 *    (c)
+			 */
+			do {
+				parent = successor;
+				successor = tmp;
+				tmp = tmp->rb_left;
+			} while (tmp);
+			child2 = successor->rb_right;
+			WRITE_ONCE(parent->rb_left, child2);
+			WRITE_ONCE(successor->rb_right, child);
+			rb_set_parent(child, successor);
+
+			augment->copy(node, successor);
+			augment->propagate(parent, successor);
+		}
+
+		tmp = node->rb_left;
+		WRITE_ONCE(successor->rb_left, tmp);
+		rb_set_parent(tmp, successor);
+
+		pc = node->__rb_parent_color;
+		tmp = __rb_parent(pc);
+		__rb_change_child(node, successor, tmp, root);
+
+		if (child2) {
+			successor->__rb_parent_color = pc;
+			rb_set_parent_color(child2, parent, RB_BLACK);
+			rebalance = NULL;
+		} else {
+			unsigned long pc2 = successor->__rb_parent_color;
+			successor->__rb_parent_color = pc;
+			rebalance = __rb_is_black(pc2) ? parent : NULL;
+		}
+		tmp = successor;
+	}
+
+	augment->propagate(tmp, NULL);
+	return rebalance;
+}
+
+static __always_inline void
+rb_erase_augmented(struct rb_node *node, struct rb_root *root,
+		   const struct rb_augment_callbacks *augment)
+{
+	struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
+	if (rebalance)
+		__rb_erase_color(rebalance, root, augment->rotate);
+}
+
+#endif	/* _LINUX_RBTREE_AUGMENTED_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bf4b7ba11..55eac4d5f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -338,19 +338,6 @@ static inline void lockup_detector_init(void)
 }
 #endif
 
-#ifdef CONFIG_DETECT_HUNG_TASK
-extern unsigned int  sysctl_hung_task_panic;
-extern unsigned long sysctl_hung_task_check_count;
-extern unsigned long sysctl_hung_task_timeout_secs;
-extern unsigned long sysctl_hung_task_warnings;
-extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
-					 void __user *buffer,
-					 size_t *lenp, loff_t *ppos);
-#else
-/* Avoid need for ifdefs elsewhere in the code */
-enum { sysctl_hung_task_timeout_secs = 0 };
-#endif
-
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 
@@ -372,23 +359,6 @@ extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
 struct nsproxy;
 struct user_namespace;
 
-/*
- * Default maximum number of active map areas, this limits the number of vmas
- * per mm struct. Users can overwrite this number by sysctl but there is a
- * problem.
- *
- * When a program's coredump is generated as ELF format, a section is created
- * per a vma. In ELF, the number of sections is represented in unsigned short.
- * This means the number of sections should be smaller than 65535 at coredump.
- * Because the kernel adds some informative sections to a image of program at
- * generating coredump, we need some margin. The number of extra sections is
- * 1-3 now and depends on arch. We use "5" as safe margin, here.
- */
-#define MAPCOUNT_ELF_CORE_MARGIN	(5)
-#define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
-
-extern int sysctl_max_map_count;
-
 #include <linux/aio.h>
 
 #ifdef CONFIG_MMU
@@ -1258,12 +1228,6 @@ struct sched_rt_entity {
 #endif
 };
 
-/*
- * default timeslice is 100 msecs (used only for SCHED_RR tasks).
- * Timeslices get refilled after they expire.
- */
-#define RR_TIMESLICE		(100 * HZ / 1000)
-
 struct rcu_node;
 
 enum perf_event_task_context {
@@ -2083,52 +2047,10 @@ extern void wake_up_idle_cpu(int cpu);
 static inline void wake_up_idle_cpu(int cpu) { }
 #endif
 
-extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_min_granularity;
-extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_child_runs_first;
-extern unsigned int sysctl_sched_wake_to_idle;
-
-enum sched_tunable_scaling {
-	SCHED_TUNABLESCALING_NONE,
-	SCHED_TUNABLESCALING_LOG,
-	SCHED_TUNABLESCALING_LINEAR,
-	SCHED_TUNABLESCALING_END,
-};
-extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
-
-#ifdef CONFIG_SCHED_DEBUG
-extern unsigned int sysctl_sched_migration_cost;
-extern unsigned int sysctl_sched_nr_migrate;
-extern unsigned int sysctl_sched_time_avg;
-extern unsigned int sysctl_timer_migration;
-extern unsigned int sysctl_sched_shares_window;
-
-int sched_proc_update_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *length,
-		loff_t *ppos);
-#endif
-#ifdef CONFIG_SCHED_DEBUG
-static inline unsigned int get_sysctl_timer_migration(void)
-{
-	return sysctl_timer_migration;
-}
-#else
-static inline unsigned int get_sysctl_timer_migration(void)
-{
-	return 1;
-}
-#endif
-extern unsigned int sysctl_sched_rt_period;
-extern int sysctl_sched_rt_runtime;
-
-int sched_rt_handler(struct ctl_table *table, int write,
-		void __user *buffer, size_t *lenp,
-		loff_t *ppos);
+extern unsigned int sysctl_sched_ravg_window;
+extern unsigned int sysctl_sched_wakeup_load_threshold;
 
 #ifdef CONFIG_SCHED_AUTOGROUP
-extern unsigned int sysctl_sched_autogroup_enabled;
-
 extern void sched_autogroup_create_attach(struct task_struct *p);
 extern void sched_autogroup_detach(struct task_struct *p);
 extern void sched_autogroup_fork(struct signal_struct *sig);
@@ -2144,10 +2066,6 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
 static inline void sched_autogroup_exit(struct signal_struct *sig) { }
 #endif
 
-#ifdef CONFIG_CFS_BANDWIDTH
-extern unsigned int sysctl_sched_cfs_bandwidth_slice;
-#endif
-
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
new file mode 100644
index 000000000..d8c6e8606
--- /dev/null
+++ b/include/linux/sched/sysctl.h
@@ -0,0 +1,91 @@
+#ifndef _SCHED_SYSCTL_H
+#define _SCHED_SYSCTL_H
+
+#ifdef CONFIG_DETECT_HUNG_TASK
+extern unsigned int  sysctl_hung_task_panic;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern unsigned long sysctl_hung_task_warnings;
+extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+					 void __user *buffer,
+					 size_t *lenp, loff_t *ppos);
+#else
+/* Avoid need for ifdefs elsewhere in the code */
+enum { sysctl_hung_task_timeout_secs = 0 };
+#endif
+
+/*
+ * Default maximum number of active map areas, this limits the number of vmas
+ * per mm struct. Users can overwrite this number by sysctl but there is a
+ * problem.
+ *
+ * When a program's coredump is generated as ELF format, a section is created
+ * per a vma. In ELF, the number of sections is represented in unsigned short.
+ * This means the number of sections should be smaller than 65535 at coredump.
+ * Because the kernel adds some informative sections to a image of program at
+ * generating coredump, we need some margin. The number of extra sections is
+ * 1-3 now and depends on arch. We use "5" as safe margin, here.
+ */
+#define MAPCOUNT_ELF_CORE_MARGIN	(5)
+#define DEFAULT_MAX_MAP_COUNT	(USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
+
+extern int sysctl_max_map_count;
+
+extern unsigned int sysctl_sched_latency;
+extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_wakeup_granularity;
+extern unsigned int sysctl_sched_child_runs_first;
+extern unsigned int sysctl_sched_wake_to_idle;
+
+enum sched_tunable_scaling {
+	SCHED_TUNABLESCALING_NONE,
+	SCHED_TUNABLESCALING_LOG,
+	SCHED_TUNABLESCALING_LINEAR,
+	SCHED_TUNABLESCALING_END,
+};
+extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+
+#ifdef CONFIG_SCHED_DEBUG
+extern unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_sched_time_avg;
+extern unsigned int sysctl_timer_migration;
+extern unsigned int sysctl_sched_shares_window;
+
+int sched_proc_update_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *length,
+		loff_t *ppos);
+#endif
+#ifdef CONFIG_SCHED_DEBUG
+static inline unsigned int get_sysctl_timer_migration(void)
+{
+	return sysctl_timer_migration;
+}
+#else
+static inline unsigned int get_sysctl_timer_migration(void)
+{
+	return 1;
+}
+#endif
+extern unsigned int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_runtime;
+
+#ifdef CONFIG_CFS_BANDWIDTH
+extern unsigned int sysctl_sched_cfs_bandwidth_slice;
+#endif
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+extern unsigned int sysctl_sched_autogroup_enabled;
+#endif
+
+/*
+ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
+ * Timeslices get refilled after they expire.
+ */
+#define RR_TIMESLICE		(100 * HZ / 1000)
+
+int sched_rt_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos);
+
+#endif /* _SCHED_SYSCTL_H */
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index e156ce155..6218a2dad 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -142,6 +142,41 @@ int seq_put_decimal_ull(struct seq_file *m, char delimiter,
 int seq_put_decimal_ll(struct seq_file *m, char delimiter,
 			long long num);
 
+/**
+ * seq_show_options - display mount options with appropriate escapes.
+ * @m: the seq_file handle
+ * @name: the mount option name
+ * @value: the mount option name's value, can be NULL
+ */
+static inline void seq_show_option(struct seq_file *m, const char *name,
+				   const char *value)
+{
+	seq_putc(m, ',');
+	seq_escape(m, name, ",= \t\n\\");
+	if (value) {
+		seq_putc(m, '=');
+		seq_escape(m, value, ", \t\n\\");
+	}
+}
+
+/**
+ * seq_show_option_n - display mount options with appropriate escapes
+ *		       where @value must be a specific length.
+ * @m: the seq_file handle
+ * @name: the mount option name
+ * @value: the mount option name's value, cannot be NULL
+ * @length: the length of @value to display
+ *
+ * This is a macro since this uses "length" to define the size of the
+ * stack buffer.
+ */
+#define seq_show_option_n(m, name, value, length) {	\
+	char val_buf[length + 1];			\
+	strncpy(val_buf, value, length);		\
+	val_buf[length] = '\0';				\
+	seq_show_option(m, name, val_buf);		\
+}
+
 #define SEQ_START_TOKEN ((void *)1)
 /*
  * Helpers for iteration over list_head-s in seq_files
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 5c25b9a8c..e4a26ea07 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -4,6 +4,12 @@
 /*
  * This struct is used to pass information from page reclaim to the shrinkers.
  * We consolidate the values for easier extention later.
+ *
+ * The 'gfpmask' refers to the allocation we are currently trying to
+ * fulfil.
+ *
+ * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
+ * querying the cache size, so a fastpath for that case is appropriate.
  */
 struct shrink_control {
 	gfp_t gfp_mask;
@@ -14,24 +20,38 @@ struct shrink_control {
 	int priority;
 };
 
+#define SHRINK_STOP (~0UL)
+
 /*
  * A callback you can register to apply pressure to ageable caches.
  *
- * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
- * and a 'gfpmask'.  It should look through the least-recently-used
- * 'nr_to_scan' entries and attempt to free them up.  It should return
- * the number of objects which remain in the cache.  If it returns -1, it means
- * it cannot do any scanning at this time (eg. there is a risk of deadlock).
- * The callback must not return -1 if nr_to_scan is zero.
+ * @shrink() should look through the least-recently-used 'nr_to_scan' entries
+ * and attempt to free them up.  It should return the number of objects which
+ * remain in the cache.  If it returns -1, it means it cannot do any scanning at
+ * this time (eg. there is a risk of deadlock).
  *
- * The 'gfpmask' refers to the allocation we are currently trying to
- * fulfil.
+ * @count_objects should return the number of freeable items in the cache. If
+ * there are no objects to free or the number of freeable items cannot be
+ * determined, it should return 0. No deadlock checks should be done during the
+ * count callback - the shrinker relies on aggregating scan counts that couldn't
+ * be executed due to potential deadlocks to be run at a later call when the
+ * deadlock condition is no longer pending.
  *
- * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
- * querying the cache size, so a fastpath for that case is appropriate.
+ * @scan_objects will only be called if @count_objects returned a non-zero
+ * value for the number of freeable objects. The callout should scan the cache
+ * and attempt to free items from the cache. It should then return the number
+ * of objects freed during the scan, or SHRINK_STOP if progress cannot be made
+ * due to potential deadlocks. If SHRINK_STOP is returned, then no further
+ * attempts to call the @scan_objects will be made from the current reclaim
+ * context.
  */
 struct shrinker {
 	int (*shrink)(struct shrinker *, struct shrink_control *sc);
+	unsigned long (*count_objects)(struct shrinker *,
+ 				       struct shrink_control *sc);
+ 	unsigned long (*scan_objects)(struct shrinker *,
+ 				      struct shrink_control *sc);
+ 
 	int seeks;	/* seeks to recreate an obj */
 	long batch;	/* reclaim batch size, 0 = default */
 
diff --git a/include/linux/state_notifier.h b/include/linux/state_notifier.h
new file mode 100644
index 000000000..34f9287aa
--- /dev/null
+++ b/include/linux/state_notifier.h
@@ -0,0 +1,19 @@
+#ifndef __LINUX_STATE_NOTIFIER_H
+#define __LINUX_STATE_NOTIFIER_H
+
+#include <linux/notifier.h>
+
+#define STATE_NOTIFIER_ACTIVE		0x01
+#define STATE_NOTIFIER_SUSPEND		0x02
+
+struct state_event {
+	void *data;
+};
+
+extern void state_suspend(void);
+extern void state_resume(void);
+int state_register_client(struct notifier_block *nb);
+int state_unregister_client(struct notifier_block *nb);
+int state_notifier_call_chain(unsigned long val, void *v);
+
+#endif /* _LINUX_STATE_NOTIFIER_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 886dda7f7..453425ace 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -9,6 +9,7 @@
 #include <linux/sched.h>
 #include <linux/node.h>
 
+#include <linux/fs.h>
 #include <linux/atomic.h>
 #include <asm/page.h>
 
@@ -19,10 +20,13 @@ struct bio;
 #define SWAP_FLAG_PREFER	0x8000	/* set if swap priority specified */
 #define SWAP_FLAG_PRIO_MASK	0x7fff
 #define SWAP_FLAG_PRIO_SHIFT	0
-#define SWAP_FLAG_DISCARD	0x10000 /* discard swap cluster after use */
+#define SWAP_FLAG_DISCARD	0x10000 /* enable discard for swap */
+#define SWAP_FLAG_DISCARD_ONCE	0x20000 /* discard swap area at swapon-time */
+#define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */
 
 #define SWAP_FLAGS_VALID	(SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
-				 SWAP_FLAG_DISCARD)
+				 SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \
+				 SWAP_FLAG_DISCARD_PAGES)
 
 static inline int current_is_kswapd(void)
 {
@@ -146,16 +150,20 @@ struct swap_extent {
 enum {
 	SWP_USED	= (1 << 0),	/* is slot in swap_info[] used? */
 	SWP_WRITEOK	= (1 << 1),	/* ok to write to this swap?	*/
-	SWP_DISCARDABLE = (1 << 2),	/* swapon+blkdev support discard */
+	SWP_DISCARDABLE = (1 << 2),	/* blkdev support discard */
 	SWP_DISCARDING	= (1 << 3),	/* now discarding a free cluster */
 	SWP_SOLIDSTATE	= (1 << 4),	/* blkdev seeks are cheap */
 	SWP_CONTINUED	= (1 << 5),	/* swap_map has count continuation */
 	SWP_BLKDEV	= (1 << 6),	/* its a block device */
+	SWP_FILE	= (1 << 7),	/* set after swap_activate success */
+	SWP_AREA_DISCARD = (1 << 8),	/* single-time swap area discards */
+	SWP_PAGE_DISCARD = (1 << 9),	/* freed swap page-cluster discards */
 					/* add others here before... */
-	SWP_SCANNING	= (1 << 8),	/* refcount in scan_swap_map */
+	SWP_FAST	= (1 << 10),	/* blkdev access is fast and cheap */
+	SWP_SCANNING	= (1 << 11),	/* refcount in scan_swap_map */
 };
 
-#define SWAP_CLUSTER_MAX 32
+#define SWAP_CLUSTER_MAX 32UL
 #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
 
 /*
@@ -329,14 +337,16 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
+extern int swap_set_page_dirty(struct page *page);
 extern void end_swap_bio_write(struct bio *bio, int err);
 extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
 	void (*end_write_func)(struct bio *, int));
 extern void end_swap_bio_read(struct bio *bio, int err);
 
 /* linux/mm/swap_state.c */
-extern struct address_space swapper_space;
-#define total_swapcache_pages  swapper_space.nrpages
+extern struct address_space swapper_spaces[];
+#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)])
+extern unsigned long total_swapcache_pages(void);
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *);
 extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
@@ -354,10 +364,18 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
 /* linux/mm/swapfile.c */
 extern atomic_long_t nr_swap_pages;
 extern long total_swap_pages;
+extern bool is_swap_fast(swp_entry_t entry);
 
 /* Swap 50% full? Release swapcache more aggressively.. */
-static inline bool vm_swap_full(void)
+static inline bool vm_swap_full(struct swap_info_struct *si)
 {
+	/*
+	 * If the swap device is fast, return true
+	 * not to delay swap free.
+	 */
+	if (si->flags & SWP_FAST)
+		return true;
+
 	return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
 }
 
@@ -401,8 +419,8 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 
 #define get_nr_swap_pages()			0L
 #define total_swap_pages			0L
-#define total_swapcache_pages			0UL
-#define vm_swap_full()				0
+#define total_swapcache_pages()			0UL
+#define vm_swap_full(si)			0
 
 #define si_swapinfo(val) \
 	do { (val)->freeswap = (val)->totalswap = 0; } while (0)
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index 508872747..a520fd70a 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -39,7 +39,7 @@ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head)
 
 static inline void timerqueue_init(struct timerqueue_node *node)
 {
-	rb_init_node(&node->node);
+	RB_CLEAR_NODE(&node->node);
 }
 
 static inline void timerqueue_init_head(struct timerqueue_head *head)
diff --git a/include/linux/unlz4.h b/include/linux/unlz4.h
new file mode 100644
index 000000000..3273c2f36
--- /dev/null
+++ b/include/linux/unlz4.h
@@ -0,0 +1,10 @@
+#ifndef DECOMPRESS_UNLZ4_H
+#define DECOMPRESS_UNLZ4_H
+
+int unlz4(unsigned char *inbuf, long len,
+	long (*fill)(void*, unsigned long),
+	long (*flush)(void*, unsigned long),
+	unsigned char *output,
+	long *pos,
+	void(*error)(char *x));
+#endif
diff --git a/include/linux/zbud.h b/include/linux/zbud.h
new file mode 100644
index 000000000..e183a0a65
--- /dev/null
+++ b/include/linux/zbud.h
@@ -0,0 +1,22 @@
+#ifndef _ZBUD_H_
+#define _ZBUD_H_
+
+#include <linux/types.h>
+
+struct zbud_pool;
+
+struct zbud_ops {
+	int (*evict)(struct zbud_pool *pool, unsigned long handle);
+};
+
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops);
+void zbud_destroy_pool(struct zbud_pool *pool);
+int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
+	unsigned long *handle);
+void zbud_free(struct zbud_pool *pool, unsigned long handle);
+int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
+void *zbud_map(struct zbud_pool *pool, unsigned long handle);
+void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
+u64 zbud_get_pool_size(struct zbud_pool *pool);
+
+#endif /* _ZBUD_H_ */
diff --git a/include/linux/zcache.h b/include/linux/zcache.h
new file mode 100644
index 000000000..2db7e4bbb
--- /dev/null
+++ b/include/linux/zcache.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef _LINUX_ZCACHE_H
+#define _LINUX_ZCACHE_H
+
+#ifdef CONFIG_ZCACHE
+extern u64 zcache_pages(void);
+#else
+u64 zcache_pages(void) { return 0; }
+#endif
+
+#endif /* _LINUX_ZCACHE_H */
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
new file mode 100644
index 000000000..24e0173c7
--- /dev/null
+++ b/include/linux/zpool.h
@@ -0,0 +1,110 @@
+/*
+ * zpool memory storage api
+ *
+ * Copyright (C) 2014 Dan Streetman
+ *
+ * This is a common frontend for the zbud and zsmalloc memory
+ * storage pool implementations.  Typically, this is used to
+ * store compressed memory.
+ */
+
+#ifndef _ZPOOL_H_
+#define _ZPOOL_H_
+
+struct zpool;
+
+struct zpool_ops {
+	int (*evict)(struct zpool *pool, unsigned long handle);
+};
+
+/*
+ * Control how a handle is mapped.  It will be ignored if the
+ * implementation does not support it.  Its use is optional.
+ * Note that this does not refer to memory protection, it
+ * refers to how the memory will be copied in/out if copying
+ * is necessary during mapping; read-write is the safest as
+ * it copies the existing memory in on map, and copies the
+ * changed memory back out on unmap.  Write-only does not copy
+ * in the memory and should only be used for initialization.
+ * If in doubt, use ZPOOL_MM_DEFAULT which is read-write.
+ */
+enum zpool_mapmode {
+	ZPOOL_MM_RW, /* normal read-write mapping */
+	ZPOOL_MM_RO, /* read-only (no copy-out at unmap time) */
+	ZPOOL_MM_WO, /* write-only (no copy-in at map time) */
+
+	ZPOOL_MM_DEFAULT = ZPOOL_MM_RW
+};
+
+bool zpool_has_pool(char *type);
+
+struct zpool *zpool_create_pool(const char *type, const char *name,
+			gfp_t gfp, const struct zpool_ops *ops);
+
+char *zpool_get_type(struct zpool *pool);
+
+void zpool_destroy_pool(struct zpool *pool);
+
+int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp,
+			unsigned long *handle);
+
+void zpool_free(struct zpool *pool, unsigned long handle);
+
+int zpool_shrink(struct zpool *pool, unsigned int pages,
+			unsigned int *reclaimed);
+
+void *zpool_map_handle(struct zpool *pool, unsigned long handle,
+			enum zpool_mapmode mm);
+
+void zpool_unmap_handle(struct zpool *pool, unsigned long handle);
+
+u64 zpool_get_total_size(struct zpool *pool);
+
+
+/**
+ * struct zpool_driver - driver implementation for zpool
+ * @type:	name of the driver.
+ * @list:	entry in the list of zpool drivers.
+ * @create:	create a new pool.
+ * @destroy:	destroy a pool.
+ * @malloc:	allocate mem from a pool.
+ * @free:	free mem from a pool.
+ * @shrink:	shrink the pool.
+ * @map:	map a handle.
+ * @unmap:	unmap a handle.
+ * @total_size:	get total size of a pool.
+ *
+ * This is created by a zpool implementation and registered
+ * with zpool.
+ */
+struct zpool_driver {
+	char *type;
+	struct module *owner;
+	atomic_t refcount;
+	struct list_head list;
+
+	void *(*create)(const char *name,
+			gfp_t gfp,
+			const struct zpool_ops *ops,
+			struct zpool *zpool);
+	void (*destroy)(void *pool);
+
+	int (*malloc)(void *pool, size_t size, gfp_t gfp,
+				unsigned long *handle);
+	void (*free)(void *pool, unsigned long handle);
+
+	int (*shrink)(void *pool, unsigned int pages,
+				unsigned int *reclaimed);
+
+	void *(*map)(void *pool, unsigned long handle,
+				enum zpool_mapmode mm);
+	void (*unmap)(void *pool, unsigned long handle);
+
+	u64 (*total_size)(void *pool);
+};
+
+void zpool_register_driver(struct zpool_driver *driver);
+
+int zpool_unregister_driver(struct zpool_driver *driver);
+
+#endif
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 398dae365..57a8e98f2 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -2,6 +2,7 @@
  * zsmalloc memory allocator
  *
  * Copyright (C) 2011  Nitin Gupta
+ * Copyright (C) 2012, 2013 Minchan Kim
  *
  * This code is released using a dual license strategy: BSD/GPL
  * You can choose the license that better fits your requirements.
@@ -14,14 +15,13 @@
 #define _ZS_MALLOC_H_
 
 #include <linux/types.h>
-#include <linux/mm_types.h>
 
 /*
  * zsmalloc mapping modes
  *
  * NOTE: These only make a difference when a mapped object spans pages.
- *       They also have no effect when PGTABLE_MAPPING is selected.
-*/
+ * They also have no effect when PGTABLE_MAPPING is selected.
+ */
 enum zs_mapmode {
 	ZS_MM_RW, /* normal read-write mapping */
 	ZS_MM_RO, /* read-only (no copy-out at unmap time) */
@@ -34,14 +34,14 @@ enum zs_mapmode {
 	 */
 };
 
-struct zs_ops {
-	struct page * (*alloc)(gfp_t);
-	void (*free)(struct page *);
+struct zs_pool_stats {
+	/* How many pages were migrated (freed) */
+	unsigned long pages_compacted;
 };
 
 struct zs_pool;
 
-struct zs_pool *zs_create_pool(gfp_t flags, struct zs_ops *ops);
+struct zs_pool *zs_create_pool(const char *name);
 void zs_destroy_pool(struct zs_pool *pool);
 
 unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags);
@@ -51,6 +51,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 			enum zs_mapmode mm);
 void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
 
-u64 zs_get_total_size_bytes(struct zs_pool *pool);
+unsigned long zs_get_total_pages(struct zs_pool *pool);
+unsigned long zs_compact(struct zs_pool *pool);
 
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats);
 #endif
diff --git a/include/sound/wm8904.h b/include/sound/wm8904.h
index 898be3a8d..6d8f8fba3 100644
--- a/include/sound/wm8904.h
+++ b/include/sound/wm8904.h
@@ -119,7 +119,7 @@
 #define WM8904_MIC_REGS  2
 #define WM8904_GPIO_REGS 4
 #define WM8904_DRC_REGS  4
-#define WM8904_EQ_REGS   25
+#define WM8904_EQ_REGS   24
 
 /**
  * DRC configurations are specified with a label and a set of register
diff --git a/kernel/async.c b/kernel/async.c
index 32d8dc960..b7717733f 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -62,7 +62,7 @@ static async_cookie_t next_cookie = 1;
 #define MAX_WORK	32768
 
 static LIST_HEAD(async_pending);
-static LIST_HEAD(async_running);
+static ASYNC_DOMAIN(async_running);
 static DEFINE_SPINLOCK(async_lock);
 
 struct async_entry {
@@ -71,7 +71,7 @@ struct async_entry {
 	async_cookie_t		cookie;
 	async_func_ptr		*func;
 	void			*data;
-	struct list_head	*running;
+	struct async_domain	*running;
 };
 
 static DECLARE_WAIT_QUEUE_HEAD(async_done);
@@ -82,7 +82,7 @@ static atomic_t entry_count;
 /*
  * MUST be called with the lock held!
  */
-static async_cookie_t  __lowest_in_progress(struct list_head *running)
+static async_cookie_t  __lowest_in_progress(struct async_domain *running)
 {
 	struct async_entry *entry;
 
@@ -93,9 +93,8 @@ static async_cookie_t  __lowest_in_progress(struct list_head *running)
 			return next_cookie;
 	}
 
-	if (!list_empty(running)) {
-		entry = list_first_entry(running,
-			struct async_entry, list);
+	if (!list_empty(&running->domain)) {
+		entry = list_first_entry(&running->domain, typeof(*entry), list);
 		return entry->cookie;
 	}
 
@@ -106,7 +105,7 @@ static async_cookie_t  __lowest_in_progress(struct list_head *running)
 	return next_cookie;	/* "infinity" value */
 }
 
-static async_cookie_t  lowest_in_progress(struct list_head *running)
+static async_cookie_t  lowest_in_progress(struct async_domain *running)
 {
 	unsigned long flags;
 	async_cookie_t ret;
@@ -126,10 +125,11 @@ static void async_run_entry_fn(struct work_struct *work)
 		container_of(work, struct async_entry, work);
 	unsigned long flags;
 	ktime_t uninitialized_var(calltime), delta, rettime;
+	struct async_domain *running = entry->running;
 
 	/* 1) move self to the running queue */
 	spin_lock_irqsave(&async_lock, flags);
-	list_move_tail(&entry->list, entry->running);
+	list_move_tail(&entry->list, &running->domain);
 	spin_unlock_irqrestore(&async_lock, flags);
 
 	/* 2) run (and print duration) */
@@ -163,7 +163,7 @@ static void async_run_entry_fn(struct work_struct *work)
 	wake_up(&async_done);
 }
 
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
+static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running)
 {
 	struct async_entry *entry;
 	unsigned long flags;
@@ -230,7 +230,7 @@ EXPORT_SYMBOL_GPL(async_schedule);
  * Note: This function may be called from atomic or non-atomic contexts.
  */
 async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
-				     struct list_head *running)
+				     struct async_domain *running)
 {
 	return __async_schedule(ptr, data, running);
 }
@@ -249,14 +249,14 @@ EXPORT_SYMBOL_GPL(async_synchronize_full);
 
 /**
  * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
- * @list: running list to synchronize on
+ * @domain: running list to synchronize on
  *
  * This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list have been done.
+ * synchronization domain specified by the running list @domain have been done.
  */
-void async_synchronize_full_domain(struct list_head *list)
+void async_synchronize_full_domain(struct async_domain *domain)
 {
-	async_synchronize_cookie_domain(next_cookie, list);
+	async_synchronize_cookie_domain(next_cookie, domain);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
 
@@ -266,11 +266,10 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
  * @running: running list to synchronize on, NULL indicates all lists
  *
  * This function waits until all asynchronous function calls for the
- * synchronization domain specified by the running list @list submitted
+ * synchronization domain specified by running list @running submitted
  * prior to @cookie have been done.
  */
-void async_synchronize_cookie_domain(async_cookie_t cookie,
-				     struct list_head *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running)
 {
 	ktime_t uninitialized_var(starttime), delta, endtime;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ad70cada2..f48e4e739 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1053,15 +1053,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 
 	mutex_lock(&cgroup_root_mutex);
 	for_each_subsys(root, ss)
-		seq_printf(seq, ",%s", ss->name);
+		seq_show_option(seq, ss->name, NULL);
 	if (test_bit(ROOT_NOPREFIX, &root->flags))
 		seq_puts(seq, ",noprefix");
 	if (strlen(root->release_agent_path))
-		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+		seq_show_option(seq, "release_agent",
+				root->release_agent_path);
 	if (clone_children(&root->top_cgroup))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
-		seq_printf(seq, ",name=%s", root->name);
+		seq_show_option(seq, "name", root->name);
 	mutex_unlock(&cgroup_root_mutex);
 	return 0;
 }
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 02caf610c..b2e02501b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -166,6 +166,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
 	return ret;
 }
 
+int __ref __register_cpu_notifier(struct notifier_block *nb)
+{
+	return raw_notifier_chain_register(&cpu_chain, nb);
+}
+
 static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
 			int *nr_calls)
 {
@@ -189,6 +194,7 @@ static void cpu_notify_nofail(unsigned long val, void *v)
 	BUG_ON(cpu_notify(val, v));
 }
 EXPORT_SYMBOL(register_cpu_notifier);
+EXPORT_SYMBOL(__register_cpu_notifier);
 
 void __ref unregister_cpu_notifier(struct notifier_block *nb)
 {
@@ -198,6 +204,12 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
+void __ref __unregister_cpu_notifier(struct notifier_block *nb)
+{
+	raw_notifier_chain_unregister(&cpu_chain, nb);
+}
+EXPORT_SYMBOL(__unregister_cpu_notifier);
+
 static inline void check_for_tasks(int cpu)
 {
 	struct task_struct *p;
@@ -262,6 +274,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 		goto out_release;
 	}
 
+	/*
+	 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
+	 * and RCU users of this state to go away such that all new such users
+	 * will observe it.
+	 *
+	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+	 * not imply sync_sched(), so explicitly call both.
+	 */
+#ifdef CONFIG_PREEMPT
+	synchronize_sched();
+#endif
+	synchronize_rcu();
+
+	/*
+	 * So now all preempt/rcu users must observe !cpu_active().
+	 */
+
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 6a72395d3..72bc17919 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -44,6 +44,7 @@
 #include <linux/err.h>
 #include <linux/debugobjects.h>
 #include <linux/sched.h>
+#include <linux/sched/sysctl.h>
 #include <linux/timer.h>
 
 #include <asm/uaccess.h>
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index fb655f5f9..15374d0ac 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -12,6 +12,7 @@
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/mutex.h>
 
 #include "internals.h"
 
@@ -326,18 +327,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
 
 void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 {
+	static DEFINE_MUTEX(register_lock);
 	char name [MAX_NAMELEN];
 
-	if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
+	if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip))
 		return;
 
+	/*
+	 * irq directories are registered only when a handler is
+	 * added, not when the descriptor is created, so multiple
+	 * tasks might try to register at the same time.
+	 */
+	mutex_lock(&register_lock);
+
+	if (desc->dir)
+		goto out_unlock;
+
 	memset(name, 0, MAX_NAMELEN);
 	sprintf(name, "%d", irq);
 
 	/* create /proc/irq/1234 */
 	desc->dir = proc_mkdir(name, root_irq_dir);
 	if (!desc->dir)
-		return;
+		goto out_unlock;
 
 #ifdef CONFIG_SMP
 	/* create /proc/irq/<irq>/smp_affinity */
@@ -358,6 +370,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 
 	proc_create_data("spurious", 0444, desc->dir,
 			 &irq_spurious_proc_fops, (void *)(long)irq);
+
+out_unlock:
+	mutex_unlock(&register_lock);
 }
 
 void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
diff --git a/kernel/module.c b/kernel/module.c
index 6282fdc4d..fac398bb5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -969,11 +969,15 @@ void symbol_put_addr(void *addr)
 	if (core_kernel_text(a))
 		return;
 
-	/* module_text_address is safe here: we're supposed to have reference
-	 * to module from symbol_get, so it can't go away. */
+	/*
+	 * Even though we hold a reference on the module; we still need to
+	 * disable preemption in order to safely traverse the data structure.
+	 */
+	preempt_disable();
 	modaddr = __module_text_address(a);
 	BUG_ON(!modaddr);
 	module_put(modaddr);
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(symbol_put_addr);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8d443c36c..751d37d87 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1983,11 +1983,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
 	 * schedule one last time. The schedule call will never return, and
 	 * the scheduled task must drop that reference.
-	 * The test for TASK_DEAD must occur while the runqueue locks are
-	 * still held, otherwise prev could be scheduled on another cpu, die
-	 * there before we look at prev->state, and then the reference would
-	 * be dropped twice.
-	 *		Manfred Spraul <manfred@colorfullife.com>
+	 *
+	 * We must observe prev->state before clearing prev->on_cpu (in
+	 * finish_lock_switch), otherwise a concurrent wakeup can get prev
+	 * running on another CPU and we could rave with its RUNNING -> DEAD
+	 * transition, resulting in a double drop.
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
@@ -4625,7 +4625,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	struct task_struct *p;
 	int retval;
 
-	get_online_cpus();
 	rcu_read_lock();
 
 	p = find_process_by_pid(pid);
@@ -4678,7 +4677,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	free_cpumask_var(cpus_allowed);
 out_put_task:
 	put_task_struct(p);
-	put_online_cpus();
 	return retval;
 }
 
@@ -4721,7 +4719,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 	unsigned long flags;
 	int retval;
 
-	get_online_cpus();
 	rcu_read_lock();
 
 	retval = -ESRCH;
@@ -4734,12 +4731,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 		goto out_unlock;
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 out_unlock:
 	rcu_read_unlock();
-	put_online_cpus();
 
 	return retval;
 }
@@ -7005,13 +7001,11 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 static void reinit_sched_domains(void)
 {
-	get_online_cpus();
 
 	/* Destroy domains first to force the rebuild */
 	partition_sched_domains(0, NULL, NULL);
 
 	rebuild_sched_domains();
-	put_online_cpus();
 }
 
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7162,14 +7156,18 @@ void __init sched_init_smp(void)
 	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
-	get_online_cpus();
+
+	/*
+	 * There's no userspace yet to cause hotplug operations; hence all the
+	 * cpu masks are stable and all blatant races in the below code cannot
+	 * happen.
+	 */
 	mutex_lock(&sched_domains_mutex);
 	init_sched_domains(cpu_active_mask);
 	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
 	if (cpumask_empty(non_isolated_cpus))
 		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
-	put_online_cpus();
 
 	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
 	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c65ccfcae..261c65543 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,5 +1,6 @@
 
 #include <linux/sched.h>
+#include <linux/sched/sysctl.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
@@ -712,8 +713,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 	 * After ->on_cpu is cleared, the task can be moved to a different CPU.
 	 * We must ensure this doesn't happen until the switch is completely
 	 * finished.
+	 *
+	 * Pairs with the control dependency and rmb in try_to_wake_up().
 	 */
-	smp_wmb();
+	smp_mb();
 	prev->on_cpu = 0;
 #endif
 #ifdef CONFIG_DEBUG_SPINLOCK
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index aa8debaa1..2d0dc1461 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -60,6 +60,7 @@
 #include <linux/kmod.h>
 #include <linux/capability.h>
 #include <linux/binfmts.h>
+#include <linux/sched/sysctl.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c95833821..b3f540792 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -291,7 +291,7 @@ static void clocksource_watchdog(unsigned long data)
 			continue;
 
 		/* Check the deviation from the watchdog clocksource. */
-		if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
+		if ((abs64(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
 			clocksource_unstable(cs, cs_nsec - wd_nsec);
 			continue;
 		}
diff --git a/kernel/timer.c b/kernel/timer.c
index 219865e85..5b5310630 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
 #include <linux/kallsyms.h>
 #include <linux/irq_work.h>
 #include <linux/sched.h>
+#include <linux/sched/sysctl.h>
 #include <linux/slab.h>
 
 #include <asm/uaccess.h>
diff --git a/lib/Kconfig b/lib/Kconfig
index 09f0d8a64..4b23c9600 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -177,6 +177,15 @@ config LZO_COMPRESS
 config LZO_DECOMPRESS
 	tristate
 
+config LZ4_COMPRESS
+	tristate
+
+config LZ4HC_COMPRESS
+	tristate
+
+config LZ4_DECOMPRESS
+	tristate
+
 source "lib/xz/Kconfig"
 
 #
@@ -201,6 +210,10 @@ config DECOMPRESS_LZO
 	select LZO_DECOMPRESS
 	tristate
 
+config DECOMPRESS_LZ4
+	select LZ4_DECOMPRESS
+	tristate
+
 #
 # Generic allocator support is selected if needed
 #
diff --git a/lib/Makefile b/lib/Makefile
index 9161b969d..1919769ec 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -72,6 +72,9 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 obj-$(CONFIG_BCH) += bch.o
 obj-$(CONFIG_LZO_COMPRESS) += lzo/
 obj-$(CONFIG_LZO_DECOMPRESS) += lzo/
+obj-$(CONFIG_LZ4_COMPRESS) += lz4/
+obj-$(CONFIG_LZ4HC_COMPRESS) += lz4/
+obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/
 obj-$(CONFIG_XZ_DEC) += xz/
 obj-$(CONFIG_RAID6_PQ) += raid6/
 
@@ -80,6 +83,7 @@ lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o
 lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o
 lib-$(CONFIG_DECOMPRESS_XZ) += decompress_unxz.o
 lib-$(CONFIG_DECOMPRESS_LZO) += decompress_unlzo.o
+lib-$(CONFIG_DECOMPRESS_LZ4) += decompress_unlz4.o
 
 obj-$(CONFIG_TEXTSEARCH) += textsearch.o
 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile
new file mode 100644
index 000000000..8085d04e9
--- /dev/null
+++ b/lib/lz4/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LZ4_COMPRESS) += lz4_compress.o
+obj-$(CONFIG_LZ4HC_COMPRESS) += lz4hc_compress.o
+obj-$(CONFIG_LZ4_DECOMPRESS) += lz4_decompress.o
diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
new file mode 100644
index 000000000..28321d8f7
--- /dev/null
+++ b/lib/lz4/lz4_compress.c
@@ -0,0 +1,443 @@
+/*
+ * LZ4 - Fast LZ compression algorithm
+ * Copyright (C) 2011-2012, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ *
+ *  Changed for kernel use by:
+ *  Chanho Min <chanho.min@lge.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/lz4.h>
+#include <asm/unaligned.h>
+#include "lz4defs.h"
+
+/*
+ * LZ4_compressCtx :
+ * -----------------
+ * Compress 'isize' bytes from 'source' into an output buffer 'dest' of
+ * maximum size 'maxOutputSize'.  * If it cannot achieve it, compression
+ * will stop, and result of the function will be zero.
+ * return : the number of bytes written in buffer 'dest', or 0 if the
+ * compression fails
+ */
+static inline int lz4_compressctx(void *ctx,
+		const char *source,
+		char *dest,
+		int isize,
+		int maxoutputsize)
+{
+	HTYPE *hashtable = (HTYPE *)ctx;
+	const u8 *ip = (u8 *)source;
+#if LZ4_ARCH64
+	const BYTE * const base = ip;
+#else
+	const int base = 0;
+#endif
+	const u8 *anchor = ip;
+	const u8 *const iend = ip + isize;
+	const u8 *const mflimit = iend - MFLIMIT;
+	#define MATCHLIMIT (iend - LASTLITERALS)
+
+	u8 *op = (u8 *) dest;
+	u8 *const oend = op + maxoutputsize;
+	int length;
+	const int skipstrength = SKIPSTRENGTH;
+	u32 forwardh;
+	int lastrun;
+
+	/* Init */
+	if (isize < MINLENGTH)
+		goto _last_literals;
+
+	memset((void *)hashtable, 0, LZ4_MEM_COMPRESS);
+
+	/* First Byte */
+	hashtable[LZ4_HASH_VALUE(ip)] = ip - base;
+	ip++;
+	forwardh = LZ4_HASH_VALUE(ip);
+
+	/* Main Loop */
+	for (;;) {
+		int findmatchattempts = (1U << skipstrength) + 3;
+		const u8 *forwardip = ip;
+		const u8 *ref;
+		u8 *token;
+
+		/* Find a match */
+		do {
+			u32 h = forwardh;
+			int step = findmatchattempts++ >> skipstrength;
+			ip = forwardip;
+			forwardip = ip + step;
+
+			if (unlikely(forwardip > mflimit))
+				goto _last_literals;
+
+			forwardh = LZ4_HASH_VALUE(forwardip);
+			ref = base + hashtable[h];
+			hashtable[h] = ip - base;
+		} while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
+
+		/* Catch up */
+		while ((ip > anchor) && (ref > (u8 *)source) &&
+			unlikely(ip[-1] == ref[-1])) {
+			ip--;
+			ref--;
+		}
+
+		/* Encode Literal length */
+		length = (int)(ip - anchor);
+		token = op++;
+		/* check output limit */
+		if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
+			(length >> 8) > oend))
+			return 0;
+
+		if (length >= (int)RUN_MASK) {
+			int len;
+			*token = (RUN_MASK << ML_BITS);
+			len = length - RUN_MASK;
+			for (; len > 254 ; len -= 255)
+				*op++ = 255;
+			*op++ = (u8)len;
+		} else
+			*token = (length << ML_BITS);
+
+		/* Copy Literals */
+		LZ4_BLINDCOPY(anchor, op, length);
+_next_match:
+		/* Encode Offset */
+		LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref));
+
+		/* Start Counting */
+		ip += MINMATCH;
+		/* MinMatch verified */
+		ref += MINMATCH;
+		anchor = ip;
+		while (likely(ip < MATCHLIMIT - (STEPSIZE - 1))) {
+			#if LZ4_ARCH64
+			u64 diff = A64(ref) ^ A64(ip);
+			#else
+			u32 diff = A32(ref) ^ A32(ip);
+			#endif
+			if (!diff) {
+				ip += STEPSIZE;
+				ref += STEPSIZE;
+				continue;
+			}
+			ip += LZ4_NBCOMMONBYTES(diff);
+			goto _endcount;
+		}
+		#if LZ4_ARCH64
+		if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) {
+			ip += 4;
+			ref += 4;
+		}
+		#endif
+		if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) {
+			ip += 2;
+			ref += 2;
+		}
+		if ((ip < MATCHLIMIT) && (*ref == *ip))
+			ip++;
+_endcount:
+		/* Encode MatchLength */
+		length = (int)(ip - anchor);
+		/* Check output limit */
+		if (unlikely(op + (1 + LASTLITERALS) + (length >> 8) > oend))
+			return 0;
+		if (length >= (int)ML_MASK) {
+			*token += ML_MASK;
+			length -= ML_MASK;
+			for (; length > 509 ; length -= 510) {
+				*op++ = 255;
+				*op++ = 255;
+			}
+			if (length > 254) {
+				length -= 255;
+				*op++ = 255;
+			}
+			*op++ = (u8)length;
+		} else
+			*token += length;
+
+		/* Test end of chunk */
+		if (ip > mflimit) {
+			anchor = ip;
+			break;
+		}
+
+		/* Fill table */
+		hashtable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base;
+
+		/* Test next position */
+		ref = base + hashtable[LZ4_HASH_VALUE(ip)];
+		hashtable[LZ4_HASH_VALUE(ip)] = ip - base;
+		if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) {
+			token = op++;
+			*token = 0;
+			goto _next_match;
+		}
+
+		/* Prepare next loop */
+		anchor = ip++;
+		forwardh = LZ4_HASH_VALUE(ip);
+	}
+
+_last_literals:
+	/* Encode Last Literals */
+	lastrun = (int)(iend - anchor);
+	if (((char *)op - dest) + lastrun + 1
+		+ ((lastrun + 255 - RUN_MASK) / 255) > (u32)maxoutputsize)
+		return 0;
+
+	if (lastrun >= (int)RUN_MASK) {
+		*op++ = (RUN_MASK << ML_BITS);
+		lastrun -= RUN_MASK;
+		for (; lastrun > 254 ; lastrun -= 255)
+			*op++ = 255;
+		*op++ = (u8)lastrun;
+	} else
+		*op++ = (lastrun << ML_BITS);
+	memcpy(op, anchor, iend - anchor);
+	op += iend - anchor;
+
+	/* End */
+	return (int)(((char *)op) - dest);
+}
+
+static inline int lz4_compress64kctx(void *ctx,
+		const char *source,
+		char *dest,
+		int isize,
+		int maxoutputsize)
+{
+	u16 *hashtable = (u16 *)ctx;
+	const u8 *ip = (u8 *) source;
+	const u8 *anchor = ip;
+	const u8 *const base = ip;
+	const u8 *const iend = ip + isize;
+	const u8 *const mflimit = iend - MFLIMIT;
+	#define MATCHLIMIT (iend - LASTLITERALS)
+
+	u8 *op = (u8 *) dest;
+	u8 *const oend = op + maxoutputsize;
+	int len, length;
+	const int skipstrength = SKIPSTRENGTH;
+	u32 forwardh;
+	int lastrun;
+
+	/* Init */
+	if (isize < MINLENGTH)
+		goto _last_literals;
+
+	memset((void *)hashtable, 0, LZ4_MEM_COMPRESS);
+
+	/* First Byte */
+	ip++;
+	forwardh = LZ4_HASH64K_VALUE(ip);
+
+	/* Main Loop */
+	for (;;) {
+		int findmatchattempts = (1U << skipstrength) + 3;
+		const u8 *forwardip = ip;
+		const u8 *ref;
+		u8 *token;
+
+		/* Find a match */
+		do {
+			u32 h = forwardh;
+			int step = findmatchattempts++ >> skipstrength;
+			ip = forwardip;
+			forwardip = ip + step;
+
+			if (forwardip > mflimit)
+				goto _last_literals;
+
+			forwardh = LZ4_HASH64K_VALUE(forwardip);
+			ref = base + hashtable[h];
+			hashtable[h] = (u16)(ip - base);
+		} while (A32(ref) != A32(ip));
+
+		/* Catch up */
+		while ((ip > anchor) && (ref > (u8 *)source)
+			&& (ip[-1] == ref[-1])) {
+			ip--;
+			ref--;
+		}
+
+		/* Encode Literal length */
+		length = (int)(ip - anchor);
+		token = op++;
+		/* Check output limit */
+		if (unlikely(op + length + (2 + 1 + LASTLITERALS)
+			+ (length >> 8) > oend))
+			return 0;
+		if (length >= (int)RUN_MASK) {
+			*token = (RUN_MASK << ML_BITS);
+			len = length - RUN_MASK;
+			for (; len > 254 ; len -= 255)
+				*op++ = 255;
+			*op++ = (u8)len;
+		} else
+			*token = (length << ML_BITS);
+
+		/* Copy Literals */
+		LZ4_BLINDCOPY(anchor, op, length);
+
+_next_match:
+		/* Encode Offset */
+		LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref));
+
+		/* Start Counting */
+		ip += MINMATCH;
+		/* MinMatch verified */
+		ref += MINMATCH;
+		anchor = ip;
+
+		while (ip < MATCHLIMIT - (STEPSIZE - 1)) {
+			#if LZ4_ARCH64
+			u64 diff = A64(ref) ^ A64(ip);
+			#else
+			u32 diff = A32(ref) ^ A32(ip);
+			#endif
+
+			if (!diff) {
+				ip += STEPSIZE;
+				ref += STEPSIZE;
+				continue;
+			}
+			ip += LZ4_NBCOMMONBYTES(diff);
+			goto _endcount;
+		}
+		#if LZ4_ARCH64
+		if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) {
+			ip += 4;
+			ref += 4;
+		}
+		#endif
+		if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) {
+			ip += 2;
+			ref += 2;
+		}
+		if ((ip < MATCHLIMIT) && (*ref == *ip))
+			ip++;
+_endcount:
+
+		/* Encode MatchLength */
+		len = (int)(ip - anchor);
+		/* Check output limit */
+		if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
+			return 0;
+		if (len >= (int)ML_MASK) {
+			*token += ML_MASK;
+			len -= ML_MASK;
+			for (; len > 509 ; len -= 510) {
+				*op++ = 255;
+				*op++ = 255;
+			}
+			if (len > 254) {
+				len -= 255;
+				*op++ = 255;
+			}
+			*op++ = (u8)len;
+		} else
+			*token += len;
+
+		/* Test end of chunk */
+		if (ip > mflimit) {
+			anchor = ip;
+			break;
+		}
+
+		/* Fill table */
+		hashtable[LZ4_HASH64K_VALUE(ip-2)] = (u16)(ip - 2 - base);
+
+		/* Test next position */
+		ref = base + hashtable[LZ4_HASH64K_VALUE(ip)];
+		hashtable[LZ4_HASH64K_VALUE(ip)] = (u16)(ip - base);
+		if (A32(ref) == A32(ip)) {
+			token = op++;
+			*token = 0;
+			goto _next_match;
+		}
+
+		/* Prepare next loop */
+		anchor = ip++;
+		forwardh = LZ4_HASH64K_VALUE(ip);
+	}
+
+_last_literals:
+	/* Encode Last Literals */
+	lastrun = (int)(iend - anchor);
+	if (op + lastrun + 1 + (lastrun - RUN_MASK + 255) / 255 > oend)
+		return 0;
+	if (lastrun >= (int)RUN_MASK) {
+		*op++ = (RUN_MASK << ML_BITS);
+		lastrun -= RUN_MASK;
+		for (; lastrun > 254 ; lastrun -= 255)
+			*op++ = 255;
+		*op++ = (u8)lastrun;
+	} else
+		*op++ = (lastrun << ML_BITS);
+	memcpy(op, anchor, iend - anchor);
+	op += iend - anchor;
+	/* End */
+	return (int)(((char *)op) - dest);
+}
+
+int lz4_compress(const unsigned char *src, size_t src_len,
+			unsigned char *dst, size_t *dst_len, void *wrkmem)
+{
+	int ret = -1;
+	int out_len = 0;
+
+	if (src_len < LZ4_64KLIMIT)
+		out_len = lz4_compress64kctx(wrkmem, src, dst, src_len,
+				lz4_compressbound(src_len));
+	else
+		out_len = lz4_compressctx(wrkmem, src, dst, src_len,
+				lz4_compressbound(src_len));
+
+	if (out_len < 0)
+		goto exit;
+
+	*dst_len = out_len;
+
+	return 0;
+exit:
+	return ret;
+}
+EXPORT_SYMBOL(lz4_compress);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("LZ4 compressor");
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
new file mode 100644
index 000000000..6d940c72b
--- /dev/null
+++ b/lib/lz4/lz4_decompress.c
@@ -0,0 +1,343 @@
+/*
+ * LZ4 Decompressor for Linux kernel
+ *
+ * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ *
+ * Based on LZ4 implementation by Yann Collet.
+ *
+ * LZ4 - Fast LZ compression algorithm
+ * Copyright (C) 2011-2012, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  You can contact the author at :
+ *  - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ *  - LZ4 source repository : http://code.google.com/p/lz4/
+ */
+
+#ifndef STATIC
+#include <linux/module.h>
+#include <linux/kernel.h>
+#endif
+#include <linux/lz4.h>
+
+#include <asm/unaligned.h>
+
+#include "lz4defs.h"
+
+static const int dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
+#endif
+
+static int lz4_uncompress(const char *source, char *dest, int osize)
+{
+	const BYTE *ip = (const BYTE *) source;
+	const BYTE *ref;
+	BYTE *op = (BYTE *) dest;
+	BYTE * const oend = op + osize;
+	BYTE *cpy;
+	unsigned token;
+	size_t length;
+
+	while (1) {
+
+		/* get runlength */
+		token = *ip++;
+		length = (token >> ML_BITS);
+		if (length == RUN_MASK) {
+			size_t len;
+
+			len = *ip++;
+			for (; len == 255; length += 255)
+				len = *ip++;
+			if (unlikely(length > (size_t)(length + len)))
+				goto _output_error;
+			length += len;
+		}
+
+		/* copy literals */
+		cpy = op + length;
+		if (unlikely(cpy > oend - COPYLENGTH)) {
+			/*
+			 * Error: not enough place for another match
+			 * (min 4) + 5 literals
+			 */
+			if (cpy != oend)
+				goto _output_error;
+
+			memcpy(op, ip, length);
+			ip += length;
+			break; /* EOF */
+		}
+		LZ4_WILDCOPY(ip, op, cpy);
+		ip -= (op - cpy);
+		op = cpy;
+
+		/* get offset */
+		LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
+		ip += 2;
+
+		/* Error: offset create reference outside destination buffer */
+		if (unlikely(ref < (BYTE *const) dest))
+			goto _output_error;
+
+		/* get matchlength */
+		length = token & ML_MASK;
+		if (length == ML_MASK) {
+			for (; *ip == 255; length += 255)
+				ip++;
+			if (unlikely(length > (size_t)(length + *ip)))
+				goto _output_error;
+			length += *ip++;
+		}
+
+		/* copy repeated sequence */
+		if (unlikely((op - ref) < STEPSIZE)) {
+#if LZ4_ARCH64
+			int dec64 = dec64table[op - ref];
+#else
+			const int dec64 = 0;
+#endif
+			op[0] = ref[0];
+			op[1] = ref[1];
+			op[2] = ref[2];
+			op[3] = ref[3];
+			op += 4;
+			ref += 4;
+			ref -= dec32table[op-ref];
+			PUT4(ref, op);
+			op += STEPSIZE - 4;
+			ref -= dec64;
+		} else {
+			LZ4_COPYSTEP(ref, op);
+		}
+		cpy = op + length - (STEPSIZE - 4);
+		if (cpy > (oend - COPYLENGTH)) {
+
+			/* Error: request to write beyond destination buffer */
+			if (cpy > oend)
+				goto _output_error;
+#if LZ4_ARCH64
+			if ((ref + COPYLENGTH) > oend)
+#else
+			if ((ref + COPYLENGTH) > oend ||
+					(op + COPYLENGTH) > oend)
+#endif
+				goto _output_error;
+			LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
+			while (op < cpy)
+				*op++ = *ref++;
+			op = cpy;
+			/*
+			 * Check EOF (should never happen, since last 5 bytes
+			 * are supposed to be literals)
+			 */
+			if (op == oend)
+				goto _output_error;
+			continue;
+		}
+		LZ4_SECURECOPY(ref, op, cpy);
+		op = cpy; /* correction */
+	}
+	/* end of decoding */
+	return (int) (((char *)ip) - source);
+
+	/* write overflow error detected */
+_output_error:
+	return -1;
+}
+
+static int lz4_uncompress_unknownoutputsize(const char *source, char *dest,
+				int isize, size_t maxoutputsize)
+{
+	const BYTE *ip = (const BYTE *) source;
+	const BYTE *const iend = ip + isize;
+	const BYTE *ref;
+
+
+	BYTE *op = (BYTE *) dest;
+	BYTE * const oend = op + maxoutputsize;
+	BYTE *cpy;
+
+	/* Main Loop */
+	while (ip < iend) {
+
+		unsigned token;
+		size_t length;
+
+		/* get runlength */
+		token = *ip++;
+		length = (token >> ML_BITS);
+		if (length == RUN_MASK) {
+			int s = 255;
+			while ((ip < iend) && (s == 255)) {
+				s = *ip++;
+				if (unlikely(length > (size_t)(length + s)))
+					goto _output_error;
+				length += s;
+			}
+		}
+		/* copy literals */
+		cpy = op + length;
+		if ((cpy > oend - COPYLENGTH) ||
+			(ip + length > iend - COPYLENGTH)) {
+
+			if (cpy > oend)
+				goto _output_error;/* writes beyond buffer */
+
+			if (ip + length != iend)
+				goto _output_error;/*
+						    * Error: LZ4 format requires
+						    * to consume all input
+						    * at this stage
+						    */
+			memcpy(op, ip, length);
+			op += length;
+			break;/* Necessarily EOF, due to parsing restrictions */
+		}
+		LZ4_WILDCOPY(ip, op, cpy);
+		ip -= (op - cpy);
+		op = cpy;
+
+		/* get offset */
+		LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
+		ip += 2;
+		if (ref < (BYTE * const) dest)
+			goto _output_error;
+			/*
+			 * Error : offset creates reference
+			 * outside of destination buffer
+			 */
+
+		/* get matchlength */
+		length = (token & ML_MASK);
+		if (length == ML_MASK) {
+			while (ip < iend) {
+				int s = *ip++;
+				if (unlikely(length > (size_t)(length + s)))
+					goto _output_error;
+				length += s;
+				if (s == 255)
+					continue;
+				break;
+			}
+		}
+
+		/* copy repeated sequence */
+		if (unlikely((op - ref) < STEPSIZE)) {
+#if LZ4_ARCH64
+			int dec64 = dec64table[op - ref];
+#else
+			const int dec64 = 0;
+#endif
+				op[0] = ref[0];
+				op[1] = ref[1];
+				op[2] = ref[2];
+				op[3] = ref[3];
+				op += 4;
+				ref += 4;
+				ref -= dec32table[op - ref];
+				PUT4(ref, op);
+				op += STEPSIZE - 4;
+				ref -= dec64;
+		} else {
+			LZ4_COPYSTEP(ref, op);
+		}
+		cpy = op + length - (STEPSIZE-4);
+		if (cpy > oend - COPYLENGTH) {
+			if (cpy > oend)
+				goto _output_error; /* write outside of buf */
+#if LZ4_ARCH64
+			if ((ref + COPYLENGTH) > oend)
+#else
+			if ((ref + COPYLENGTH) > oend ||
+					(op + COPYLENGTH) > oend)
+#endif
+				goto _output_error;
+			LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
+			while (op < cpy)
+				*op++ = *ref++;
+			op = cpy;
+			/*
+			 * Check EOF (should never happen, since last 5 bytes
+			 * are supposed to be literals)
+			 */
+			if (op == oend)
+				goto _output_error;
+			continue;
+		}
+		LZ4_SECURECOPY(ref, op, cpy);
+		op = cpy; /* correction */
+	}
+	/* end of decoding */
+	return (int) (((char *) op) - dest);
+
+	/* write overflow error detected */
+_output_error:
+	return -1;
+}
+
+int lz4_decompress(const unsigned char *src, size_t *src_len,
+		unsigned char *dest, size_t actual_dest_len)
+{
+	int ret = -1;
+	int input_len = 0;
+
+	input_len = lz4_uncompress(src, dest, actual_dest_len);
+	if (input_len < 0)
+		goto exit_0;
+	*src_len = input_len;
+
+	return 0;
+exit_0:
+	return ret;
+}
+#ifndef STATIC
+EXPORT_SYMBOL(lz4_decompress);
+#endif
+
+int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
+		unsigned char *dest, size_t *dest_len)
+{
+	int ret = -1;
+	int out_len = 0;
+
+	out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len,
+					*dest_len);
+	if (out_len < 0)
+		goto exit_0;
+	*dest_len = out_len;
+
+	return 0;
+exit_0:
+	return ret;
+}
+#ifndef STATIC
+EXPORT_SYMBOL(lz4_decompress_unknownoutputsize);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("LZ4 Decompressor");
+#endif
diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
new file mode 100644
index 000000000..9b4182fd3
--- /dev/null
+++ b/lib/lz4/lz4defs.h
@@ -0,0 +1,157 @@
+/*
+ * lz4defs.h -- architecture specific defines
+ *
+ * Copyright (C) 2013, LG Electronics, Kyungsik Lee <kyungsik.lee@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Detects 64 bits mode
+ */
+#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) \
+	|| defined(__ppc64__) || defined(__LP64__))
+#define LZ4_ARCH64 1
+#else
+#define LZ4_ARCH64 0
+#endif
+
+/*
+ * Architecture-specific macros
+ */
+#define ARM_EFFICIENT_UNALIGNED_ACCESS
+#define BYTE	u8
+typedef struct _U16_S { u16 v; } U16_S;
+typedef struct _U32_S { u32 v; } U32_S;
+typedef struct _U64_S { u64 v; } U64_S;
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)		\
+	|| defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 6	\
+	&& defined(ARM_EFFICIENT_UNALIGNED_ACCESS)
+
+#define A16(x) (((U16_S *)(x))->v)
+#define A32(x) (((U32_S *)(x))->v)
+#define A64(x) (((U64_S *)(x))->v)
+
+#define PUT4(s, d) (A32(d) = A32(s))
+#define PUT8(s, d) (A64(d) = A64(s))
+#define LZ4_WRITE_LITTLEENDIAN_16(p, v)	\
+	do {	\
+		A16(p) = v; \
+		p += 2; \
+	} while (0)
+#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */
+
+#define A64(x) get_unaligned((u64 *)&(((U16_S *)(x))->v))
+#define A32(x) get_unaligned((u32 *)&(((U16_S *)(x))->v))
+#define A16(x) get_unaligned((u16 *)&(((U16_S *)(x))->v))
+
+#define PUT4(s, d) \
+	put_unaligned(get_unaligned((const u32 *) s), (u32 *) d)
+#define PUT8(s, d) \
+	put_unaligned(get_unaligned((const u64 *) s), (u64 *) d)
+
+#define LZ4_WRITE_LITTLEENDIAN_16(p, v)	\
+	do {	\
+		put_unaligned(v, (u16 *)(p)); \
+		p += 2; \
+	} while (0)
+#endif
+
+#define COPYLENGTH 8
+#define ML_BITS  4
+#define ML_MASK  ((1U << ML_BITS) - 1)
+#define RUN_BITS (8 - ML_BITS)
+#define RUN_MASK ((1U << RUN_BITS) - 1)
+#define MEMORY_USAGE	14
+#define MINMATCH	4
+#define SKIPSTRENGTH	6
+#define LASTLITERALS	5
+#define MFLIMIT		(COPYLENGTH + MINMATCH)
+#define MINLENGTH	(MFLIMIT + 1)
+#define MAXD_LOG	16
+#define MAXD		(1 << MAXD_LOG)
+#define MAXD_MASK	(u32)(MAXD - 1)
+#define MAX_DISTANCE	(MAXD - 1)
+#define HASH_LOG	(MAXD_LOG - 1)
+#define HASHTABLESIZE	(1 << HASH_LOG)
+#define MAX_NB_ATTEMPTS	256
+#define OPTIMAL_ML	(int)((ML_MASK-1)+MINMATCH)
+#define LZ4_64KLIMIT	((1<<16) + (MFLIMIT - 1))
+#define HASHLOG64K	((MEMORY_USAGE - 2) + 1)
+#define HASH64KTABLESIZE	(1U << HASHLOG64K)
+#define LZ4_HASH_VALUE(p)	(((A32(p)) * 2654435761U) >> \
+				((MINMATCH * 8) - (MEMORY_USAGE-2)))
+#define LZ4_HASH64K_VALUE(p)	(((A32(p)) * 2654435761U) >> \
+				((MINMATCH * 8) - HASHLOG64K))
+#define HASH_VALUE(p)		(((A32(p)) * 2654435761U) >> \
+				((MINMATCH * 8) - HASH_LOG))
+
+#if LZ4_ARCH64/* 64-bit */
+#define STEPSIZE 8
+
+#define LZ4_COPYSTEP(s, d)	\
+	do {			\
+		PUT8(s, d);	\
+		d += 8;		\
+		s += 8;		\
+	} while (0)
+
+#define LZ4_COPYPACKET(s, d)	LZ4_COPYSTEP(s, d)
+
+#define LZ4_SECURECOPY(s, d, e)			\
+	do {					\
+		if (d < e) {			\
+			LZ4_WILDCOPY(s, d, e);	\
+		}				\
+	} while (0)
+#define HTYPE u32
+
+#ifdef __BIG_ENDIAN
+#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3)
+#else
+#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3)
+#endif
+
+#else	/* 32-bit */
+#define STEPSIZE 4
+
+#define LZ4_COPYSTEP(s, d)	\
+	do {			\
+		PUT4(s, d);	\
+		d += 4;		\
+		s += 4;		\
+	} while (0)
+
+#define LZ4_COPYPACKET(s, d)		\
+	do {				\
+		LZ4_COPYSTEP(s, d);	\
+		LZ4_COPYSTEP(s, d);	\
+	} while (0)
+
+#define LZ4_SECURECOPY	LZ4_WILDCOPY
+#define HTYPE const u8*
+
+#ifdef __BIG_ENDIAN
+#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3)
+#else
+#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3)
+#endif
+
+#endif
+
+#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \
+	(d = s - get_unaligned_le16(p))
+
+#define LZ4_WILDCOPY(s, d, e)		\
+	do {				\
+		LZ4_COPYPACKET(s, d);	\
+	} while (d < e)
+
+#define LZ4_BLINDCOPY(s, d, l)	\
+	do {	\
+		u8 *e = (d) + l;	\
+		LZ4_WILDCOPY(s, d, e);	\
+		d = e;	\
+	} while (0)
diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
new file mode 100644
index 000000000..f344f76b6
--- /dev/null
+++ b/lib/lz4/lz4hc_compress.c
@@ -0,0 +1,539 @@
+/*
+ * LZ4 HC - High Compression Mode of LZ4
+ * Copyright (C) 2011-2012, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ *
+ *  Changed for kernel use by:
+ *  Chanho Min <chanho.min@lge.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/lz4.h>
+#include <asm/unaligned.h>
+#include "lz4defs.h"
+
+struct lz4hc_data {
+	const u8 *base;
+	HTYPE hashtable[HASHTABLESIZE];
+	u16 chaintable[MAXD];
+	const u8 *nexttoupdate;
+} __attribute__((__packed__));
+
+static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base)
+{
+	memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable));
+	memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable));
+
+#if LZ4_ARCH64
+	hc4->nexttoupdate = base + 1;
+#else
+	hc4->nexttoupdate = base;
+#endif
+	hc4->base = base;
+	return 1;
+}
+
+/* Update chains up to ip (excluded) */
+static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip)
+{
+	u16 *chaintable = hc4->chaintable;
+	HTYPE *hashtable  = hc4->hashtable;
+#if LZ4_ARCH64
+	const BYTE * const base = hc4->base;
+#else
+	const int base = 0;
+#endif
+
+	while (hc4->nexttoupdate < ip) {
+		const u8 *p = hc4->nexttoupdate;
+		size_t delta = p - (hashtable[HASH_VALUE(p)] + base);
+		if (delta > MAX_DISTANCE)
+			delta = MAX_DISTANCE;
+		chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta;
+		hashtable[HASH_VALUE(p)] = (p) - base;
+		hc4->nexttoupdate++;
+	}
+}
+
+static inline size_t lz4hc_commonlength(const u8 *p1, const u8 *p2,
+		const u8 *const matchlimit)
+{
+	const u8 *p1t = p1;
+
+	while (p1t < matchlimit - (STEPSIZE - 1)) {
+#if LZ4_ARCH64
+		u64 diff = A64(p2) ^ A64(p1t);
+#else
+		u32 diff = A32(p2) ^ A32(p1t);
+#endif
+		if (!diff) {
+			p1t += STEPSIZE;
+			p2 += STEPSIZE;
+			continue;
+		}
+		p1t += LZ4_NBCOMMONBYTES(diff);
+		return p1t - p1;
+	}
+#if LZ4_ARCH64
+	if ((p1t < (matchlimit-3)) && (A32(p2) == A32(p1t))) {
+		p1t += 4;
+		p2 += 4;
+	}
+#endif
+
+	if ((p1t < (matchlimit - 1)) && (A16(p2) == A16(p1t))) {
+		p1t += 2;
+		p2 += 2;
+	}
+	if ((p1t < matchlimit) && (*p2 == *p1t))
+		p1t++;
+	return p1t - p1;
+}
+
+static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4,
+		const u8 *ip, const u8 *const matchlimit, const u8 **matchpos)
+{
+	u16 *const chaintable = hc4->chaintable;
+	HTYPE *const hashtable = hc4->hashtable;
+	const u8 *ref;
+#if LZ4_ARCH64
+	const BYTE * const base = hc4->base;
+#else
+	const int base = 0;
+#endif
+	int nbattempts = MAX_NB_ATTEMPTS;
+	size_t repl = 0, ml = 0;
+	u16 delta;
+
+	/* HC4 match finder */
+	lz4hc_insert(hc4, ip);
+	ref = hashtable[HASH_VALUE(ip)] + base;
+
+	/* potential repetition */
+	if (ref >= ip-4) {
+		/* confirmed */
+		if (A32(ref) == A32(ip)) {
+			delta = (u16)(ip-ref);
+			repl = ml  = lz4hc_commonlength(ip + MINMATCH,
+					ref + MINMATCH, matchlimit) + MINMATCH;
+			*matchpos = ref;
+		}
+		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
+	}
+
+	while ((ref >= ip - MAX_DISTANCE) && nbattempts) {
+		nbattempts--;
+		if (*(ref + ml) == *(ip + ml)) {
+			if (A32(ref) == A32(ip)) {
+				size_t mlt =
+					lz4hc_commonlength(ip + MINMATCH,
+					ref + MINMATCH, matchlimit) + MINMATCH;
+				if (mlt > ml) {
+					ml = mlt;
+					*matchpos = ref;
+				}
+			}
+		}
+		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
+	}
+
+	/* Complete table */
+	if (repl) {
+		const BYTE *ptr = ip;
+		const BYTE *end;
+		end = ip + repl - (MINMATCH-1);
+		/* Pre-Load */
+		while (ptr < end - delta) {
+			chaintable[(size_t)(ptr) & MAXD_MASK] = delta;
+			ptr++;
+		}
+		do {
+			chaintable[(size_t)(ptr) & MAXD_MASK] = delta;
+			/* Head of chain */
+			hashtable[HASH_VALUE(ptr)] = (ptr) - base;
+			ptr++;
+		} while (ptr < end);
+		hc4->nexttoupdate = end;
+	}
+
+	return (int)ml;
+}
+
+static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4,
+	const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest,
+	const u8 **matchpos, const u8 **startpos)
+{
+	u16 *const chaintable = hc4->chaintable;
+	HTYPE *const hashtable = hc4->hashtable;
+#if LZ4_ARCH64
+	const BYTE * const base = hc4->base;
+#else
+	const int base = 0;
+#endif
+	const u8 *ref;
+	int nbattempts = MAX_NB_ATTEMPTS;
+	int delta = (int)(ip - startlimit);
+
+	/* First Match */
+	lz4hc_insert(hc4, ip);
+	ref = hashtable[HASH_VALUE(ip)] + base;
+
+	while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base)
+		&& (nbattempts)) {
+		nbattempts--;
+		if (*(startlimit + longest) == *(ref - delta + longest)) {
+			if (A32(ref) == A32(ip)) {
+				const u8 *reft = ref + MINMATCH;
+				const u8 *ipt = ip + MINMATCH;
+				const u8 *startt = ip;
+
+				while (ipt < matchlimit-(STEPSIZE - 1)) {
+					#if LZ4_ARCH64
+					u64 diff = A64(reft) ^ A64(ipt);
+					#else
+					u32 diff = A32(reft) ^ A32(ipt);
+					#endif
+
+					if (!diff) {
+						ipt += STEPSIZE;
+						reft += STEPSIZE;
+						continue;
+					}
+					ipt += LZ4_NBCOMMONBYTES(diff);
+					goto _endcount;
+				}
+				#if LZ4_ARCH64
+				if ((ipt < (matchlimit - 3))
+					&& (A32(reft) == A32(ipt))) {
+					ipt += 4;
+					reft += 4;
+				}
+				ipt += 2;
+				#endif
+				if ((ipt < (matchlimit - 1))
+					&& (A16(reft) == A16(ipt))) {
+					reft += 2;
+				}
+				if ((ipt < matchlimit) && (*reft == *ipt))
+					ipt++;
+_endcount:
+				reft = ref;
+
+				while ((startt > startlimit)
+					&& (reft > hc4->base)
+					&& (startt[-1] == reft[-1])) {
+					startt--;
+					reft--;
+				}
+
+				if ((ipt - startt) > longest) {
+					longest = (int)(ipt - startt);
+					*matchpos = reft;
+					*startpos = startt;
+				}
+			}
+		}
+		ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK];
+	}
+	return longest;
+}
+
+static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor,
+		int ml, const u8 *ref)
+{
+	int length, len;
+	u8 *token;
+
+	/* Encode Literal length */
+	length = (int)(*ip - *anchor);
+	token = (*op)++;
+	if (length >= (int)RUN_MASK) {
+		*token = (RUN_MASK << ML_BITS);
+		len = length - RUN_MASK;
+		for (; len > 254 ; len -= 255)
+			*(*op)++ = 255;
+		*(*op)++ = (u8)len;
+	} else
+		*token = (length << ML_BITS);
+
+	/* Copy Literals */
+	LZ4_BLINDCOPY(*anchor, *op, length);
+
+	/* Encode Offset */
+	LZ4_WRITE_LITTLEENDIAN_16(*op, (u16)(*ip - ref));
+
+	/* Encode MatchLength */
+	len = (int)(ml - MINMATCH);
+	if (len >= (int)ML_MASK) {
+		*token += ML_MASK;
+		len -= ML_MASK;
+		for (; len > 509 ; len -= 510) {
+			*(*op)++ = 255;
+			*(*op)++ = 255;
+		}
+		if (len > 254) {
+			len -= 255;
+			*(*op)++ = 255;
+		}
+		*(*op)++ = (u8)len;
+	} else
+		*token += len;
+
+	/* Prepare next loop */
+	*ip += ml;
+	*anchor = *ip;
+
+	return 0;
+}
+
+static int lz4_compresshcctx(struct lz4hc_data *ctx,
+		const char *source,
+		char *dest,
+		int isize)
+{
+	const u8 *ip = (const u8 *)source;
+	const u8 *anchor = ip;
+	const u8 *const iend = ip + isize;
+	const u8 *const mflimit = iend - MFLIMIT;
+	const u8 *const matchlimit = (iend - LASTLITERALS);
+
+	u8 *op = (u8 *)dest;
+
+	int ml, ml2, ml3, ml0;
+	const u8 *ref = NULL;
+	const u8 *start2 = NULL;
+	const u8 *ref2 = NULL;
+	const u8 *start3 = NULL;
+	const u8 *ref3 = NULL;
+	const u8 *start0;
+	const u8 *ref0;
+	int lastrun;
+
+	ip++;
+
+	/* Main Loop */
+	while (ip < mflimit) {
+		ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref));
+		if (!ml) {
+			ip++;
+			continue;
+		}
+
+		/* saved, in case we would skip too much */
+		start0 = ip;
+		ref0 = ref;
+		ml0 = ml;
+_search2:
+		if (ip+ml < mflimit)
+			ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2,
+				ip + 1, matchlimit, ml, &ref2, &start2);
+		else
+			ml2 = ml;
+		/* No better match */
+		if (ml2 == ml) {
+			lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+			continue;
+		}
+
+		if (start0 < ip) {
+			/* empirical */
+			if (start2 < ip + ml0) {
+				ip = start0;
+				ref = ref0;
+				ml = ml0;
+			}
+		}
+		/*
+		 * Here, start0==ip
+		 * First Match too small : removed
+		 */
+		if ((start2 - ip) < 3) {
+			ml = ml2;
+			ip = start2;
+			ref = ref2;
+			goto _search2;
+		}
+
+_search3:
+		/*
+		 * Currently we have :
+		 * ml2 > ml1, and
+		 * ip1+3 <= ip2 (usually < ip1+ml1)
+		 */
+		if ((start2 - ip) < OPTIMAL_ML) {
+			int correction;
+			int new_ml = ml;
+			if (new_ml > OPTIMAL_ML)
+				new_ml = OPTIMAL_ML;
+			if (ip + new_ml > start2 + ml2 - MINMATCH)
+				new_ml = (int)(start2 - ip) + ml2 - MINMATCH;
+			correction = new_ml - (int)(start2 - ip);
+			if (correction > 0) {
+				start2 += correction;
+				ref2 += correction;
+				ml2 -= correction;
+			}
+		}
+		/*
+		 * Now, we have start2 = ip+new_ml,
+		 * with new_ml=min(ml, OPTIMAL_ML=18)
+		 */
+		if (start2 + ml2 < mflimit)
+			ml3 = lz4hc_insertandgetwidermatch(ctx,
+				start2 + ml2 - 3, start2, matchlimit,
+				ml2, &ref3, &start3);
+		else
+			ml3 = ml2;
+
+		/* No better match : 2 sequences to encode */
+		if (ml3 == ml2) {
+			/* ip & ref are known; Now for ml */
+			if (start2 < ip+ml)
+				ml = (int)(start2 - ip);
+
+			/* Now, encode 2 sequences */
+			lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+			ip = start2;
+			lz4_encodesequence(&ip, &op, &anchor, ml2, ref2);
+			continue;
+		}
+
+		/* Not enough space for match 2 : remove it */
+		if (start3 < ip + ml + 3) {
+			/*
+			 * can write Seq1 immediately ==> Seq2 is removed,
+			 * so Seq3 becomes Seq1
+			 */
+			if (start3 >= (ip + ml)) {
+				if (start2 < ip + ml) {
+					int correction =
+						(int)(ip + ml - start2);
+					start2 += correction;
+					ref2 += correction;
+					ml2 -= correction;
+					if (ml2 < MINMATCH) {
+						start2 = start3;
+						ref2 = ref3;
+						ml2 = ml3;
+					}
+				}
+
+				lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+				ip  = start3;
+				ref = ref3;
+				ml  = ml3;
+
+				start0 = start2;
+				ref0 = ref2;
+				ml0 = ml2;
+				goto _search2;
+			}
+
+			start2 = start3;
+			ref2 = ref3;
+			ml2 = ml3;
+			goto _search3;
+		}
+
+		/*
+		 * OK, now we have 3 ascending matches; let's write at least
+		 * the first one ip & ref are known; Now for ml
+		 */
+		if (start2 < ip + ml) {
+			if ((start2 - ip) < (int)ML_MASK) {
+				int correction;
+				if (ml > OPTIMAL_ML)
+					ml = OPTIMAL_ML;
+				if (ip + ml > start2 + ml2 - MINMATCH)
+					ml = (int)(start2 - ip) + ml2
+						- MINMATCH;
+				correction = ml - (int)(start2 - ip);
+				if (correction > 0) {
+					start2 += correction;
+					ref2 += correction;
+					ml2 -= correction;
+				}
+			} else
+				ml = (int)(start2 - ip);
+		}
+		lz4_encodesequence(&ip, &op, &anchor, ml, ref);
+
+		ip = start2;
+		ref = ref2;
+		ml = ml2;
+
+		start2 = start3;
+		ref2 = ref3;
+		ml2 = ml3;
+
+		goto _search3;
+	}
+
+	/* Encode Last Literals */
+	lastrun = (int)(iend - anchor);
+	if (lastrun >= (int)RUN_MASK) {
+		*op++ = (RUN_MASK << ML_BITS);
+		lastrun -= RUN_MASK;
+		for (; lastrun > 254 ; lastrun -= 255)
+			*op++ = 255;
+		*op++ = (u8) lastrun;
+	} else
+		*op++ = (lastrun << ML_BITS);
+	memcpy(op, anchor, iend - anchor);
+	op += iend - anchor;
+	/* End */
+	return (int) (((char *)op) - dest);
+}
+
+int lz4hc_compress(const unsigned char *src, size_t src_len,
+			unsigned char *dst, size_t *dst_len, void *wrkmem)
+{
+	int ret = -1;
+	int out_len = 0;
+
+	struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem;
+	lz4hc_init(hc4, (const u8 *)src);
+	out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src,
+		(char *)dst, (int)src_len);
+
+	if (out_len < 0)
+		goto exit;
+
+	*dst_len = out_len;
+	return 0;
+
+exit:
+	return ret;
+}
+EXPORT_SYMBOL(lz4hc_compress);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("LZ4HC compressor");
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 3ac50dc55..6ebbdba6b 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -32,35 +32,9 @@
 #include <linux/string.h>
 #include <linux/bitops.h>
 #include <linux/rcupdate.h>
+#include <linux/hardirq.h>		/* in_interrupt() */
 
 
-#ifdef __KERNEL__
-#define RADIX_TREE_MAP_SHIFT	(CONFIG_BASE_SMALL ? 4 : 6)
-#else
-#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
-#endif
-
-#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
-#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
-
-#define RADIX_TREE_TAG_LONGS	\
-	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
-
-struct radix_tree_node {
-	unsigned int	height;		/* Height from the bottom */
-	unsigned int	count;
-	union {
-		struct radix_tree_node *parent;	/* Used when ascending tree */
-		struct rcu_head	rcu_head;	/* Used when freeing node */
-	};
-	void __rcu	*slots[RADIX_TREE_MAP_SIZE];
-	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
-};
-
-#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
-#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
-					  RADIX_TREE_MAP_SHIFT))
-
 /*
  * The height_to_maxindex array needs to be one deeper than the maximum
  * path as height 0 holds only 1 entry.
@@ -72,12 +46,25 @@ static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
  */
 static struct kmem_cache *radix_tree_node_cachep;
 
+/*
+ * The radix tree is variable-height, so an insert operation not only has
+ * to build the branch to its corresponding item, it also has to build the
+ * branch to existing items if the size has to be increased (by
+ * radix_tree_extend).
+ *
+ * The worst case is a zero height tree with just a single item at index 0,
+ * and then inserting an item at index ULONG_MAX. This requires 2 new branches
+ * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared.
+ * Hence:
+ */
+#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)
+
 /*
  * Per-cpu pool of preloaded nodes
  */
 struct radix_tree_preload {
 	int nr;
-	struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
+	struct radix_tree_node *nodes[RADIX_TREE_PRELOAD_SIZE];
 };
 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
 
@@ -194,7 +181,12 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 	struct radix_tree_node *ret = NULL;
 	gfp_t gfp_mask = root_gfp_mask(root);
 
-	if (!(gfp_mask & __GFP_WAIT)) {
+	/*
+	 * Preload code isn't irq safe and it doesn't make sence to use
+	 * preloading in the interrupt anyway as all the allocations have to
+	 * be atomic. So just do normal allocation when in interrupt.
+	 */
+	if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) {
 		struct radix_tree_preload *rtp;
 
 		/*
@@ -202,7 +194,7 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 		 * succeed in getting a node here (and never reach
 		 * kmem_cache_alloc)
 		 */
-		rtp = &__get_cpu_var(radix_tree_preloads);
+		rtp = this_cpu_ptr(&radix_tree_preloads);
 		if (rtp->nr) {
 			ret = rtp->nodes[rtp->nr - 1];
 			rtp->nodes[rtp->nr - 1] = NULL;
@@ -251,21 +243,21 @@ radix_tree_node_free(struct radix_tree_node *node)
  * To make use of this facility, the radix tree must be initialised without
  * __GFP_WAIT being passed to INIT_RADIX_TREE().
  */
-int radix_tree_preload(gfp_t gfp_mask)
+static int __radix_tree_preload(gfp_t gfp_mask)
 {
 	struct radix_tree_preload *rtp;
 	struct radix_tree_node *node;
 	int ret = -ENOMEM;
 
 	preempt_disable();
-	rtp = &__get_cpu_var(radix_tree_preloads);
+	rtp = this_cpu_ptr(&radix_tree_preloads);
 	while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
 		preempt_enable();
 		node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
 		if (node == NULL)
 			goto out;
 		preempt_disable();
-		rtp = &__get_cpu_var(radix_tree_preloads);
+		rtp = this_cpu_ptr(&radix_tree_preloads);
 		if (rtp->nr < ARRAY_SIZE(rtp->nodes))
 			rtp->nodes[rtp->nr++] = node;
 		else
@@ -275,8 +267,39 @@ int radix_tree_preload(gfp_t gfp_mask)
 out:
 	return ret;
 }
+
+/*
+ * Load up this CPU's radix_tree_node buffer with sufficient objects to
+ * ensure that the addition of a single element in the tree cannot fail.  On
+ * success, return zero, with preemption disabled.  On error, return -ENOMEM
+ * with preemption not disabled.
+ *
+ * To make use of this facility, the radix tree must be initialised without
+ * __GFP_WAIT being passed to INIT_RADIX_TREE().
+ */
+int radix_tree_preload(gfp_t gfp_mask)
+{
+	/* Warn on non-sensical use... */
+	WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT));
+	return __radix_tree_preload(gfp_mask);
+}
 EXPORT_SYMBOL(radix_tree_preload);
 
+/*
+ * The same as above function, except we don't guarantee preloading happens.
+ * We do it, if we decide it helps. On success, return zero with preemption
+ * disabled. On error, return -ENOMEM with preemption not disabled.
+ */
+int radix_tree_maybe_preload(gfp_t gfp_mask)
+{
+	if (gfp_mask & __GFP_WAIT)
+		return __radix_tree_preload(gfp_mask);
+	/* Preloading doesn't help anything with this gfp mask, skip it */
+	preempt_disable();
+	return 0;
+}
+EXPORT_SYMBOL(radix_tree_maybe_preload);
+
 /*
  *	Return the maximum key which can be store into a
  *	radix tree with height HEIGHT.
@@ -337,23 +360,28 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 }
 
 /**
- *	radix_tree_insert    -    insert into a radix tree
+ *	__radix_tree_create	-	create a slot in a radix tree
  *	@root:		radix tree root
  *	@index:		index key
- *	@item:		item to insert
+ *	@nodep:		returns node
+ *	@slotp:		returns slot
  *
- *	Insert an item into the radix tree at position @index.
+ *	Create, if necessary, and return the node and slot for an item
+ *	at position @index in the radix tree @root.
+ *
+ *	Until there is more than one item in the tree, no nodes are
+ *	allocated and @root->rnode is used as a direct slot instead of
+ *	pointing to a node, in which case *@nodep will be NULL.
+ *
+ *	Returns -ENOMEM, or 0 for success.
  */
-int radix_tree_insert(struct radix_tree_root *root,
-			unsigned long index, void *item)
+int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
+			struct radix_tree_node **nodep, void ***slotp)
 {
 	struct radix_tree_node *node = NULL, *slot;
-	unsigned int height, shift;
-	int offset;
+	unsigned int height, shift, offset;
 	int error;
 
-	BUG_ON(radix_tree_is_indirect_ptr(item));
-
 	/* Make sure the tree is high enough.  */
 	if (index > radix_tree_maxindex(root->height)) {
 		error = radix_tree_extend(root, index);
@@ -389,16 +417,42 @@ int radix_tree_insert(struct radix_tree_root *root,
 		height--;
 	}
 
-	if (slot != NULL)
+	if (nodep)
+		*nodep = node;
+	if (slotp)
+		*slotp = node ? node->slots + offset : (void **)&root->rnode;
+	return 0;
+}
+
+/**
+ *	radix_tree_insert    -    insert into a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@item:		item to insert
+ *
+ *	Insert an item into the radix tree at position @index.
+ */
+int radix_tree_insert(struct radix_tree_root *root,
+			unsigned long index, void *item)
+{
+	struct radix_tree_node *node;
+	void **slot;
+	int error;
+
+	BUG_ON(radix_tree_is_indirect_ptr(item));
+
+	error = __radix_tree_create(root, index, &node, &slot);
+	if (error)
+		return error;
+	if (*slot != NULL)
 		return -EEXIST;
+	rcu_assign_pointer(*slot, item);
 
 	if (node) {
 		node->count++;
-		rcu_assign_pointer(node->slots[offset], item);
-		BUG_ON(tag_get(node, 0, offset));
-		BUG_ON(tag_get(node, 1, offset));
+		BUG_ON(tag_get(node, 0, index & RADIX_TREE_MAP_MASK));
+		BUG_ON(tag_get(node, 1, index & RADIX_TREE_MAP_MASK));
 	} else {
-		rcu_assign_pointer(root->rnode, item);
 		BUG_ON(root_tag_get(root, 0));
 		BUG_ON(root_tag_get(root, 1));
 	}
@@ -407,15 +461,26 @@ int radix_tree_insert(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_insert);
 
-/*
- * is_slot == 1 : search for the slot.
- * is_slot == 0 : search for the node.
+/**
+ *	__radix_tree_lookup	-	lookup an item in a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@nodep:		returns node
+ *	@slotp:		returns slot
+ *
+ *	Lookup and return the item at position @index in the radix
+ *	tree @root.
+ *
+ *	Until there is more than one item in the tree, no nodes are
+ *	allocated and @root->rnode is used as a direct slot instead of
+ *	pointing to a node, in which case *@nodep will be NULL.
  */
-static void *radix_tree_lookup_element(struct radix_tree_root *root,
-				unsigned long index, int is_slot)
+void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
+			  struct radix_tree_node **nodep, void ***slotp)
 {
+	struct radix_tree_node *node, *parent;
 	unsigned int height, shift;
-	struct radix_tree_node *node, **slot;
+	void **slot;
 
 	node = rcu_dereference_raw(root->rnode);
 	if (node == NULL)
@@ -424,7 +489,12 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
 	if (!radix_tree_is_indirect_ptr(node)) {
 		if (index > 0)
 			return NULL;
-		return is_slot ? (void *)&root->rnode : node;
+
+		if (nodep)
+			*nodep = NULL;
+		if (slotp)
+			*slotp = (void **)&root->rnode;
+		return node;
 	}
 	node = indirect_to_ptr(node);
 
@@ -435,8 +505,8 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 
 	do {
-		slot = (struct radix_tree_node **)
-			(node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
+		parent = node;
+		slot = node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK);
 		node = rcu_dereference_raw(*slot);
 		if (node == NULL)
 			return NULL;
@@ -445,7 +515,11 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
 		height--;
 	} while (height > 0);
 
-	return is_slot ? (void *)slot : indirect_to_ptr(node);
+	if (nodep)
+		*nodep = parent;
+	if (slotp)
+		*slotp = slot;
+	return node;
 }
 
 /**
@@ -463,7 +537,11 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
  */
 void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
 {
-	return (void **)radix_tree_lookup_element(root, index, 1);
+	void **slot;
+
+	if (!__radix_tree_lookup(root, index, NULL, &slot))
+		return NULL;
+	return slot;
 }
 EXPORT_SYMBOL(radix_tree_lookup_slot);
 
@@ -481,7 +559,7 @@ EXPORT_SYMBOL(radix_tree_lookup_slot);
  */
 void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
 {
-	return radix_tree_lookup_element(root, index, 0);
+	return __radix_tree_lookup(root, index, NULL, NULL);
 }
 EXPORT_SYMBOL(radix_tree_lookup);
 
@@ -896,103 +974,73 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
 
-
 /**
- *	radix_tree_next_hole    -    find the next hole (not-present entry)
- *	@root:		tree root
- *	@index:		index key
- *	@max_scan:	maximum range to search
+ *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
+ *	@root:		radix tree root
+ *	@results:	where the results of the lookup are placed
+ *	@first_index:	start the lookup from this key
+ *	@max_items:	place up to this many items at *results
  *
- *	Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest
- *	indexed hole.
+ *	Performs an index-ascending scan of the tree for present items.  Places
+ *	them at *@results and returns the number of items which were placed at
+ *	*@results.
  *
- *	Returns: the index of the hole if found, otherwise returns an index
- *	outside of the set specified (in which case 'return - index >= max_scan'
- *	will be true). In rare cases of index wrap-around, 0 will be returned.
+ *	The implementation is naive.
  *
- *	radix_tree_next_hole may be called under rcu_read_lock. However, like
- *	radix_tree_gang_lookup, this will not atomically search a snapshot of
- *	the tree at a single point in time. For example, if a hole is created
- *	at index 5, then subsequently a hole is created at index 10,
- *	radix_tree_next_hole covering both indexes may return 10 if called
- *	under rcu_read_lock.
+ *	Like radix_tree_lookup, radix_tree_gang_lookup may be called under
+ *	rcu_read_lock. In this case, rather than the returned results being
+ *	an atomic snapshot of the tree at a single point in time, the semantics
+ *	of an RCU protected gang lookup are as though multiple radix_tree_lookups
+ *	have been issued in individual locks, and results stored in 'results'.
  */
-unsigned long radix_tree_next_hole(struct radix_tree_root *root,
-				unsigned long index, unsigned long max_scan)
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+			unsigned long first_index, unsigned int max_items)
 {
-	unsigned long i;
-
-	for (i = 0; i < max_scan; i++) {
-		if (!radix_tree_lookup(root, index))
-			break;
-		index++;
-		if (index == 0)
-			break;
-	}
-
-	return index;
-}
-EXPORT_SYMBOL(radix_tree_next_hole);
+	struct radix_tree_iter iter;
+	void **slot;
+	unsigned int ret = 0;
 
-/**
- *	radix_tree_prev_hole    -    find the prev hole (not-present entry)
- *	@root:		tree root
- *	@index:		index key
- *	@max_scan:	maximum range to search
- *
- *	Search backwards in the range [max(index-max_scan+1, 0), index]
- *	for the first hole.
- *
- *	Returns: the index of the hole if found, otherwise returns an index
- *	outside of the set specified (in which case 'index - return >= max_scan'
- *	will be true). In rare cases of wrap-around, ULONG_MAX will be returned.
- *
- *	radix_tree_next_hole may be called under rcu_read_lock. However, like
- *	radix_tree_gang_lookup, this will not atomically search a snapshot of
- *	the tree at a single point in time. For example, if a hole is created
- *	at index 10, then subsequently a hole is created at index 5,
- *	radix_tree_prev_hole covering both indexes may return 5 if called under
- *	rcu_read_lock.
- */
-unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
-				   unsigned long index, unsigned long max_scan)
-{
-	unsigned long i;
+	if (unlikely(!max_items))
+		return 0;
 
-	for (i = 0; i < max_scan; i++) {
-		if (!radix_tree_lookup(root, index))
-			break;
-		index--;
-		if (index == ULONG_MAX)
+	radix_tree_for_each_slot(slot, root, &iter, first_index) {
+		results[ret] = rcu_dereference_raw(*slot);
+		if (!results[ret])
+			continue;
+		if (radix_tree_is_indirect_ptr(results[ret])) {
+			slot = radix_tree_iter_retry(&iter);
+			continue;
+		}
+		if (++ret == max_items)
 			break;
 	}
 
-	return index;
+	return ret;
 }
-EXPORT_SYMBOL(radix_tree_prev_hole);
+EXPORT_SYMBOL(radix_tree_gang_lookup);
 
 /**
- *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
+ *	radix_tree_gang_lookup_index - perform multiple lookup on a radix tree
  *	@root:		radix tree root
  *	@results:	where the results of the lookup are placed
+ *	@indices:	where their indices should be placed
  *	@first_index:	start the lookup from this key
  *	@max_items:	place up to this many items at *results
  *
  *	Performs an index-ascending scan of the tree for present items.  Places
  *	them at *@results and returns the number of items which were placed at
- *	*@results.
+ *	*@results. The indices are placed in @indices.
  *
  *	The implementation is naive.
  *
- *	Like radix_tree_lookup, radix_tree_gang_lookup may be called under
- *	rcu_read_lock. In this case, rather than the returned results being
- *	an atomic snapshot of the tree at a single point in time, the semantics
- *	of an RCU protected gang lookup are as though multiple radix_tree_lookups
- *	have been issued in individual locks, and results stored in 'results'.
+ *	Just one difference from radix_tree_gang_lookup, the indices are also
+ *	collected along with the results of lookup.
  */
 unsigned int
-radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned int max_items)
+radix_tree_gang_lookup_index(struct radix_tree_root *root, void **results,
+			unsigned long *indices, unsigned long first_index,
+			unsigned int max_items)
 {
 	struct radix_tree_iter iter;
 	void **slot;
@@ -1005,13 +1053,15 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 		results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot));
 		if (!results[ret])
 			continue;
+		if (indices)
+			indices[ret] = iter.index;
 		if (++ret == max_items)
 			break;
 	}
 
 	return ret;
 }
-EXPORT_SYMBOL(radix_tree_gang_lookup);
+EXPORT_SYMBOL(radix_tree_gang_lookup_index);
 
 /**
  *	radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree
@@ -1081,9 +1131,13 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 		return 0;
 
 	radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
-		results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot));
+		results[ret] = rcu_dereference_raw(*slot);
 		if (!results[ret])
 			continue;
+		if (radix_tree_is_indirect_ptr(results[ret])) {
+			slot = radix_tree_iter_retry(&iter);
+			continue;
+		}
 		if (++ret == max_items)
 			break;
 	}
@@ -1203,8 +1257,10 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
 
 		node = indirect_to_ptr(node);
 		max_index = radix_tree_maxindex(node->height);
-		if (cur_index > max_index)
+		if (cur_index > max_index) {
+			rcu_read_unlock();
 			break;
+		}
 
 		cur_index = __locate(node, item, cur_index, &found_index);
 		rcu_read_unlock();
@@ -1285,48 +1341,89 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
 }
 
 /**
- *	radix_tree_delete    -    delete an item from a radix tree
+ *	__radix_tree_delete_node    -    try to free node after clearing a slot
  *	@root:		radix tree root
  *	@index:		index key
+ *	@node:		node containing @index
  *
- *	Remove the item at @index from the radix tree rooted at @root.
+ *	After clearing the slot at @index in @node from radix tree
+ *	rooted at @root, call this function to attempt freeing the
+ *	node and shrinking the tree.
  *
- *	Returns the address of the deleted item, or NULL if it was not present.
+ *	Returns %true if @node was freed, %false otherwise.
  */
-void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index,
+			      struct radix_tree_node *node)
 {
-	struct radix_tree_node *node = NULL;
-	struct radix_tree_node *slot = NULL;
-	struct radix_tree_node *to_free;
-	unsigned int height, shift;
+	bool deleted = false;
+
+	do {
+		struct radix_tree_node *parent;
+
+		if (node->count) {
+			if (node == indirect_to_ptr(root->rnode)) {
+				radix_tree_shrink(root);
+				if (root->height == 0)
+					deleted = true;
+			}
+			return deleted;
+		}
+
+		parent = node->parent;
+		if (parent) {
+			index >>= RADIX_TREE_MAP_SHIFT;
+
+			parent->slots[index & RADIX_TREE_MAP_MASK] = NULL;
+			parent->count--;
+		} else {
+			root_tag_clear_all(root);
+			root->height = 0;
+			root->rnode = NULL;
+		}
+
+		radix_tree_node_free(node);
+		deleted = true;
+
+		node = parent;
+	} while (node);
+
+	return deleted;
+}
+
+/**
+ *	radix_tree_delete_item    -    delete an item from a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@item:		expected item
+ *
+ *	Remove @item at @index from the radix tree rooted at @root.
+ *
+ *	Returns the address of the deleted item, or NULL if it was not present
+ *	or the entry at the given @index was not @item.
+ */
+void *radix_tree_delete_item(struct radix_tree_root *root,
+			     unsigned long index, void *item)
+{
+	struct radix_tree_node *node;
+	unsigned int offset;
+	void **slot;
+	void *entry;
 	int tag;
-	int uninitialized_var(offset);
 
-	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		goto out;
+	entry = __radix_tree_lookup(root, index, &node, &slot);
+	if (!entry)
+		return NULL;
 
-	slot = root->rnode;
-	if (height == 0) {
+	if (item && entry != item)
+		return NULL;
+
+	if (!node) {
 		root_tag_clear_all(root);
 		root->rnode = NULL;
-		goto out;
+		return entry;
 	}
-	slot = indirect_to_ptr(slot);
-	shift = height * RADIX_TREE_MAP_SHIFT;
 
-	do {
-		if (slot == NULL)
-			goto out;
-
-		shift -= RADIX_TREE_MAP_SHIFT;
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		node = slot;
-		slot = slot->slots[offset];
-	} while (shift);
-
-	if (slot == NULL)
-		goto out;
+	offset = index & RADIX_TREE_MAP_MASK;
 
 	/*
 	 * Clear all tags associated with the item to be deleted.
@@ -1337,40 +1434,27 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 			radix_tree_tag_clear(root, index, tag);
 	}
 
-	to_free = NULL;
-	/* Now free the nodes we do not need anymore */
-	while (node) {
-		node->slots[offset] = NULL;
-		node->count--;
-		/*
-		 * Queue the node for deferred freeing after the
-		 * last reference to it disappears (set NULL, above).
-		 */
-		if (to_free)
-			radix_tree_node_free(to_free);
-
-		if (node->count) {
-			if (node == indirect_to_ptr(root->rnode))
-				radix_tree_shrink(root);
-			goto out;
-		}
-
-		/* Node with zero slots in use so free it */
-		to_free = node;
+	node->slots[offset] = NULL;
+	node->count--;
 
-		index >>= RADIX_TREE_MAP_SHIFT;
-		offset = index & RADIX_TREE_MAP_MASK;
-		node = node->parent;
-	}
+	__radix_tree_delete_node(root, index, node);
 
-	root_tag_clear_all(root);
-	root->height = 0;
-	root->rnode = NULL;
-	if (to_free)
-		radix_tree_node_free(to_free);
+	return entry;
+}
+EXPORT_SYMBOL(radix_tree_delete_item);
 
-out:
-	return slot;
+/**
+ *	radix_tree_delete    -    delete an item from a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Remove the item at @index from the radix tree rooted at @root.
+ *
+ *	Returns the address of the deleted item, or NULL if it was not present.
+ */
+void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+{
+	return radix_tree_delete_item(root, index, NULL);
 }
 EXPORT_SYMBOL(radix_tree_delete);
 
diff --git a/lib/rbtree.c b/lib/rbtree.c
index d4175565d..1356454e3 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -2,7 +2,8 @@
   Red Black Trees
   (C) 1999  Andrea Arcangeli <andrea@suse.de>
   (C) 2002  David Woodhouse <dwmw2@infradead.org>
-  
+  (C) 2012  Michel Lespinasse <walken@google.com>
+
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
@@ -20,339 +21,428 @@
   linux/lib/rbtree.c
 */
 
-#include <linux/rbtree.h>
+#include <linux/rbtree_augmented.h>
 #include <linux/export.h>
 
-static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
-{
-	struct rb_node *right = node->rb_right;
-	struct rb_node *parent = rb_parent(node);
-
-	if ((node->rb_right = right->rb_left))
-		rb_set_parent(right->rb_left, node);
-	right->rb_left = node;
+/*
+ * red-black trees properties:  http://en.wikipedia.org/wiki/Rbtree
+ *
+ *  1) A node is either red or black
+ *  2) The root is black
+ *  3) All leaves (NULL) are black
+ *  4) Both children of every red node are black
+ *  5) Every simple path from root to leaves contains the same number
+ *     of black nodes.
+ *
+ *  4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
+ *  consecutive red nodes in a path and every red node is therefore followed by
+ *  a black. So if B is the number of black nodes on every simple path (as per
+ *  5), then the longest possible path due to 4 is 2B.
+ *
+ *  We shall indicate color with case, where black nodes are uppercase and red
+ *  nodes will be lowercase. Unknown color nodes shall be drawn as red within
+ *  parentheses and have some accompanying text comment.
+ */
 
-	rb_set_parent(right, parent);
+/*
+ * Notes on lockless lookups:
+ *
+ * All stores to the tree structure (rb_left and rb_right) must be done using
+ * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
+ * tree structure as seen in program order.
+ *
+ * These two requirements will allow lockless iteration of the tree -- not
+ * correct iteration mind you, tree rotations are not atomic so a lookup might
+ * miss entire subtrees.
+ *
+ * But they do guarantee that any such traversal will only see valid elements
+ * and that it will indeed complete -- does not get stuck in a loop.
+ *
+ * It also guarantees that if the lookup returns an element it is the 'correct'
+ * one. But not returning an element does _NOT_ mean it's not present.
+ *
+ * NOTE:
+ *
+ * Stores to __rb_parent_color are not important for simple lookups so those
+ * are left undone as of now. Nor did I check for loops involving parent
+ * pointers.
+ */
 
-	if (parent)
-	{
-		if (node == parent->rb_left)
-			parent->rb_left = right;
-		else
-			parent->rb_right = right;
-	}
-	else
-		root->rb_node = right;
-	rb_set_parent(node, right);
+static inline void rb_set_black(struct rb_node *rb)
+{
+	rb->__rb_parent_color |= RB_BLACK;
 }
 
-static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
+static inline struct rb_node *rb_red_parent(struct rb_node *red)
 {
-	struct rb_node *left = node->rb_left;
-	struct rb_node *parent = rb_parent(node);
-
-	if ((node->rb_left = left->rb_right))
-		rb_set_parent(left->rb_right, node);
-	left->rb_right = node;
-
-	rb_set_parent(left, parent);
+	return (struct rb_node *)red->__rb_parent_color;
+}
 
-	if (parent)
-	{
-		if (node == parent->rb_right)
-			parent->rb_right = left;
-		else
-			parent->rb_left = left;
-	}
-	else
-		root->rb_node = left;
-	rb_set_parent(node, left);
+/*
+ * Helper function for rotations:
+ * - old's parent and color get assigned to new
+ * - old gets assigned new as a parent and 'color' as a color.
+ */
+static inline void
+__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
+			struct rb_root *root, int color)
+{
+	struct rb_node *parent = rb_parent(old);
+	new->__rb_parent_color = old->__rb_parent_color;
+	rb_set_parent_color(old, new, color);
+	__rb_change_child(old, new, parent, root);
 }
 
-void rb_insert_color(struct rb_node *node, struct rb_root *root)
+static __always_inline void
+__rb_insert(struct rb_node *node, struct rb_root *root,
+	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
-	struct rb_node *parent, *gparent;
-
-	while ((parent = rb_parent(node)) && rb_is_red(parent))
-	{
-		gparent = rb_parent(parent);
-
-		if (parent == gparent->rb_left)
-		{
-			{
-				register struct rb_node *uncle = gparent->rb_right;
-				if (uncle && rb_is_red(uncle))
-				{
-					rb_set_black(uncle);
-					rb_set_black(parent);
-					rb_set_red(gparent);
-					node = gparent;
-					continue;
-				}
+	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
+
+	while (true) {
+		/*
+		 * Loop invariant: node is red
+		 *
+		 * If there is a black parent, we are done.
+		 * Otherwise, take some corrective action as we don't
+		 * want a red root or two consecutive red nodes.
+		 */
+		if (!parent) {
+			rb_set_parent_color(node, NULL, RB_BLACK);
+			break;
+		} else if (rb_is_black(parent))
+			break;
+
+		gparent = rb_red_parent(parent);
+
+		tmp = gparent->rb_right;
+		if (parent != tmp) {	/* parent == gparent->rb_left */
+			if (tmp && rb_is_red(tmp)) {
+				/*
+				 * Case 1 - color flips
+				 *
+				 *       G            g
+				 *      / \          / \
+				 *     p   u  -->   P   U
+				 *    /            /
+				 *   n            n
+				 *
+				 * However, since g's parent might be red, and
+				 * 4) does not allow this, we need to recurse
+				 * at g.
+				 */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
 			}
 
-			if (parent->rb_right == node)
-			{
-				register struct rb_node *tmp;
-				__rb_rotate_left(parent, root);
-				tmp = parent;
+			tmp = parent->rb_right;
+			if (node == tmp) {
+				/*
+				 * Case 2 - left rotate at parent
+				 *
+				 *      G             G
+				 *     / \           / \
+				 *    p   U  -->    n   U
+				 *     \           /
+				 *      n         p
+				 *
+				 * This still leaves us in violation of 4), the
+				 * continuation into Case 3 will fix that.
+				 */
+				tmp = node->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp);
+				WRITE_ONCE(node->rb_left, parent);
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
+				augment_rotate(parent, node);
 				parent = node;
-				node = tmp;
+				tmp = node->rb_right;
 			}
 
-			rb_set_black(parent);
-			rb_set_red(gparent);
-			__rb_rotate_right(gparent, root);
+			/*
+			 * Case 3 - right rotate at gparent
+			 *
+			 *        G           P
+			 *       / \         / \
+			 *      p   U  -->  n   g
+			 *     /                 \
+			 *    n                   U
+			 */
+			WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
+			WRITE_ONCE(parent->rb_right, gparent);
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			augment_rotate(gparent, parent);
+			break;
 		} else {
-			{
-				register struct rb_node *uncle = gparent->rb_left;
-				if (uncle && rb_is_red(uncle))
-				{
-					rb_set_black(uncle);
-					rb_set_black(parent);
-					rb_set_red(gparent);
-					node = gparent;
-					continue;
-				}
+			tmp = gparent->rb_left;
+			if (tmp && rb_is_red(tmp)) {
+				/* Case 1 - color flips */
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+				rb_set_parent_color(parent, gparent, RB_BLACK);
+				node = gparent;
+				parent = rb_parent(node);
+				rb_set_parent_color(node, parent, RB_RED);
+				continue;
 			}
 
-			if (parent->rb_left == node)
-			{
-				register struct rb_node *tmp;
-				__rb_rotate_right(parent, root);
-				tmp = parent;
+			tmp = parent->rb_left;
+			if (node == tmp) {
+				/* Case 2 - right rotate at parent */
+				tmp = node->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp);
+				WRITE_ONCE(node->rb_right, parent);
+				if (tmp)
+					rb_set_parent_color(tmp, parent,
+							    RB_BLACK);
+				rb_set_parent_color(parent, node, RB_RED);
+				augment_rotate(parent, node);
 				parent = node;
-				node = tmp;
+				tmp = node->rb_left;
 			}
 
-			rb_set_black(parent);
-			rb_set_red(gparent);
-			__rb_rotate_left(gparent, root);
+			/* Case 3 - left rotate at gparent */
+			WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
+			WRITE_ONCE(parent->rb_left, gparent);
+			if (tmp)
+				rb_set_parent_color(tmp, gparent, RB_BLACK);
+			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
+			augment_rotate(gparent, parent);
+			break;
 		}
 	}
-
-	rb_set_black(root->rb_node);
 }
-EXPORT_SYMBOL(rb_insert_color);
 
-static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
-			     struct rb_root *root)
+/*
+ * Inline version for rb_erase() use - we want to be able to inline
+ * and eliminate the dummy_rotate callback there
+ */
+static __always_inline void
+____rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
-	struct rb_node *other;
-
-	while ((!node || rb_is_black(node)) && node != root->rb_node)
-	{
-		if (parent->rb_left == node)
-		{
-			other = parent->rb_right;
-			if (rb_is_red(other))
-			{
-				rb_set_black(other);
-				rb_set_red(parent);
-				__rb_rotate_left(parent, root);
-				other = parent->rb_right;
+	struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
+
+	while (true) {
+		/*
+		 * Loop invariants:
+		 * - node is black (or NULL on first iteration)
+		 * - node is not the root (parent is not NULL)
+		 * - All leaf paths going through parent and node have a
+		 *   black node count that is 1 lower than other leaf paths.
+		 */
+		sibling = parent->rb_right;
+		if (node != sibling) {	/* node == parent->rb_left */
+			if (rb_is_red(sibling)) {
+				/*
+				 * Case 1 - left rotate at parent
+				 *
+				 *     P               S
+				 *    / \             / \
+				 *   N   s    -->    p   Sr
+				 *      / \         / \
+				 *     Sl  Sr      N   Sl
+				 */
+				tmp1 = sibling->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp1);
+				WRITE_ONCE(sibling->rb_left, parent);
+				rb_set_parent_color(tmp1, parent, RB_BLACK);
+				__rb_rotate_set_parents(parent, sibling, root,
+							RB_RED);
+				augment_rotate(parent, sibling);
+				sibling = tmp1;
 			}
-			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
-			    (!other->rb_right || rb_is_black(other->rb_right)))
-			{
-				rb_set_red(other);
-				node = parent;
-				parent = rb_parent(node);
-			}
-			else
-			{
-				if (!other->rb_right || rb_is_black(other->rb_right))
-				{
-					rb_set_black(other->rb_left);
-					rb_set_red(other);
-					__rb_rotate_right(other, root);
-					other = parent->rb_right;
+			tmp1 = sibling->rb_right;
+			if (!tmp1 || rb_is_black(tmp1)) {
+				tmp2 = sibling->rb_left;
+				if (!tmp2 || rb_is_black(tmp2)) {
+					/*
+					 * Case 2 - sibling color flip
+					 * (p could be either color here)
+					 *
+					 *    (p)           (p)
+					 *    / \           / \
+					 *   N   S    -->  N   s
+					 *      / \           / \
+					 *     Sl  Sr        Sl  Sr
+					 *
+					 * This leaves us violating 5) which
+					 * can be fixed by flipping p to black
+					 * if it was red, or by recursing at p.
+					 * p is red when coming from Case 1.
+					 */
+					rb_set_parent_color(sibling, parent,
+							    RB_RED);
+					if (rb_is_red(parent))
+						rb_set_black(parent);
+					else {
+						node = parent;
+						parent = rb_parent(node);
+						if (parent)
+							continue;
+					}
+					break;
 				}
-				rb_set_color(other, rb_color(parent));
-				rb_set_black(parent);
-				rb_set_black(other->rb_right);
-				__rb_rotate_left(parent, root);
-				node = root->rb_node;
-				break;
-			}
-		}
-		else
-		{
-			other = parent->rb_left;
-			if (rb_is_red(other))
-			{
-				rb_set_black(other);
-				rb_set_red(parent);
-				__rb_rotate_right(parent, root);
-				other = parent->rb_left;
+				/*
+				 * Case 3 - right rotate at sibling
+				 * (p could be either color here)
+				 *
+				 *   (p)           (p)
+				 *   / \           / \
+				 *  N   S    -->  N   Sl
+				 *     / \             \
+				 *    sl  Sr            s
+				 *                       \
+				 *                        Sr
+				 */
+				tmp1 = tmp2->rb_right;
+				WRITE_ONCE(sibling->rb_left, tmp1);
+				WRITE_ONCE(tmp2->rb_right, sibling);
+				WRITE_ONCE(parent->rb_right, tmp2);
+				if (tmp1)
+					rb_set_parent_color(tmp1, sibling,
+							    RB_BLACK);
+				augment_rotate(sibling, tmp2);
+				tmp1 = sibling;
+				sibling = tmp2;
 			}
-			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
-			    (!other->rb_right || rb_is_black(other->rb_right)))
-			{
-				rb_set_red(other);
-				node = parent;
-				parent = rb_parent(node);
+			/*
+			 * Case 4 - left rotate at parent + color flips
+			 * (p and sl could be either color here.
+			 *  After rotation, p becomes black, s acquires
+			 *  p's color, and sl keeps its color)
+			 *
+			 *      (p)             (s)
+			 *      / \             / \
+			 *     N   S     -->   P   Sr
+			 *        / \         / \
+			 *      (sl) sr      N  (sl)
+			 */
+			tmp2 = sibling->rb_left;
+			WRITE_ONCE(parent->rb_right, tmp2);
+			WRITE_ONCE(sibling->rb_left, parent);
+			rb_set_parent_color(tmp1, sibling, RB_BLACK);
+			if (tmp2)
+				rb_set_parent(tmp2, parent);
+			__rb_rotate_set_parents(parent, sibling, root,
+						RB_BLACK);
+			augment_rotate(parent, sibling);
+			break;
+		} else {
+			sibling = parent->rb_left;
+			if (rb_is_red(sibling)) {
+				/* Case 1 - right rotate at parent */
+				tmp1 = sibling->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp1);
+				WRITE_ONCE(sibling->rb_right, parent);
+				rb_set_parent_color(tmp1, parent, RB_BLACK);
+				__rb_rotate_set_parents(parent, sibling, root,
+							RB_RED);
+				augment_rotate(parent, sibling);
+				sibling = tmp1;
 			}
-			else
-			{
-				if (!other->rb_left || rb_is_black(other->rb_left))
-				{
-					rb_set_black(other->rb_right);
-					rb_set_red(other);
-					__rb_rotate_left(other, root);
-					other = parent->rb_left;
+			tmp1 = sibling->rb_left;
+			if (!tmp1 || rb_is_black(tmp1)) {
+				tmp2 = sibling->rb_right;
+				if (!tmp2 || rb_is_black(tmp2)) {
+					/* Case 2 - sibling color flip */
+					rb_set_parent_color(sibling, parent,
+							    RB_RED);
+					if (rb_is_red(parent))
+						rb_set_black(parent);
+					else {
+						node = parent;
+						parent = rb_parent(node);
+						if (parent)
+							continue;
+					}
+					break;
 				}
-				rb_set_color(other, rb_color(parent));
-				rb_set_black(parent);
-				rb_set_black(other->rb_left);
-				__rb_rotate_right(parent, root);
-				node = root->rb_node;
-				break;
+				/* Case 3 - right rotate at sibling */
+				tmp1 = tmp2->rb_left;
+				WRITE_ONCE(sibling->rb_right, tmp1);
+				WRITE_ONCE(tmp2->rb_left, sibling);
+				WRITE_ONCE(parent->rb_left, tmp2);
+				if (tmp1)
+					rb_set_parent_color(tmp1, sibling,
+							    RB_BLACK);
+				augment_rotate(sibling, tmp2);
+				tmp1 = sibling;
+				sibling = tmp2;
 			}
+			/* Case 4 - left rotate at parent + color flips */
+			tmp2 = sibling->rb_right;
+			WRITE_ONCE(parent->rb_left, tmp2);
+			WRITE_ONCE(sibling->rb_right, parent);
+			rb_set_parent_color(tmp1, sibling, RB_BLACK);
+			if (tmp2)
+				rb_set_parent(tmp2, parent);
+			__rb_rotate_set_parents(parent, sibling, root,
+						RB_BLACK);
+			augment_rotate(parent, sibling);
+			break;
 		}
 	}
-	if (node)
-		rb_set_black(node);
 }
 
-void rb_erase(struct rb_node *node, struct rb_root *root)
+/* Non-inline version for rb_erase_augmented() use */
+void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
-	struct rb_node *child, *parent;
-	int color;
-
-	if (!node->rb_left)
-		child = node->rb_right;
-	else if (!node->rb_right)
-		child = node->rb_left;
-	else
-	{
-		struct rb_node *old = node, *left;
-
-		node = node->rb_right;
-		while ((left = node->rb_left) != NULL)
-			node = left;
-
-		if (rb_parent(old)) {
-			if (rb_parent(old)->rb_left == old)
-				rb_parent(old)->rb_left = node;
-			else
-				rb_parent(old)->rb_right = node;
-		} else
-			root->rb_node = node;
-
-		child = node->rb_right;
-		parent = rb_parent(node);
-		color = rb_color(node);
-
-		if (parent == old) {
-			parent = node;
-		} else {
-			if (child)
-				rb_set_parent(child, parent);
-			parent->rb_left = child;
-
-			node->rb_right = old->rb_right;
-			rb_set_parent(old->rb_right, node);
-		}
-
-		node->rb_parent_color = old->rb_parent_color;
-		node->rb_left = old->rb_left;
-		rb_set_parent(old->rb_left, node);
-
-		goto color;
-	}
-
-	parent = rb_parent(node);
-	color = rb_color(node);
-
-	if (child)
-		rb_set_parent(child, parent);
-	if (parent)
-	{
-		if (parent->rb_left == node)
-			parent->rb_left = child;
-		else
-			parent->rb_right = child;
-	}
-	else
-		root->rb_node = child;
-
- color:
-	if (color == RB_BLACK)
-		__rb_erase_color(child, parent, root);
+	____rb_erase_color(parent, root, augment_rotate);
 }
-EXPORT_SYMBOL(rb_erase);
+EXPORT_SYMBOL(__rb_erase_color);
 
-static void rb_augment_path(struct rb_node *node, rb_augment_f func, void *data)
-{
-	struct rb_node *parent;
+/*
+ * Non-augmented rbtree manipulation functions.
+ *
+ * We use dummy augmented callbacks here, and have the compiler optimize them
+ * out of the rb_insert_color() and rb_erase() function definitions.
+ */
 
-up:
-	func(node, data);
-	parent = rb_parent(node);
-	if (!parent)
-		return;
+static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
+static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
+static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
 
-	if (node == parent->rb_left && parent->rb_right)
-		func(parent->rb_right, data);
-	else if (parent->rb_left)
-		func(parent->rb_left, data);
+static const struct rb_augment_callbacks dummy_callbacks = {
+	dummy_propagate, dummy_copy, dummy_rotate
+};
 
-	node = parent;
-	goto up;
-}
-
-/*
- * after inserting @node into the tree, update the tree to account for
- * both the new entry and any damage done by rebalance
- */
-void rb_augment_insert(struct rb_node *node, rb_augment_f func, void *data)
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
 {
-	if (node->rb_left)
-		node = node->rb_left;
-	else if (node->rb_right)
-		node = node->rb_right;
-
-	rb_augment_path(node, func, data);
+	__rb_insert(node, root, dummy_rotate);
 }
-EXPORT_SYMBOL(rb_augment_insert);
+EXPORT_SYMBOL(rb_insert_color);
 
-/*
- * before removing the node, find the deepest node on the rebalance path
- * that will still be there after @node gets removed
- */
-struct rb_node *rb_augment_erase_begin(struct rb_node *node)
+void rb_erase(struct rb_node *node, struct rb_root *root)
 {
-	struct rb_node *deepest;
-
-	if (!node->rb_right && !node->rb_left)
-		deepest = rb_parent(node);
-	else if (!node->rb_right)
-		deepest = node->rb_left;
-	else if (!node->rb_left)
-		deepest = node->rb_right;
-	else {
-		deepest = rb_next(node);
-		if (deepest->rb_right)
-			deepest = deepest->rb_right;
-		else if (rb_parent(deepest) != node)
-			deepest = rb_parent(deepest);
-	}
-
-	return deepest;
+	struct rb_node *rebalance;
+	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
+	if (rebalance)
+		____rb_erase_color(rebalance, root, dummy_rotate);
 }
-EXPORT_SYMBOL(rb_augment_erase_begin);
+EXPORT_SYMBOL(rb_erase);
 
 /*
- * after removal, update the tree to account for the removed entry
- * and any rebalance damage.
+ * Augmented rbtree manipulation functions.
+ *
+ * This instantiates the same __always_inline functions as in the non-augmented
+ * case, but this time with user-defined callbacks.
  */
-void rb_augment_erase_end(struct rb_node *node, rb_augment_f func, void *data)
+
+void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
+	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
-	if (node)
-		rb_augment_path(node, func, data);
+	__rb_insert(node, root, augment_rotate);
 }
-EXPORT_SYMBOL(rb_augment_erase_end);
+EXPORT_SYMBOL(__rb_insert_augmented);
 
 /*
  * This function returns the first node (in sort order) of the tree.
@@ -387,11 +477,13 @@ struct rb_node *rb_next(const struct rb_node *node)
 {
 	struct rb_node *parent;
 
-	if (rb_parent(node) == node)
+	if (RB_EMPTY_NODE(node))
 		return NULL;
 
-	/* If we have a right-hand child, go down and then left as far
-	   as we can. */
+	/*
+	 * If we have a right-hand child, go down and then left as far
+	 * as we can.
+	 */
 	if (node->rb_right) {
 		node = node->rb_right; 
 		while (node->rb_left)
@@ -399,12 +491,13 @@ struct rb_node *rb_next(const struct rb_node *node)
 		return (struct rb_node *)node;
 	}
 
-	/* No right-hand children.  Everything down and left is
-	   smaller than us, so any 'next' node must be in the general
-	   direction of our parent. Go up the tree; any time the
-	   ancestor is a right-hand child of its parent, keep going
-	   up. First time it's a left-hand child of its parent, said
-	   parent is our 'next' node. */
+	/*
+	 * No right-hand children. Everything down and left is smaller than us,
+	 * so any 'next' node must be in the general direction of our parent.
+	 * Go up the tree; any time the ancestor is a right-hand child of its
+	 * parent, keep going up. First time it's a left-hand child of its
+	 * parent, said parent is our 'next' node.
+	 */
 	while ((parent = rb_parent(node)) && node == parent->rb_right)
 		node = parent;
 
@@ -416,11 +509,13 @@ struct rb_node *rb_prev(const struct rb_node *node)
 {
 	struct rb_node *parent;
 
-	if (rb_parent(node) == node)
+	if (RB_EMPTY_NODE(node))
 		return NULL;
 
-	/* If we have a left-hand child, go down and then right as far
-	   as we can. */
+	/*
+	 * If we have a left-hand child, go down and then right as far
+	 * as we can.
+	 */
 	if (node->rb_left) {
 		node = node->rb_left; 
 		while (node->rb_right)
@@ -428,8 +523,10 @@ struct rb_node *rb_prev(const struct rb_node *node)
 		return (struct rb_node *)node;
 	}
 
-	/* No left-hand children. Go up till we find an ancestor which
-	   is a right-hand child of its parent */
+	/*
+	 * No left-hand children. Go up till we find an ancestor which
+	 * is a right-hand child of its parent.
+	 */
 	while ((parent = rb_parent(node)) && node == parent->rb_left)
 		node = parent;
 
@@ -443,14 +540,7 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 	struct rb_node *parent = rb_parent(victim);
 
 	/* Set the surrounding nodes to point to the replacement */
-	if (parent) {
-		if (victim == parent->rb_left)
-			parent->rb_left = new;
-		else
-			parent->rb_right = new;
-	} else {
-		root->rb_node = new;
-	}
+	__rb_change_child(victim, new, parent, root);
 	if (victim->rb_left)
 		rb_set_parent(victim->rb_left, new);
 	if (victim->rb_right)
@@ -460,3 +550,43 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 	*new = *victim;
 }
 EXPORT_SYMBOL(rb_replace_node);
+
+static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
+{
+	for (;;) {
+		if (node->rb_left)
+			node = node->rb_left;
+		else if (node->rb_right)
+			node = node->rb_right;
+		else
+			return (struct rb_node *)node;
+	}
+}
+
+struct rb_node *rb_next_postorder(const struct rb_node *node)
+{
+	const struct rb_node *parent;
+	if (!node)
+		return NULL;
+	parent = rb_parent(node);
+
+	/* If we're sitting on node, we've already seen our children */
+	if (parent && node == parent->rb_left && parent->rb_right) {
+		/* If we are the parent's left node, go to the parent's right
+		 * node then all the way down to the left */
+		return rb_left_deepest_node(parent->rb_right);
+	} else
+		/* Otherwise we are the parent's right node, and the parent
+		 * should be next */
+		return (struct rb_node *)parent;
+}
+EXPORT_SYMBOL(rb_next_postorder);
+
+struct rb_node *rb_first_postorder(const struct rb_root *root)
+{
+	if (!root->rb_node)
+		return NULL;
+
+	return rb_left_deepest_node(root->rb_node);
+}
+EXPORT_SYMBOL(rb_first_postorder);
diff --git a/mm/Kconfig b/mm/Kconfig
index e0b4f6fbb..81c5191d2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -466,6 +466,22 @@ config ZSWAP
 	  from this compressed store much faster than most tradition
 	  swap devices resulting in reduced I/O and faster performance
 	  for many workloads.
+	  
+config ZCACHE
+       bool "Compressed cache for file pages (EXPERIMENTAL)"
+       depends on CRYPTO && CLEANCACHE
+       select CRYPTO_LZO
+       select ZBUD
+       default n
+       help
+         A compressed cache for file pages.
+         It takes active file pages that are in the process of being reclaimed
+         and attempts to compress them into a dynamically allocated RAM-based
+         memory pool.
+
+         If this process is successful, when those file pages needed again, the
+         I/O reading operation was avoided. This results in a significant performance
+         gains under memory pressure for systems full with file pages.
 
 config SWAP_ENABLE_READAHEAD
 	bool "Enable readahead on page swap in"
@@ -482,6 +498,24 @@ config ZSWAP_ENABLE_WRITEBACK
 	depends on ZSWAP
 	default n
 
+config ZPOOL
+	tristate "Common API for compressed memory storage"
+	default n
+	help
+	  Compressed memory storage API.  This allows using either zbud or
+	  zsmalloc.
+
+config ZBUD
+	tristate "Low density storage for compressed pages"
+	default n
+	help
+	  A special purpose allocator for storing compressed pages.
+	  It is designed to store up to two compressed pages per physical
+	  page.  While this design limits storage density, it has simple and
+	  deterministic reclaim properties that make it preferable to a higher
+	  density approach when reclaim will be used.
+
+
 config DIRECT_RECLAIM_FILE_PAGES_ONLY
 	bool "Reclaim file pages only on direct reclaim path"
 	depends on ZSWAP
diff --git a/mm/Makefile b/mm/Makefile
index 0774dd357..f279b0dc1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 obj-$(CONFIG_BOUNCE)	+= bounce.o
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
 obj-$(CONFIG_FRONTSWAP)	+= frontswap.o
+obj-$(CONFIG_ZCACHE)	+= zcache.o
 obj-$(CONFIG_ZSWAP)	+= zswap.o
 obj-$(CONFIG_HAS_DMA)	+= dmapool.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
@@ -54,3 +55,5 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_ZSMALLOC_NEW) += zsmalloc.o
+obj-$(CONFIG_ZPOOL)	+= zpool.o
+obj-$(CONFIG_ZBUD)	+= zbud.o
\ No newline at end of file
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 5646c740f..1d97b0d05 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -19,23 +19,13 @@
 #include <linux/cleancache.h>
 
 /*
- * This global enablement flag may be read thousands of times per second
- * by cleancache_get/put/invalidate even on systems where cleancache_ops
- * is not claimed (e.g. cleancache is config'ed on but remains
- * disabled), so is preferred to the slower alternative: a function
- * call that checks a non-global.
- */
-int cleancache_enabled __read_mostly;
-EXPORT_SYMBOL(cleancache_enabled);
-
-/*
- * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * cleancache_ops is set by cleancache_register_ops to contain the pointers
  * to the cleancache "backend" implementation functions.
  */
-static struct cleancache_ops cleancache_ops __read_mostly;
+static const struct cleancache_ops *cleancache_ops __read_mostly;
 
 /*
- * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * Counters available via /sys/kernel/debug/cleancache (if debugfs is
  * properly configured.  These are for information only so are not protected
  * against increment races.
  */
@@ -44,32 +34,107 @@ static u64 cleancache_failed_gets;
 static u64 cleancache_puts;
 static u64 cleancache_invalidates;
 
+static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
+{
+	switch (sb->cleancache_poolid) {
+	case CLEANCACHE_NO_BACKEND:
+		__cleancache_init_fs(sb);
+		break;
+	case CLEANCACHE_NO_BACKEND_SHARED:
+		__cleancache_init_shared_fs(sb);
+		break;
+	}
+}
+
 /*
- * register operations for cleancache, returning previous thus allowing
- * detection of multiple backends and possible nesting
+ * Register operations for cleancache. Returns 0 on success.
  */
-struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
+int cleancache_register_ops(const struct cleancache_ops *ops)
 {
-	struct cleancache_ops old = cleancache_ops;
+	if (cmpxchg(&cleancache_ops, NULL, ops))
+		return -EBUSY;
 
-	cleancache_ops = *ops;
-	cleancache_enabled = 1;
-	return old;
+	/*
+	 * A cleancache backend can be built as a module and hence loaded after
+	 * a cleancache enabled filesystem has called cleancache_init_fs. To
+	 * handle such a scenario, here we call ->init_fs or ->init_shared_fs
+	 * for each active super block. To differentiate between local and
+	 * shared filesystems, we temporarily initialize sb->cleancache_poolid
+	 * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
+	 * respectively in case there is no backend registered at the time
+	 * cleancache_init_fs or cleancache_init_shared_fs is called.
+	 *
+	 * Since filesystems can be mounted concurrently with cleancache
+	 * backend registration, we have to be careful to guarantee that all
+	 * cleancache enabled filesystems that has been mounted by the time
+	 * cleancache_register_ops is called has got and all mounted later will
+	 * get cleancache_poolid. This is assured by the following statements
+	 * tied together:
+	 *
+	 * a) iterate_supers skips only those super blocks that has started
+	 *    ->kill_sb
+	 *
+	 * b) if iterate_supers encounters a super block that has not finished
+	 *    ->mount yet, it waits until it is finished
+	 *
+	 * c) cleancache_init_fs is called from ->mount and
+	 *    cleancache_invalidate_fs is called from ->kill_sb
+	 *
+	 * d) we call iterate_supers after cleancache_ops has been set
+	 *
+	 * From a) it follows that if iterate_supers skips a super block, then
+	 * either the super block is already dead, in which case we do not need
+	 * to bother initializing cleancache for it, or it was mounted after we
+	 * initiated iterate_supers. In the latter case, it must have seen
+	 * cleancache_ops set according to d) and initialized cleancache from
+	 * ->mount by itself according to c). This proves that we call
+	 * ->init_fs at least once for each active super block.
+	 *
+	 * From b) and c) it follows that if iterate_supers encounters a super
+	 * block that has already started ->init_fs, it will wait until ->mount
+	 * and hence ->init_fs has finished, then check cleancache_poolid, see
+	 * that it has already been set and therefore do nothing. This proves
+	 * that we call ->init_fs no more than once for each super block.
+	 *
+	 * Combined together, the last two paragraphs prove the function
+	 * correctness.
+	 *
+	 * Note that various cleancache callbacks may proceed before this
+	 * function is called or even concurrently with it, but since
+	 * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
+	 * until the corresponding ->init_fs has been actually called and
+	 * cleancache_ops has been set.
+	 */
+	iterate_supers(cleancache_register_ops_sb, NULL);
+	return 0;
 }
 EXPORT_SYMBOL(cleancache_register_ops);
 
 /* Called by a cleancache-enabled filesystem at time of mount */
 void __cleancache_init_fs(struct super_block *sb)
 {
-	sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
+	int pool_id = CLEANCACHE_NO_BACKEND;
+
+	if (cleancache_ops) {
+		pool_id = cleancache_ops->init_fs(PAGE_SIZE);
+		if (pool_id < 0)
+			pool_id = CLEANCACHE_NO_POOL;
+	}
+	sb->cleancache_poolid = pool_id;
 }
 EXPORT_SYMBOL(__cleancache_init_fs);
 
 /* Called by a cleancache-enabled clustered filesystem at time of mount */
-void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+void __cleancache_init_shared_fs(struct super_block *sb)
 {
-	sb->cleancache_poolid =
-		(*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
+	int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
+
+	if (cleancache_ops) {
+		pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
+		if (pool_id < 0)
+			pool_id = CLEANCACHE_NO_POOL;
+	}
+	sb->cleancache_poolid = pool_id;
 }
 EXPORT_SYMBOL(__cleancache_init_shared_fs);
 
@@ -91,7 +156,7 @@ static int cleancache_get_key(struct inode *inode,
 			struct dentry d;
 			d.d_inode = inode;
 			len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
-			if (len <= 0 || len == 255)
+			if (len <= FILEID_ROOT || len == FILEID_INVALID)
 				return -1;
 			if (maxlen > CLEANCACHE_KEY_MAX)
 				return -1;
@@ -106,6 +171,10 @@ static int cleancache_get_key(struct inode *inode,
  * successful, use it to fill the specified page with data and return 0.
  * The pageframe is unchanged and returns -1 if the get fails.
  * Page must be locked by caller.
+ *
+ * The function has two checks before any action is taken - whether
+ * a backend is registered and whether the sb->cleancache_poolid
+ * is correct.
  */
 int __cleancache_get_page(struct page *page)
 {
@@ -113,6 +182,11 @@ int __cleancache_get_page(struct page *page)
 	int pool_id;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
+	if (!cleancache_ops) {
+		cleancache_failed_gets++;
+		goto out;
+	}
+
 	VM_BUG_ON(!PageLocked(page));
 	pool_id = page->mapping->host->i_sb->cleancache_poolid;
 	if (pool_id < 0)
@@ -121,7 +195,7 @@ int __cleancache_get_page(struct page *page)
 	if (cleancache_get_key(page->mapping->host, &key) < 0)
 		goto out;
 
-	ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
+	ret = cleancache_ops->get_page(pool_id, key, page->index, page);
 	if (ret == 0)
 		cleancache_succ_gets++;
 	else
@@ -136,17 +210,26 @@ EXPORT_SYMBOL(__cleancache_get_page);
  * (previously-obtained per-filesystem) poolid and the page's,
  * inode and page index.  Page must be locked.  Note that a put_page
  * always "succeeds", though a subsequent get_page may succeed or fail.
+ *
+ * The function has two checks before any action is taken - whether
+ * a backend is registered and whether the sb->cleancache_poolid
+ * is correct.
  */
 void __cleancache_put_page(struct page *page)
 {
 	int pool_id;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
+	if (!cleancache_ops) {
+		cleancache_puts++;
+		return;
+	}
+
 	VM_BUG_ON(!PageLocked(page));
 	pool_id = page->mapping->host->i_sb->cleancache_poolid;
 	if (pool_id >= 0 &&
-	      cleancache_get_key(page->mapping->host, &key) >= 0) {
-		(*cleancache_ops.put_page)(pool_id, key, page->index, page);
+		cleancache_get_key(page->mapping->host, &key) >= 0) {
+		cleancache_ops->put_page(pool_id, key, page->index, page);
 		cleancache_puts++;
 	}
 }
@@ -155,6 +238,10 @@ EXPORT_SYMBOL(__cleancache_put_page);
 /*
  * Invalidate any data from cleancache associated with the poolid and the
  * page's inode and page index so that a subsequent "get" will fail.
+ *
+ * The function has two checks before any action is taken - whether
+ * a backend is registered and whether the sb->cleancache_poolid
+ * is correct.
  */
 void __cleancache_invalidate_page(struct address_space *mapping,
 					struct page *page)
@@ -163,11 +250,14 @@ void __cleancache_invalidate_page(struct address_space *mapping,
 	int pool_id = mapping->host->i_sb->cleancache_poolid;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
+	if (!cleancache_ops)
+		return;
+
 	if (pool_id >= 0) {
 		VM_BUG_ON(!PageLocked(page));
 		if (cleancache_get_key(mapping->host, &key) >= 0) {
-			(*cleancache_ops.invalidate_page)(pool_id,
-							  key, page->index);
+			cleancache_ops->invalidate_page(pool_id,
+					key, page->index);
 			cleancache_invalidates++;
 		}
 	}
@@ -178,29 +268,38 @@ EXPORT_SYMBOL(__cleancache_invalidate_page);
  * Invalidate all data from cleancache associated with the poolid and the
  * mappings's inode so that all subsequent gets to this poolid/inode
  * will fail.
+ *
+ * The function has two checks before any action is taken - whether
+ * a backend is registered and whether the sb->cleancache_poolid
+ * is correct.
  */
 void __cleancache_invalidate_inode(struct address_space *mapping)
 {
 	int pool_id = mapping->host->i_sb->cleancache_poolid;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
+	if (!cleancache_ops)
+		return;
+
 	if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
-		(*cleancache_ops.invalidate_inode)(pool_id, key);
+		cleancache_ops->invalidate_inode(pool_id, key);
 }
 EXPORT_SYMBOL(__cleancache_invalidate_inode);
 
 /*
  * Called by any cleancache-enabled filesystem at time of unmount;
- * note that pool_id is surrendered and may be reutrned by a subsequent
- * cleancache_init_fs or cleancache_init_shared_fs
+ * note that pool_id is surrendered and may be returned by a subsequent
+ * cleancache_init_fs or cleancache_init_shared_fs.
  */
 void __cleancache_invalidate_fs(struct super_block *sb)
 {
-	if (sb->cleancache_poolid >= 0) {
-		int old_poolid = sb->cleancache_poolid;
-		sb->cleancache_poolid = -1;
-		(*cleancache_ops.invalidate_fs)(old_poolid);
-	}
+	int pool_id;
+
+	pool_id = sb->cleancache_poolid;
+	sb->cleancache_poolid = CLEANCACHE_NO_POOL;
+
+	if (cleancache_ops && pool_id >= 0)
+		cleancache_ops->invalidate_fs(pool_id);
 }
 EXPORT_SYMBOL(__cleancache_invalidate_fs);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 02c00eeef..e5e1543f9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -666,6 +666,86 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 	}
 }
 
+/**
+ * page_cache_next_hole - find the next hole (not-present entry)
+ * @mapping: mapping
+ * @index: index
+ * @max_scan: maximum range to search
+ *
+ * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
+ * lowest indexed hole.
+ *
+ * Returns: the index of the hole if found, otherwise returns an index
+ * outside of the set specified (in which case 'return - index >=
+ * max_scan' will be true). In rare cases of index wrap-around, 0 will
+ * be returned.
+ *
+ * page_cache_next_hole may be called under rcu_read_lock. However,
+ * like radix_tree_gang_lookup, this will not atomically search a
+ * snapshot of the tree at a single point in time. For example, if a
+ * hole is created at index 5, then subsequently a hole is created at
+ * index 10, page_cache_next_hole covering both indexes may return 10
+ * if called under rcu_read_lock.
+ */
+pgoff_t page_cache_next_hole(struct address_space *mapping,
+                             pgoff_t index, unsigned long max_scan)
+{
+        unsigned long i;
+
+        for (i = 0; i < max_scan; i++) {
+                struct page *page;
+
+                page = radix_tree_lookup(&mapping->page_tree, index);
+                if (!page || radix_tree_exceptional_entry(page))
+                        break;
+                index++;
+                if (index == 0)
+                        break;
+        }
+
+        return index;
+}
+EXPORT_SYMBOL(page_cache_next_hole);
+
+/**
+ * page_cache_prev_hole - find the prev hole (not-present entry)
+ * @mapping: mapping
+ * @index: index
+ * @max_scan: maximum range to search
+ *
+ * Search backwards in the range [max(index-max_scan+1, 0), index] for
+ * the first hole.
+ *
+ * Returns: the index of the hole if found, otherwise returns an index
+ * outside of the set specified (in which case 'index - return >=
+ * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
+ * will be returned.
+ *
+ * page_cache_prev_hole may be called under rcu_read_lock. However,
+ * like radix_tree_gang_lookup, this will not atomically search a
+ * snapshot of the tree at a single point in time. For example, if a
+ * hole is created at index 10, then subsequently a hole is created at
+ * index 5, page_cache_prev_hole covering both indexes may return 5 if
+ * called under rcu_read_lock.
+ */
+pgoff_t page_cache_prev_hole(struct address_space *mapping,
+			     pgoff_t index, unsigned long max_scan)
+{
+	unsigned long i;
+
+	for (i = 0; i < max_scan; i++) {
+		if (!radix_tree_lookup(&mapping->page_tree, index))
+			break;
+		index--;
+		if (index == ULONG_MAX)
+			break;
+	}
+
+	return index;
+}
+EXPORT_SYMBOL(page_cache_prev_hole);
+
+
 /**
  * find_get_page - find and get a page reference
  * @mapping: the address_space to search
@@ -2442,6 +2522,11 @@ static ssize_t generic_perform_write(struct file *file,
 			break;
 		}
 
+		if (fatal_signal_pending(current)) {
+			status = -EINTR;
+			break;
+		}
+
 		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
 						&page, &fsdata);
 		if (unlikely(status))
@@ -2482,10 +2567,6 @@ static ssize_t generic_perform_write(struct file *file,
 		written += copied;
 
 		balance_dirty_pages_ratelimited(mapping);
-		if (fatal_signal_pending(current)) {
-			status = -EINTR;
-			break;
-		}
 	} while (iov_iter_count(i));
 
 	return written ? written : status;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc36e280c..e622aab7f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2503,6 +2503,14 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (iter_vma == vma)
 			continue;
 
+		/*
+		 * Shared VMAs have their own reserves and do not affect
+		 * MAP_PRIVATE accounting but it is possible that a shared
+		 * VMA is using the same page so check and skip such VMAs.
+		 */
+		if (iter_vma->vm_flags & VM_MAYSHARE)
+			continue;
+
 		/*
 		 * Unmap the page from other VMAs without their own reserves.
 		 * They get marked to be SIGKILLed if they fault in these
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5265cb948..6e96f56da 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5229,7 +5229,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 		swp_entry_t swap = radix_to_swp_entry(page);
 		if (do_swap_account)
 			*entry = swap;
-		page = find_get_page(&swapper_space, swap.val);
+		page = find_get_page(swap_address_space(swap), swap.val);
 	}
 #endif
 	return page;
diff --git a/mm/memory.c b/mm/memory.c
index ec059db19..ed5b26277 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3258,7 +3258,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	mem_cgroup_commit_charge_swapin(page, ptr);
 
 	swap_free(entry);
-	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+	if (vm_swap_full(page_swap_info(page)) || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
 		try_to_free_swap(page);
 	unlock_page(page);
 	if (swapcache) {
diff --git a/mm/mincore.c b/mm/mincore.c
index 936b4cee8..8be8c84b9 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
 	/* shmem/tmpfs may return swap: account for swapcache page too. */
 	if (radix_tree_exceptional_entry(page)) {
 		swp_entry_t swap = radix_to_swp_entry(page);
-		page = find_get_page(&swapper_space, swap.val);
+		page = find_get_page(swap_address_space(swap), swap.val);
 	}
 #endif
 	if (page) {
@@ -135,7 +135,7 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			} else {
 #ifdef CONFIG_SWAP
 				pgoff = entry.val;
-				*vec = mincore_page(&swapper_space, pgoff);
+				*vec = mincore_page(swap_address_space(entry), pgoff);
 #else
 				WARN_ON(1);
 				*vec = 1;
diff --git a/mm/mmap.c b/mm/mmap.c
index 1ce374f97..07b3d1fff 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -32,6 +32,7 @@
 #include <linux/audit.h>
 #include <linux/khugepaged.h>
 #include <linux/ksm.h>
+#include <linux/sched/sysctl.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
diff --git a/mm/mremap.c b/mm/mremap.c
index db8d983b5..8855dec4e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -19,6 +19,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/mmu_notifier.h>
+#include <linux/sched/sysctl.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
diff --git a/mm/nommu.c b/mm/nommu.c
index 75ff3c173..3685abade 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -29,6 +29,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/audit.h>
+#include <linux/sched/sysctl.h>
 
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 79d3c67be..1cfc67c3e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -317,7 +317,7 @@ static void bad_page(struct page *page)
 
 	/* Don't complain about poisoned pages */
 	if (PageHWPoison(page)) {
-		reset_page_mapcount(page); /* remove PageBuddy */
+		page_mapcount_reset(page); /* remove PageBuddy */
 		return;
 	}
 
@@ -349,7 +349,7 @@ static void bad_page(struct page *page)
 	dump_stack();
 out:
 	/* Leave bad fields for debug, except PageBuddy could make trouble */
-	reset_page_mapcount(page); /* remove PageBuddy */
+	page_mapcount_reset(page); /* remove PageBuddy */
 	add_taint(TAINT_BAD_PAGE);
 }
 
@@ -4039,7 +4039,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		set_page_links(page, zone, nid, pfn);
 		mminit_verify_page_links(page, zone, nid, pfn);
 		init_page_count(page);
-		reset_page_mapcount(page);
+		page_mapcount_reset(page);
 		SetPageReserved(page);
 		/*
 		 * Mark the block movable so that blocks are reserved for
@@ -6312,6 +6312,9 @@ static struct trace_print_flags pageflag_names[] = {
 	{1UL << PG_scfslower, "scfslower"},
 	{1UL << PG_nocache,"nocache"},
 #endif
+#ifdef CONFIG_ZCACHE
+	{1UL << PG_was_active,           "PG_was_active"  },
+#endif
 };
 
 static void dump_page_flags(unsigned long flags)
diff --git a/mm/page_io.c b/mm/page_io.c
index d0b4d4a55..c1d2f81db 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -18,6 +18,7 @@
 #include <linux/swap.h>
 #include <linux/bio.h>
 #include <linux/swapops.h>
+#include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/frontswap.h>
@@ -145,17 +146,43 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 int swap_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int ret = 0;
+	struct swap_info_struct *sis = page_swap_info(page);
 
 	if (try_to_free_swap(page)) {
 		unlock_page(page);
 		goto out;
 	}
+
 	if (frontswap_store(page) == 0) {
 		set_page_writeback(page);
 		unlock_page(page);
 		end_page_writeback(page);
 		goto out;
 	}
+	if (sis->flags & SWP_FILE) {
+		struct kiocb kiocb;
+		struct file *swap_file = sis->swap_file;
+		struct address_space *mapping = swap_file->f_mapping;
+		struct iovec iov = {
+			.iov_base = page_address(page),
+			.iov_len  = PAGE_SIZE,
+		};
+
+		init_sync_kiocb(&kiocb, swap_file);
+		kiocb.ki_pos = page_file_offset(page);
+		kiocb.ki_left = PAGE_SIZE;
+		kiocb.ki_nbytes = PAGE_SIZE;
+
+		unlock_page(page);
+		ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
+						&kiocb, &iov,
+						kiocb.ki_pos, 1);
+		if (ret == PAGE_SIZE) {
+			count_vm_event(PSWPOUT);
+			ret = 0;
+		}
+		return ret;
+	}
 	ret = __swap_writepage(page, wbc, end_swap_bio_write);
 out:
 	return ret;
@@ -188,6 +215,7 @@ int swap_readpage(struct page *page)
 {
 	struct bio *bio;
 	int ret = 0;
+	struct swap_info_struct *sis = page_swap_info(page);
 
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(PageUptodate(page));
@@ -196,6 +224,15 @@ int swap_readpage(struct page *page)
 		unlock_page(page);
 		goto out;
 	}
+	if (sis->flags & SWP_FILE) {
+		struct file *swap_file = sis->swap_file;
+		struct address_space *mapping = swap_file->f_mapping;
+
+		ret = mapping->a_ops->readpage(swap_file, page);
+		if (!ret)
+			count_vm_event(PSWPIN);
+		return ret;
+	}
 	bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
 	if (bio == NULL) {
 		unlock_page(page);
@@ -207,3 +244,15 @@ int swap_readpage(struct page *page)
 out:
 	return ret;
 }
+
+int swap_set_page_dirty(struct page *page)
+{
+	struct swap_info_struct *sis = page_swap_info(page);
+
+	if (sis->flags & SWP_FILE) {
+		struct address_space *mapping = sis->swap_file->f_mapping;
+		return mapping->a_ops->set_page_dirty(page);
+	} else {
+		return __set_page_dirty_no_writeback(page);
+	}
+}
\ No newline at end of file
diff --git a/mm/readahead.c b/mm/readahead.c
index 0eac4cbf6..abec1a59b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -367,7 +367,7 @@ static pgoff_t count_history_pages(struct address_space *mapping,
 	pgoff_t head;
 
 	rcu_read_lock();
-	head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
+	head = page_cache_prev_hole(mapping, offset - 1, max);
 	rcu_read_unlock();
 
 	return offset - 1 - head;
@@ -446,7 +446,7 @@ ondemand_readahead(struct address_space *mapping,
 		pgoff_t start;
 
 		rcu_read_lock();
-		start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
+		start = page_cache_next_hole(mapping, offset+1,max);
 		rcu_read_unlock();
 
 		if (!start || start - offset > max)
diff --git a/mm/slob.c b/mm/slob.c
index 8105be42c..dcc976b33 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -117,7 +117,7 @@ static inline void struct_slob_page_wrong_size(void)
  */
 static inline void free_slob_page(struct slob_page *sp)
 {
-	reset_page_mapcount(&sp->page);
+	page_mapcount_reset(&sp->page);
 	sp->page.mapping = NULL;
 }
 
diff --git a/mm/slub.c b/mm/slub.c
index 286f736e9..5cb23f8ea 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1419,7 +1419,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 		-pages);
 
 	__ClearPageSlab(page);
-	reset_page_mapcount(page);
+	page_mapcount_reset(page);
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += pages;
 	__free_pages(page, order);
diff --git a/mm/swap.c b/mm/swap.c
index a8feea68a..db74d14f9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -845,7 +845,13 @@ void __init swap_setup(void)
 	unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
 
 #ifdef CONFIG_SWAP
-	bdi_init(swapper_space.backing_dev_info);
+	int i;
+
+	bdi_init(swapper_spaces[0].backing_dev_info);
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		spin_lock_init(&swapper_spaces[i].tree_lock);
+		INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
+	}
 #endif
 
 	/* Use a smaller cluster for small-memory machines */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e561867c1..f1ee1e8fa 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -26,7 +26,7 @@
  */
 static const struct address_space_operations swap_aops = {
 	.writepage	= swap_writepage,
-	.set_page_dirty	= __set_page_dirty_no_writeback,
+	.set_page_dirty	= swap_set_page_dirty,
 	.migratepage	= migrate_page,
 };
 
@@ -35,12 +35,12 @@ static struct backing_dev_info swap_backing_dev_info = {
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
 };
 
-struct address_space swapper_space = {
-	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-	.tree_lock	= __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
-	.a_ops		= &swap_aops,
-	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
-	.backing_dev_info = &swap_backing_dev_info,
+struct address_space swapper_spaces[MAX_SWAPFILES] = {
+	[0 ... MAX_SWAPFILES - 1] = {
+		.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
+		.a_ops		= &swap_aops,
+		.backing_dev_info = &swap_backing_dev_info,
+	}
 };
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
@@ -52,9 +52,19 @@ static struct {
 	unsigned long find_total;
 } swap_cache_info;
 
+unsigned long total_swapcache_pages(void)
+{
+	int i;
+	unsigned long ret = 0;
+
+	for (i = 0; i < MAX_SWAPFILES; i++)
+		ret += swapper_spaces[i].nrpages;
+	return ret;
+}
+
 void show_swap_cache_info(void)
 {
-	printk("%lu pages in swap cache\n", total_swapcache_pages);
+	printk("%lu pages in swap cache\n", total_swapcache_pages());
 	printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
 		swap_cache_info.add_total, swap_cache_info.del_total,
 		swap_cache_info.find_success, swap_cache_info.find_total);
@@ -70,6 +80,7 @@ void show_swap_cache_info(void)
 int __add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
 	int error;
+	struct address_space *address_space;
 
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(PageSwapCache(page));
@@ -79,14 +90,15 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
 	SetPageSwapCache(page);
 	set_page_private(page, entry.val);
 
-	spin_lock_irq(&swapper_space.tree_lock);
-	error = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
+	address_space = swap_address_space(entry);
+	spin_lock_irq(&address_space->tree_lock);
+	error = radix_tree_insert(&address_space->page_tree, entry.val, page);
 	if (likely(!error)) {
-		total_swapcache_pages++;
+		address_space->nrpages++;
 		__inc_zone_page_state(page, NR_FILE_PAGES);
 		INC_CACHE_INFO(add_total);
 	}
-	spin_unlock_irq(&swapper_space.tree_lock);
+	spin_unlock_irq(&address_space->tree_lock);
 
 	if (unlikely(error)) {
 		/*
@@ -122,14 +134,18 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
  */
 void __delete_from_swap_cache(struct page *page)
 {
+	swp_entry_t entry;
+	struct address_space *address_space;
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(!PageSwapCache(page));
 	VM_BUG_ON(PageWriteback(page));
 
-	radix_tree_delete(&swapper_space.page_tree, page_private(page));
+	entry.val = page_private(page);
+	address_space = swap_address_space(entry);
+	radix_tree_delete(&address_space->page_tree, page_private(page));
 	set_page_private(page, 0);
 	ClearPageSwapCache(page);
-	total_swapcache_pages--;
+	address_space->nrpages--;
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	INC_CACHE_INFO(del_total);
 }
@@ -195,12 +211,14 @@ int add_to_swap(struct page *page)
 void delete_from_swap_cache(struct page *page)
 {
 	swp_entry_t entry;
+	struct address_space *address_space;
 
 	entry.val = page_private(page);
 
-	spin_lock_irq(&swapper_space.tree_lock);
+	address_space = swap_address_space(entry);
+	spin_lock_irq(&address_space->tree_lock);
 	__delete_from_swap_cache(page);
-	spin_unlock_irq(&swapper_space.tree_lock);
+	spin_unlock_irq(&address_space->tree_lock);
 
 	swapcache_free(entry, page);
 	page_cache_release(page);
@@ -263,7 +281,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
 {
 	struct page *page;
 
-	page = find_get_page(&swapper_space, entry.val);
+	page = find_get_page(swap_address_space(entry), entry.val);
 
 	if (page)
 		INC_CACHE_INFO(find_success);
@@ -290,7 +308,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * called after lookup_swap_cache() failed, re-calling
 		 * that would confuse statistics.
 		 */
-		found_page = find_get_page(&swapper_space, entry.val);
+		found_page = find_get_page(swap_address_space(entry), entry.val);
 		if (found_page)
 			break;
 
@@ -393,7 +411,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 	struct page *page;
 	unsigned long offset = swp_offset(entry);
 	unsigned long start_offset, end_offset;
-	unsigned long mask = (1UL << page_cluster) - 1;
+	unsigned long mask = is_swap_fast(entry) ? 0 :
+				(1UL << page_cluster) - 1;
 
 	/* Read a page_cluster sized and aligned cluster around offset. */
 	start_offset = offset & ~mask;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 99d16ec7c..f00f23601 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -73,6 +73,26 @@ static inline unsigned char swap_count(unsigned char ent)
 	return ent & ~SWAP_HAS_CACHE;	/* may include SWAP_HAS_CONT flag */
 }
 
+bool is_swap_fast(swp_entry_t entry)
+{
+	struct swap_info_struct *p;
+	unsigned long type;
+
+	if (non_swap_entry(entry))
+		return false;
+
+	type = swp_type(entry);
+	if (type >= nr_swapfiles)
+		return false;
+
+	p = swap_info[type];
+
+	if (p->flags & SWP_FAST)
+		return true;
+
+	return false;
+}
+
 /* returns 1 if swap entry is freed */
 static int
 __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
@@ -81,7 +101,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
 	struct page *page;
 	int ret = 0;
 
-	page = find_get_page(&swapper_space, entry.val);
+	page = find_get_page(swap_address_space(entry), entry.val);
 	if (!page)
 		return 0;
 	/*
@@ -212,7 +232,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 			si->cluster_nr = SWAPFILE_CLUSTER - 1;
 			goto checks;
 		}
-		if (si->flags & SWP_DISCARDABLE) {
+		if (si->flags & SWP_PAGE_DISCARD) {
 			/*
 			 * Start range check on racing allocations, in case
 			 * they overlap the cluster we eventually decide on
@@ -293,7 +313,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 		scan_base = offset = si->lowest_bit;
 
 	/* reuse swap entry of cache-only swap if not busy. */
-	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+	if (vm_swap_full(si) && si->swap_map[offset] == SWAP_HAS_CACHE) {
 		int swap_was_freed;
 		spin_unlock(&si->lock);
 		swap_was_freed = __try_to_reclaim_swap(si, offset);
@@ -322,7 +342,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 
 	if (si->lowest_alloc) {
 		/*
-		 * Only set when SWP_DISCARDABLE, and there's a scan
+		 * Only set when SWP_PAGE_DISCARD, and there's a scan
 		 * for a free cluster in progress or just completed.
 		 */
 		if (found_free_cluster) {
@@ -382,7 +402,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 			spin_lock(&si->lock);
 			goto checks;
 		}
-		if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+		if (vm_swap_full(si) && si->swap_map[offset] == SWAP_HAS_CACHE) {
 			spin_lock(&si->lock);
 			goto checks;
 		}
@@ -397,7 +417,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 			spin_lock(&si->lock);
 			goto checks;
 		}
-		if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+		if (vm_swap_full(si) && si->swap_map[offset] == SWAP_HAS_CACHE) {
 			spin_lock(&si->lock);
 			goto checks;
 		}
@@ -746,7 +766,7 @@ int free_swap_and_cache(swp_entry_t entry)
 	p = swap_info_get(entry);
 	if (p) {
 		if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
-			page = find_get_page(&swapper_space, entry.val);
+			page = find_get_page(swap_address_space(entry), entry.val);
 			if (page && !trylock_page(page)) {
 				page_cache_release(page);
 				page = NULL;
@@ -760,7 +780,7 @@ int free_swap_and_cache(swp_entry_t entry)
 		 * Also recheck PageSwapCache now page is locked (above).
 		 */
 		if (PageSwapCache(page) && !PageWriteback(page) &&
-				(!page_mapped(page) || vm_swap_full())) {
+				(!page_mapped(page) || vm_swap_full(page_swap_info(page)))) {
 			delete_from_swap_cache(page);
 			SetPageDirty(page);
 		}
@@ -1492,7 +1512,9 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
  */
 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 {
-	struct inode *inode;
+	struct file *swap_file = sis->swap_file;
+	struct address_space *mapping = swap_file->f_mapping;
+	struct inode *inode = mapping->host;
 	unsigned blocks_per_page;
 	unsigned long page_no;
 	unsigned blkbits;
@@ -1503,13 +1525,22 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 	int nr_extents = 0;
 	int ret;
 
-	inode = sis->swap_file->f_mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
 		ret = add_swap_extent(sis, 0, sis->max, 0);
 		*span = sis->pages;
 		goto out;
 	}
 
+	if (mapping->a_ops->swap_activate) {
+		ret = mapping->a_ops->swap_activate(swap_file);
+		if (!ret) {
+			sis->flags |= SWP_FILE;
+			ret = add_swap_extent(sis, 0, sis->max, 0);
+			*span = sis->pages;
+		}
+		goto out;
+	}
+	
 	blkbits = inode->i_blkbits;
 	blocks_per_page = PAGE_SIZE >> blkbits;
 
@@ -2083,6 +2114,21 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 	return nr_extents;
 }
 
+/*
+ * Helper to sys_swapon determining if a given swap
+ * backing device queue supports DISCARD operations.
+ */
+static bool swap_discardable(struct swap_info_struct *si)
+{
+	struct request_queue *q = bdev_get_queue(si->bdev);
+
+	if (!q || !blk_queue_discard(q))
+		return false;
+
+	return true;
+}
+
+
 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 {
 	struct swap_info_struct *p;
@@ -2190,8 +2236,39 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 			p->flags |= SWP_SOLIDSTATE;
 			p->cluster_next = 1 + (random32() % p->highest_bit);
 		}
-		if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
-			p->flags |= SWP_DISCARDABLE;
+		if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+			/*
+			 * When discard is enabled for swap with no particular
+			 * policy flagged, we set all swap discard flags here in
+			 * order to sustain backward compatibility with older
+			 * swapon(8) releases.
+			 */
+			p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
+				     SWP_PAGE_DISCARD);
+
+			/*
+			 * By flagging sys_swapon, a sysadmin can tell us to
+			 * either do single-time area discards only, or to just
+			 * perform discards for released swap page-clusters.
+			 * Now it's time to adjust the p->flags accordingly.
+			 */
+			if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
+				p->flags &= ~SWP_PAGE_DISCARD;
+			else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
+				p->flags &= ~SWP_AREA_DISCARD;
+
+			/* issue a swapon-time discard if it's still required */
+			if (p->flags & SWP_AREA_DISCARD) {
+				int err = discard_swap(p);
+				if (unlikely(err))
+					printk(KERN_ERR
+					       "swapon: discard_swap(%p): %d\n",
+						p, err);
+			}
+		}
+
+		if (blk_queue_fast(bdev_get_queue(p->bdev)))
+ 			p->flags |= SWP_FAST;
 	}
 
 	mutex_lock(&swapon_mutex);
@@ -2202,11 +2279,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	enable_swap_info(p, prio, swap_map, frontswap_map);
 
 	printk(KERN_INFO "Adding %uk swap on %s.  "
-			"Priority:%d extents:%d across:%lluk %s%s%s\n",
+			"Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
 		p->pages<<(PAGE_SHIFT-10), name, p->prio,
 		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
 		(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
 		(p->flags & SWP_DISCARDABLE) ? "D" : "",
+		(p->flags & SWP_AREA_DISCARD) ? "s" : "",
+ 		(p->flags & SWP_PAGE_DISCARD) ? "c" : "",
 		(frontswap_map) ? "FS" : "");
 
 	mutex_unlock(&swapon_mutex);
diff --git a/mm/util.c b/mm/util.c
index ae962b31d..a280deeb1 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,8 @@
 #include <linux/export.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -341,6 +343,24 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
+struct address_space *page_mapping(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	VM_BUG_ON(PageSlab(page));
+#ifdef CONFIG_SWAP
+	if (unlikely(PageSwapCache(page))) {
+		swp_entry_t entry;
+
+		entry.val = page_private(page);
+		mapping = swap_address_space(entry);
+	} else
+#endif
+	if ((unsigned long)mapping & PAGE_MAPPING_ANON)
+		mapping = NULL;
+	return mapping;
+}
+
 /* Tracepoints definitions. */
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 530fab823..b27d8ba49 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -274,34 +274,41 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
  *
  * Returns the number of slab objects which we shrunk.
  */
-unsigned long shrink_slab(struct shrink_control *shrink,
+unsigned long shrink_slab(struct shrink_control *shrinkctl,
 			  unsigned long nr_pages_scanned,
 			  unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
-	unsigned long ret = 0;
+	unsigned long freed = 0;
 
 	if (nr_pages_scanned == 0)
 		nr_pages_scanned = SWAP_CLUSTER_MAX;
 
 	if (!down_read_trylock(&shrinker_rwsem)) {
-		/* Assume we'll be able to shrink next time */
-		ret = 1;
+		/*
+ 		 * If we would return 0, our callers would understand that we
+ 		 * have nothing else to shrink and give up trying. By returning
+ 		 * 1 we keep it going and assume we'll be able to shrink next
+ 		 * time.
+ 		 */
+ 		freed = 1;
 		goto out;
 	}
 
 	list_for_each_entry(shrinker, &shrinker_list, list) {
 		unsigned long long delta;
-		long total_scan, pages_got;
+		long total_scan;
 		long max_pass;
-		int shrink_ret = 0;
 		long nr;
 		long new_nr;
 		long batch_size = shrinker->batch ? shrinker->batch
 						  : SHRINK_BATCH;
 
-		max_pass = do_shrinker_shrink(shrinker, shrink, 0);
-		if (max_pass <= 0)
+		if (shrinker->count_objects)
+ 			max_pass = shrinker->count_objects(shrinker, shrinkctl);
+ 		else
+ 			max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
+ 		if (max_pass <= 0)
 			continue;
 
 		/*
@@ -317,9 +324,9 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		do_div(delta, lru_pages + 1);
 		total_scan += delta;
 		if (total_scan < 0) {
-			printk(KERN_ERR "shrink_slab: %pF negative objects to "
-			       "delete nr=%ld\n",
-			       shrinker->shrink, total_scan);
+			printk(KERN_ERR
+ 			"shrink_slab: %pF negative objects to delete nr=%ld\n",
+  			       shrinker->shrink, total_scan);
 			total_scan = max_pass;
 		}
 
@@ -346,26 +353,35 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		if (total_scan > max_pass * 2)
 			total_scan = max_pass * 2;
 
-		trace_mm_shrink_slab_start(shrinker, shrink, nr,
+		trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
 					nr_pages_scanned, lru_pages,
 					max_pass, delta, total_scan);
 
 		while (total_scan >= batch_size) {
-			int nr_before;
 
-			nr_before = do_shrinker_shrink(shrinker, shrink, 0);
-			shrink_ret = do_shrinker_shrink(shrinker, shrink,
-							batch_size);
-			if (shrink_ret == -1)
-				break;
-			if (shrink_ret < nr_before) {
-				pages_got = nr_before - shrink_ret;
-				ret += pages_got;
-				total_scan -= pages_got > batch_size ? pages_got : batch_size;
-			} else {
-				total_scan -= batch_size;
-			}
+			if (shrinker->scan_objects) {
+	 			unsigned long ret;
+ 				shrinkctl->nr_to_scan = batch_size;
+ 				ret = shrinker->scan_objects(shrinker, shrinkctl);
+ 
+ 				if (ret == SHRINK_STOP)
+ 					break;
+ 				freed += ret;
+ 			} else {
+ 				int nr_before;
+ 				long ret;
+ 
+ 				nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
+ 				ret = do_shrinker_shrink(shrinker, shrinkctl,
+ 								batch_size);
+ 				if (ret == -1)
+ 					break;
+ 				if (ret < nr_before)
+ 					freed += nr_before - ret;
+ 			}
+ 
 			count_vm_events(SLABS_SCANNED, batch_size);
+			total_scan -= batch_size;
 
 			cond_resched();
 		}
@@ -381,12 +397,12 @@ unsigned long shrink_slab(struct shrink_control *shrink,
 		else
 			new_nr = atomic_long_read(&shrinker->nr_in_batch);
 
-		trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
+		trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
 	}
 	up_read(&shrinker_rwsem);
 out:
 	cond_resched();
-	return ret;
+	return freed;
 }
 
 static inline int is_page_cache_freeable(struct page *page)
@@ -974,7 +990,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 activate_locked:
 		/* Not a candidate for swapping, so reclaim swap space. */
-		if (PageSwapCache(page) && vm_swap_full())
+		if (PageSwapCache(page) && vm_swap_full(page_swap_info(page)))
 			try_to_free_swap(page);
 		VM_BUG_ON(PageActive(page));
 		SetPageActive(page);
@@ -1268,6 +1284,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
 	while (!list_empty(page_list)) {
 		struct page *page = lru_to_page(page_list);
 		int lru;
+		int file;
 
 		VM_BUG_ON(PageLRU(page));
 		list_del(&page->lru);
@@ -1280,6 +1297,13 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
 		SetPageLRU(page);
 		lru = page_lru(page);
 		add_page_to_lru_list(zone, page, lru);
+		
+		file = is_file_lru(lru);
+#ifdef CONFIG_ZCACHE
+		if (IS_ENABLED(CONFIG_ZCACHE))
+			if (file)
+				SetPageWasActive(page);
+#endif
 		if (is_active_lru(lru)) {
 			int file = is_file_lru(lru);
 			int numpages = hpage_nr_pages(page);
@@ -1574,6 +1598,12 @@ static void shrink_active_list(unsigned long nr_to_scan,
 		}
 
 		ClearPageActive(page);	/* we are de-activating */
+		if (IS_ENABLED(CONFIG_ZCACHE))
+			/*
+			 * For zcache to know whether the page is from active
+			 * file list
+			 */
+			SetPageWasActive(page);
 		list_add(&page->lru, &l_inactive);
 	}
 
diff --git a/mm/zbud.c b/mm/zbud.c
new file mode 100644
index 000000000..894108348
--- /dev/null
+++ b/mm/zbud.c
@@ -0,0 +1,640 @@
+/*
+ * zbud.c
+ *
+ * Copyright (C) 2013, Seth Jennings, IBM
+ *
+ * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
+ *
+ * zbud is an special purpose allocator for storing compressed pages.  Contrary
+ * to what its name may suggest, zbud is not a buddy allocator, but rather an
+ * allocator that "buddies" two compressed pages together in a single memory
+ * page.
+ *
+ * While this design limits storage density, it has simple and deterministic
+ * reclaim properties that make it preferable to a higher density approach when
+ * reclaim will be used.
+ *
+ * zbud works by storing compressed pages, or "zpages", together in pairs in a
+ * single memory page called a "zbud page".  The first buddy is "left
+ * justified" at the beginning of the zbud page, and the last buddy is "right
+ * justified" at the end of the zbud page.  The benefit is that if either
+ * buddy is freed, the freed buddy space, coalesced with whatever slack space
+ * that existed between the buddies, results in the largest possible free region
+ * within the zbud page.
+ *
+ * zbud also provides an attractive lower bound on density. The ratio of zpages
+ * to zbud pages can not be less than 1.  This ensures that zbud can never "do
+ * harm" by using more pages to store zpages than the uncompressed zpages would
+ * have used on their own.
+ *
+ * zbud pages are divided into "chunks".  The size of the chunks is fixed at
+ * compile time and determined by NCHUNKS_ORDER below.  Dividing zbud pages
+ * into chunks allows organizing unbuddied zbud pages into a manageable number
+ * of unbuddied lists according to the number of free chunks available in the
+ * zbud page.
+ *
+ * The zbud API differs from that of conventional allocators in that the
+ * allocation function, zbud_alloc(), returns an opaque handle to the user,
+ * not a dereferenceable pointer.  The user must map the handle using
+ * zbud_map() in order to get a usable pointer by which to access the
+ * allocation data and unmap the handle with zbud_unmap() when operations
+ * on the allocation data are complete.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/zbud.h>
+#include <linux/zpool.h>
+
+/*****************
+ * Structures
+*****************/
+/*
+ * NCHUNKS_ORDER determines the internal allocation granularity, effectively
+ * adjusting internal fragmentation.  It also determines the number of
+ * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
+ * in allocated page is occupied by zbud header, NCHUNKS will be calculated to
+ * 63 which shows the max number of free chunks in zbud page, also there will be
+ * 63 freelists per pool.
+ */
+#define NCHUNKS_ORDER	6
+
+#define CHUNK_SHIFT	(PAGE_SHIFT - NCHUNKS_ORDER)
+#define CHUNK_SIZE	(1 << CHUNK_SHIFT)
+#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
+#define NCHUNKS		((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+
+/**
+ * struct zbud_pool - stores metadata for each zbud pool
+ * @lock:	protects all pool fields and first|last_chunk fields of any
+ *		zbud page in the pool
+ * @unbuddied:	array of lists tracking zbud pages that only contain one buddy;
+ *		the lists each zbud page is added to depends on the size of
+ *		its free region.
+ * @buddied:	list tracking the zbud pages that contain two buddies;
+ *		these zbud pages are full
+ * @lru:	list tracking the zbud pages in LRU order by most recently
+ *		added buddy.
+ * @pages_nr:	number of zbud pages in the pool.
+ * @ops:	pointer to a structure of user defined operations specified at
+ *		pool creation time.
+ *
+ * This structure is allocated at pool creation time and maintains metadata
+ * pertaining to a particular zbud pool.
+ */
+struct zbud_pool {
+	spinlock_t lock;
+	struct list_head unbuddied[NCHUNKS];
+	struct list_head buddied;
+	struct list_head lru;
+	u64 pages_nr;
+	const struct zbud_ops *ops;
+#ifdef CONFIG_ZPOOL
+	struct zpool *zpool;
+	const struct zpool_ops *zpool_ops;
+#endif
+};
+
+/*
+ * struct zbud_header - zbud page metadata occupying the first chunk of each
+ *			zbud page.
+ * @buddy:	links the zbud page into the unbuddied/buddied lists in the pool
+ * @lru:	links the zbud page into the lru list in the pool
+ * @first_chunks:	the size of the first buddy in chunks, 0 if free
+ * @last_chunks:	the size of the last buddy in chunks, 0 if free
+ */
+struct zbud_header {
+	struct list_head buddy;
+	struct list_head lru;
+	unsigned int first_chunks;
+	unsigned int last_chunks;
+	bool under_reclaim;
+};
+
+/*****************
+ * zpool
+ ****************/
+
+#ifdef CONFIG_ZPOOL
+
+static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
+{
+	if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
+		return pool->zpool_ops->evict(pool->zpool, handle);
+	else
+		return -ENOENT;
+}
+
+static const struct zbud_ops zbud_zpool_ops = {
+	.evict =	zbud_zpool_evict
+};
+
+static void *zbud_zpool_create(const char *name, gfp_t gfp,
+			       const struct zpool_ops *zpool_ops,
+			       struct zpool *zpool)
+{
+	struct zbud_pool *pool;
+
+	pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
+	if (pool) {
+		pool->zpool = zpool;
+		pool->zpool_ops = zpool_ops;
+	}
+	return pool;
+}
+
+static void zbud_zpool_destroy(void *pool)
+{
+	zbud_destroy_pool(pool);
+}
+
+static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+			unsigned long *handle)
+{
+	return zbud_alloc(pool, size, gfp, handle);
+}
+static void zbud_zpool_free(void *pool, unsigned long handle)
+{
+	zbud_free(pool, handle);
+}
+
+static int zbud_zpool_shrink(void *pool, unsigned int pages,
+			unsigned int *reclaimed)
+{
+	unsigned int total = 0;
+	int ret = -EINVAL;
+
+	while (total < pages) {
+		ret = zbud_reclaim_page(pool, 8);
+		if (ret < 0)
+			break;
+		total++;
+	}
+
+	if (reclaimed)
+		*reclaimed = total;
+
+	return ret;
+}
+
+static void *zbud_zpool_map(void *pool, unsigned long handle,
+			enum zpool_mapmode mm)
+{
+	return zbud_map(pool, handle);
+}
+static void zbud_zpool_unmap(void *pool, unsigned long handle)
+{
+	zbud_unmap(pool, handle);
+}
+
+static u64 zbud_zpool_total_size(void *pool)
+{
+	return zbud_get_pool_size(pool) * PAGE_SIZE;
+}
+
+static struct zpool_driver zbud_zpool_driver = {
+	.type =		"zbud",
+	.owner =	THIS_MODULE,
+	.create =	zbud_zpool_create,
+	.destroy =	zbud_zpool_destroy,
+	.malloc =	zbud_zpool_malloc,
+	.free =		zbud_zpool_free,
+	.shrink =	zbud_zpool_shrink,
+	.map =		zbud_zpool_map,
+	.unmap =	zbud_zpool_unmap,
+	.total_size =	zbud_zpool_total_size,
+};
+
+MODULE_ALIAS("zpool-zbud");
+#endif /* CONFIG_ZPOOL */
+
+/*****************
+ * Helpers
+*****************/
+/* Just to make the code easier to read */
+enum buddy {
+	FIRST,
+	LAST
+};
+
+/* Converts an allocation size in bytes to size in zbud chunks */
+static int size_to_chunks(size_t size)
+{
+	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
+}
+
+#define for_each_unbuddied_list(_iter, _begin) \
+	for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
+
+/* Initializes the zbud header of a newly allocated zbud page */
+static struct zbud_header *init_zbud_page(struct page *page)
+{
+	struct zbud_header *zhdr = page_address(page);
+	zhdr->first_chunks = 0;
+	zhdr->last_chunks = 0;
+	INIT_LIST_HEAD(&zhdr->buddy);
+	INIT_LIST_HEAD(&zhdr->lru);
+	zhdr->under_reclaim = 0;
+	return zhdr;
+}
+
+/* Resets the struct page fields and frees the page */
+static void free_zbud_page(struct zbud_header *zhdr)
+{
+	__free_page(virt_to_page(zhdr));
+}
+
+/*
+ * Encodes the handle of a particular buddy within a zbud page
+ * Pool lock should be held as this function accesses first|last_chunks
+ */
+static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
+{
+	unsigned long handle;
+
+	/*
+	 * For now, the encoded handle is actually just the pointer to the data
+	 * but this might not always be the case.  A little information hiding.
+	 * Add CHUNK_SIZE to the handle if it is the first allocation to jump
+	 * over the zbud header in the first chunk.
+	 */
+	handle = (unsigned long)zhdr;
+	if (bud == FIRST)
+		/* skip over zbud header */
+		handle += ZHDR_SIZE_ALIGNED;
+	else /* bud == LAST */
+		handle += PAGE_SIZE - (zhdr->last_chunks  << CHUNK_SHIFT);
+	return handle;
+}
+
+/* Returns the zbud page where a given handle is stored */
+static struct zbud_header *handle_to_zbud_header(unsigned long handle)
+{
+	return (struct zbud_header *)(handle & PAGE_MASK);
+}
+
+/* Returns the number of free chunks in a zbud page */
+static int num_free_chunks(struct zbud_header *zhdr)
+{
+	/*
+	 * Rather than branch for different situations, just use the fact that
+	 * free buddies have a length of zero to simplify everything.
+	 */
+	return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
+}
+
+/*****************
+ * API Functions
+*****************/
+/**
+ * zbud_create_pool() - create a new zbud pool
+ * @gfp:	gfp flags when allocating the zbud pool structure
+ * @ops:	user-defined operations for the zbud pool
+ *
+ * Return: pointer to the new zbud pool or NULL if the metadata allocation
+ * failed.
+ */
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
+{
+	struct zbud_pool *pool;
+	int i;
+
+	pool = kzalloc(sizeof(struct zbud_pool), gfp);
+	if (!pool)
+		return NULL;
+	spin_lock_init(&pool->lock);
+	for_each_unbuddied_list(i, 0)
+		INIT_LIST_HEAD(&pool->unbuddied[i]);
+	INIT_LIST_HEAD(&pool->buddied);
+	INIT_LIST_HEAD(&pool->lru);
+	pool->pages_nr = 0;
+	pool->ops = ops;
+	return pool;
+}
+
+/**
+ * zbud_destroy_pool() - destroys an existing zbud pool
+ * @pool:	the zbud pool to be destroyed
+ *
+ * The pool should be emptied before this function is called.
+ */
+void zbud_destroy_pool(struct zbud_pool *pool)
+{
+	kfree(pool);
+}
+
+/**
+ * zbud_alloc() - allocates a region of a given size
+ * @pool:	zbud pool from which to allocate
+ * @size:	size in bytes of the desired allocation
+ * @gfp:	gfp flags used if the pool needs to grow
+ * @handle:	handle of the new allocation
+ *
+ * This function will attempt to find a free region in the pool large enough to
+ * satisfy the allocation request.  A search of the unbuddied lists is
+ * performed first. If no suitable free region is found, then a new page is
+ * allocated and added to the pool to satisfy the request.
+ *
+ * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
+ * as zbud pool pages.
+ *
+ * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
+ * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
+ * a new page.
+ */
+int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
+			unsigned long *handle)
+{
+	int chunks, i, freechunks;
+	struct zbud_header *zhdr = NULL;
+	enum buddy bud;
+	struct page *page;
+	int found = 0;
+
+	if (!size || (gfp & __GFP_HIGHMEM))
+		return -EINVAL;
+	if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
+		return -ENOSPC;
+	chunks = size_to_chunks(size);
+	spin_lock(&pool->lock);
+
+	/* First, try to find an unbuddied zbud page. */
+	zhdr = NULL;
+	for_each_unbuddied_list(i, chunks) {
+		if (!list_empty(&pool->unbuddied[i])) {
+			zhdr = list_first_entry(&pool->unbuddied[i],
+					struct zbud_header, buddy);
+			list_del(&zhdr->buddy);
+			if (zhdr->first_chunks == 0)
+				bud = FIRST;
+			else
+				bud = LAST;
+			found = 1;
+			goto found;
+		}
+	}
+
+	/* Couldn't find unbuddied zbud page, create new one */
+	spin_unlock(&pool->lock);
+	page = alloc_page(gfp);
+	if (!page)
+		return -ENOMEM;
+	spin_lock(&pool->lock);
+	pool->pages_nr++;
+	zhdr = init_zbud_page(page);
+	bud = FIRST;
+
+found:
+	if (bud == FIRST)
+		zhdr->first_chunks = chunks;
+	else
+		zhdr->last_chunks = chunks;
+
+	if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
+		/* Add to unbuddied list */
+		freechunks = num_free_chunks(zhdr);
+		list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+	} else {
+		/* Add to buddied list */
+		list_add(&zhdr->buddy, &pool->buddied);
+	}
+
+	/* Add/move zbud page to beginning of LRU */
+	if (!list_empty(&zhdr->lru))
+		list_del(&zhdr->lru);
+	list_add(&zhdr->lru, &pool->lru);
+
+	*handle = encode_handle(zhdr, bud);
+	if ((gfp & __GFP_ZERO) && found)
+		memset((void *)*handle, 0, size);
+	spin_unlock(&pool->lock);
+
+	return 0;
+}
+
+/**
+ * zbud_free() - frees the allocation associated with the given handle
+ * @pool:	pool in which the allocation resided
+ * @handle:	handle associated with the allocation returned by zbud_alloc()
+ *
+ * In the case that the zbud page in which the allocation resides is under
+ * reclaim, as indicated by the PG_reclaim flag being set, this function
+ * only sets the first|last_chunks to 0.  The page is actually freed
+ * once both buddies are evicted (see zbud_reclaim_page() below).
+ */
+void zbud_free(struct zbud_pool *pool, unsigned long handle)
+{
+	struct zbud_header *zhdr;
+	int freechunks;
+
+	spin_lock(&pool->lock);
+	zhdr = handle_to_zbud_header(handle);
+
+	/* If first buddy, handle will be page aligned */
+	if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
+		zhdr->last_chunks = 0;
+	else
+		zhdr->first_chunks = 0;
+
+	if (zhdr->under_reclaim) {
+		/* zbud page is under reclaim, reclaim will free */
+		spin_unlock(&pool->lock);
+		return;
+	}
+
+	/* Remove from existing buddy list */
+	list_del(&zhdr->buddy);
+
+	if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+		/* zbud page is empty, free */
+		list_del(&zhdr->lru);
+		free_zbud_page(zhdr);
+		pool->pages_nr--;
+	} else {
+		/* Add to unbuddied list */
+		freechunks = num_free_chunks(zhdr);
+		list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+	}
+
+	spin_unlock(&pool->lock);
+}
+
+/**
+ * zbud_reclaim_page() - evicts allocations from a pool page and frees it
+ * @pool:	pool from which a page will attempt to be evicted
+ * @retires:	number of pages on the LRU list for which eviction will
+ *		be attempted before failing
+ *
+ * zbud reclaim is different from normal system reclaim in that the reclaim is
+ * done from the bottom, up.  This is because only the bottom layer, zbud, has
+ * information on how the allocations are organized within each zbud page. This
+ * has the potential to create interesting locking situations between zbud and
+ * the user, however.
+ *
+ * To avoid these, this is how zbud_reclaim_page() should be called:
+
+ * The user detects a page should be reclaimed and calls zbud_reclaim_page().
+ * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
+ * the user-defined eviction handler with the pool and handle as arguments.
+ *
+ * If the handle can not be evicted, the eviction handler should return
+ * non-zero. zbud_reclaim_page() will add the zbud page back to the
+ * appropriate list and try the next zbud page on the LRU up to
+ * a user defined number of retries.
+ *
+ * If the handle is successfully evicted, the eviction handler should
+ * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
+ * contains logic to delay freeing the page if the page is under reclaim,
+ * as indicated by the setting of the PG_reclaim flag on the underlying page.
+ *
+ * If all buddies in the zbud page are successfully evicted, then the
+ * zbud page can be freed.
+ *
+ * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
+ * no pages to evict or an eviction handler is not registered, -EAGAIN if
+ * the retry limit was hit.
+ */
+int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
+{
+	int i, ret, freechunks;
+	struct zbud_header *zhdr;
+	unsigned long first_handle = 0, last_handle = 0;
+
+	spin_lock(&pool->lock);
+	if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
+			retries == 0) {
+		spin_unlock(&pool->lock);
+		return -EINVAL;
+	}
+	for (i = 0; i < retries; i++) {
+		zhdr = list_last_entry(&pool->lru, struct zbud_header, lru);
+		list_del(&zhdr->lru);
+		list_del(&zhdr->buddy);
+		/* Protect zbud page against free */
+		zhdr->under_reclaim = true;
+		/*
+		 * We need encode the handles before unlocking, since we can
+		 * race with free that will set (first|last)_chunks to 0
+		 */
+		first_handle = 0;
+		last_handle = 0;
+		if (zhdr->first_chunks)
+			first_handle = encode_handle(zhdr, FIRST);
+		if (zhdr->last_chunks)
+			last_handle = encode_handle(zhdr, LAST);
+		spin_unlock(&pool->lock);
+
+		/* Issue the eviction callback(s) */
+		if (first_handle) {
+			ret = pool->ops->evict(pool, first_handle);
+			if (ret)
+				goto next;
+		}
+		if (last_handle) {
+			ret = pool->ops->evict(pool, last_handle);
+			if (ret)
+				goto next;
+		}
+next:
+		spin_lock(&pool->lock);
+		zhdr->under_reclaim = false;
+		if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+			/*
+			 * Both buddies are now free, free the zbud page and
+			 * return success.
+			 */
+			free_zbud_page(zhdr);
+			pool->pages_nr--;
+			spin_unlock(&pool->lock);
+			return 0;
+		} else if (zhdr->first_chunks == 0 ||
+				zhdr->last_chunks == 0) {
+			/* add to unbuddied list */
+			freechunks = num_free_chunks(zhdr);
+			list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+		} else {
+			/* add to buddied list */
+			list_add(&zhdr->buddy, &pool->buddied);
+		}
+
+		/* add to beginning of LRU */
+		list_add(&zhdr->lru, &pool->lru);
+	}
+	spin_unlock(&pool->lock);
+	return -EAGAIN;
+}
+
+/**
+ * zbud_map() - maps the allocation associated with the given handle
+ * @pool:	pool in which the allocation resides
+ * @handle:	handle associated with the allocation to be mapped
+ *
+ * While trivial for zbud, the mapping functions for others allocators
+ * implementing this allocation API could have more complex information encoded
+ * in the handle and could create temporary mappings to make the data
+ * accessible to the user.
+ *
+ * Returns: a pointer to the mapped allocation
+ */
+void *zbud_map(struct zbud_pool *pool, unsigned long handle)
+{
+	return (void *)(handle);
+}
+
+/**
+ * zbud_unmap() - maps the allocation associated with the given handle
+ * @pool:	pool in which the allocation resides
+ * @handle:	handle associated with the allocation to be unmapped
+ */
+void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
+{
+}
+
+/**
+ * zbud_get_pool_size() - gets the zbud pool size in pages
+ * @pool:	pool whose size is being queried
+ *
+ * Returns: size in pages of the given pool.  The pool lock need not be
+ * taken to access pages_nr.
+ */
+u64 zbud_get_pool_size(struct zbud_pool *pool)
+{
+	return pool->pages_nr;
+}
+
+static int __init init_zbud(void)
+{
+	/* Make sure the zbud header will fit in one chunk */
+	BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
+	pr_info("loaded\n");
+
+#ifdef CONFIG_ZPOOL
+	zpool_register_driver(&zbud_zpool_driver);
+#endif
+
+	return 0;
+}
+
+static void __exit exit_zbud(void)
+{
+#ifdef CONFIG_ZPOOL
+	zpool_unregister_driver(&zbud_zpool_driver);
+#endif
+
+	pr_info("unloaded\n");
+}
+
+module_init(init_zbud);
+module_exit(exit_zbud);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
+MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");
diff --git a/mm/zcache.c b/mm/zcache.c
new file mode 100644
index 000000000..ec1c7c919
--- /dev/null
+++ b/mm/zcache.c
@@ -0,0 +1,1158 @@
+/*
+ * linux/mm/zcache.c
+ *
+ * A cleancache backend for file pages compression.
+ * Concepts based on original zcache by Dan Magenheimer.
+ * Copyright (C) 2013  Bob Liu <bob.liu@xxxxxxxxxx>
+ *
+ * With zcache, active file pages can be compressed in memory during page
+ * reclaiming. When their data is needed again the I/O reading operation is
+ * avoided. This results in a significant performance gain under memory pressure
+ * for systems with many file pages.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/cleancache.h>
+#include <linux/cpu.h>
+#include <linux/crypto.h>
+#include <linux/page-flags.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/mm_types.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
+#include <linux/types.h>
+#include <linux/zbud.h>
+
+/*
+ * Enable/disable zcache (enabled by default)
+ */
+static bool zcache_enabled = true;
+module_param_named(enabled, zcache_enabled, bool, 0);
+
+/*
+ * Compressor to be used by zcache
+ */
+#define ZCACHE_COMPRESSOR_DEFAULT "lzo"
+#ifndef CONFIG_CRYPTO_LZ4
+static char *zcache_compressor = ZCACHE_COMPRESSOR_DEFAULT;
+#else
+static char *zcache_compressor = "lz4";
+#endif
+module_param_named(compressor, zcache_compressor, charp, 0);
+
+/*
+ * The maximum percentage of memory that the compressed pool can occupy.
+ */
+static unsigned int zcache_max_pool_percent = 10;
+module_param_named(max_pool_percent, zcache_max_pool_percent, uint, 0644);
+
+static unsigned int zcache_clear_percent = 4;
+module_param_named(clear_percent, zcache_clear_percent, uint, 0644);
+/*
+ * zcache statistics
+ */
+static u64 zcache_pool_limit_hit;
+static u64 zcache_dup_entry;
+static u64 zcache_zbud_alloc_fail;
+static u64 zcache_evict_zpages;
+static u64 zcache_evict_filepages;
+static u64 zcache_inactive_pages_refused;
+static u64 zcache_reclaim_fail;
+static u64 zcache_pool_shrink;
+static u64 zcache_pool_shrink_fail;
+static u64 zcache_pool_shrink_pages;
+static u64 zcache_store_failed;
+static atomic_t zcache_stored_pages = ATOMIC_INIT(0);
+static atomic_t zcache_stored_zero_pages = ATOMIC_INIT(0);
+
+#define GFP_ZCACHE \
+	(__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | \
+		__GFP_NOMEMALLOC | __GFP_NO_KSWAPD | __GFP_ZERO)
+
+/*
+ * Make sure this is different from radix tree
+ * indirect ptr or exceptional entry.
+ */
+#define ZERO_HANDLE	((void *)~(~0UL >> 1))
+
+/*
+ * Zcache receives pages for compression through the Cleancache API and is able
+ * to evict pages from its own compressed pool on an LRU basis in the case that
+ * the compressed pool is full.
+ *
+ * Zcache makes use of zbud for the managing the compressed memory pool. Each
+ * allocation in zbud is not directly accessible by address.  Rather, a handle
+ * (zaddr) is return by the allocation routine and that handle(zaddr must be
+ * mapped before being accessed. The compressed memory pool grows on demand and
+ * shrinks as compressed pages are freed.
+ *
+ * When a file page is passed from cleancache to zcache, zcache maintains a
+ * mapping of the <filesystem_type, inode_number, page_index> to the zbud
+ * address that references that compressed file page. This mapping is achieved
+ * with a red-black tree per filesystem type, plus a radix tree per red-black
+ * node.
+ *
+ * A zcache pool with pool_id as the index is created when a filesystem mounted
+ * Each zcache pool has a red-black tree, the inode number(rb_index) is the
+ * search key. Each red-black tree node has a radix tree which use
+ * page->index(ra_index) as the index. Each radix tree slot points to the zbud
+ * address combining with some extra information(zcache_ra_handle).
+ */
+#define MAX_ZCACHE_POOLS 32
+/*
+ * One zcache_pool per (cleancache aware) filesystem mount instance
+ */
+struct zcache_pool {
+	struct rb_root rbtree;
+	rwlock_t rb_lock;		/* Protects rbtree */
+	u64 size;
+	struct zbud_pool *pool;         /* Zbud pool used */
+};
+
+/*
+ * Manage all zcache pools
+ */
+struct _zcache {
+	struct zcache_pool *pools[MAX_ZCACHE_POOLS];
+	u32 num_pools;			/* Current no. of zcache pools */
+	spinlock_t pool_lock;		/* Protects pools[] and num_pools */
+};
+struct _zcache zcache;
+
+/*
+ * Redblack tree node, each node has a page index radix-tree.
+ * Indexed by inode nubmer.
+ */
+struct zcache_rbnode {
+	struct rb_node rb_node;
+	int rb_index;
+	struct radix_tree_root ratree; /* Page radix tree per inode rbtree */
+	spinlock_t ra_lock;		/* Protects radix tree */
+	struct kref refcount;
+};
+
+/*
+ * Radix-tree leaf, indexed by page->index
+ */
+struct zcache_ra_handle {
+	int rb_index;			/* Redblack tree index */
+	int ra_index;			/* Radix tree index */
+	int zlen;			/* Compressed page size */
+	struct zcache_pool *zpool;	/* Finding zcache_pool during evict */
+};
+
+u64 zcache_pages(void)
+{
+	int i;
+	u64 count = 0;
+
+	for (i = 0; (i < MAX_ZCACHE_POOLS) && zcache.pools[i]; i++)
+		count += zcache.pools[i]->size;
+
+	return count;
+}
+
+static struct kmem_cache *zcache_rbnode_cache;
+static int zcache_rbnode_cache_create(void)
+{
+	zcache_rbnode_cache = KMEM_CACHE(zcache_rbnode, 0);
+	return zcache_rbnode_cache == NULL;
+}
+static void zcache_rbnode_cache_destroy(void)
+{
+	kmem_cache_destroy(zcache_rbnode_cache);
+}
+
+static int zcache_shrink(struct shrinker *s, struct shrink_control *sc)
+{
+	unsigned long active_file;
+	unsigned long file;
+	long file_gap;
+	unsigned long freed = 0;
+	unsigned long pool;
+	static bool running;
+	int i = 0;
+	int retries;
+
+	if (running)
+		goto end;
+
+	running = true;
+	active_file = global_page_state(NR_ACTIVE_FILE);
+	file = global_page_state(NR_FILE_PAGES);
+	pool = zcache_pages();
+
+	file_gap = pool - file;
+
+	if ((file_gap >= 0) &&
+		(totalram_pages * zcache_clear_percent / 100 > file)) {
+		file_gap = pool;
+		zcache_pool_shrink++;
+		goto reclaim;
+	}
+
+	/*
+	 * file_gap == 0 means that the number of pages
+	 * stored by zcache is around twice as many as the
+	 * number of active file pages.
+	 */
+	file_gap = pool - active_file;
+	if (file_gap < 0)
+		file_gap = 0;
+	else
+		zcache_pool_shrink++;
+
+reclaim:
+	retries = file_gap;
+	while ((file_gap > 0) && retries) {
+		struct zcache_pool *zpool =
+			zcache.pools[i++ % MAX_ZCACHE_POOLS];
+		if (!zpool || !zpool->size)
+			continue;
+		if (zbud_reclaim_page(zpool->pool, 8)) {
+			zcache_pool_shrink_fail++;
+			retries--;
+			continue;
+		}
+		freed++;
+		file_gap--;
+	}
+
+	zcache_pool_shrink_pages += freed;
+	for (i = 0; (i < MAX_ZCACHE_POOLS) && zcache.pools[i]; i++)
+		zcache.pools[i]->size =
+			zbud_get_pool_size(zcache.pools[i]->pool);
+
+	running = false;
+end:
+	return freed;
+}
+
+static struct shrinker zcache_shrinker = {
+	.shrink = zcache_shrink,
+	.seeks = DEFAULT_SEEKS * 16
+};
+
+/*
+ * Compression functions
+ * (Below functions are copyed from zswap!)
+ */
+static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms;
+
+enum comp_op {
+	ZCACHE_COMPOP_COMPRESS,
+	ZCACHE_COMPOP_DECOMPRESS
+};
+
+static int zcache_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
+				u8 *dst, unsigned int *dlen)
+{
+	struct crypto_comp *tfm;
+	int ret;
+
+	tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu());
+	switch (op) {
+	case ZCACHE_COMPOP_COMPRESS:
+		ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
+		break;
+	case ZCACHE_COMPOP_DECOMPRESS:
+		ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	put_cpu();
+	return ret;
+}
+
+static int __init zcache_comp_init(void)
+{
+	if (!crypto_has_comp(zcache_compressor, 0, 0)) {
+		pr_info("%s compressor not available\n", zcache_compressor);
+		/* fall back to default compressor */
+		zcache_compressor = ZCACHE_COMPRESSOR_DEFAULT;
+		if (!crypto_has_comp(zcache_compressor, 0, 0))
+			/* can't even load the default compressor */
+			return -ENODEV;
+	}
+	pr_info("using %s compressor\n", zcache_compressor);
+
+	/* alloc percpu transforms */
+	zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
+	if (!zcache_comp_pcpu_tfms)
+		return -ENOMEM;
+	return 0;
+}
+
+static void zcache_comp_exit(void)
+{
+	/* free percpu transforms */
+	if (zcache_comp_pcpu_tfms)
+		free_percpu(zcache_comp_pcpu_tfms);
+}
+
+/*
+ * Per-cpu code
+ * (Below functions are also copyed from zswap!)
+ */
+static DEFINE_PER_CPU(u8 *, zcache_dstmem);
+
+static int __zcache_cpu_notifier(unsigned long action, unsigned long cpu)
+{
+	struct crypto_comp *tfm;
+	u8 *dst;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		tfm = crypto_alloc_comp(zcache_compressor, 0, 0);
+		if (IS_ERR(tfm)) {
+			pr_err("can't allocate compressor transform\n");
+			return NOTIFY_BAD;
+		}
+		*per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm;
+		dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
+		if (!dst) {
+			pr_err("can't allocate compressor buffer\n");
+			crypto_free_comp(tfm);
+			*per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL;
+			return NOTIFY_BAD;
+		}
+		per_cpu(zcache_dstmem, cpu) = dst;
+		break;
+	case CPU_DEAD:
+	case CPU_UP_CANCELED:
+		tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu);
+		if (tfm) {
+			crypto_free_comp(tfm);
+			*per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL;
+		}
+		dst = per_cpu(zcache_dstmem, cpu);
+		kfree(dst);
+		per_cpu(zcache_dstmem, cpu) = NULL;
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static int zcache_cpu_notifier(struct notifier_block *nb,
+				unsigned long action, void *pcpu)
+{
+	unsigned long cpu = (unsigned long)pcpu;
+
+	return __zcache_cpu_notifier(action, cpu);
+}
+
+static struct notifier_block zcache_cpu_notifier_block = {
+	.notifier_call = zcache_cpu_notifier
+};
+
+static int zcache_cpu_init(void)
+{
+	unsigned long cpu;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		if (__zcache_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
+			goto cleanup;
+	register_cpu_notifier(&zcache_cpu_notifier_block);
+	put_online_cpus();
+	return 0;
+
+cleanup:
+	for_each_online_cpu(cpu)
+		__zcache_cpu_notifier(CPU_UP_CANCELED, cpu);
+	put_online_cpus();
+	return -ENOMEM;
+}
+
+/*
+ * Zcache helpers
+ */
+static bool zcache_is_full(void)
+{
+	long file = global_page_state(NR_FILE_PAGES);
+
+	return ((totalram_pages * zcache_max_pool_percent / 100 <
+			zcache_pages()) ||
+			(totalram_pages * zcache_clear_percent / 100 >
+			file));
+}
+
+/*
+ * The caller must hold zpool->rb_lock at least
+ */
+static struct zcache_rbnode *zcache_find_rbnode(struct rb_root *rbtree,
+	int index, struct rb_node **rb_parent, struct rb_node ***rb_link)
+{
+	struct zcache_rbnode *entry;
+	struct rb_node **__rb_link, *__rb_parent, *rb_prev;
+
+	__rb_link = &rbtree->rb_node;
+	rb_prev = __rb_parent = NULL;
+
+	while (*__rb_link) {
+		__rb_parent = *__rb_link;
+		entry = rb_entry(__rb_parent, struct zcache_rbnode, rb_node);
+		if (entry->rb_index > index)
+			__rb_link = &__rb_parent->rb_left;
+		else if (entry->rb_index < index) {
+			rb_prev = __rb_parent;
+			__rb_link = &__rb_parent->rb_right;
+		} else
+			return entry;
+	}
+
+	if (rb_parent)
+		*rb_parent = __rb_parent;
+	if (rb_link)
+		*rb_link = __rb_link;
+	return NULL;
+}
+
+static struct zcache_rbnode *zcache_find_get_rbnode(struct zcache_pool *zpool,
+					int rb_index)
+{
+	unsigned long flags;
+	struct zcache_rbnode *rbnode;
+
+	read_lock_irqsave(&zpool->rb_lock, flags);
+	rbnode = zcache_find_rbnode(&zpool->rbtree, rb_index, 0, 0);
+	if (rbnode)
+		kref_get(&rbnode->refcount);
+	read_unlock_irqrestore(&zpool->rb_lock, flags);
+	return rbnode;
+}
+
+/*
+ * kref_put callback for zcache_rbnode.
+ *
+ * The rbnode must have been isolated from rbtree already.
+ */
+static void zcache_rbnode_release(struct kref *kref)
+{
+	struct zcache_rbnode *rbnode;
+
+	rbnode = container_of(kref, struct zcache_rbnode, refcount);
+	BUG_ON(rbnode->ratree.rnode);
+	kmem_cache_free(zcache_rbnode_cache, rbnode);
+}
+
+/*
+ * Check whether the radix-tree of this rbnode is empty.
+ * If that's true, then we can delete this zcache_rbnode from
+ * zcache_pool->rbtree
+ *
+ * Caller must hold zcache_rbnode->ra_lock
+ */
+static int zcache_rbnode_empty(struct zcache_rbnode *rbnode)
+{
+	return rbnode->ratree.rnode == NULL;
+}
+
+/*
+ * Remove zcache_rbnode from zpool->rbtree
+ *
+ * holded_rblock - whether the caller has holded zpool->rb_lock
+ */
+static void zcache_rbnode_isolate(struct zcache_pool *zpool,
+		struct zcache_rbnode *rbnode, bool holded_rblock)
+{
+	unsigned long flags;
+
+	if (!holded_rblock)
+		write_lock_irqsave(&zpool->rb_lock, flags);
+	/*
+	 * Someone can get reference on this rbnode before we could
+	 * acquire write lock above.
+	 * We want to remove it from zpool->rbtree when only the caller and
+	 * corresponding ratree holds a reference to this rbnode.
+	 * Below check ensures that a racing zcache put will not end up adding
+	 * a page to an isolated node and thereby losing that memory.
+	 */
+	if (atomic_read(&rbnode->refcount.refcount) == 2) {
+		rb_erase(&rbnode->rb_node, &zpool->rbtree);
+		RB_CLEAR_NODE(&rbnode->rb_node);
+		kref_put(&rbnode->refcount, zcache_rbnode_release);
+	}
+	if (!holded_rblock)
+		write_unlock_irqrestore(&zpool->rb_lock, flags);
+}
+
+/*
+ * Store zaddr which allocated by zbud_alloc() to the hierarchy rbtree-ratree.
+ */
+static int zcache_store_zaddr(struct zcache_pool *zpool,
+		int ra_index, int rb_index, unsigned long zaddr)
+{
+	unsigned long flags;
+	struct zcache_rbnode *rbnode, *tmp;
+	struct rb_node **link = NULL, *parent = NULL;
+	int ret;
+	void *dup_zaddr;
+
+	rbnode = zcache_find_get_rbnode(zpool, rb_index);
+	if (!rbnode) {
+		/* alloc and init a new rbnode */
+		rbnode = kmem_cache_alloc(zcache_rbnode_cache,
+			GFP_ZCACHE);
+		if (!rbnode)
+			return -ENOMEM;
+
+		INIT_RADIX_TREE(&rbnode->ratree, GFP_ATOMIC|__GFP_NOWARN);
+		spin_lock_init(&rbnode->ra_lock);
+		rbnode->rb_index = rb_index;
+		kref_init(&rbnode->refcount);
+		RB_CLEAR_NODE(&rbnode->rb_node);
+
+		/* add that rbnode to rbtree */
+		write_lock_irqsave(&zpool->rb_lock, flags);
+		tmp = zcache_find_rbnode(&zpool->rbtree, rb_index,
+				&parent, &link);
+		if (tmp) {
+			/* somebody else allocated new rbnode */
+			kmem_cache_free(zcache_rbnode_cache, rbnode);
+			rbnode = tmp;
+		} else {
+			rb_link_node(&rbnode->rb_node, parent, link);
+			rb_insert_color(&rbnode->rb_node, &zpool->rbtree);
+		}
+
+		/* Inc the reference of this zcache_rbnode */
+		kref_get(&rbnode->refcount);
+		write_unlock_irqrestore(&zpool->rb_lock, flags);
+	}
+
+	/* Succfully got a zcache_rbnode when arriving here */
+	spin_lock_irqsave(&rbnode->ra_lock, flags);
+	dup_zaddr = radix_tree_delete(&rbnode->ratree, ra_index);
+	if (unlikely(dup_zaddr)) {
+		if (dup_zaddr == ZERO_HANDLE) {
+			atomic_dec(&zcache_stored_zero_pages);
+		} else {
+			zbud_free(zpool->pool, (unsigned long)dup_zaddr);
+			atomic_dec(&zcache_stored_pages);
+			zpool->size = zbud_get_pool_size(zpool->pool);
+		}
+		zcache_dup_entry++;
+	}
+
+	/* Insert zcache_ra_handle to ratree */
+	ret = radix_tree_insert(&rbnode->ratree, ra_index,
+				(void *)zaddr);
+	spin_unlock_irqrestore(&rbnode->ra_lock, flags);
+	if (unlikely(ret)) {
+		write_lock_irqsave(&zpool->rb_lock, flags);
+		spin_lock(&rbnode->ra_lock);
+
+		if (zcache_rbnode_empty(rbnode))
+			zcache_rbnode_isolate(zpool, rbnode, 1);
+
+		spin_unlock(&rbnode->ra_lock);
+		write_unlock_irqrestore(&zpool->rb_lock, flags);
+	}
+
+	kref_put(&rbnode->refcount, zcache_rbnode_release);
+	return ret;
+}
+
+/*
+ * Load zaddr and delete it from radix tree.
+ * If the radix tree of the corresponding rbnode is empty, delete the rbnode
+ * from zpool->rbtree also.
+ */
+static void *zcache_load_delete_zaddr(struct zcache_pool *zpool,
+				int rb_index, int ra_index)
+{
+	struct zcache_rbnode *rbnode;
+	void *zaddr = NULL;
+	unsigned long flags;
+
+	rbnode = zcache_find_get_rbnode(zpool, rb_index);
+	if (!rbnode)
+		goto out;
+
+	BUG_ON(rbnode->rb_index != rb_index);
+
+	spin_lock_irqsave(&rbnode->ra_lock, flags);
+	zaddr = radix_tree_delete(&rbnode->ratree, ra_index);
+	spin_unlock_irqrestore(&rbnode->ra_lock, flags);
+
+	/* rb_lock and ra_lock must be taken again in the given sequence */
+	write_lock_irqsave(&zpool->rb_lock, flags);
+	spin_lock(&rbnode->ra_lock);
+	if (zcache_rbnode_empty(rbnode))
+		zcache_rbnode_isolate(zpool, rbnode, 1);
+	spin_unlock(&rbnode->ra_lock);
+	write_unlock_irqrestore(&zpool->rb_lock, flags);
+
+	kref_put(&rbnode->refcount, zcache_rbnode_release);
+out:
+	return zaddr;
+}
+
+static bool zero_page(struct page *page)
+{
+	unsigned long *ptr = kmap_atomic(page);
+	int i;
+	bool ret = false;
+
+	for (i = 0; i < PAGE_SIZE / sizeof(*ptr); i++) {
+		if (ptr[i])
+			goto out;
+	}
+	ret = true;
+out:
+	kunmap_atomic(ptr);
+	return ret;
+}
+
+static void zcache_store_page(int pool_id, struct cleancache_filekey key,
+		pgoff_t index, struct page *page)
+{
+	struct zcache_ra_handle *zhandle;
+	u8 *zpage, *src, *dst;
+	/* Address of zhandle + compressed data(zpage) */
+	unsigned long zaddr = 0;
+	unsigned int zlen = PAGE_SIZE;
+	bool zero = 0;
+	int ret;
+
+	struct zcache_pool *zpool = zcache.pools[pool_id];
+
+	/*
+	 * Zcache will be ineffective if the compressed memory pool is full with
+	 * compressed inactive file pages and most of them will never be used
+	 * again.
+	 * So we refuse to compress pages that are not from active file list.
+	 */
+	if (!PageWasActive(page)) {
+		zcache_inactive_pages_refused++;
+		return;
+	}
+
+	zero = zero_page(page);
+	if (zero)
+		goto zero;
+
+	if (zcache_is_full()) {
+		zcache_pool_limit_hit++;
+		if (zbud_reclaim_page(zpool->pool, 8)) {
+			zcache_reclaim_fail++;
+			return;
+		}
+		/*
+		 * Continue if reclaimed a page frame succ.
+		 */
+		zcache_evict_filepages++;
+		zpool->size = zbud_get_pool_size(zpool->pool);
+	}
+
+	/* compress */
+	dst = get_cpu_var(zcache_dstmem);
+	src = kmap_atomic(page);
+	ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, src, PAGE_SIZE, dst,
+			&zlen);
+	kunmap_atomic(src);
+	if (ret) {
+		pr_err("zcache compress error ret %d\n", ret);
+		put_cpu_var(zcache_dstmem);
+		return;
+	}
+
+	/* store zcache handle together with compressed page data */
+	ret = zbud_alloc(zpool->pool, zlen + sizeof(struct zcache_ra_handle),
+			GFP_ZCACHE, &zaddr);
+	if (ret) {
+		zcache_zbud_alloc_fail++;
+		put_cpu_var(zcache_dstmem);
+		return;
+	}
+
+	zhandle = (struct zcache_ra_handle *)zbud_map(zpool->pool, zaddr);
+
+	/* Compressed page data stored at the end of zcache_ra_handle */
+	zpage = (u8 *)(zhandle + 1);
+	memcpy(zpage, dst, zlen);
+	zbud_unmap(zpool->pool, zaddr);
+	put_cpu_var(zcache_dstmem);
+
+zero:
+	if (zero)
+		zaddr = (unsigned long)ZERO_HANDLE;
+
+	/* store zcache handle */
+	ret = zcache_store_zaddr(zpool, index, key.u.ino, zaddr);
+	if (ret) {
+		zcache_store_failed++;
+		if (!zero)
+			zbud_free(zpool->pool, zaddr);
+		return;
+	}
+
+	/* update stats */
+	if (zero) {
+		atomic_inc(&zcache_stored_zero_pages);
+	} else {
+		zhandle->ra_index = index;
+		zhandle->rb_index = key.u.ino;
+		zhandle->zlen = zlen;
+		zhandle->zpool = zpool;
+		atomic_inc(&zcache_stored_pages);
+		zpool->size = zbud_get_pool_size(zpool->pool);
+	}
+
+	return;
+}
+
+static int zcache_load_page(int pool_id, struct cleancache_filekey key,
+			pgoff_t index, struct page *page)
+{
+	int ret = 0;
+	u8 *src, *dst;
+	void *zaddr;
+	unsigned int dlen = PAGE_SIZE;
+	struct zcache_ra_handle *zhandle;
+	struct zcache_pool *zpool = zcache.pools[pool_id];
+
+	zaddr = zcache_load_delete_zaddr(zpool, key.u.ino, index);
+	if (!zaddr)
+		return -ENOENT;
+	else if (zaddr == ZERO_HANDLE)
+		goto map;
+
+	zhandle = (struct zcache_ra_handle *)zbud_map(zpool->pool,
+			(unsigned long)zaddr);
+	/* Compressed page data stored at the end of zcache_ra_handle */
+	src = (u8 *)(zhandle + 1);
+
+	/* decompress */
+map:
+	dst = kmap_atomic(page);
+	if (zaddr != ZERO_HANDLE) {
+		ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, src,
+				zhandle->zlen, dst, &dlen);
+	} else {
+		memset(dst, 0, PAGE_SIZE);
+		kunmap_atomic(dst);
+		flush_dcache_page(page);
+		atomic_dec(&zcache_stored_zero_pages);
+		goto out;
+	}
+	kunmap_atomic(dst);
+	zbud_unmap(zpool->pool, (unsigned long)zaddr);
+	zbud_free(zpool->pool, (unsigned long)zaddr);
+
+	BUG_ON(ret);
+	BUG_ON(dlen != PAGE_SIZE);
+
+	/* update stats */
+	atomic_dec(&zcache_stored_pages);
+	zpool->size = zbud_get_pool_size(zpool->pool);
+out:
+	SetPageWasActive(page);
+	return ret;
+}
+
+static void zcache_flush_page(int pool_id, struct cleancache_filekey key,
+			pgoff_t index)
+{
+	struct zcache_pool *zpool = zcache.pools[pool_id];
+	void *zaddr = NULL;
+
+	zaddr = zcache_load_delete_zaddr(zpool, key.u.ino, index);
+	if (zaddr && (zaddr != ZERO_HANDLE)) {
+		zbud_free(zpool->pool, (unsigned long)zaddr);
+		atomic_dec(&zcache_stored_pages);
+		zpool->size = zbud_get_pool_size(zpool->pool);
+	} else if (zaddr == ZERO_HANDLE) {
+		atomic_dec(&zcache_stored_zero_pages);
+	}
+}
+
+#define FREE_BATCH 16
+/*
+ * Callers must hold the lock
+ */
+static void zcache_flush_ratree(struct zcache_pool *zpool,
+		struct zcache_rbnode *rbnode)
+{
+	unsigned long index = 0;
+	int count, i;
+	struct zcache_ra_handle *zhandle;
+	void *zaddr = NULL;
+
+	do {
+		void *zaddrs[FREE_BATCH];
+		unsigned long indices[FREE_BATCH];
+
+		count = radix_tree_gang_lookup_index(&rbnode->ratree,
+				(void **)zaddrs, indices,
+				index, FREE_BATCH);
+
+		for (i = 0; i < count; i++) {
+			if (zaddrs[i] == ZERO_HANDLE) {
+				zaddr = radix_tree_delete(&rbnode->ratree,
+					indices[i]);
+				if (zaddr)
+					atomic_dec(&zcache_stored_zero_pages);
+				continue;
+			}
+			zhandle = (struct zcache_ra_handle *)zbud_map(
+					zpool->pool, (unsigned long)zaddrs[i]);
+			index = zhandle->ra_index;
+			zaddr = radix_tree_delete(&rbnode->ratree, index);
+			if (!zaddr)
+				continue;
+			zbud_unmap(zpool->pool, (unsigned long)zaddrs[i]);
+			zbud_free(zpool->pool, (unsigned long)zaddrs[i]);
+			atomic_dec(&zcache_stored_pages);
+			zpool->size = zbud_get_pool_size(zpool->pool);
+		}
+
+		index++;
+	} while (count == FREE_BATCH);
+}
+
+static void zcache_flush_inode(int pool_id, struct cleancache_filekey key)
+{
+	struct zcache_rbnode *rbnode;
+	unsigned long flags1, flags2;
+	struct zcache_pool *zpool = zcache.pools[pool_id];
+
+	/*
+	 * Refuse new pages added in to the same rbinode, so get rb_lock at
+	 * first.
+	 */
+	write_lock_irqsave(&zpool->rb_lock, flags1);
+	rbnode = zcache_find_rbnode(&zpool->rbtree, key.u.ino, 0, 0);
+	if (!rbnode) {
+		write_unlock_irqrestore(&zpool->rb_lock, flags1);
+		return;
+	}
+
+	kref_get(&rbnode->refcount);
+	spin_lock_irqsave(&rbnode->ra_lock, flags2);
+
+	zcache_flush_ratree(zpool, rbnode);
+	if (zcache_rbnode_empty(rbnode))
+		/* When arrvied here, we already hold rb_lock */
+		zcache_rbnode_isolate(zpool, rbnode, 1);
+
+	spin_unlock_irqrestore(&rbnode->ra_lock, flags2);
+	write_unlock_irqrestore(&zpool->rb_lock, flags1);
+	kref_put(&rbnode->refcount, zcache_rbnode_release);
+}
+
+static void zcache_destroy_pool(struct zcache_pool *zpool);
+static void zcache_flush_fs(int pool_id)
+{
+	struct zcache_rbnode *z_rbnode = NULL;
+	struct rb_node *rbnode;
+	unsigned long flags1, flags2;
+	struct zcache_pool *zpool;
+
+	if (pool_id < 0)
+		return;
+
+	zpool = zcache.pools[pool_id];
+	if (!zpool)
+		return;
+
+	/*
+	 * Refuse new pages added in, so get rb_lock at first.
+	 */
+	write_lock_irqsave(&zpool->rb_lock, flags1);
+
+	rbnode = rb_first(&zpool->rbtree);
+	while (rbnode) {
+		z_rbnode = rb_entry(rbnode, struct zcache_rbnode, rb_node);
+		rbnode = rb_next(rbnode);
+		if (z_rbnode) {
+			kref_get(&z_rbnode->refcount);
+			spin_lock_irqsave(&z_rbnode->ra_lock, flags2);
+			zcache_flush_ratree(zpool, z_rbnode);
+			if (zcache_rbnode_empty(z_rbnode))
+				zcache_rbnode_isolate(zpool, z_rbnode, 1);
+			spin_unlock_irqrestore(&z_rbnode->ra_lock, flags2);
+			kref_put(&z_rbnode->refcount, zcache_rbnode_release);
+		}
+	}
+
+	write_unlock_irqrestore(&zpool->rb_lock, flags1);
+	zcache_destroy_pool(zpool);
+}
+
+/*
+ * Evict compressed pages from zcache pool on an LRU basis after the compressed
+ * pool is full.
+ */
+static int zcache_evict_zpage(struct zbud_pool *pool, unsigned long zaddr)
+{
+	struct zcache_pool *zpool;
+	struct zcache_ra_handle *zhandle;
+	void *zaddr_intree;
+
+	BUG_ON(zaddr == (unsigned long)ZERO_HANDLE);
+
+	zhandle = (struct zcache_ra_handle *)zbud_map(pool, zaddr);
+
+	zpool = zhandle->zpool;
+	/* There can be a race with zcache store */
+	if (!zpool)
+		return -EINVAL;
+
+	BUG_ON(pool != zpool->pool);
+
+	zaddr_intree = zcache_load_delete_zaddr(zpool, zhandle->rb_index,
+			zhandle->ra_index);
+	if (zaddr_intree) {
+		BUG_ON((unsigned long)zaddr_intree != zaddr);
+		zbud_unmap(pool, zaddr);
+		zbud_free(pool, zaddr);
+		atomic_dec(&zcache_stored_pages);
+		zpool->size = zbud_get_pool_size(pool);
+		zcache_evict_zpages++;
+	}
+	return 0;
+}
+
+static struct zbud_ops zcache_zbud_ops = {
+	.evict = zcache_evict_zpage
+};
+
+/* Return pool id */
+static int zcache_create_pool(void)
+{
+	int ret;
+	struct zcache_pool *zpool;
+
+	zpool = kzalloc(sizeof(*zpool), GFP_KERNEL);
+	if (!zpool) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	zpool->pool = zbud_create_pool(GFP_KERNEL, &zcache_zbud_ops);
+	if (!zpool->pool) {
+		kfree(zpool);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	spin_lock(&zcache.pool_lock);
+	if (zcache.num_pools == MAX_ZCACHE_POOLS) {
+		pr_err("Cannot create new pool (limit:%u)\n", MAX_ZCACHE_POOLS);
+		zbud_destroy_pool(zpool->pool);
+		kfree(zpool);
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	rwlock_init(&zpool->rb_lock);
+	zpool->rbtree = RB_ROOT;
+	/* Add to pool list */
+	for (ret = 0; ret < MAX_ZCACHE_POOLS; ret++)
+		if (!zcache.pools[ret])
+			break;
+	zcache.pools[ret] = zpool;
+	zcache.num_pools++;
+	pr_info("New pool created id:%d\n", ret);
+
+out_unlock:
+	spin_unlock(&zcache.pool_lock);
+out:
+	return ret;
+}
+
+static void zcache_destroy_pool(struct zcache_pool *zpool)
+{
+	int i;
+
+	if (!zpool)
+		return;
+
+	spin_lock(&zcache.pool_lock);
+	zcache.num_pools--;
+	for (i = 0; i < MAX_ZCACHE_POOLS; i++)
+		if (zcache.pools[i] == zpool)
+			break;
+	zcache.pools[i] = NULL;
+	spin_unlock(&zcache.pool_lock);
+
+	if (!RB_EMPTY_ROOT(&zpool->rbtree))
+		WARN_ON("Memory leak detected. Freeing non-empty pool!\n");
+
+	zbud_destroy_pool(zpool->pool);
+	kfree(zpool);
+}
+
+static int zcache_init_fs(size_t pagesize)
+{
+	int ret;
+
+	if (pagesize != PAGE_SIZE) {
+		pr_info("Unsupported page size: %zu", pagesize);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = zcache_create_pool();
+	if (ret < 0) {
+		pr_info("Failed to create new pool\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+out:
+	return ret;
+}
+
+static int zcache_init_shared_fs(char *uuid, size_t pagesize)
+{
+	/* shared pools are unsupported and map to private */
+	return zcache_init_fs(pagesize);
+}
+
+static struct cleancache_ops zcache_ops = {
+	.put_page = zcache_store_page,
+	.get_page = zcache_load_page,
+	.invalidate_page = zcache_flush_page,
+	.invalidate_inode = zcache_flush_inode,
+	.invalidate_fs = zcache_flush_fs,
+	.init_shared_fs = zcache_init_shared_fs,
+	.init_fs = zcache_init_fs
+};
+
+/*
+ * Debugfs functions
+ */
+#ifdef CONFIG_DEBUG_FS
+#include <linux/debugfs.h>
+
+static int pool_pages_get(void *_data, u64 *val)
+{
+	*val = zcache_pages();
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(pool_page_fops, pool_pages_get, NULL, "%llu\n");
+
+static struct dentry *zcache_debugfs_root;
+
+static int __init zcache_debugfs_init(void)
+{
+	if (!debugfs_initialized())
+		return -ENODEV;
+
+	zcache_debugfs_root = debugfs_create_dir("zcache", NULL);
+	if (!zcache_debugfs_root)
+		return -ENOMEM;
+
+	debugfs_create_u64("pool_limit_hit", S_IRUGO, zcache_debugfs_root,
+			&zcache_pool_limit_hit);
+	debugfs_create_u64("reject_alloc_fail", S_IRUGO, zcache_debugfs_root,
+			&zcache_zbud_alloc_fail);
+	debugfs_create_u64("duplicate_entry", S_IRUGO, zcache_debugfs_root,
+			&zcache_dup_entry);
+	debugfs_create_file("pool_pages", S_IRUGO, zcache_debugfs_root, NULL,
+			&pool_page_fops);
+	debugfs_create_atomic_t("stored_pages", S_IRUGO, zcache_debugfs_root,
+			&zcache_stored_pages);
+	debugfs_create_atomic_t("stored_zero_pages", S_IRUGO,
+			zcache_debugfs_root, &zcache_stored_zero_pages);
+	debugfs_create_u64("evicted_zpages", S_IRUGO, zcache_debugfs_root,
+			&zcache_evict_zpages);
+	debugfs_create_u64("evicted_filepages", S_IRUGO, zcache_debugfs_root,
+			&zcache_evict_filepages);
+	debugfs_create_u64("reclaim_fail", S_IRUGO, zcache_debugfs_root,
+			&zcache_reclaim_fail);
+	debugfs_create_u64("inactive_pages_refused", S_IRUGO,
+			zcache_debugfs_root, &zcache_inactive_pages_refused);
+	debugfs_create_u64("pool_shrink_count", S_IRUGO,
+			zcache_debugfs_root, &zcache_pool_shrink);
+	debugfs_create_u64("pool_shrink_fail", S_IRUGO,
+			zcache_debugfs_root, &zcache_pool_shrink_fail);
+	debugfs_create_u64("pool_shrink_pages", S_IRUGO,
+			zcache_debugfs_root, &zcache_pool_shrink_pages);
+	debugfs_create_u64("store_fail", S_IRUGO,
+			zcache_debugfs_root, &zcache_store_failed);
+	return 0;
+}
+
+static void __exit zcache_debugfs_exit(void)
+{
+	debugfs_remove_recursive(zcache_debugfs_root);
+}
+#else
+static int __init zcache_debugfs_init(void)
+{
+	return 0;
+}
+static void __exit zcache_debugfs_exit(void)
+{
+}
+#endif
+
+/*
+ * zcache init and exit
+ */
+static int __init init_zcache(void)
+{
+	if (!zcache_enabled)
+		return 0;
+
+	pr_info("loading zcache..\n");
+	if (zcache_rbnode_cache_create()) {
+		pr_err("entry cache creation failed\n");
+		goto error;
+	}
+
+	if (zcache_comp_init()) {
+		pr_err("compressor initialization failed\n");
+		goto compfail;
+	}
+	if (zcache_cpu_init()) {
+		pr_err("per-cpu initialization failed\n");
+		goto pcpufail;
+	}
+
+	spin_lock_init(&zcache.pool_lock);
+	cleancache_register_ops(&zcache_ops);
+
+	if (zcache_debugfs_init())
+		pr_warn("debugfs initialization failed\n");
+	register_shrinker(&zcache_shrinker);
+	return 0;
+pcpufail:
+	zcache_comp_exit();
+compfail:
+	zcache_rbnode_cache_destroy();
+error:
+	return -ENOMEM;
+}
+
+/* must be late so crypto has time to come up */
+late_initcall(init_zcache);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bob Liu <bob.liu@xxxxxxxxxx>");
+MODULE_DESCRIPTION("Compressed cache for clean file pages");
+
diff --git a/mm/zpool.c b/mm/zpool.c
new file mode 100644
index 000000000..308395ba8
--- /dev/null
+++ b/mm/zpool.c
@@ -0,0 +1,359 @@
+/*
+ * zpool memory storage api
+ *
+ * Copyright (C) 2014 Dan Streetman
+ *
+ * This is a common frontend for memory storage pool implementations.
+ * Typically, this is used to store compressed memory.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/zpool.h>
+
+struct zpool {
+	char *type;
+
+	struct zpool_driver *driver;
+	void *pool;
+	const struct zpool_ops *ops;
+
+	struct list_head list;
+};
+
+static LIST_HEAD(drivers_head);
+static DEFINE_SPINLOCK(drivers_lock);
+
+static LIST_HEAD(pools_head);
+static DEFINE_SPINLOCK(pools_lock);
+
+/**
+ * zpool_register_driver() - register a zpool implementation.
+ * @driver:	driver to register
+ */
+void zpool_register_driver(struct zpool_driver *driver)
+{
+	spin_lock(&drivers_lock);
+	atomic_set(&driver->refcount, 0);
+	list_add(&driver->list, &drivers_head);
+	spin_unlock(&drivers_lock);
+}
+EXPORT_SYMBOL(zpool_register_driver);
+
+/**
+ * zpool_unregister_driver() - unregister a zpool implementation.
+ * @driver:	driver to unregister.
+ *
+ * Module usage counting is used to prevent using a driver
+ * while/after unloading, so if this is called from module
+ * exit function, this should never fail; if called from
+ * other than the module exit function, and this returns
+ * failure, the driver is in use and must remain available.
+ */
+int zpool_unregister_driver(struct zpool_driver *driver)
+{
+	int ret = 0, refcount;
+
+	spin_lock(&drivers_lock);
+	refcount = atomic_read(&driver->refcount);
+	WARN_ON(refcount < 0);
+	if (refcount > 0)
+		ret = -EBUSY;
+	else
+		list_del(&driver->list);
+	spin_unlock(&drivers_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(zpool_unregister_driver);
+
+/* this assumes @type is null-terminated. */
+static struct zpool_driver *zpool_get_driver(const char *type)
+{
+	struct zpool_driver *driver;
+
+	spin_lock(&drivers_lock);
+	list_for_each_entry(driver, &drivers_head, list) {
+		if (!strcmp(driver->type, type)) {
+			bool got = try_module_get(driver->owner);
+
+			if (got)
+				atomic_inc(&driver->refcount);
+			spin_unlock(&drivers_lock);
+			return got ? driver : NULL;
+		}
+	}
+
+	spin_unlock(&drivers_lock);
+	return NULL;
+}
+
+static void zpool_put_driver(struct zpool_driver *driver)
+{
+	atomic_dec(&driver->refcount);
+	module_put(driver->owner);
+}
+
+/**
+ * zpool_has_pool() - Check if the pool driver is available
+ * @type	The type of the zpool to check (e.g. zbud, zsmalloc)
+ *
+ * This checks if the @type pool driver is available.  This will try to load
+ * the requested module, if needed, but there is no guarantee the module will
+ * still be loaded and available immediately after calling.  If this returns
+ * true, the caller should assume the pool is available, but must be prepared
+ * to handle the @zpool_create_pool() returning failure.  However if this
+ * returns false, the caller should assume the requested pool type is not
+ * available; either the requested pool type module does not exist, or could
+ * not be loaded, and calling @zpool_create_pool() with the pool type will
+ * fail.
+ *
+ * Returns: true if @type pool is available, false if not
+ */
+bool zpool_has_pool(char *type)
+{
+	struct zpool_driver *driver = zpool_get_driver(type);
+
+	if (!driver) {
+		request_module("zpool-%s", type);
+		driver = zpool_get_driver(type);
+	}
+
+	if (!driver)
+		return false;
+
+	zpool_put_driver(driver);
+	return true;
+}
+EXPORT_SYMBOL(zpool_has_pool);
+
+/**
+ * zpool_create_pool() - Create a new zpool
+ * @type	The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @name	The name of the zpool (e.g. zram0, zswap)
+ * @gfp		The GFP flags to use when allocating the pool.
+ * @ops		The optional ops callback.
+ *
+ * This creates a new zpool of the specified type.  The gfp flags will be
+ * used when allocating memory, if the implementation supports it.  If the
+ * ops param is NULL, then the created zpool will not be shrinkable.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: New zpool on success, NULL on failure.
+ */
+struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
+		const struct zpool_ops *ops)
+{
+	struct zpool_driver *driver;
+	struct zpool *zpool;
+
+	pr_debug("creating pool type %s\n", type);
+
+	driver = zpool_get_driver(type);
+
+	if (!driver) {
+		request_module("zpool-%s", type);
+		driver = zpool_get_driver(type);
+	}
+
+	if (!driver) {
+		pr_err("no driver for type %s\n", type);
+		return NULL;
+	}
+
+	zpool = kmalloc(sizeof(*zpool), gfp);
+	if (!zpool) {
+		pr_err("couldn't create zpool - out of memory\n");
+		zpool_put_driver(driver);
+		return NULL;
+	}
+
+	zpool->type = driver->type;
+	zpool->driver = driver;
+	zpool->pool = driver->create(name, gfp, ops, zpool);
+	zpool->ops = ops;
+
+	if (!zpool->pool) {
+		pr_err("couldn't create %s pool\n", type);
+		zpool_put_driver(driver);
+		kfree(zpool);
+		return NULL;
+	}
+
+	pr_debug("created pool type %s\n", type);
+
+	spin_lock(&pools_lock);
+	list_add(&zpool->list, &pools_head);
+	spin_unlock(&pools_lock);
+
+	return zpool;
+}
+
+/**
+ * zpool_destroy_pool() - Destroy a zpool
+ * @pool	The zpool to destroy.
+ *
+ * Implementations must guarantee this to be thread-safe,
+ * however only when destroying different pools.  The same
+ * pool should only be destroyed once, and should not be used
+ * after it is destroyed.
+ *
+ * This destroys an existing zpool.  The zpool should not be in use.
+ */
+void zpool_destroy_pool(struct zpool *zpool)
+{
+	pr_debug("destroying pool type %s\n", zpool->type);
+
+	spin_lock(&pools_lock);
+	list_del(&zpool->list);
+	spin_unlock(&pools_lock);
+	zpool->driver->destroy(zpool->pool);
+	zpool_put_driver(zpool->driver);
+	kfree(zpool);
+}
+
+/**
+ * zpool_get_type() - Get the type of the zpool
+ * @pool	The zpool to check
+ *
+ * This returns the type of the pool.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: The type of zpool.
+ */
+char *zpool_get_type(struct zpool *zpool)
+{
+	return zpool->type;
+}
+
+/**
+ * zpool_malloc() - Allocate memory
+ * @pool	The zpool to allocate from.
+ * @size	The amount of memory to allocate.
+ * @gfp		The GFP flags to use when allocating memory.
+ * @handle	Pointer to the handle to set
+ *
+ * This allocates the requested amount of memory from the pool.
+ * The gfp flags will be used when allocating memory, if the
+ * implementation supports it.  The provided @handle will be
+ * set to the allocated object handle.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: 0 on success, negative value on error.
+ */
+int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp,
+			unsigned long *handle)
+{
+	return zpool->driver->malloc(zpool->pool, size, gfp, handle);
+}
+
+/**
+ * zpool_free() - Free previously allocated memory
+ * @pool	The zpool that allocated the memory.
+ * @handle	The handle to the memory to free.
+ *
+ * This frees previously allocated memory.  This does not guarantee
+ * that the pool will actually free memory, only that the memory
+ * in the pool will become available for use by the pool.
+ *
+ * Implementations must guarantee this to be thread-safe,
+ * however only when freeing different handles.  The same
+ * handle should only be freed once, and should not be used
+ * after freeing.
+ */
+void zpool_free(struct zpool *zpool, unsigned long handle)
+{
+	zpool->driver->free(zpool->pool, handle);
+}
+
+/**
+ * zpool_shrink() - Shrink the pool size
+ * @pool	The zpool to shrink.
+ * @pages	The number of pages to shrink the pool.
+ * @reclaimed	The number of pages successfully evicted.
+ *
+ * This attempts to shrink the actual memory size of the pool
+ * by evicting currently used handle(s).  If the pool was
+ * created with no zpool_ops, or the evict call fails for any
+ * of the handles, this will fail.  If non-NULL, the @reclaimed
+ * parameter will be set to the number of pages reclaimed,
+ * which may be more than the number of pages requested.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: 0 on success, negative value on error/failure.
+ */
+int zpool_shrink(struct zpool *zpool, unsigned int pages,
+			unsigned int *reclaimed)
+{
+	return zpool->driver->shrink(zpool->pool, pages, reclaimed);
+}
+
+/**
+ * zpool_map_handle() - Map a previously allocated handle into memory
+ * @pool	The zpool that the handle was allocated from
+ * @handle	The handle to map
+ * @mm		How the memory should be mapped
+ *
+ * This maps a previously allocated handle into memory.  The @mm
+ * param indicates to the implementation how the memory will be
+ * used, i.e. read-only, write-only, read-write.  If the
+ * implementation does not support it, the memory will be treated
+ * as read-write.
+ *
+ * This may hold locks, disable interrupts, and/or preemption,
+ * and the zpool_unmap_handle() must be called to undo those
+ * actions.  The code that uses the mapped handle should complete
+ * its operatons on the mapped handle memory quickly and unmap
+ * as soon as possible.  As the implementation may use per-cpu
+ * data, multiple handles should not be mapped concurrently on
+ * any cpu.
+ *
+ * Returns: A pointer to the handle's mapped memory area.
+ */
+void *zpool_map_handle(struct zpool *zpool, unsigned long handle,
+			enum zpool_mapmode mapmode)
+{
+	return zpool->driver->map(zpool->pool, handle, mapmode);
+}
+
+/**
+ * zpool_unmap_handle() - Unmap a previously mapped handle
+ * @pool	The zpool that the handle was allocated from
+ * @handle	The handle to unmap
+ *
+ * This unmaps a previously mapped handle.  Any locks or other
+ * actions that the implementation took in zpool_map_handle()
+ * will be undone here.  The memory area returned from
+ * zpool_map_handle() should no longer be used after this.
+ */
+void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
+{
+	zpool->driver->unmap(zpool->pool, handle);
+}
+
+/**
+ * zpool_get_total_size() - The total size of the pool
+ * @pool	The zpool to check
+ *
+ * This returns the total size in bytes of the pool.
+ *
+ * Returns: Total size of the zpool in bytes.
+ */
+u64 zpool_get_total_size(struct zpool *zpool)
+{
+	return zpool->driver->total_size(zpool->pool);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
+MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index eff0ca018..7543e3fca 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2,6 +2,7 @@
  * zsmalloc memory allocator
  *
  * Copyright (C) 2011  Nitin Gupta
+ * Copyright (C) 2012, 2013 Minchan Kim
  *
  * This code is released using a dual license strategy: BSD/GPL
  * You can choose the license that better fits your requirements.
@@ -10,43 +11,12 @@
  * Released under the terms of GNU General Public License Version 2.0
  */
 
-
 /*
- * This allocator is designed for use with zcache and zram. Thus, the
- * allocator is supposed to work well under low memory conditions. In
- * particular, it never attempts higher order page allocation which is
- * very likely to fail under memory pressure. On the other hand, if we
- * just use single (0-order) pages, it would suffer from very high
- * fragmentation -- any object of size PAGE_SIZE/2 or larger would occupy
- * an entire page. This was one of the major issues with its predecessor
- * (xvmalloc).
- *
- * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
- * and links them together using various 'struct page' fields. These linked
- * pages act as a single higher-order page i.e. an object can span 0-order
- * page boundaries. The code refers to these linked pages as a single entity
- * called zspage.
- *
- * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
- * since this satisfies the requirements of all its current users (in the
- * worst case, page is incompressible and is thus stored "as-is" i.e. in
- * uncompressed form). For allocation requests larger than this size, failure
- * is returned (see zs_malloc).
- *
- * Additionally, zs_malloc() does not return a dereferenceable pointer.
- * Instead, it returns an opaque handle (unsigned long) which encodes actual
- * location of the allocated object. The reason for this indirection is that
- * zsmalloc does not keep zspages permanently mapped since that would cause
- * issues on 32-bit systems where the VA region for kernel space mappings
- * is very small. So, before using the allocating memory, the object has to
- * be mapped using zs_map_object() to get a usable pointer and subsequently
- * unmapped using zs_unmap_object().
- *
  * Following is how we use various fields and flags of underlying
  * struct page(s) to form a zspage.
  *
  * Usage of struct page fields:
- *	page->first_page: points to the first component (0-order) page
+ *	page->private: points to the first component (0-order) page
  *	page->index (union with page->freelist): offset of the first object
  *		starting in this page. For the first page, this is
  *		always 0, so we use this field (aka freelist) to point
@@ -56,14 +26,18 @@
  *
  *	For _first_ page only:
  *
- *	page->private (union with page->first_page): refers to the
- *		component page after the first page
+ *	page->private: refers to the component page after the first page
+ *		If the page is first_page for huge object, it stores handle.
+ *		Look at size_class->huge.
  *	page->freelist: points to the first free object in zspage.
  *		Free objects are linked together using in-place
  *		metadata.
+ *	page->objects: maximum number of objects we can store in this
+ *		zspage (class->zspage_order * PAGE_SIZE / class->size)
  *	page->lru: links together first pages of various zspages.
  *		Basically forming list of zspages in a fullness group.
  *	page->mapping: class index and fullness group of the zspage
+ *	page->inuse: the number of objects that are used in this zspage
  *
  * Usage of struct page flags:
  *	PG_private: identifies the first component page
@@ -71,12 +45,14 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/highmem.h>
-#include <linux/init.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <asm/tlbflush.h>
@@ -84,11 +60,12 @@
 #include <linux/cpumask.h>
 #include <linux/cpu.h>
 #include <linux/vmalloc.h>
-#include <linux/hardirq.h>
+#include <linux/preempt.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
-
+#include <linux/debugfs.h>
 #include <linux/zsmalloc.h>
+#include <linux/zpool.h>
 
 /*
  * This must be power of 2 and greater than of equal to sizeof(link_free).
@@ -105,6 +82,8 @@
 #define ZS_MAX_ZSPAGE_ORDER 2
 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
 
+#define ZS_HANDLE_SIZE (sizeof(unsigned long))
+
 /*
  * Object location (<PFN>, <obj_idx>) is encoded as
  * as single (unsigned long) handle value.
@@ -128,17 +107,37 @@
 #endif
 #endif
 #define _PFN_BITS		(MAX_PHYSMEM_BITS - PAGE_SHIFT)
-#define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS)
+
+/*
+ * Memory for allocating for handle keeps object position by
+ * encoding <page, obj_idx> and the encoded value has a room
+ * in least bit(ie, look at obj_to_location).
+ * We use the bit to synchronize between object access by
+ * user and migration.
+ */
+#define HANDLE_PIN_BIT	0
+
+/*
+ * Head in allocated object should have OBJ_ALLOCATED_TAG
+ * to identify the object was allocated or not.
+ * It's okay to add the status bit in the least bit because
+ * header keeps handle which is 4byte-aligned address so we
+ * have room for two bit at least.
+ */
+#define OBJ_ALLOCATED_TAG 1
+#define OBJ_TAG_BITS 1
+#define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
 #define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
 
 #define MAX(a, b) ((a) >= (b) ? (a) : (b))
 /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
 #define ZS_MIN_ALLOC_SIZE \
 	MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
+/* each chunk includes extra space to keep handle */
 #define ZS_MAX_ALLOC_SIZE	PAGE_SIZE
 
 /*
- * On systems with 4K page size, this gives 254 size classes! There is a
+ * On systems with 4K page size, this gives 255 size classes! There is a
  * trader-off here:
  *  - Large number of size classes is potentially wasteful as free page are
  *    spread across these classes
@@ -151,8 +150,6 @@
  *  (reason above)
  */
 #define ZS_SIZE_CLASS_DELTA	(PAGE_SIZE >> 8)
-#define ZS_SIZE_CLASSES		((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \
-					ZS_SIZE_CLASS_DELTA + 1)
 
 /*
  * We do not maintain any list for completely empty or full pages
@@ -166,12 +163,38 @@ enum fullness_group {
 	ZS_FULL
 };
 
+enum zs_stat_type {
+	OBJ_ALLOCATED,
+	OBJ_USED,
+	CLASS_ALMOST_FULL,
+	CLASS_ALMOST_EMPTY,
+};
+
+#ifdef CONFIG_ZSMALLOC_STAT
+#define NR_ZS_STAT_TYPE	(CLASS_ALMOST_EMPTY + 1)
+#else
+#define NR_ZS_STAT_TYPE	(OBJ_USED + 1)
+#endif
+
+struct zs_size_stat {
+	unsigned long objs[NR_ZS_STAT_TYPE];
+};
+
+#ifdef CONFIG_ZSMALLOC_STAT
+static struct dentry *zs_stat_root;
+#endif
+
+/*
+ * number of size_classes
+ */
+static int zs_size_classes;
+
 /*
  * We assign a page to ZS_ALMOST_EMPTY fullness group when:
  *	n <= N / f, where
  * n = number of allocated objects
  * N = total number of objects zspage can store
- * f = 1/fullness_threshold_frac
+ * f = fullness_threshold_frac
  *
  * Similarly, we assign zspage to:
  *	ZS_ALMOST_FULL	when n > N / f
@@ -183,6 +206,8 @@ enum fullness_group {
 static const int fullness_threshold_frac = 4;
 
 struct size_class {
+	spinlock_t lock;
+	struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
 	/*
 	 * Size of objects stored in this class. Must be multiple
 	 * of ZS_ALIGN.
@@ -190,15 +215,12 @@ struct size_class {
 	int size;
 	unsigned int index;
 
+	struct zs_size_stat stats;
+
 	/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
 	int pages_per_zspage;
-
-	spinlock_t lock;
-
-	/* stats */
-	u64 pages_allocated;
-
-	struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
+	/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+	bool huge;
 };
 
 /*
@@ -208,14 +230,39 @@ struct size_class {
  * This must be power of 2 and less than or equal to ZS_ALIGN
  */
 struct link_free {
-	/* Handle of next free chunk (encodes <PFN, obj_idx>) */
-	void *next;
+	union {
+		/*
+		 * Position of next free chunk (encodes <PFN, obj_idx>)
+		 * It's valid for non-allocated object
+		 */
+		void *next;
+		/*
+		 * Handle of allocated object.
+		 */
+		unsigned long handle;
+	};
 };
 
 struct zs_pool {
-	struct size_class size_class[ZS_SIZE_CLASSES];
+	const char *name;
+
+	struct size_class **size_class;
+	struct kmem_cache *handle_cachep;
+
+	atomic_long_t pages_allocated;
 
-	struct zs_ops *ops;
+	struct zs_pool_stats stats;
+
+	/* Compact classes */
+	struct shrinker shrinker;
+	/*
+	 * To signify that register_shrinker() was successful
+	 * and unregister_shrinker() will not Oops.
+	 */
+	bool shrinker_enabled;
+#ifdef CONFIG_ZSMALLOC_STAT
+	struct dentry *stat_dentry;
+#endif
 };
 
 /*
@@ -237,22 +284,128 @@ struct mapping_area {
 	enum zs_mapmode vm_mm; /* mapping mode */
 };
 
-/* default page alloc/free ops */
-struct page *zs_alloc_page(gfp_t flags)
+static int create_handle_cache(struct zs_pool *pool)
+{
+	pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+					0, 0, NULL);
+	return pool->handle_cachep ? 0 : 1;
+}
+
+static void destroy_handle_cache(struct zs_pool *pool)
+{
+	kmem_cache_destroy(pool->handle_cachep);
+}
+
+static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp)
+{
+	return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
+			gfp & ~__GFP_HIGHMEM);
+}
+
+static void free_handle(struct zs_pool *pool, unsigned long handle)
+{
+	kmem_cache_free(pool->handle_cachep, (void *)handle);
+}
+
+static void record_obj(unsigned long handle, unsigned long obj)
+{
+	/*
+	 * lsb of @obj represents handle lock while other bits
+	 * represent object value the handle is pointing so
+	 * updating shouldn't do store tearing.
+	 */
+	WRITE_ONCE(*(unsigned long *)handle, obj);
+}
+
+/* zpool driver */
+
+#ifdef CONFIG_ZPOOL
+
+static void *zs_zpool_create(const char *name, gfp_t gfp,
+			     const struct zpool_ops *zpool_ops,
+			     struct zpool *zpool)
+{
+	/*
+	 * Ignore global gfp flags: zs_malloc() may be invoked from
+	 * different contexts and its caller must provide a valid
+	 * gfp mask.
+	 */
+	return zs_create_pool(name);
+}
+
+static void zs_zpool_destroy(void *pool)
+{
+	zs_destroy_pool(pool);
+}
+
+static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+			unsigned long *handle)
+{
+	*handle = zs_malloc(pool, size, gfp);
+	return *handle ? 0 : -1;
+}
+static void zs_zpool_free(void *pool, unsigned long handle)
+{
+	zs_free(pool, handle);
+}
+
+static int zs_zpool_shrink(void *pool, unsigned int pages,
+			unsigned int *reclaimed)
+{
+	return -EINVAL;
+}
+
+static void *zs_zpool_map(void *pool, unsigned long handle,
+			enum zpool_mapmode mm)
+{
+	enum zs_mapmode zs_mm;
+
+	switch (mm) {
+	case ZPOOL_MM_RO:
+		zs_mm = ZS_MM_RO;
+		break;
+	case ZPOOL_MM_WO:
+		zs_mm = ZS_MM_WO;
+		break;
+	case ZPOOL_MM_RW: /* fallthru */
+	default:
+		zs_mm = ZS_MM_RW;
+		break;
+	}
+
+	return zs_map_object(pool, handle, zs_mm);
+}
+static void zs_zpool_unmap(void *pool, unsigned long handle)
 {
-	return alloc_page(flags);
+	zs_unmap_object(pool, handle);
 }
 
-void zs_free_page(struct page *page)
+static u64 zs_zpool_total_size(void *pool)
 {
-	__free_page(page);
+	return zs_get_total_pages(pool) << PAGE_SHIFT;
 }
 
-struct zs_ops zs_default_ops = {
-	.alloc = zs_alloc_page,
-	.free = zs_free_page
+static struct zpool_driver zs_zpool_driver = {
+	.type =		"zsmalloc",
+	.owner =	THIS_MODULE,
+	.create =	zs_zpool_create,
+	.destroy =	zs_zpool_destroy,
+	.malloc =	zs_zpool_malloc,
+	.free =		zs_zpool_free,
+	.shrink =	zs_zpool_shrink,
+	.map =		zs_zpool_map,
+	.unmap =	zs_zpool_unmap,
+	.total_size =	zs_zpool_total_size,
 };
 
+MODULE_ALIAS("zpool-zsmalloc");
+#endif /* CONFIG_ZPOOL */
+
+static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
+{
+	return pages_per_zspage * PAGE_SIZE / size;
+}
+
 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
 static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
 
@@ -266,26 +419,28 @@ static int is_last_page(struct page *page)
 	return PagePrivate2(page);
 }
 
-static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
+static void get_zspage_mapping(struct page *first_page,
+				unsigned int *class_idx,
 				enum fullness_group *fullness)
 {
 	unsigned long m;
-	BUG_ON(!is_first_page(page));
+	BUG_ON(!is_first_page(first_page));
 
-	m = (unsigned long)page->mapping;
+	m = (unsigned long)first_page->mapping;
 	*fullness = m & FULLNESS_MASK;
 	*class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
 }
 
-static void set_zspage_mapping(struct page *page, unsigned int class_idx,
+static void set_zspage_mapping(struct page *first_page,
+				unsigned int class_idx,
 				enum fullness_group fullness)
 {
 	unsigned long m;
-	BUG_ON(!is_first_page(page));
+	BUG_ON(!is_first_page(first_page));
 
 	m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
 			(fullness & FULLNESS_MASK);
-	page->mapping = (struct address_space *)m;
+	first_page->mapping = (struct address_space *)m;
 }
 
 /*
@@ -303,8 +458,171 @@ static int get_size_class_index(int size)
 		idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
 				ZS_SIZE_CLASS_DELTA);
 
-	return idx;
+	return min(zs_size_classes - 1, idx);
+}
+
+static inline void zs_stat_inc(struct size_class *class,
+				enum zs_stat_type type, unsigned long cnt)
+{
+	if (type < NR_ZS_STAT_TYPE)
+		class->stats.objs[type] += cnt;
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+				enum zs_stat_type type, unsigned long cnt)
+{
+	if (type < NR_ZS_STAT_TYPE)
+		class->stats.objs[type] -= cnt;
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+				enum zs_stat_type type)
+{
+	if (type < NR_ZS_STAT_TYPE)
+		return class->stats.objs[type];
+	return 0;
+}
+
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static void __init zs_stat_init(void)
+{
+	if (!debugfs_initialized()) {
+		pr_warn("debugfs not available, stat dir not created\n");
+		return;
+	}
+
+	zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+	if (!zs_stat_root)
+		pr_warn("debugfs 'zsmalloc' stat dir creation failed\n");
+}
+
+static void __exit zs_stat_exit(void)
+{
+	debugfs_remove_recursive(zs_stat_root);
+}
+
+static unsigned long zs_can_compact(struct size_class *class);
+
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+	int i;
+	struct zs_pool *pool = s->private;
+	struct size_class *class;
+	int objs_per_zspage;
+	unsigned long class_almost_full, class_almost_empty;
+	unsigned long obj_allocated, obj_used, pages_used, freeable;
+	unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
+	unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+	unsigned long total_freeable = 0;
+
+	seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n",
+			"class", "size", "almost_full", "almost_empty",
+			"obj_allocated", "obj_used", "pages_used",
+			"pages_per_zspage", "freeable");
+
+	for (i = 0; i < zs_size_classes; i++) {
+		class = pool->size_class[i];
+
+		if (class->index != i)
+			continue;
+
+		spin_lock(&class->lock);
+		class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
+		class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
+		obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+		obj_used = zs_stat_get(class, OBJ_USED);
+		freeable = zs_can_compact(class);
+		spin_unlock(&class->lock);
+
+		objs_per_zspage = get_maxobj_per_zspage(class->size,
+				class->pages_per_zspage);
+		pages_used = obj_allocated / objs_per_zspage *
+				class->pages_per_zspage;
+
+		seq_printf(s, " %5u %5u %11lu %12lu %13lu"
+				" %10lu %10lu %16d %8lu\n",
+			i, class->size, class_almost_full, class_almost_empty,
+			obj_allocated, obj_used, pages_used,
+			class->pages_per_zspage, freeable);
+
+		total_class_almost_full += class_almost_full;
+		total_class_almost_empty += class_almost_empty;
+		total_objs += obj_allocated;
+		total_used_objs += obj_used;
+		total_pages += pages_used;
+		total_freeable += freeable;
+	}
+
+	seq_puts(s, "\n");
+	seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n",
+			"Total", "", total_class_almost_full,
+			total_class_almost_empty, total_objs,
+			total_used_objs, total_pages, "", total_freeable);
+
+	return 0;
+}
+
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, zs_stats_size_show, inode->i_private);
+}
+
+static const struct file_operations zs_stat_size_ops = {
+	.open           = zs_stats_size_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = single_release,
+};
+
+static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
+{
+	struct dentry *entry;
+
+	if (!zs_stat_root) {
+		pr_warn("no root stat dir, not creating <%s> stat dir\n", name);
+		return;
+	}
+
+	entry = debugfs_create_dir(name, zs_stat_root);
+	if (!entry) {
+		pr_warn("debugfs dir <%s> creation failed\n", name);
+		return;
+	}
+	pool->stat_dentry = entry;
+
+	entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
+			pool->stat_dentry, pool, &zs_stat_size_ops);
+	if (!entry) {
+		pr_warn("%s: debugfs file entry <%s> creation failed\n",
+				name, "classes");
+		debugfs_remove_recursive(pool->stat_dentry);
+		pool->stat_dentry = NULL;
+	}
+}
+
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+	debugfs_remove_recursive(pool->stat_dentry);
+}
+
+#else /* CONFIG_ZSMALLOC_STAT */
+static void __init zs_stat_init(void)
+{
+}
+
+static void __exit zs_stat_exit(void)
+{
+}
+
+static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name)
+{
+}
+
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
 }
+#endif
 
 /*
  * For each size class, zspages are divided into different groups
@@ -313,21 +631,20 @@ static int get_size_class_index(int size)
  * the pool (not yet implemented). This function returns fullness
  * status of the given page.
  */
-static enum fullness_group get_fullness_group(struct page *page,
-					struct size_class *class)
+static enum fullness_group get_fullness_group(struct page *first_page)
 {
 	int inuse, max_objects;
 	enum fullness_group fg;
-	BUG_ON(!is_first_page(page));
+	BUG_ON(!is_first_page(first_page));
 
-	inuse = page->inuse;
-	max_objects = class->pages_per_zspage * PAGE_SIZE / class->size;
+	inuse = first_page->inuse;
+	max_objects = first_page->objects;
 
 	if (inuse == 0)
 		fg = ZS_EMPTY;
 	else if (inuse == max_objects)
 		fg = ZS_FULL;
-	else if (inuse <= max_objects / fullness_threshold_frac)
+	else if (inuse <= 3 * max_objects / fullness_threshold_frac)
 		fg = ZS_ALMOST_EMPTY;
 	else
 		fg = ZS_ALMOST_FULL;
@@ -341,33 +658,46 @@ static enum fullness_group get_fullness_group(struct page *page,
  * have. This functions inserts the given zspage into the freelist
  * identified by <class, fullness_group>.
  */
-static void insert_zspage(struct page *page, struct size_class *class,
-				enum fullness_group fullness)
+static void insert_zspage(struct size_class *class,
+				enum fullness_group fullness,
+				struct page *first_page)
 {
 	struct page **head;
 
-	BUG_ON(!is_first_page(page));
+	BUG_ON(!is_first_page(first_page));
 
 	if (fullness >= _ZS_NR_FULLNESS_GROUPS)
 		return;
 
+	zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
+			CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
+
 	head = &class->fullness_list[fullness];
-	if (*head)
-		list_add_tail(&page->lru, &(*head)->lru);
+	if (!*head) {
+		*head = first_page;
+		return;
+	}
 
-	*head = page;
+	/*
+	 * We want to see more ZS_FULL pages and less almost
+	 * empty/full. Put pages with higher ->inuse first.
+	 */
+	list_add_tail(&first_page->lru, &(*head)->lru);
+	if (first_page->inuse >= (*head)->inuse)
+		*head = first_page;
 }
 
 /*
  * This function removes the given zspage from the freelist identified
  * by <class, fullness_group>.
  */
-static void remove_zspage(struct page *page, struct size_class *class,
-				enum fullness_group fullness)
+static void remove_zspage(struct size_class *class,
+				enum fullness_group fullness,
+				struct page *first_page)
 {
 	struct page **head;
 
-	BUG_ON(!is_first_page(page));
+	BUG_ON(!is_first_page(first_page));
 
 	if (fullness >= _ZS_NR_FULLNESS_GROUPS)
 		return;
@@ -376,11 +706,13 @@ static void remove_zspage(struct page *page, struct size_class *class,
 	BUG_ON(!*head);
 	if (list_empty(&(*head)->lru))
 		*head = NULL;
-	else if (*head == page)
+	else if (*head == first_page)
 		*head = (struct page *)list_entry((*head)->lru.next,
 					struct page, lru);
 
-	list_del_init(&page->lru);
+	list_del_init(&first_page->lru);
+	zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
+			CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
 }
 
 /*
@@ -392,24 +724,22 @@ static void remove_zspage(struct page *page, struct size_class *class,
  * page from the freelist of the old fullness group to that of the new
  * fullness group.
  */
-static enum fullness_group fix_fullness_group(struct zs_pool *pool,
-						struct page *page)
+static enum fullness_group fix_fullness_group(struct size_class *class,
+						struct page *first_page)
 {
 	int class_idx;
-	struct size_class *class;
 	enum fullness_group currfg, newfg;
 
-	BUG_ON(!is_first_page(page));
+	BUG_ON(!is_first_page(first_page));
 
-	get_zspage_mapping(page, &class_idx, &currfg);
-	class = &pool->size_class[class_idx];
-	newfg = get_fullness_group(page, class);
+	get_zspage_mapping(first_page, &class_idx, &currfg);
+	newfg = get_fullness_group(first_page);
 	if (newfg == currfg)
 		goto out;
 
-	remove_zspage(page, class, currfg);
-	insert_zspage(page, class, newfg);
-	set_zspage_mapping(page, class_idx, newfg);
+	remove_zspage(class, currfg, first_page);
+	insert_zspage(class, newfg, first_page);
+	set_zspage_mapping(first_page, class_idx, newfg);
 
 out:
 	return newfg;
@@ -420,7 +750,8 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
  * to form a zspage for each size class. This is important
  * to reduce wastage due to unusable space left at end of
  * each zspage which is given as:
- *	wastage = Zp - Zp % size_class
+ *     wastage = Zp % class_size
+ *     usage = Zp - wastage
  * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
  *
  * For example, for size class of 3/8 * PAGE_SIZE, we should
@@ -460,7 +791,7 @@ static struct page *get_first_page(struct page *page)
 	if (is_first_page(page))
 		return page;
 	else
-		return page->first_page;
+		return (struct page *)page_private(page);
 }
 
 static struct page *get_next_page(struct page *page)
@@ -470,35 +801,59 @@ static struct page *get_next_page(struct page *page)
 	if (is_last_page(page))
 		next = NULL;
 	else if (is_first_page(page))
-		next = (struct page *)page->private;
+		next = (struct page *)page_private(page);
 	else
 		next = list_entry(page->lru.next, struct page, lru);
 
 	return next;
 }
 
-/* Encode <page, obj_idx> as a single handle value */
-static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
+/*
+ * Encode <page, obj_idx> as a single handle value.
+ * We use the least bit of handle for tagging.
+ */
+static void *location_to_obj(struct page *page, unsigned long obj_idx)
 {
-	unsigned long handle;
+	unsigned long obj;
 
 	if (!page) {
 		BUG_ON(obj_idx);
 		return NULL;
 	}
 
-	handle = page_to_pfn(page) << OBJ_INDEX_BITS;
-	handle |= (obj_idx & OBJ_INDEX_MASK);
+	obj = page_to_pfn(page) << OBJ_INDEX_BITS;
+	obj |= ((obj_idx) & OBJ_INDEX_MASK);
+	obj <<= OBJ_TAG_BITS;
 
-	return (void *)handle;
+	return (void *)obj;
 }
 
-/* Decode <page, obj_idx> pair from the given object handle */
-static void obj_handle_to_location(unsigned long handle, struct page **page,
+/*
+ * Decode <page, obj_idx> pair from the given object handle. We adjust the
+ * decoded obj_idx back to its original value since it was adjusted in
+ * location_to_obj().
+ */
+static void obj_to_location(unsigned long obj, struct page **page,
 				unsigned long *obj_idx)
 {
-	*page = pfn_to_page(handle >> OBJ_INDEX_BITS);
-	*obj_idx = handle & OBJ_INDEX_MASK;
+	obj >>= OBJ_TAG_BITS;
+	*page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+	*obj_idx = (obj & OBJ_INDEX_MASK);
+}
+
+static unsigned long handle_to_obj(unsigned long handle)
+{
+	return *(unsigned long *)handle;
+}
+
+static unsigned long obj_to_head(struct size_class *class, struct page *page,
+			void *obj)
+{
+	if (class->huge) {
+		VM_BUG_ON(!is_first_page(page));
+		return page_private(page);
+	} else
+		return *(unsigned long *)obj;
 }
 
 static unsigned long obj_idx_to_offset(struct page *page,
@@ -512,6 +867,25 @@ static unsigned long obj_idx_to_offset(struct page *page,
 	return off + obj_idx * class_size;
 }
 
+static inline int trypin_tag(unsigned long handle)
+{
+	unsigned long *ptr = (unsigned long *)handle;
+
+	return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
+}
+
+static void pin_tag(unsigned long handle)
+{
+	while (!trypin_tag(handle));
+}
+
+static void unpin_tag(unsigned long handle)
+{
+	unsigned long *ptr = (unsigned long *)handle;
+
+	clear_bit_unlock(HANDLE_PIN_BIT, ptr);
+}
+
 static void reset_page(struct page *page)
 {
 	clear_bit(PG_private, &page->flags);
@@ -519,10 +893,10 @@ static void reset_page(struct page *page)
 	set_page_private(page, 0);
 	page->mapping = NULL;
 	page->freelist = NULL;
-	reset_page_mapcount(page);
+	page_mapcount_reset(page);
 }
 
-static void free_zspage(struct zs_ops *ops, struct page *first_page)
+static void free_zspage(struct page *first_page)
 {
 	struct page *nextp, *tmp, *head_extra;
 
@@ -532,7 +906,7 @@ static void free_zspage(struct zs_ops *ops, struct page *first_page)
 	head_extra = (struct page *)page_private(first_page);
 
 	reset_page(first_page);
-	ops->free(first_page);
+	__free_page(first_page);
 
 	/* zspage with only 1 system page */
 	if (!head_extra)
@@ -541,14 +915,14 @@ static void free_zspage(struct zs_ops *ops, struct page *first_page)
 	list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) {
 		list_del(&nextp->lru);
 		reset_page(nextp);
-		ops->free(nextp);
+		__free_page(nextp);
 	}
 	reset_page(head_extra);
-	ops->free(head_extra);
+	__free_page(head_extra);
 }
 
 /* Initialize a newly allocated zspage */
-static void init_zspage(struct page *first_page, struct size_class *class)
+static void init_zspage(struct size_class *class, struct page *first_page)
 {
 	unsigned long off = 0;
 	struct page *page = first_page;
@@ -557,7 +931,8 @@ static void init_zspage(struct page *first_page, struct size_class *class)
 	while (page) {
 		struct page *next_page;
 		struct link_free *link;
-		unsigned int i, objs_on_page;
+		unsigned int i = 1;
+		void *vaddr;
 
 		/*
 		 * page->index stores offset of first object starting
@@ -568,16 +943,12 @@ static void init_zspage(struct page *first_page, struct size_class *class)
 		if (page != first_page)
 			page->index = off;
 
-		link = (struct link_free *)kmap_atomic(page) +
-						off / sizeof(*link);
-		objs_on_page = (PAGE_SIZE - off) / class->size;
+		vaddr = kmap_atomic(page);
+		link = (struct link_free *)vaddr + off / sizeof(*link);
 
-		for (i = 1; i <= objs_on_page; i++) {
-			off += class->size;
-			if (off < PAGE_SIZE) {
-				link->next = obj_location_to_handle(page, i);
-				link += class->size / sizeof(*link);
-			}
+		while ((off += class->size) < PAGE_SIZE) {
+			link->next = location_to_obj(page, i++);
+			link += class->size / sizeof(*link);
 		}
 
 		/*
@@ -586,18 +957,17 @@ static void init_zspage(struct page *first_page, struct size_class *class)
 		 * page (if present)
 		 */
 		next_page = get_next_page(page);
-		link->next = obj_location_to_handle(next_page, 0);
-		kunmap_atomic(link);
+		link->next = location_to_obj(next_page, 0);
+		kunmap_atomic(vaddr);
 		page = next_page;
-		off = (off + class->size) % PAGE_SIZE;
+		off %= PAGE_SIZE;
 	}
 }
 
 /*
  * Allocate a zspage for the given size class
  */
-static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class,
-				gfp_t flags)
+static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
 {
 	int i, error;
 	struct page *first_page = NULL, *uninitialized_var(prev_page);
@@ -606,7 +976,7 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class,
 	 * Allocate individual pages and link them together as:
 	 * 1. first page->private = first sub-page
 	 * 2. all sub-pages are linked together using page->lru
-	 * 3. each sub-page is linked to the first page using page->first_page
+	 * 3. each sub-page is linked to the first page using page->private
 	 *
 	 * For each size class, First/Head pages are linked together using
 	 * page->lru. Also, we set PG_private to identify the first page
@@ -617,7 +987,7 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class,
 	for (i = 0; i < class->pages_per_zspage; i++) {
 		struct page *page;
 
-		page = ops->alloc(flags);
+		page = alloc_page(flags);
 		if (!page)
 			goto cleanup;
 
@@ -629,9 +999,9 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class,
 			first_page->inuse = 0;
 		}
 		if (i == 1)
-			first_page->private = (unsigned long)page;
+			set_page_private(first_page, (unsigned long)page);
 		if (i >= 1)
-			page->first_page = first_page;
+			set_page_private(page, (unsigned long)first_page);
 		if (i >= 2)
 			list_add(&page->lru, &prev_page->lru);
 		if (i == class->pages_per_zspage - 1)	/* last page */
@@ -639,15 +1009,17 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class,
 		prev_page = page;
 	}
 
-	init_zspage(first_page, class);
+	init_zspage(class, first_page);
 
-	first_page->freelist = obj_location_to_handle(first_page, 0);
+	first_page->freelist = location_to_obj(first_page, 0);
+	/* Maximum number of objects we can store in this zspage */
+	first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
 
 	error = 0; /* Success */
 
 cleanup:
 	if (unlikely(error) && first_page) {
-		free_zspage(ops, first_page);
+		free_zspage(first_page);
 		first_page = NULL;
 	}
 
@@ -702,14 +1074,11 @@ static inline void __zs_unmap_object(struct mapping_area *area,
 				struct page *pages[2], int off, int size)
 {
 	unsigned long addr = (unsigned long)area->vm_addr;
-	unsigned long end = addr + (PAGE_SIZE * 2);
 
-	flush_cache_vunmap(addr, end);
-	unmap_kernel_range_noflush(addr, PAGE_SIZE * 2);
-	flush_tlb_kernel_range(addr, end);
+	unmap_kernel_range(addr, PAGE_SIZE * 2);
 }
 
-#else /* CONFIG_PGTABLE_MAPPING*/
+#else /* CONFIG_PGTABLE_MAPPING */
 
 static inline int __zs_cpu_up(struct mapping_area *area)
 {
@@ -719,7 +1088,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
 	 */
 	if (area->vm_buf)
 		return 0;
-	area->vm_buf = (char *)__get_free_page(GFP_KERNEL);
+	area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL);
 	if (!area->vm_buf)
 		return -ENOMEM;
 	return 0;
@@ -727,8 +1096,7 @@ static inline int __zs_cpu_up(struct mapping_area *area)
 
 static inline void __zs_cpu_down(struct mapping_area *area)
 {
-	if (area->vm_buf)
-		free_page((unsigned long)area->vm_buf);
+	kfree(area->vm_buf);
 	area->vm_buf = NULL;
 }
 
@@ -765,12 +1133,17 @@ static void __zs_unmap_object(struct mapping_area *area,
 {
 	int sizes[2];
 	void *addr;
-	char *buf = area->vm_buf;
+	char *buf;
 
 	/* no write fastpath */
 	if (area->vm_mm == ZS_MM_RO)
 		goto out;
 
+	buf = area->vm_buf;
+	buf = buf + ZS_HANDLE_SIZE;
+	size -= ZS_HANDLE_SIZE;
+	off += ZS_HANDLE_SIZE;
+
 	sizes[0] = PAGE_SIZE - off;
 	sizes[1] = size - sizes[0];
 
@@ -816,197 +1189,71 @@ static struct notifier_block zs_cpu_nb = {
 	.notifier_call = zs_cpu_notifier
 };
 
-static void zs_exit(void)
+static int zs_register_cpu_notifier(void)
 {
-	int cpu;
-
-	for_each_online_cpu(cpu)
-		zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu);
-	unregister_cpu_notifier(&zs_cpu_nb);
-}
+	int cpu, uninitialized_var(ret);
 
-static int zs_init(void)
-{
-	int cpu, ret;
+	cpu_notifier_register_begin();
 
-	register_cpu_notifier(&zs_cpu_nb);
+	__register_cpu_notifier(&zs_cpu_nb);
 	for_each_online_cpu(cpu) {
 		ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
 		if (notifier_to_errno(ret))
-			goto fail;
+			break;
 	}
-	return 0;
-fail:
-	zs_exit();
+
+	cpu_notifier_register_done();
 	return notifier_to_errno(ret);
 }
 
-/**
- * zs_create_pool - Creates an allocation pool to work from.
- * @flags: allocation flags used to allocate pool metadata
- * @ops: allocation/free callbacks for expanding the pool
- *
- * This function must be called before anything when using
- * the zsmalloc allocator.
- *
- * On success, a pointer to the newly created pool is returned,
- * otherwise NULL.
- */
-struct zs_pool *zs_create_pool(gfp_t flags, struct zs_ops *ops)
+static void zs_unregister_cpu_notifier(void)
 {
-	int i, ovhd_size;
-	struct zs_pool *pool;
-
-	ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
-	pool = kzalloc(ovhd_size, flags);
-	if (!pool)
-		return NULL;
+	int cpu;
 
-	for (i = 0; i < ZS_SIZE_CLASSES; i++) {
-		int size;
-		struct size_class *class;
+	cpu_notifier_register_begin();
 
-		size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
-		if (size > ZS_MAX_ALLOC_SIZE)
-			size = ZS_MAX_ALLOC_SIZE;
+	for_each_online_cpu(cpu)
+		zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu);
+	__unregister_cpu_notifier(&zs_cpu_nb);
 
-		class = &pool->size_class[i];
-		class->size = size;
-		class->index = i;
-		spin_lock_init(&class->lock);
-		class->pages_per_zspage = get_pages_per_zspage(size);
+	cpu_notifier_register_done();
+}
 
-	}
+static void init_zs_size_classes(void)
+{
+	int nr;
 
-	if (ops)
-		pool->ops = ops;
-	else
-		pool->ops = &zs_default_ops;
+	nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1;
+	if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA)
+		nr += 1;
 
-	return pool;
+	zs_size_classes = nr;
 }
-EXPORT_SYMBOL_GPL(zs_create_pool);
 
-void zs_destroy_pool(struct zs_pool *pool)
+static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
 {
-	int i;
+	if (prev->pages_per_zspage != pages_per_zspage)
+		return false;
 
-	for (i = 0; i < ZS_SIZE_CLASSES; i++) {
-		int fg;
-		struct size_class *class = &pool->size_class[i];
+	if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage)
+		!= get_maxobj_per_zspage(size, pages_per_zspage))
+		return false;
 
-		for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
-			if (class->fullness_list[fg]) {
-				pr_info("Freeing non-empty class with size "
-					"%db, fullness group %d\n",
-					class->size, fg);
-			}
-		}
-	}
-	kfree(pool);
+	return true;
 }
-EXPORT_SYMBOL_GPL(zs_destroy_pool);
 
-/**
- * zs_malloc - Allocate block of given size from pool.
- * @pool: pool to allocate from
- * @size: size of block to allocate
- *
- * On success, handle to the allocated object is returned,
- * otherwise 0.
- * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
- */
-unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags)
+static bool zspage_full(struct page *first_page)
 {
-	unsigned long obj;
-	struct link_free *link;
-	int class_idx;
-	struct size_class *class;
-
-	struct page *first_page, *m_page;
-	unsigned long m_objidx, m_offset;
-
-	if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
-		return 0;
-
-	class_idx = get_size_class_index(size);
-	class = &pool->size_class[class_idx];
-	BUG_ON(class_idx != class->index);
-
-	spin_lock(&class->lock);
-	first_page = find_get_zspage(class);
-
-	if (!first_page) {
-		spin_unlock(&class->lock);
-		first_page = alloc_zspage(pool->ops, class, flags);
-		if (unlikely(!first_page))
-			return 0;
-
-		set_zspage_mapping(first_page, class->index, ZS_EMPTY);
-		spin_lock(&class->lock);
-		class->pages_allocated += class->pages_per_zspage;
-	}
-
-	obj = (unsigned long)first_page->freelist;
-	obj_handle_to_location(obj, &m_page, &m_objidx);
-	m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
-
-	link = (struct link_free *)kmap_atomic(m_page) +
-					m_offset / sizeof(*link);
-	first_page->freelist = link->next;
-	memset(link, POISON_INUSE, sizeof(*link));
-	kunmap_atomic(link);
-
-	first_page->inuse++;
-	/* Now move the zspage to another fullness group, if required */
-	fix_fullness_group(pool, first_page);
-	spin_unlock(&class->lock);
+	BUG_ON(!is_first_page(first_page));
 
-	return obj;
+	return first_page->inuse == first_page->objects;
 }
-EXPORT_SYMBOL_GPL(zs_malloc);
 
-void zs_free(struct zs_pool *pool, unsigned long obj)
+unsigned long zs_get_total_pages(struct zs_pool *pool)
 {
-	struct link_free *link;
-	struct page *first_page, *f_page;
-	unsigned long f_objidx, f_offset;
-
-	int class_idx;
-	struct size_class *class;
-	enum fullness_group fullness;
-
-	if (unlikely(!obj))
-		return;
-
-	obj_handle_to_location(obj, &f_page, &f_objidx);
-	first_page = get_first_page(f_page);
-
-	get_zspage_mapping(first_page, &class_idx, &fullness);
-	class = &pool->size_class[class_idx];
-	f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
-
-	spin_lock(&class->lock);
-
-	/* Insert this object in containing zspage's freelist */
-	link = (struct link_free *)((unsigned char *)kmap_atomic(f_page)
-							+ f_offset);
-	link->next = first_page->freelist;
-	kunmap_atomic(link);
-	first_page->freelist = (void *)obj;
-
-	first_page->inuse--;
-	fullness = fix_fullness_group(pool, first_page);
-
-	if (fullness == ZS_EMPTY)
-		class->pages_allocated -= class->pages_per_zspage;
-
-	spin_unlock(&class->lock);
-
-	if (fullness == ZS_EMPTY)
-		free_zspage(pool->ops, first_page);
+	return atomic_long_read(&pool->pages_allocated);
 }
-EXPORT_SYMBOL_GPL(zs_free);
+EXPORT_SYMBOL_GPL(zs_get_total_pages);
 
 /**
  * zs_map_object - get address of allocated object from handle.
@@ -1021,18 +1268,19 @@ EXPORT_SYMBOL_GPL(zs_free);
  * against nested mappings.
  *
  * This function returns with preemption and page faults disabled.
-*/
+ */
 void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 			enum zs_mapmode mm)
 {
 	struct page *page;
-	unsigned long obj_idx, off;
+	unsigned long obj, obj_idx, off;
 
 	unsigned int class_idx;
 	enum fullness_group fg;
 	struct size_class *class;
 	struct mapping_area *area;
 	struct page *pages[2];
+	void *ret;
 
 	BUG_ON(!handle);
 
@@ -1043,9 +1291,13 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 	 */
 	BUG_ON(in_interrupt());
 
-	obj_handle_to_location(handle, &page, &obj_idx);
+	/* From now on, migration cannot move the object */
+	pin_tag(handle);
+
+	obj = handle_to_obj(handle);
+	obj_to_location(obj, &page, &obj_idx);
 	get_zspage_mapping(get_first_page(page), &class_idx, &fg);
-	class = &pool->size_class[class_idx];
+	class = pool->size_class[class_idx];
 	off = obj_idx_to_offset(page, obj_idx, class->size);
 
 	area = &get_cpu_var(zs_map_area);
@@ -1053,7 +1305,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 	if (off + class->size <= PAGE_SIZE) {
 		/* this object is contained entirely within a page */
 		area->vm_addr = kmap_atomic(page);
-		return area->vm_addr + off;
+		ret = area->vm_addr + off;
+		goto out;
 	}
 
 	/* this object spans two pages */
@@ -1061,14 +1314,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 	pages[1] = get_next_page(page);
 	BUG_ON(!pages[1]);
 
-	return __zs_map_object(area, pages, off, class->size);
+	ret = __zs_map_object(area, pages, off, class->size);
+out:
+	if (!class->huge)
+		ret += ZS_HANDLE_SIZE;
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(zs_map_object);
 
 void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 {
 	struct page *page;
-	unsigned long obj_idx, off;
+	unsigned long obj, obj_idx, off;
 
 	unsigned int class_idx;
 	enum fullness_group fg;
@@ -1077,12 +1335,13 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 
 	BUG_ON(!handle);
 
-	obj_handle_to_location(handle, &page, &obj_idx);
+	obj = handle_to_obj(handle);
+	obj_to_location(obj, &page, &obj_idx);
 	get_zspage_mapping(get_first_page(page), &class_idx, &fg);
-	class = &pool->size_class[class_idx];
+	class = pool->size_class[class_idx];
 	off = obj_idx_to_offset(page, obj_idx, class->size);
 
-	area = &__get_cpu_var(zs_map_area);
+	area = this_cpu_ptr(&zs_map_area);
 	if (off + class->size <= PAGE_SIZE)
 		kunmap_atomic(area->vm_addr);
 	else {
@@ -1095,20 +1354,704 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 		__zs_unmap_object(area, pages, off, class->size);
 	}
 	put_cpu_var(zs_map_area);
+	unpin_tag(handle);
 }
 EXPORT_SYMBOL_GPL(zs_unmap_object);
 
-u64 zs_get_total_size_bytes(struct zs_pool *pool)
+static unsigned long obj_malloc(struct size_class *class,
+				struct page *first_page, unsigned long handle)
+{
+	unsigned long obj;
+	struct link_free *link;
+
+	struct page *m_page;
+	unsigned long m_objidx, m_offset;
+	void *vaddr;
+
+	handle |= OBJ_ALLOCATED_TAG;
+	obj = (unsigned long)first_page->freelist;
+	obj_to_location(obj, &m_page, &m_objidx);
+	m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+
+	vaddr = kmap_atomic(m_page);
+	link = (struct link_free *)vaddr + m_offset / sizeof(*link);
+	first_page->freelist = link->next;
+	if (!class->huge)
+		/* record handle in the header of allocated chunk */
+		link->handle = handle;
+	else
+		/* record handle in first_page->private */
+		set_page_private(first_page, handle);
+	kunmap_atomic(vaddr);
+	first_page->inuse++;
+	zs_stat_inc(class, OBJ_USED, 1);
+
+	return obj;
+}
+
+
+/**
+ * zs_malloc - Allocate block of given size from pool.
+ * @pool: pool to allocate from
+ * @size: size of block to allocate
+ *
+ * On success, handle to the allocated object is returned,
+ * otherwise 0.
+ * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
+ */
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
+{
+	unsigned long handle, obj;
+	struct size_class *class;
+	struct page *first_page;
+
+	if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
+		return 0;
+
+	handle = alloc_handle(pool, gfp);
+	if (!handle)
+		return 0;
+
+	/* extra space in chunk to keep the handle */
+	size += ZS_HANDLE_SIZE;
+	class = pool->size_class[get_size_class_index(size)];
+
+	spin_lock(&class->lock);
+	first_page = find_get_zspage(class);
+
+	if (!first_page) {
+		spin_unlock(&class->lock);
+		first_page = alloc_zspage(class, gfp);
+		if (unlikely(!first_page)) {
+			free_handle(pool, handle);
+			return 0;
+		}
+
+		set_zspage_mapping(first_page, class->index, ZS_EMPTY);
+		atomic_long_add(class->pages_per_zspage,
+					&pool->pages_allocated);
+
+		spin_lock(&class->lock);
+		zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+				class->size, class->pages_per_zspage));
+	}
+
+	obj = obj_malloc(class, first_page, handle);
+	/* Now move the zspage to another fullness group, if required */
+	fix_fullness_group(class, first_page);
+	record_obj(handle, obj);
+	spin_unlock(&class->lock);
+
+	return handle;
+}
+EXPORT_SYMBOL_GPL(zs_malloc);
+
+static void obj_free(struct size_class *class, unsigned long obj)
+{
+	struct link_free *link;
+	struct page *first_page, *f_page;
+	unsigned long f_objidx, f_offset;
+	void *vaddr;
+
+	BUG_ON(!obj);
+
+	obj &= ~OBJ_ALLOCATED_TAG;
+	obj_to_location(obj, &f_page, &f_objidx);
+	first_page = get_first_page(f_page);
+
+	f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
+
+	vaddr = kmap_atomic(f_page);
+
+	/* Insert this object in containing zspage's freelist */
+	link = (struct link_free *)(vaddr + f_offset);
+	link->next = first_page->freelist;
+	if (class->huge)
+		set_page_private(first_page, 0);
+	kunmap_atomic(vaddr);
+	first_page->freelist = (void *)obj;
+	first_page->inuse--;
+	zs_stat_dec(class, OBJ_USED, 1);
+}
+
+void zs_free(struct zs_pool *pool, unsigned long handle)
+{
+	struct page *first_page, *f_page;
+	unsigned long obj, f_objidx;
+	int class_idx;
+	struct size_class *class;
+	enum fullness_group fullness;
+
+	if (unlikely(!handle))
+		return;
+
+	pin_tag(handle);
+	obj = handle_to_obj(handle);
+	obj_to_location(obj, &f_page, &f_objidx);
+	first_page = get_first_page(f_page);
+
+	get_zspage_mapping(first_page, &class_idx, &fullness);
+	class = pool->size_class[class_idx];
+
+	spin_lock(&class->lock);
+	obj_free(class, obj);
+	fullness = fix_fullness_group(class, first_page);
+	if (fullness == ZS_EMPTY) {
+		zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+				class->size, class->pages_per_zspage));
+		atomic_long_sub(class->pages_per_zspage,
+				&pool->pages_allocated);
+		free_zspage(first_page);
+	}
+	spin_unlock(&class->lock);
+	unpin_tag(handle);
+
+	free_handle(pool, handle);
+}
+EXPORT_SYMBOL_GPL(zs_free);
+
+static void zs_object_copy(struct size_class *class, unsigned long dst,
+				unsigned long src)
+{
+	struct page *s_page, *d_page;
+	unsigned long s_objidx, d_objidx;
+	unsigned long s_off, d_off;
+	void *s_addr, *d_addr;
+	int s_size, d_size, size;
+	int written = 0;
+
+	s_size = d_size = class->size;
+
+	obj_to_location(src, &s_page, &s_objidx);
+	obj_to_location(dst, &d_page, &d_objidx);
+
+	s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
+	d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
+
+	if (s_off + class->size > PAGE_SIZE)
+		s_size = PAGE_SIZE - s_off;
+
+	if (d_off + class->size > PAGE_SIZE)
+		d_size = PAGE_SIZE - d_off;
+
+	s_addr = kmap_atomic(s_page);
+	d_addr = kmap_atomic(d_page);
+
+	while (1) {
+		size = min(s_size, d_size);
+		memcpy(d_addr + d_off, s_addr + s_off, size);
+		written += size;
+
+		if (written == class->size)
+			break;
+
+		s_off += size;
+		s_size -= size;
+		d_off += size;
+		d_size -= size;
+
+		if (s_off >= PAGE_SIZE) {
+			kunmap_atomic(d_addr);
+			kunmap_atomic(s_addr);
+			s_page = get_next_page(s_page);
+			BUG_ON(!s_page);
+			s_addr = kmap_atomic(s_page);
+			d_addr = kmap_atomic(d_page);
+			s_size = class->size - written;
+			s_off = 0;
+		}
+
+		if (d_off >= PAGE_SIZE) {
+			kunmap_atomic(d_addr);
+			d_page = get_next_page(d_page);
+			BUG_ON(!d_page);
+			d_addr = kmap_atomic(d_page);
+			d_size = class->size - written;
+			d_off = 0;
+		}
+	}
+
+	kunmap_atomic(d_addr);
+	kunmap_atomic(s_addr);
+}
+
+/*
+ * Find alloced object in zspage from index object and
+ * return handle.
+ */
+static unsigned long find_alloced_obj(struct size_class *class,
+					struct page *page, int index)
+{
+	unsigned long head;
+	int offset = 0;
+	unsigned long handle = 0;
+	void *addr = kmap_atomic(page);
+
+	if (!is_first_page(page))
+		offset = page->index;
+	offset += class->size * index;
+
+	while (offset < PAGE_SIZE) {
+		head = obj_to_head(class, page, addr + offset);
+		if (head & OBJ_ALLOCATED_TAG) {
+			handle = head & ~OBJ_ALLOCATED_TAG;
+			if (trypin_tag(handle))
+				break;
+			handle = 0;
+		}
+
+		offset += class->size;
+		index++;
+	}
+
+	kunmap_atomic(addr);
+	return handle;
+}
+
+struct zs_compact_control {
+	/* Source page for migration which could be a subpage of zspage. */
+	struct page *s_page;
+	/* Destination page for migration which should be a first page
+	 * of zspage. */
+	struct page *d_page;
+	 /* Starting object index within @s_page which used for live object
+	  * in the subpage. */
+	int index;
+};
+
+static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
+				struct zs_compact_control *cc)
+{
+	unsigned long used_obj, free_obj;
+	unsigned long handle;
+	struct page *s_page = cc->s_page;
+	struct page *d_page = cc->d_page;
+	unsigned long index = cc->index;
+	int ret = 0;
+
+	while (1) {
+		handle = find_alloced_obj(class, s_page, index);
+		if (!handle) {
+			s_page = get_next_page(s_page);
+			if (!s_page)
+				break;
+			index = 0;
+			continue;
+		}
+
+		/* Stop if there is no more space */
+		if (zspage_full(d_page)) {
+			unpin_tag(handle);
+			ret = -ENOMEM;
+			break;
+		}
+
+		used_obj = handle_to_obj(handle);
+		free_obj = obj_malloc(class, d_page, handle);
+		zs_object_copy(class, free_obj, used_obj);
+		index++;
+		/*
+		 * record_obj updates handle's value to free_obj and it will
+		 * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
+		 * breaks synchronization using pin_tag(e,g, zs_free) so
+		 * let's keep the lock bit.
+		 */
+		free_obj |= BIT(HANDLE_PIN_BIT);
+		record_obj(handle, free_obj);
+		unpin_tag(handle);
+		obj_free(class, used_obj);
+	}
+
+	/* Remember last position in this iteration */
+	cc->s_page = s_page;
+	cc->index = index;
+
+	return ret;
+}
+
+static struct page *isolate_target_page(struct size_class *class)
+{
+	int i;
+	struct page *page;
+
+	for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
+		page = class->fullness_list[i];
+		if (page) {
+			remove_zspage(class, i, page);
+			break;
+		}
+	}
+
+	return page;
+}
+
+/*
+ * putback_zspage - add @first_page into right class's fullness list
+ * @pool: target pool
+ * @class: destination class
+ * @first_page: target page
+ *
+ * Return @fist_page's fullness_group
+ */
+static enum fullness_group putback_zspage(struct zs_pool *pool,
+			struct size_class *class,
+			struct page *first_page)
+{
+	enum fullness_group fullness;
+
+	BUG_ON(!is_first_page(first_page));
+
+	fullness = get_fullness_group(first_page);
+	insert_zspage(class, fullness, first_page);
+	set_zspage_mapping(first_page, class->index, fullness);
+
+	if (fullness == ZS_EMPTY) {
+		zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+			class->size, class->pages_per_zspage));
+		atomic_long_sub(class->pages_per_zspage,
+				&pool->pages_allocated);
+
+		free_zspage(first_page);
+	}
+
+	return fullness;
+}
+
+static struct page *isolate_source_page(struct size_class *class)
 {
 	int i;
-	u64 npages = 0;
+	struct page *page = NULL;
+
+	for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
+		page = class->fullness_list[i];
+		if (!page)
+			continue;
 
-	for (i = 0; i < ZS_SIZE_CLASSES; i++)
-		npages += pool->size_class[i].pages_allocated;
+		remove_zspage(class, i, page);
+		break;
+	}
+
+	return page;
+}
+
+/*
+ *
+ * Based on the number of unused allocated objects calculate
+ * and return the number of pages that we can free.
+ */
+static unsigned long zs_can_compact(struct size_class *class)
+{
+	unsigned long obj_wasted;
+	unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+	unsigned long obj_used = zs_stat_get(class, OBJ_USED);
+
+	if (obj_allocated <= obj_used)
+		return 0;
+
+	obj_wasted = obj_allocated - obj_used;
+	obj_wasted /= get_maxobj_per_zspage(class->size,
+			class->pages_per_zspage);
+
+	return obj_wasted * class->pages_per_zspage;
+}
+
+static void __zs_compact(struct zs_pool *pool, struct size_class *class)
+{
+	struct zs_compact_control cc;
+	struct page *src_page;
+	struct page *dst_page = NULL;
+
+	spin_lock(&class->lock);
+	while ((src_page = isolate_source_page(class))) {
+
+		BUG_ON(!is_first_page(src_page));
+
+		if (!zs_can_compact(class))
+			break;
+
+		cc.index = 0;
+		cc.s_page = src_page;
+
+		while ((dst_page = isolate_target_page(class))) {
+			cc.d_page = dst_page;
+			/*
+			 * If there is no more space in dst_page, resched
+			 * and see if anyone had allocated another zspage.
+			 */
+			if (!migrate_zspage(pool, class, &cc))
+				break;
+
+			putback_zspage(pool, class, dst_page);
+		}
+
+		/* Stop if we couldn't find slot */
+		if (dst_page == NULL)
+			break;
+
+		putback_zspage(pool, class, dst_page);
+		if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
+			pool->stats.pages_compacted += class->pages_per_zspage;
+		spin_unlock(&class->lock);
+		cond_resched();
+		spin_lock(&class->lock);
+	}
+
+	if (src_page)
+		putback_zspage(pool, class, src_page);
+
+	spin_unlock(&class->lock);
+}
+
+unsigned long zs_compact(struct zs_pool *pool)
+{
+	int i;
+	struct size_class *class;
+
+	for (i = zs_size_classes - 1; i >= 0; i--) {
+		class = pool->size_class[i];
+		if (!class)
+			continue;
+		if (class->index != i)
+			continue;
+		__zs_compact(pool, class);
+	}
+
+	return pool->stats.pages_compacted;
+}
+EXPORT_SYMBOL_GPL(zs_compact);
+
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats)
+{
+	memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats));
+}
+EXPORT_SYMBOL_GPL(zs_pool_stats);
+
+static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
+		struct shrink_control *sc)
+{
+	unsigned long pages_freed;
+	struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+			shrinker);
+
+	pages_freed = pool->stats.pages_compacted;
+	/*
+	 * Compact classes and calculate compaction delta.
+	 * Can run concurrently with a manually triggered
+	 * (by user) compaction.
+	 */
+	pages_freed = zs_compact(pool) - pages_freed;
+
+	return pages_freed ? pages_freed : SHRINK_STOP;
+}
+
+static unsigned long zs_shrinker_count(struct shrinker *shrinker,
+		struct shrink_control *sc)
+{
+	int i;
+	struct size_class *class;
+	unsigned long pages_to_free = 0;
+	struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+			shrinker);
+
+	for (i = zs_size_classes - 1; i >= 0; i--) {
+		class = pool->size_class[i];
+		if (!class)
+			continue;
+		if (class->index != i)
+			continue;
+
+		pages_to_free += zs_can_compact(class);
+	}
+
+	return pages_to_free;
+}
+
+static void zs_unregister_shrinker(struct zs_pool *pool)
+{
+	if (pool->shrinker_enabled) {
+		unregister_shrinker(&pool->shrinker);
+		pool->shrinker_enabled = false;
+	}
+}
+
+static void zs_register_shrinker(struct zs_pool *pool)
+{
+	pool->shrinker.scan_objects = zs_shrinker_scan;
+	pool->shrinker.count_objects = zs_shrinker_count;
+	pool->shrinker.batch = 0;
+	pool->shrinker.seeks = DEFAULT_SEEKS;
+
+	register_shrinker(&pool->shrinker);
+}
+
+static int zs_register_shrinker_int(struct zs_pool *pool)
+{
+	zs_register_shrinker(pool);
+	return 0;
+}
+
+/**
+ * zs_create_pool - Creates an allocation pool to work from.
+ * @flags: allocation flags used to allocate pool metadata
+ *
+ * This function must be called before anything when using
+ * the zsmalloc allocator.
+ *
+ * On success, a pointer to the newly created pool is returned,
+ * otherwise NULL.
+ */
+struct zs_pool *zs_create_pool(const char *name)
+{
+	int i;
+	struct zs_pool *pool;
+	struct size_class *prev_class = NULL;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return NULL;
+
+	pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
+			GFP_KERNEL);
+	if (!pool->size_class) {
+		kfree(pool);
+		return NULL;
+	}
+
+	pool->name = kstrdup(name, GFP_KERNEL);
+	if (!pool->name)
+		goto err;
+
+	if (create_handle_cache(pool))
+		goto err;
+
+	/*
+	 * Iterate reversly, because, size of size_class that we want to use
+	 * for merging should be larger or equal to current size.
+	 */
+	for (i = zs_size_classes - 1; i >= 0; i--) {
+		int size;
+		int pages_per_zspage;
+		struct size_class *class;
+
+		size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
+		if (size > ZS_MAX_ALLOC_SIZE)
+			size = ZS_MAX_ALLOC_SIZE;
+		pages_per_zspage = get_pages_per_zspage(size);
+
+		/*
+		 * size_class is used for normal zsmalloc operation such
+		 * as alloc/free for that size. Although it is natural that we
+		 * have one size_class for each size, there is a chance that we
+		 * can get more memory utilization if we use one size_class for
+		 * many different sizes whose size_class have same
+		 * characteristics. So, we makes size_class point to
+		 * previous size_class if possible.
+		 */
+		if (prev_class) {
+			if (can_merge(prev_class, size, pages_per_zspage)) {
+				pool->size_class[i] = prev_class;
+				continue;
+			}
+		}
+
+		class = kzalloc(sizeof(struct size_class), GFP_KERNEL);
+		if (!class)
+			goto err;
+
+		class->size = size;
+		class->index = i;
+		class->pages_per_zspage = pages_per_zspage;
+		if (pages_per_zspage == 1 &&
+			get_maxobj_per_zspage(size, pages_per_zspage) == 1)
+			class->huge = true;
+		spin_lock_init(&class->lock);
+		pool->size_class[i] = class;
+
+		prev_class = class;
+	}
+
+	/* debug only, don't abort if it fails */
+	zs_pool_stat_create(pool, name);
+
+	/*
+	 * Not critical, we still can use the pool
+	 * and user can trigger compaction manually.
+	 */
+	if (zs_register_shrinker_int(pool) == 0)
+		pool->shrinker_enabled = true;
+	return pool;
+
+err:
+	zs_destroy_pool(pool);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(zs_create_pool);
+
+void zs_destroy_pool(struct zs_pool *pool)
+{
+	int i;
+
+	zs_unregister_shrinker(pool);
+	zs_pool_stat_destroy(pool);
+
+	for (i = 0; i < zs_size_classes; i++) {
+		int fg;
+		struct size_class *class = pool->size_class[i];
+
+		if (!class)
+			continue;
+
+		if (class->index != i)
+			continue;
+
+		for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
+			if (class->fullness_list[fg]) {
+				pr_info("Freeing non-empty class with size %db, fullness group %d\n",
+					class->size, fg);
+			}
+		}
+		kfree(class);
+	}
+
+	destroy_handle_cache(pool);
+	kfree(pool->size_class);
+	kfree(pool->name);
+	kfree(pool);
+}
+EXPORT_SYMBOL_GPL(zs_destroy_pool);
+
+static int __init zs_init(void)
+{
+	int ret = zs_register_cpu_notifier();
+
+	if (ret)
+		goto notifier_fail;
+
+	init_zs_size_classes();
+
+#ifdef CONFIG_ZPOOL
+	zpool_register_driver(&zs_zpool_driver);
+#endif
+
+	zs_stat_init();
+
+	return 0;
+
+notifier_fail:
+	zs_unregister_cpu_notifier();
+
+	return ret;
+}
+
+static void __exit zs_exit(void)
+{
+#ifdef CONFIG_ZPOOL
+	zpool_unregister_driver(&zs_zpool_driver);
+#endif
+	zs_unregister_cpu_notifier();
 
-	return npages << PAGE_SHIFT;
+	zs_stat_exit();
 }
-EXPORT_SYMBOL_GPL(zs_get_total_size_bytes);
 
 module_init(zs_init);
 module_exit(zs_exit);
diff --git a/mm/zswap.c b/mm/zswap.c
index b825f9876..d5980efc1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1,11 +1,11 @@
 /*
  * zswap.c - zswap driver file
  *
- * zswap is a backend for frontswap that takes pages that are in the
- * process of being swapped out and attempts to compress them and store
- * them in a RAM-based memory pool.  This results in a significant I/O
- * reduction on the real swap device and, in the case of a slow swap
- * device, can also improve workload performance.
+ * zswap is a backend for frontswap that takes pages that are in the process
+ * of being swapped out and attempts to compress and store them in a
+ * RAM-based memory pool.  This can result in a significant I/O reduction on
+ * the swap device and, in the case where decompressing from RAM is faster
+ * than reading from the swap device, can also improve workload performance.
  *
  * Copyright (C) 2012  Seth Jennings <sjenning@linux.vnet.ibm.com>
  *
@@ -34,7 +34,7 @@
 #include <linux/swap.h>
 #include <linux/crypto.h>
 #include <linux/mempool.h>
-#include <linux/zsmalloc.h>
+#include <linux/zpool.h>
 
 #include <linux/mm_types.h>
 #include <linux/page-flags.h>
@@ -45,65 +45,62 @@
 /*********************************
 * statistics
 **********************************/
-/* Number of memory pages used by the compressed pool */
-atomic_t zswap_pool_pages = ATOMIC_INIT(0);
+/* Total bytes used by the compressed storage */
+u64 zswap_pool_total_size;
 /* The number of compressed pages currently stored in zswap */
 atomic_t zswap_stored_pages = ATOMIC_INIT(0);
 
-#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK
-/* The number of outstanding pages awaiting writeback */
-static atomic_t zswap_outstanding_writebacks = ATOMIC_INIT(0);
-#endif
-
 /*
  * The statistics below are not protected from concurrent access for
  * performance reasons so they may not be a 100% accurate.  However,
  * they do provide useful information on roughly how many times a
  * certain event is occurring.
 */
+
+/* Pool limit was hit (see zswap_max_pool_percent) */
 static u64 zswap_pool_limit_hit;
+/* Pages written back when pool limit was reached */
 static u64 zswap_written_back_pages;
+/* Store failed due to a reclaim failure after pool limit was reached */
+static u64 zswap_reject_reclaim_fail;
+/* Compressed page was too big for the allocator to (optimally) store */
 static u64 zswap_reject_compress_poor;
-static u64 zswap_writeback_attempted;
-static u64 zswap_reject_tmppage_fail;
-static u64 zswap_reject_zsmalloc_fail;
+/* Store failed because underlying allocator could not get memory */
+static u64 zswap_reject_alloc_fail;
+/* Store failed because the entry metadata could not be allocated (rare) */
 static u64 zswap_reject_kmemcache_fail;
-static u64 zswap_saved_by_writeback;
+/* Duplicate store was encountered (rare) */
 static u64 zswap_duplicate_entry;
 
 /*********************************
 * tunables
 **********************************/
-/* Enable/disable zswap (enabled by default, fixed at boot for now) */
+
+/* Enable/disable zswap (disabled by default) */
 static bool zswap_enabled = 1;
-module_param_named(enabled, zswap_enabled, bool, 0);
+module_param_named(enabled, zswap_enabled, bool, 0644);
 
 /* Compressor to be used by zswap (fixed at boot for now) */
 #define ZSWAP_COMPRESSOR_DEFAULT "lzo"
+#ifndef CONFIG_CRYPTO_LZ4
 static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
-module_param_named(compressor, zswap_compressor, charp, 0);
+#else
+static char *zswap_compressor = "lz4";
+#endif
+module_param_named(compressor, zswap_compressor, charp, 0444);
 
 /* The maximum percentage of memory that the compressed pool can occupy */
 static unsigned int zswap_max_pool_percent = 20;
 module_param_named(max_pool_percent,
 			zswap_max_pool_percent, uint, 0644);
 
-/*
- * Maximum compression ratio, as as percentage, for an acceptable
- * compressed page. Any pages that do not compress by at least
- * this ratio will be rejected.
-*/
-static unsigned int zswap_max_compression_ratio = 80;
-module_param_named(max_compression_ratio,
-			zswap_max_compression_ratio, uint, 0644);
+/* Compressed storage to use */
+#define ZSWAP_ZPOOL_DEFAULT "zbud"
+static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
+module_param_named(zpool, zswap_zpool_type, charp, 0444);
 
-/*
- * Maximum number of outstanding writebacks allowed at any given time.
- * This is to prevent decompressing an unbounded number of compressed
- * pages into the swap cache all at once, and to help with writeback
- * congestion.
-*/
-#define ZSWAP_MAX_OUTSTANDING_FLUSHES 64
+/* zpool is shared by all of zswap backend  */
+static struct zpool *zswap_pool;
 
 /*********************************
 * compression functions
@@ -157,17 +154,15 @@ static int __init zswap_comp_init(void)
 	return 0;
 }
 
-static void zswap_comp_exit(void)
+static void __init zswap_comp_exit(void)
 {
 	/* free percpu transforms */
-	if (zswap_comp_pcpu_tfms)
-		free_percpu(zswap_comp_pcpu_tfms);
+	free_percpu(zswap_comp_pcpu_tfms);
 }
 
 /*********************************
 * data structures
 **********************************/
-
 /*
  * struct zswap_entry
  *
@@ -175,41 +170,37 @@ static void zswap_comp_exit(void)
  * page within zswap.
  *
  * rbnode - links the entry into red-black tree for the appropriate swap type
- * lru - links the entry into the lru list for the appropriate swap type
  * refcount - the number of outstanding reference to the entry. This is needed
  *            to protect against premature freeing of the entry by code
- *            concurent calls to load, invalidate, and writeback.  The lock
+ *            concurrent calls to load, invalidate, and writeback.  The lock
  *            for the zswap_tree structure that contains the entry must
  *            be held while changing the refcount.  Since the lock must
  *            be held, there is no reason to also make refcount atomic.
- * type - the swap type for the entry.  Used to map back to the zswap_tree
- *        structure that contains the entry.
  * offset - the swap offset for the entry.  Index into the red-black tree.
- * handle - zsmalloc allocation handle that stores the compressed page data
+ * handle - zpool allocation handle that stores the compressed page data
  * length - the length in bytes of the compressed page data.  Needed during
- *           decompression
+ *          decompression
  */
 struct zswap_entry {
 	struct rb_node rbnode;
-	struct list_head lru;
-	int refcount;
 	pgoff_t offset;
-	unsigned long handle;
+	int refcount;
 	unsigned int length;
+	unsigned long handle;
+};
+
+struct zswap_header {
+	swp_entry_t swpentry;
 };
 
 /*
  * The tree lock in the zswap_tree struct protects a few things:
  * - the rbtree
- * - the lru list
  * - the refcount field of each entry in the tree
  */
 struct zswap_tree {
 	struct rb_root rbroot;
-	struct list_head lru;
 	spinlock_t lock;
-	struct zs_pool *pool;
-	unsigned type;
 };
 
 static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
@@ -217,49 +208,35 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
 /*********************************
 * zswap entry functions
 **********************************/
-#define ZSWAP_KMEM_CACHE_NAME "zswap_entry_cache"
 static struct kmem_cache *zswap_entry_cache;
 
-static inline int zswap_entry_cache_create(void)
+static int __init zswap_entry_cache_create(void)
 {
-	zswap_entry_cache =
-		kmem_cache_create(ZSWAP_KMEM_CACHE_NAME,
-			sizeof(struct zswap_entry), 0, 0, NULL);
-	return (zswap_entry_cache == NULL);
+	zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
+	return zswap_entry_cache == NULL;
 }
 
-static inline void zswap_entry_cache_destory(void)
+static void __init zswap_entry_cache_destroy(void)
 {
 	kmem_cache_destroy(zswap_entry_cache);
 }
 
-static inline struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
+static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
 {
 	struct zswap_entry *entry;
 	entry = kmem_cache_alloc(zswap_entry_cache, gfp);
 	if (!entry)
 		return NULL;
-	INIT_LIST_HEAD(&entry->lru);
 	entry->refcount = 1;
+	RB_CLEAR_NODE(&entry->rbnode);
 	return entry;
 }
 
-static inline void zswap_entry_cache_free(struct zswap_entry *entry)
+static void zswap_entry_cache_free(struct zswap_entry *entry)
 {
 	kmem_cache_free(zswap_entry_cache, entry);
 }
 
-static inline void zswap_entry_get(struct zswap_entry *entry)
-{
-	entry->refcount++;
-}
-
-static inline int zswap_entry_put(struct zswap_entry *entry)
-{
-	entry->refcount--;
-	return entry->refcount;
-}
-
 /*********************************
 * rbtree functions
 **********************************/
@@ -281,9 +258,9 @@ static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
 }
 
 /*
- * In the case that a entry with the same offset is found, it a pointer to
+ * In the case that a entry with the same offset is found, a pointer to
  * the existing entry is stored in dupentry and the function returns -EEXIST
-*/
+ */
 static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
 			struct zswap_entry **dupentry)
 {
@@ -307,6 +284,60 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
 	return 0;
 }
 
+static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
+{
+	if (!RB_EMPTY_NODE(&entry->rbnode)) {
+		rb_erase(&entry->rbnode, root);
+		RB_CLEAR_NODE(&entry->rbnode);
+	}
+}
+
+/*
+ * Carries out the common pattern of freeing and entry's zpool allocation,
+ * freeing the entry itself, and decrementing the number of stored pages.
+ */
+static void zswap_free_entry(struct zswap_entry *entry)
+{
+	zpool_free(zswap_pool, entry->handle);
+	zswap_entry_cache_free(entry);
+	atomic_dec(&zswap_stored_pages);
+	zswap_pool_total_size = zpool_get_total_size(zswap_pool);
+}
+
+/* caller must hold the tree lock */
+static void zswap_entry_get(struct zswap_entry *entry)
+{
+	entry->refcount++;
+}
+
+/* caller must hold the tree lock
+* remove from the tree and free it, if nobody reference the entry
+*/
+static void zswap_entry_put(struct zswap_tree *tree,
+			struct zswap_entry *entry)
+{
+	int refcount = --entry->refcount;
+
+	BUG_ON(refcount < 0);
+	if (refcount == 0) {
+		zswap_rb_erase(&tree->rbroot, entry);
+		zswap_free_entry(entry);
+	}
+}
+
+/* caller must hold the tree lock */
+static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
+				pgoff_t offset)
+{
+	struct zswap_entry *entry = NULL;
+
+	entry = zswap_rb_search(root, offset);
+	if (entry)
+		zswap_entry_get(entry);
+
+	return entry;
+}
+
 /*********************************
 * per-cpu code
 **********************************/
@@ -325,7 +356,7 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
 			return NOTIFY_BAD;
 		}
 		*per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
-		dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
+		dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
 		if (!dst) {
 			pr_err("can't allocate compressor buffer\n");
 			crypto_free_comp(tfm);
@@ -342,10 +373,8 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
 			*per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
 		}
 		dst = per_cpu(zswap_dstmem, cpu);
-		if (dst) {
-			kfree(dst);
-			per_cpu(zswap_dstmem, cpu) = NULL;
-		}
+		kfree(dst);
+		per_cpu(zswap_dstmem, cpu) = NULL;
 		break;
 	default:
 		break;
@@ -364,108 +393,42 @@ static struct notifier_block zswap_cpu_notifier_block = {
 	.notifier_call = zswap_cpu_notifier
 };
 
-static int zswap_cpu_init(void)
+static int __init zswap_cpu_init(void)
 {
 	unsigned long cpu;
 
-	get_online_cpus();
+	cpu_notifier_register_begin();
 	for_each_online_cpu(cpu)
 		if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
 			goto cleanup;
-	register_cpu_notifier(&zswap_cpu_notifier_block);
-	put_online_cpus();
+	__register_cpu_notifier(&zswap_cpu_notifier_block);
+	cpu_notifier_register_done();
 	return 0;
 
 cleanup:
 	for_each_online_cpu(cpu)
 		__zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
-	put_online_cpus();
+	cpu_notifier_register_done();
 	return -ENOMEM;
 }
 
-/*********************************
-* zsmalloc callbacks
-**********************************/
-static mempool_t *zswap_page_pool;
-
-static inline unsigned int zswap_max_pool_pages(void)
-{
-	return zswap_max_pool_percent * totalram_pages / 100;
-}
-
-static inline int zswap_page_pool_create(void)
-{
-	/* TODO: dynamically size mempool */
-	zswap_page_pool = mempool_create_page_pool(256, 0);
-	if (!zswap_page_pool)
-		return -ENOMEM;
-	return 0;
-}
-
-static inline void zswap_page_pool_destroy(void)
-{
-	mempool_destroy(zswap_page_pool);
-}
-
-static struct page *zswap_alloc_page(gfp_t flags)
-{
-	struct page *page;
-
-	if (atomic_read(&zswap_pool_pages) >= zswap_max_pool_pages()) {
-		zswap_pool_limit_hit++;
-		return NULL;
-	}
-	page = mempool_alloc(zswap_page_pool, flags);
-	if (page)
-		atomic_inc(&zswap_pool_pages);
-	return page;
-}
-
-static void zswap_free_page(struct page *page)
-{
-	if (!page)
-		return;
-	mempool_free(page, zswap_page_pool);
-	atomic_dec(&zswap_pool_pages);
-}
-
-static struct zs_ops zswap_zs_ops = {
-	.alloc = zswap_alloc_page,
-	.free = zswap_free_page
-};
-
-
 /*********************************
 * helpers
 **********************************/
-
-/*
- * Carries out the common pattern of freeing and entry's zsmalloc allocation,
- * freeing the entry itself, and decrementing the number of stored pages.
- */
-static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
+static bool zswap_is_full(void)
 {
-	zs_free(tree->pool, entry->handle);
-	zswap_entry_cache_free(entry);
-	atomic_dec(&zswap_stored_pages);
+	return totalram_pages * zswap_max_pool_percent / 100 <
+		DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
 }
 
-#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK
 /*********************************
 * writeback code
 **********************************/
-static void zswap_end_swap_write(struct bio *bio, int err)
-{
-	end_swap_bio_write(bio, err);
-	atomic_dec(&zswap_outstanding_writebacks);
-	zswap_written_back_pages++;
-}
-
 /* return enum for zswap_get_swap_cache_page */
 enum zswap_get_swap_ret {
 	ZSWAP_SWAPCACHE_NEW,
 	ZSWAP_SWAPCACHE_EXIST,
-	ZSWAP_SWAPCACHE_NOMEM
+	ZSWAP_SWAPCACHE_FAIL,
 };
 
 /*
@@ -479,15 +442,16 @@ enum zswap_get_swap_ret {
  * added to the swap cache, and returned in retpage.
  *
  * If success, the swap cache page is returned in retpage
- * Returns 0 if page was already in the swap cache, page is not locked
- * Returns 1 if the new page needs to be populated, page is locked
- * Returns <0 on error
+ * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
+ * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
+ *     the new page is added to swapcache and locked
+ * Returns ZSWAP_SWAPCACHE_FAIL on error
  */
 static int zswap_get_swap_cache_page(swp_entry_t entry,
 				struct page **retpage)
 {
 	struct page *found_page, *new_page = NULL;
-	struct address_space *swapper_space = &swapper_spaces[swp_type(entry)];
+	struct address_space *swapper_space = swap_address_space(entry);
 	int err;
 
 	*retpage = NULL;
@@ -553,13 +517,13 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
 	if (new_page)
 		page_cache_release(new_page);
 	if (!found_page)
-		return ZSWAP_SWAPCACHE_NOMEM;
+		return ZSWAP_SWAPCACHE_FAIL;
 	*retpage = found_page;
 	return ZSWAP_SWAPCACHE_EXIST;
 }
 
 /*
- * Attempts to free and entry by adding a page to the swap cache,
+ * Attempts to free an entry by adding a page to the swap cache,
  * decompressing the entry data into the page, and issuing a
  * bio write to write the page back to the swap device.
  *
@@ -570,12 +534,14 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
  * the swap cache, the compressed version stored by zswap can be
  * freed.
  */
-static int zswap_writeback_entry(struct zswap_tree *tree,
-				struct zswap_entry *entry)
+static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 {
-	unsigned long type = tree->type;
-	struct page *page;
+	struct zswap_header *zhdr;
 	swp_entry_t swpentry;
+	struct zswap_tree *tree;
+	pgoff_t offset;
+	struct zswap_entry *entry;
+	struct page *page;
 	u8 *src, *dst;
 	unsigned int dlen;
 	int ret;
@@ -583,30 +549,46 @@ static int zswap_writeback_entry(struct zswap_tree *tree,
 		.sync_mode = WB_SYNC_NONE,
 	};
 
-	/* get/allocate page in the swap cache */
-	swpentry = swp_entry(type, entry->offset);
+	/* extract swpentry from data */
+	zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
+	swpentry = zhdr->swpentry; /* here */
+	zpool_unmap_handle(pool, handle);
+	tree = zswap_trees[swp_type(swpentry)];
+	offset = swp_offset(swpentry);
+
+	/* find and ref zswap entry */
+	spin_lock(&tree->lock);
+	entry = zswap_entry_find_get(&tree->rbroot, offset);
+	if (!entry) {
+		/* entry was invalidated */
+		spin_unlock(&tree->lock);
+		return 0;
+	}
+	spin_unlock(&tree->lock);
+	BUG_ON(offset != entry->offset);
 
 	/* try to allocate swap cache page */
 	switch (zswap_get_swap_cache_page(swpentry, &page)) {
+	case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
+		ret = -ENOMEM;
+		goto fail;
 
-	case ZSWAP_SWAPCACHE_NOMEM: /* no memory */
-		return -ENOMEM;
-		break; /* not reached */
-
-	case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */
+	case ZSWAP_SWAPCACHE_EXIST:
 		/* page is already in the swap cache, ignore for now */
-		return -EEXIST;
-		break; /* not reached */
+		page_cache_release(page);
+		ret = -EEXIST;
+		goto fail;
 
 	case ZSWAP_SWAPCACHE_NEW: /* page is locked */
 		/* decompress */
 		dlen = PAGE_SIZE;
-		src = zs_map_object(tree->pool, entry->handle, ZS_MM_RO);
+		src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
+				ZPOOL_MM_RO) + sizeof(struct zswap_header);
 		dst = kmap_atomic(page);
-		ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
-				dst, &dlen);
+		ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
+				entry->length, dst, &dlen);
 		kunmap_atomic(dst);
-		zs_unmap_object(tree->pool, entry->handle);
+		zpool_unmap_handle(zswap_pool, entry->handle);
 		BUG_ON(ret);
 		BUG_ON(dlen != PAGE_SIZE);
 
@@ -614,148 +596,45 @@ static int zswap_writeback_entry(struct zswap_tree *tree,
 		SetPageUptodate(page);
 	}
 
-	/* start writeback */
+	/* move it to the tail of the inactive list after end_writeback */
 	SetPageReclaim(page);
-	if (!__swap_writepage(page, &wbc, zswap_end_swap_write))
-		atomic_inc(&zswap_outstanding_writebacks);
-	page_cache_release(page);
-
-	return 0;
-}
-
-/*
- * Attempts to free nr of entries via writeback to the swap device.
- * The number of entries that were actually freed is returned.
- */
-static int zswap_writeback_entries(struct zswap_tree *tree, int nr)
-{
-	struct zswap_entry *entry;
-	int i, ret, refcount, freed_nr = 0;
-
-	for (i = 0; i < nr; i++) {
-		/*
-		 * This limits is arbitrary for now until a better
-		 * policy can be implemented. This is so we don't
-		 * eat all of RAM decompressing pages for writeback.
-		 */
-		if (atomic_read(&zswap_outstanding_writebacks) >
-				ZSWAP_MAX_OUTSTANDING_FLUSHES)
-			break;
-
-		spin_lock(&tree->lock);
-
-		/* dequeue from lru */
-		if (list_empty(&tree->lru)) {
-			spin_unlock(&tree->lock);
-			break;
-		}
-		entry = list_first_entry(&tree->lru,
-				struct zswap_entry, lru);
-		list_del_init(&entry->lru);
-
-		/* so invalidate doesn't free the entry from under us */
-		zswap_entry_get(entry);
-
-		spin_unlock(&tree->lock);
 
-		/* attempt writeback */
-		ret = zswap_writeback_entry(tree, entry);
-
-		spin_lock(&tree->lock);
-
-		/* drop reference from above */
-		refcount = zswap_entry_put(entry);
-
-		if (!ret)
-			/* drop the initial reference from entry creation */
-			refcount = zswap_entry_put(entry);
-
-		/*
-		 * There are four possible values for refcount here:
-		 * (1) refcount is 2, writeback failed and load is in progress;
-		 *     do nothing, load will add us back to the LRU
-		 * (2) refcount is 1, writeback failed; do not free entry,
-		 *     add back to LRU
-		 * (3) refcount is 0, (normal case) not invalidate yet;
-		 *     remove from rbtree and free entry
-		 * (4) refcount is -1, invalidate happened during writeback;
-		 *     free entry
-		 */
-		if (refcount == 1)
-			list_add(&entry->lru, &tree->lru);
-
-		if (refcount == 0) {
-			/* no invalidate yet, remove from rbtree */
-			rb_erase(&entry->rbnode, &tree->rbroot);
-		}
-		spin_unlock(&tree->lock);
-		if (refcount <= 0) {
-			/* free the entry */
-			zswap_free_entry(tree, entry);
-			freed_nr++;
-		}
-	}
-	return freed_nr;
-}
-#endif /* CONFIG_ZSWAP_ENABLE_WRITEBACK */
-
-/*******************************************
-* page pool for temporary compression result
-********************************************/
-#define ZSWAP_TMPPAGE_POOL_PAGES 16
-static LIST_HEAD(zswap_tmppage_list);
-static DEFINE_SPINLOCK(zswap_tmppage_lock);
-
-static void zswap_tmppage_pool_destroy(void)
-{
-	struct page *page, *tmppage;
+	/* start writeback */
+	__swap_writepage(page, &wbc, end_swap_bio_write);
+	page_cache_release(page);
+	zswap_written_back_pages++;
 
-	spin_lock(&zswap_tmppage_lock);
-	list_for_each_entry_safe(page, tmppage, &zswap_tmppage_list, lru) {
-		list_del(&page->lru);
-		__free_pages(page, 1);
-	}
-	spin_unlock(&zswap_tmppage_lock);
-}
+	spin_lock(&tree->lock);
+	/* drop local reference */
+	zswap_entry_put(tree, entry);
 
-static int zswap_tmppage_pool_create(void)
-{
-	int i;
-	struct page *page;
+	/*
+	* There are two possible situations for entry here:
+	* (1) refcount is 1(normal case),  entry is valid and on the tree
+	* (2) refcount is 0, entry is freed and not on the tree
+	*     because invalidate happened during writeback
+	*  search the tree and free the entry if find entry
+	*/
+	if (entry == zswap_rb_search(&tree->rbroot, offset))
+		zswap_entry_put(tree, entry);
+	spin_unlock(&tree->lock);
 
-	for (i = 0; i < ZSWAP_TMPPAGE_POOL_PAGES; i++) {
-		page = alloc_pages(GFP_KERNEL, 1);
-		if (!page) {
-			zswap_tmppage_pool_destroy();
-			return -ENOMEM;
-		}
-		spin_lock(&zswap_tmppage_lock);
-		list_add(&page->lru, &zswap_tmppage_list);
-		spin_unlock(&zswap_tmppage_lock);
-	}
-	return 0;
-}
+	goto end;
 
-static inline struct page *zswap_tmppage_alloc(void)
-{
-	struct page *page;
-
-	spin_lock(&zswap_tmppage_lock);
-	if (list_empty(&zswap_tmppage_list)) {
-		spin_unlock(&zswap_tmppage_lock);
-		return NULL;
-	}
-	page = list_first_entry(&zswap_tmppage_list, struct page, lru);
-	list_del(&page->lru);
-	spin_unlock(&zswap_tmppage_lock);
-	return page;
-}
+	/*
+	* if we get here due to ZSWAP_SWAPCACHE_EXIST
+	* a load may happening concurrently
+	* it is safe and okay to not free the entry
+	* if we free the entry in the following put
+	* it it either okay to return !0
+	*/
+fail:
+	spin_lock(&tree->lock);
+	zswap_entry_put(tree, entry);
+	spin_unlock(&tree->lock);
 
-static inline void zswap_tmppage_free(struct page *page)
-{
-	spin_lock(&zswap_tmppage_lock);
-	list_add(&page->lru, &zswap_tmppage_list);
-	spin_unlock(&zswap_tmppage_lock);
+end:
+	return ret;
 }
 
 /*********************************
@@ -768,25 +647,25 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 	struct zswap_tree *tree = zswap_trees[type];
 	struct zswap_entry *entry, *dupentry;
 	int ret;
-	unsigned int dlen = PAGE_SIZE;
+	unsigned int dlen = PAGE_SIZE, len;
 	unsigned long handle;
 	char *buf;
 	u8 *src, *dst;
-	struct page *tmppage;
-	bool writeback_attempted = 0;
-#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK
-	u8 *tmpdst;
-#endif
+	struct zswap_header *zhdr;
 
-	if (!tree) {
+	if (!zswap_enabled || !tree) {
 		ret = -ENODEV;
 		goto reject;
 	}
 
-	/* if this page got EIO on pageout before, give up immediately */
-	if (PageError(page)) {
-		ret = -ENOMEM;
-		goto reject;
+	/* reclaim space if needed */
+	if (zswap_is_full()) {
+		zswap_pool_limit_hit++;
+		if (zpool_shrink(zswap_pool, 1, NULL)) {
+			zswap_reject_reclaim_fail++;
+			ret = -ENOMEM;
+			goto reject;
+		}
 	}
 
 	/* allocate entry */
@@ -806,62 +685,25 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 		ret = -EINVAL;
 		goto freepage;
 	}
-	if ((dlen * 100 / PAGE_SIZE) > zswap_max_compression_ratio) {
+
+	/* store */
+	len = dlen + sizeof(struct zswap_header);
+	ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
+		&handle);
+	if (ret == -ENOSPC) {
 		zswap_reject_compress_poor++;
-		ret = -E2BIG;
 		goto freepage;
 	}
-
-	/* store */
-	handle = zs_malloc(tree->pool, dlen,
-		__GFP_NORETRY | __GFP_HIGHMEM | __GFP_NOMEMALLOC |
-			__GFP_NOWARN);
-	if (!handle) {
-#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK
-		zswap_writeback_attempted++;
-		/*
-		 * Copy compressed buffer out of per-cpu storage so
-		 * we can re-enable preemption.
-		*/
-		tmppage = zswap_tmppage_alloc();
-		if (!tmppage) {
-			zswap_reject_tmppage_fail++;
-			ret = -ENOMEM;
-			goto freepage;
-		}
-		writeback_attempted = 1;
-		tmpdst = page_address(tmppage);
-		memcpy(tmpdst, dst, dlen);
-		dst = tmpdst;
-		put_cpu_var(zswap_dstmem);
-
-		/* try to free up some space */
-		/* TODO: replace with more targeted policy */
-		zswap_writeback_entries(tree, 16);
-		/* try again, allowing wait */
-		handle = zs_malloc(tree->pool, dlen,
-			__GFP_NORETRY | __GFP_HIGHMEM | __GFP_NOMEMALLOC |
-				__GFP_NOWARN);
-		if (!handle) {
-			/* still no space, fail */
-			zswap_reject_zsmalloc_fail++;
-			ret = -ENOMEM;
-			goto freepage;
-		}
-		zswap_saved_by_writeback++;
-#else
-		ret = -ENOMEM;
+	if (ret) {
+		zswap_reject_alloc_fail++;
 		goto freepage;
-#endif
 	}
-
-	buf = zs_map_object(tree->pool, handle, ZS_MM_WO);
+	zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW);
+	zhdr->swpentry = swp_entry(type, offset);
+	buf = (u8 *)(zhdr + 1);
 	memcpy(buf, dst, dlen);
-	zs_unmap_object(tree->pool, handle);
-	if (writeback_attempted)
-		zswap_tmppage_free(tmppage);
-	else
-		put_cpu_var(zswap_dstmem);
+	zpool_unmap_handle(zswap_pool, handle);
+	put_cpu_var(zswap_dstmem);
 
 	/* populate entry */
 	entry->offset = offset;
@@ -874,29 +716,21 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 		ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
 		if (ret == -EEXIST) {
 			zswap_duplicate_entry++;
-			/* remove from rbtree and lru */
-			rb_erase(&dupentry->rbnode, &tree->rbroot);
-			if (!list_empty(&dupentry->lru))
-				list_del_init(&dupentry->lru);
-			if (!zswap_entry_put(dupentry)) {
-				/* free */
-				zswap_free_entry(tree, dupentry);
-			}
+			/* remove from rbtree */
+			zswap_rb_erase(&tree->rbroot, dupentry);
+			zswap_entry_put(tree, dupentry);
 		}
 	} while (ret == -EEXIST);
-	list_add_tail(&entry->lru, &tree->lru);
 	spin_unlock(&tree->lock);
 
 	/* update stats */
 	atomic_inc(&zswap_stored_pages);
+	zswap_pool_total_size = zpool_get_total_size(zswap_pool);
 
 	return 0;
 
 freepage:
-	if (writeback_attempted)
-		zswap_tmppage_free(tmppage);
-	else
-		put_cpu_var(zswap_dstmem);
+	put_cpu_var(zswap_dstmem);
 	zswap_entry_cache_free(entry);
 reject:
 	return ret;
@@ -913,59 +747,41 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 	struct zswap_entry *entry;
 	u8 *src, *dst;
 	unsigned int dlen;
-	int refcount;
+	int ret;
 
 	/* find */
 	spin_lock(&tree->lock);
-	entry = zswap_rb_search(&tree->rbroot, offset);
+	entry = zswap_entry_find_get(&tree->rbroot, offset);
 	if (!entry) {
 		/* entry was written back */
 		spin_unlock(&tree->lock);
 		return -1;
 	}
-	zswap_entry_get(entry);
-
-	/* remove from lru */
-	if (!list_empty(&entry->lru))
-		list_del_init(&entry->lru);
 	spin_unlock(&tree->lock);
 
 	/* decompress */
 	dlen = PAGE_SIZE;
-	src = zs_map_object(tree->pool, entry->handle, ZS_MM_RO);
+	src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
+			ZPOOL_MM_RO) + sizeof(struct zswap_header);
 	dst = kmap_atomic(page);
-	zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
+	ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
 		dst, &dlen);
 	kunmap_atomic(dst);
-	zs_unmap_object(tree->pool, entry->handle);
+	zpool_unmap_handle(zswap_pool, entry->handle);
+	BUG_ON(ret);
 
 	spin_lock(&tree->lock);
-	refcount = zswap_entry_put(entry);
-	if (likely(refcount)) {
-		list_add_tail(&entry->lru, &tree->lru);
-		spin_unlock(&tree->lock);
-		return 0;
-	}
+	zswap_entry_put(tree, entry);
 	spin_unlock(&tree->lock);
 
-	/*
-	 * We don't have to unlink from the rbtree because
-	 * zswap_writeback_entry() or zswap_frontswap_invalidate page()
-	 * has already done this for us if we are the last reference.
-	 */
-	/* free */
-
-	zswap_free_entry(tree, entry);
-
 	return 0;
 }
 
-/* invalidates a single page */
+/* frees an entry in zswap */
 static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
 {
 	struct zswap_tree *tree = zswap_trees[type];
 	struct zswap_entry *entry;
-	int refcount;
 
 	/* find */
 	spin_lock(&tree->lock);
@@ -976,80 +792,51 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
 		return;
 	}
 
-	/* remove from rbtree and lru */
-	rb_erase(&entry->rbnode, &tree->rbroot);
-	if (!list_empty(&entry->lru))
-		list_del_init(&entry->lru);
+	/* remove from rbtree */
+	zswap_rb_erase(&tree->rbroot, entry);
 
 	/* drop the initial reference from entry creation */
-	refcount = zswap_entry_put(entry);
+	zswap_entry_put(tree, entry);
 
 	spin_unlock(&tree->lock);
-
-	if (refcount) {
-		/* writeback in progress, writeback will free */
-		return;
-	}
-
-	/* free */
-	zswap_free_entry(tree, entry);
 }
 
-/* invalidates all pages for the given swap type */
+/* frees all zswap entries for the given swap type */
 static void zswap_frontswap_invalidate_area(unsigned type)
 {
 	struct zswap_tree *tree = zswap_trees[type];
-	struct rb_node *node;
-	struct zswap_entry *entry;
+	struct zswap_entry *entry, *n;
 
 	if (!tree)
 		return;
 
 	/* walk the tree and free everything */
 	spin_lock(&tree->lock);
-	/*
-	 * TODO: Even though this code should not be executed because
-	 * the try_to_unuse() in swapoff should have emptied the tree,
-	 * it is very wasteful to rebalance the tree after every
-	 * removal when we are freeing the whole tree.
-	 *
-	 * If post-order traversal code is ever added to the rbtree
-	 * implementation, it should be used here.
-	 */
-	while ((node = rb_first(&tree->rbroot))) {
-		entry = rb_entry(node, struct zswap_entry, rbnode);
-		rb_erase(&entry->rbnode, &tree->rbroot);
-		zs_free(tree->pool, entry->handle);
-		zswap_entry_cache_free(entry);
-		atomic_dec(&zswap_stored_pages);
-	}
+	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
+		zswap_free_entry(entry);
 	tree->rbroot = RB_ROOT;
-	INIT_LIST_HEAD(&tree->lru);
 	spin_unlock(&tree->lock);
+	kfree(tree);
+	zswap_trees[type] = NULL;
 }
 
-/* NOTE: this is called in atomic context from swapon and must not sleep */
+static const struct zpool_ops zswap_zpool_ops = {
+	.evict = zswap_writeback_entry
+};
+
 static void zswap_frontswap_init(unsigned type)
 {
 	struct zswap_tree *tree;
 
-	tree = kzalloc(sizeof(struct zswap_tree), GFP_ATOMIC);
-	if (!tree)
-		goto err;
-	tree->pool = zs_create_pool(GFP_NOWAIT, &zswap_zs_ops);
-	if (!tree->pool)
-		goto freetree;
+	tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
+	if (!tree) {
+		pr_err("alloc failed, zswap disabled for swap type %d\n", type);
+		return;
+	}
+
 	tree->rbroot = RB_ROOT;
-	INIT_LIST_HEAD(&tree->lru);
 	spin_lock_init(&tree->lock);
-	tree->type = type;
 	zswap_trees[type] = tree;
-	return;
-
-freetree:
-	kfree(tree);
-err:
-	pr_err("alloc failed, zswap disabled for swap type %d\n", type);
 }
 
 static struct frontswap_ops zswap_frontswap_ops = {
@@ -1077,16 +864,12 @@ static int __init zswap_debugfs_init(void)
 	if (!zswap_debugfs_root)
 		return -ENOMEM;
 
-	debugfs_create_u64("saved_by_writeback", S_IRUGO,
-			zswap_debugfs_root, &zswap_saved_by_writeback);
 	debugfs_create_u64("pool_limit_hit", S_IRUGO,
 			zswap_debugfs_root, &zswap_pool_limit_hit);
-	debugfs_create_u64("reject_writeback_attempted", S_IRUGO,
-			zswap_debugfs_root, &zswap_writeback_attempted);
-	debugfs_create_u64("reject_tmppage_fail", S_IRUGO,
-			zswap_debugfs_root, &zswap_reject_tmppage_fail);
-	debugfs_create_u64("reject_zsmalloc_fail", S_IRUGO,
-			zswap_debugfs_root, &zswap_reject_zsmalloc_fail);
+	debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
+			zswap_debugfs_root, &zswap_reject_reclaim_fail);
+	debugfs_create_u64("reject_alloc_fail", S_IRUGO,
+			zswap_debugfs_root, &zswap_reject_alloc_fail);
 	debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
 			zswap_debugfs_root, &zswap_reject_kmemcache_fail);
 	debugfs_create_u64("reject_compress_poor", S_IRUGO,
@@ -1095,14 +878,11 @@ static int __init zswap_debugfs_init(void)
 			zswap_debugfs_root, &zswap_written_back_pages);
 	debugfs_create_u64("duplicate_entry", S_IRUGO,
 			zswap_debugfs_root, &zswap_duplicate_entry);
-	debugfs_create_atomic_t("pool_pages", S_IRUGO,
-			zswap_debugfs_root, &zswap_pool_pages);
+	debugfs_create_u64("pool_total_size", S_IRUGO,
+			zswap_debugfs_root, &zswap_pool_total_size);
 	debugfs_create_atomic_t("stored_pages", S_IRUGO,
 			zswap_debugfs_root, &zswap_stored_pages);
-#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK
-	debugfs_create_atomic_t("outstanding_writebacks", S_IRUGO,
-			zswap_debugfs_root, &zswap_outstanding_writebacks);
-#endif
+
 	return 0;
 }
 
@@ -1111,12 +891,12 @@ static void __exit zswap_debugfs_exit(void)
 	debugfs_remove_recursive(zswap_debugfs_root);
 }
 #else
-static inline int __init zswap_debugfs_init(void)
+static int __init zswap_debugfs_init(void)
 {
 	return 0;
 }
 
-static inline void __exit zswap_debugfs_exit(void) { }
+static void __exit zswap_debugfs_exit(void) { }
 #endif
 
 /*********************************
@@ -1124,21 +904,28 @@ static inline void __exit zswap_debugfs_exit(void) { }
 **********************************/
 static int __init init_zswap(void)
 {
-	if (!zswap_enabled)
-		return 0;
+	gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
 
 	pr_info("loading zswap\n");
-	if (zswap_entry_cache_create()) {
-		pr_err("entry cache creation failed\n");
-		goto error;
+
+	zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
+					&zswap_zpool_ops);
+	if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
+		pr_info("%s zpool not available\n", zswap_zpool_type);
+		zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
+		zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
+					&zswap_zpool_ops);
 	}
-	if (zswap_page_pool_create()) {
-		pr_err("page pool initialization failed\n");
-		goto pagepoolfail;
+	if (!zswap_pool) {
+		pr_err("%s zpool not available\n", zswap_zpool_type);
+		pr_err("zpool creation failed\n");
+		goto error;
 	}
-	if (zswap_tmppage_pool_create()) {
-		pr_err("workmem pool initialization failed\n");
-		goto tmppoolfail;
+	pr_info("using %s pool\n", zswap_zpool_type);
+
+	if (zswap_entry_cache_create()) {
+		pr_err("entry cache creation failed\n");
+		goto cachefail;
 	}
 	if (zswap_comp_init()) {
 		pr_err("compressor initialization failed\n");
@@ -1148,6 +935,7 @@ static int __init init_zswap(void)
 		pr_err("per-cpu initialization failed\n");
 		goto pcpufail;
 	}
+
 	frontswap_register_ops(&zswap_frontswap_ops);
 	if (zswap_debugfs_init())
 		pr_warn("debugfs initialization failed\n");
@@ -1155,11 +943,9 @@ static int __init init_zswap(void)
 pcpufail:
 	zswap_comp_exit();
 compfail:
-	zswap_tmppage_pool_destroy();
-tmppoolfail:
-	zswap_page_pool_destroy();
-pagepoolfail:
-	zswap_entry_cache_destory();
+	zswap_entry_cache_destroy();
+cachefail:
+	zpool_destroy_pool(zswap_pool);
 error:
 	return -ENOMEM;
 }
@@ -1167,5 +953,5 @@ static int __init init_zswap(void)
 late_initcall(init_zswap);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
-MODULE_DESCRIPTION("Compressed cache for swap pages");
+MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
+MODULE_DESCRIPTION("Compressed cache for swap pages");
\ No newline at end of file
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 8755a3079..7fc10b915 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -137,20 +137,24 @@ static int __xfrm6_output(struct sk_buff *skb)
 	struct dst_entry *dst = skb_dst(skb);
 	struct xfrm_state *x = dst->xfrm;
 	int mtu = ip6_skb_dst_mtu(skb);
+	bool toobig;
 
-	if (skb->len > mtu && xfrm6_local_dontfrag(skb)) {
+	if (x->props.mode != XFRM_MODE_TUNNEL)
+		goto skip_frag;
+
+	toobig = skb->len > mtu && !skb_is_gso(skb);
+
+	if (toobig && xfrm6_local_dontfrag(skb)) {
 		xfrm6_local_rxpmtu(skb, mtu);
 		return -EMSGSIZE;
-	} else if (!skb->local_df && skb->len > mtu && skb->sk) {
+	} else if (!skb->local_df && toobig && skb->sk) {
 		xfrm6_local_error(skb, mtu);
 		return -EMSGSIZE;
 	}
 
-	if (x->props.mode == XFRM_MODE_TUNNEL &&
-	    ((skb->len > mtu && !skb_is_gso(skb)) ||
-		dst_allfrag(skb_dst(skb)))) {
+	if (toobig || dst_allfrag(skb_dst(skb)))
 			return ip6_fragment(skb, x->outer_mode->afinfo->output_finish);
-	}
+skip_frag:
 	return x->outer_mode->afinfo->output_finish(skb);
 }
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index bed52a6ba..d7bfc431d 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -284,9 +284,6 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx)
 	if (tx->sdata->vif.type == NL80211_IFTYPE_WDS)
 		return TX_CONTINUE;
 
-	if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
-		return TX_CONTINUE;
-
 	if (tx->flags & IEEE80211_TX_PS_BUFFERED)
 		return TX_CONTINUE;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 42eb7ba0b..897a5f14c 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -545,6 +545,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
 {
 	struct ib_send_wr send_wr;
 	struct ib_send_wr inv_wr;
+	u32 xdr_off;
 	int sge_no;
 	int sge_bytes;
 	int page_no;
@@ -584,8 +585,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
 	ctxt->direction = DMA_TO_DEVICE;
 
 	/* Map the payload indicated by 'byte_count' */
+	xdr_off = 0;
 	for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
-		int xdr_off = 0;
 		sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
 		byte_count -= sge_bytes;
 		if (!vec->frmr) {
@@ -623,6 +624,14 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		if (page_no+1 >= sge_no)
 			ctxt->sge[page_no+1].length = 0;
 	}
+
+	/* The loop above bumps sc_dma_used for each sge. The
+	 * xdr_buf.tail gets a separate sge, but resides in the
+	 * same page as xdr_buf.head. Don't count it twice.
+	 */
+	if (sge_no > ctxt->count)
+		atomic_dec(&rdma->sc_dma_used);
+
 	BUG_ON(sge_no > rdma->sc_max_sge);
 	memset(&send_wr, 0, sizeof send_wr);
 	ctxt->wr_op = IB_WR_SEND;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 31275e52c..d4a564fec 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -811,6 +811,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
 {
 	struct socket *sock = transport->sock;
 	struct sock *sk = transport->inet;
+	struct rpc_xprt *xprt = &transport->xprt;
 
 	if (sk == NULL)
 		return;
@@ -824,6 +825,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
 	sk->sk_user_data = NULL;
 
 	xs_restore_old_callbacks(transport, sk);
+	xprt_clear_connected(xprt);
 	write_unlock_bh(&sk->sk_callback_lock);
 
 	sk->sk_no_check = 0;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 70d93c605..5b3796788 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1026,7 +1026,7 @@ static void selinux_write_opts(struct seq_file *m,
 		seq_puts(m, prefix);
 		if (has_comma)
 			seq_putc(m, '\"');
-		seq_puts(m, opts->mnt_opts[i]);
+		seq_escape(m, opts->mnt_opts[i], "\"\n\\");
 		if (has_comma)
 			seq_putc(m, '\"');
 	}
diff --git a/setup_ramdisk.sh b/setup_ramdisk.sh
index 0a70b4aa2..9aa0ec904 100755
--- a/setup_ramdisk.sh
+++ b/setup_ramdisk.sh
@@ -21,7 +21,7 @@ RDIR=$(pwd)
 # japanese variants:
 #	dcm = N900D / SC-01F  (NTT Docomo)
 #	kdi = N900J / SCL22   (au by KDDI)
-VARIANT=can
+VARIANT=eur
 
 ############## SCARY NO-TOUCHY STUFF ###############
 
diff --git a/sound/arm/Kconfig b/sound/arm/Kconfig
index 885683a3b..e04062117 100644
--- a/sound/arm/Kconfig
+++ b/sound/arm/Kconfig
@@ -9,6 +9,14 @@ menuconfig SND_ARM
 	  Drivers that are implemented on ASoC can be found in
 	  "ALSA for SoC audio support" section.
 
+config SND_PXA2XX_LIB
+	tristate
+	select SND_AC97_CODEC if SND_PXA2XX_LIB_AC97
+	select SND_DMAENGINE_PCM
+
+config SND_PXA2XX_LIB_AC97
+	bool
+
 if SND_ARM
 
 config SND_ARMAACI
@@ -21,13 +29,6 @@ config SND_PXA2XX_PCM
 	tristate
 	select SND_PCM
 
-config SND_PXA2XX_LIB
-	tristate
-	select SND_AC97_CODEC if SND_PXA2XX_LIB_AC97
-
-config SND_PXA2XX_LIB_AC97
-	bool
-
 config SND_PXA2XX_AC97
 	tristate "AC97 driver for the Intel PXA2xx chip"
 	depends on ARCH_PXA
diff --git a/sound/soc/pxa/Kconfig b/sound/soc/pxa/Kconfig
index a0f7d3cfa..23deb67b8 100644
--- a/sound/soc/pxa/Kconfig
+++ b/sound/soc/pxa/Kconfig
@@ -1,7 +1,6 @@
 config SND_PXA2XX_SOC
 	tristate "SoC Audio for the Intel PXA2xx chip"
 	depends on ARCH_PXA
-	select SND_ARM
 	select SND_PXA2XX_LIB
 	help
 	  Say Y or M if you want to add support for codecs attached to
@@ -15,7 +14,6 @@ config SND_PXA2XX_AC97
 config SND_PXA2XX_SOC_AC97
 	tristate
 	select AC97_BUS
-	select SND_ARM
 	select SND_PXA2XX_LIB_AC97
 	select SND_SOC_AC97_BUS
 
diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index d8a22c07e..5e612f881 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -1620,7 +1620,7 @@ static int dapm_power_widgets(struct snd_soc_dapm_context *dapm, int event)
 	struct snd_soc_dapm_context *d;
 	LIST_HEAD(up_list);
 	LIST_HEAD(down_list);
-	LIST_HEAD(async_domain);
+	ASYNC_DOMAIN_EXCLUSIVE(async_domain);
 	enum snd_soc_bias_level bias;
 
 	trace_snd_soc_dapm_start(card);
diff --git a/sound/synth/emux/emux_oss.c b/sound/synth/emux/emux_oss.c
index daf61abc3..646b66703 100644
--- a/sound/synth/emux/emux_oss.c
+++ b/sound/synth/emux/emux_oss.c
@@ -69,7 +69,8 @@ snd_emux_init_seq_oss(struct snd_emux *emu)
 	struct snd_seq_oss_reg *arg;
 	struct snd_seq_device *dev;
 
-	if (snd_seq_device_new(emu->card, 0, SNDRV_SEQ_DEV_ID_OSS,
+	/* using device#1 here for avoiding conflicts with OPL3 */
+	if (snd_seq_device_new(emu->card, 1, SNDRV_SEQ_DEV_ID_OSS,
 			       sizeof(struct snd_seq_oss_reg), &dev) < 0)
 		return;
 
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index c0b70c697..5a4482c2a 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -1060,25 +1060,19 @@ static void print_cpudesc(struct perf_header *ph, int fd, FILE *fp)
 static void print_nrcpus(struct perf_header *ph, int fd, FILE *fp)
 {
 	ssize_t ret;
-	u32 nr;
+	u32 nr[2];
 
 	ret = read(fd, &nr, sizeof(nr));
 	if (ret != (ssize_t)sizeof(nr))
-		nr = -1; /* interpreted as error */
+		nr[0] = nr[1] = -1; /* interpreted as error */
 
-	if (ph->needs_swap)
-		nr = bswap_32(nr);
-
-	fprintf(fp, "# nrcpus online : %u\n", nr);
-
-	ret = read(fd, &nr, sizeof(nr));
-	if (ret != (ssize_t)sizeof(nr))
-		nr = -1; /* interpreted as error */
-
-	if (ph->needs_swap)
-		nr = bswap_32(nr);
+	if (ph->needs_swap) {
+		nr[0] = bswap_32(nr[0]);
+		nr[1] = bswap_32(nr[1]);
+	}
 
-	fprintf(fp, "# nrcpus avail : %u\n", nr);
+	fprintf(fp, "# nrcpus online : %u\n", nr[1]);
+	fprintf(fp, "# nrcpus avail : %u\n", nr[0]);
 }
 
 static void print_version(struct perf_header *ph, int fd, FILE *fp)