diff --git a/Makefile b/Makefile index c4610edd8..cd79b4837 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 3 PATCHLEVEL = 4 -SUBLEVEL = 111 +SUBLEVEL = 112 EXTRAVERSION = NAME = Saber-toothed Squirrel @@ -243,8 +243,8 @@ CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ else if [ -x /bin/bash ]; then echo /bin/bash; \ else echo sh; fi ; fi) -HOSTCC = gcc -HOSTCXX = g++ +HOSTCC = ccache gcc +HOSTCXX = ccache g++ HOSTCFLAGS = -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 HOSTCXXFLAGS = -O2 @@ -330,7 +330,7 @@ include $(srctree)/scripts/Kbuild.include AS = $(CROSS_COMPILE)as LD = $(CROSS_COMPILE)ld -REAL_CC = $(CROSS_COMPILE)gcc +REAL_CC = ccache $(CROSS_COMPILE)gcc CPP = $(CC) -E AR = $(CROSS_COMPILE)ar NM = $(CROSS_COMPILE)nm @@ -377,6 +377,8 @@ KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -fno-common \ -Werror-implicit-function-declaration \ -Wno-format-security \ + -Wno-discarded-array-qualifiers \ + -Wno-switch-bool \ -mtune=cortex-a15 \ --param l1-cache-size=32 \ --param l2-cache-size=2048 \ diff --git a/VERSION b/VERSION index a3fcc7121..0ee843cc6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -7.1.0 +7.2.0 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4b3aa788b..45b36ecca 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -816,9 +816,7 @@ config ARCH_NR_GPIO source kernel/Kconfig.preempt -config HZ - int - default 100 +source kernel/Kconfig.hz config THUMB2_KERNEL bool "Compile the kernel in Thumb-2 mode (EXPERIMENTAL)" diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 529e88060..b1eedba96 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -53,6 +53,14 @@ endif comma = , +# +# The Scalar Replacement of Aggregates (SRA) optimization pass in GCC 4.9 and +# later may result in code being generated that handles signed short and signed +# char struct members incorrectly. So disable it. +# (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65932) +# +KBUILD_CFLAGS += $(call cc-option,-fno-ipa-sra) + arch-$(CONFIG_ARCH_MSM_KRAIT) :=-D__LINUX_ARM_ARCH__=7 $(call cc-option,-mcpu=cortex-a15,-march=armv7-a) ifeq ($(CONFIG_AEABI),y) diff --git a/arch/arm/boot/dts/msm8974/msm8974-gpu.dtsi b/arch/arm/boot/dts/msm8974/msm8974-gpu.dtsi index 695e45298..21487a85a 100644 --- a/arch/arm/boot/dts/msm8974/msm8974-gpu.dtsi +++ b/arch/arm/boot/dts/msm8974/msm8974-gpu.dtsi @@ -24,7 +24,7 @@ qcom,initial-pwrlevel = <1>; - qcom,idle-timeout = <8>; // + qcom,idle-timeout = <80>; //msec qcom,strtstp-sleepwake; qcom,clk-map = <0x0000006>; //KGSL_CLK_CORE | KGSL_CLK_IFACE diff --git a/arch/arm/configs/ik_defconfig b/arch/arm/configs/ik_defconfig index 1ec09b28d..7ff4bd3d8 100644 --- a/arch/arm/configs/ik_defconfig +++ b/arch/arm/configs/ik_defconfig @@ -47,6 +47,7 @@ CONFIG_LOCALVERSION="-idleKernel-hlte-" CONFIG_HAVE_KERNEL_LZMA=y CONFIG_HAVE_KERNEL_XZ=y CONFIG_HAVE_KERNEL_LZO=y +CONFIG_HAVE_KERNEL_LZ4=y # CONFIG_KERNEL_LZMA is not set CONFIG_KERNEL_XZ=y # CONFIG_KERNEL_LZO is not set @@ -574,7 +575,11 @@ CONFIG_ARCH_NR_GPIO=0 # CONFIG_PREEMPT_VOLUNTARY is not set CONFIG_PREEMPT=y CONFIG_PREEMPT_COUNT=y -CONFIG_HZ=100 +# CONFIG_HZ_100 is not set +# CONFIG_HZ_250 is not set +CONFIG_HZ_300=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=300 # CONFIG_THUMB2_KERNEL is not set CONFIG_AEABI=y # CONFIG_OABI_COMPAT is not set @@ -602,13 +607,16 @@ CONFIG_KSM=y CONFIG_UKSM=y # CONFIG_KSM_LEGACY is not set CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 -# CONFIG_CLEANCACHE is not set +CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y CONFIG_ZSMALLOC_NEW=y CONFIG_PGTABLE_MAPPING=y CONFIG_ZSWAP=y # CONFIG_SWAP_ENABLE_READAHEAD is not set # CONFIG_ZSWAP_ENABLE_WRITEBACK is not set +CONFIG_ZPOOL=y +CONFIG_ZBUD=y +CONFIG_ZCACHE=y CONFIG_DIRECT_RECLAIM_FILE_PAGES_ONLY=y CONFIG_INCREASE_MAXIMUM_SWAPPINESS=y CONFIG_FIX_INACTIVE_RATIO=y @@ -1244,6 +1252,8 @@ CONFIG_OF_BATTERYDATA=y CONFIG_OF_SUBCMDLINE_PARSE=y # CONFIG_PARPORT is not set CONFIG_BLK_DEV=y +CONFIG_ZRAM=y +CONFIG_ZRAM_LZ4_COMPRESS=y # CONFIG_BLK_DEV_COW_COMMON is not set CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 @@ -1718,6 +1728,7 @@ CONFIG_INPUT_WACOM=y # CONFIG_TOUCHSCREEN_FTS is not set # CONFIG_TOUCHSCREEN_FTS_S is not set # CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI_JSGLTE is not set +CONFIG_STATE_NOTIFIER=y CONFIG_INPUT_MISC=y # CONFIG_INPUT_AD714X is not set # CONFIG_INPUT_BMA150 is not set @@ -3677,7 +3688,7 @@ CONFIG_SEC_FPGA=y # CONFIG_SENSORS_SSP_STM is not set CONFIG_SENSORS_SSP_STM_32F=y # CONFIG_SENSORS_SSP_STM_HESTIA is not set -CONFIG_SENSORS_SSP_STM_LEGACY=y +# CONFIG_SENSORS_SSP_STM_LEGACY is not set CONFIG_SENSORS_SYSFS=y CONFIG_SENSORS_SSP=y # CONFIG_SENSORS_SSP_AK8963C is not set @@ -4189,6 +4200,9 @@ CONFIG_CRYPTO_TWOFISH_COMMON=y CONFIG_CRYPTO_DEFLATE=y # CONFIG_CRYPTO_ZLIB is not set CONFIG_CRYPTO_LZO=y +CONFIG_CRYPTO_LZO=y +CONFIG_CRYPTO_LZ4=y +CONFIG_CRYPTO_LZ4HC=y # # Random Number Generation @@ -4228,6 +4242,9 @@ CONFIG_ZLIB_INFLATE=y CONFIG_ZLIB_DEFLATE=y CONFIG_LZO_COMPRESS=y CONFIG_LZO_DECOMPRESS=y +CONFIG_LZ4_COMPRESS=y +CONFIG_LZ4HC_COMPRESS=y +CONFIG_LZ4_DECOMPRESS=y CONFIG_XZ_DEC=y CONFIG_XZ_DEC_X86=y CONFIG_XZ_DEC_POWERPC=y @@ -4239,6 +4256,7 @@ CONFIG_XZ_DEC_BCJ=y # CONFIG_XZ_DEC_TEST is not set CONFIG_DECOMPRESS_GZIP=y CONFIG_DECOMPRESS_XZ=y +CONFIG_DECOMPRESS_LZ4=y CONFIG_GENERIC_ALLOCATOR=y CONFIG_REED_SOLOMON=y CONFIG_REED_SOLOMON_ENC8=y diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index a178f75da..38ecc608b 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -440,12 +440,17 @@ setup_return(struct pt_regs *regs, struct k_sigaction *ka, */ thumb = handler & 1; -#if __LINUX_ARM_ARCH__ >= 7 +#if __LINUX_ARM_ARCH__ >= 6 /* - * Clear the If-Then Thumb-2 execution state - * ARM spec requires this to be all 000s in ARM mode - * Snapdragon S4/Krait misbehaves on a Thumb=>ARM - * signal transition without this. + * Clear the If-Then Thumb-2 execution state. ARM spec + * requires this to be all 000s in ARM mode. Snapdragon + * S4/Krait misbehaves on a Thumb=>ARM signal transition + * without this. + * + * We must do this whenever we are running on a Thumb-2 + * capable CPU, which includes ARMv6T2. However, we elect + * to do this whenever we're on an ARMv6 or later CPU for + * simplicity. */ cpsr &= ~PSR_IT_MASK; #endif diff --git a/arch/arm/mach-msm/bam_dmux.c b/arch/arm/mach-msm/bam_dmux.c index 529e7d12a..6c5df1d38 100644 --- a/arch/arm/mach-msm/bam_dmux.c +++ b/arch/arm/mach-msm/bam_dmux.c @@ -1716,7 +1716,7 @@ static void ul_wakeup(void) } if (wait_for_dfab) { ret = wait_for_completion_timeout( - &dfab_unvote_completion, HZ); + &dfab_unvote_completion, msecs_to_jiffies(1000)); BUG_ON(ret == 0); } if (likely(do_vote_dfab)) diff --git a/arch/arm/mach-msm/clock-debug.c b/arch/arm/mach-msm/clock-debug.c index ec3bd0036..89d97b0d1 100644 --- a/arch/arm/mach-msm/clock-debug.c +++ b/arch/arm/mach-msm/clock-debug.c @@ -33,7 +33,7 @@ static LIST_HEAD(clk_list); static DEFINE_SPINLOCK(clk_list_lock); static struct dentry *debugfs_base; -static u32 debug_suspend = 1; +static u32 debug_suspend = 0; struct clk_table { struct list_head node; diff --git a/arch/arm/mach-msm/dma.c b/arch/arm/mach-msm/dma.c index afee25f67..adb1b596c 100644 --- a/arch/arm/mach-msm/dma.c +++ b/arch/arm/mach-msm/dma.c @@ -356,7 +356,7 @@ static void msm_dmov_enqueue_cmd_ext_work(struct work_struct *work) } if (!dmov_conf[adm].channel_active) { dmov_conf[adm].clk_ctl = CLK_TO_BE_DIS; - schedule_delayed_work(&dmov_conf[adm].work, (HZ/10)); + schedule_delayed_work(&dmov_conf[adm].work, msecs_to_jiffies(100)); } spin_unlock_irqrestore(&dmov_conf[adm].list_lock, flags); error: diff --git a/arch/arm/mach-msm/ipc_router.c b/arch/arm/mach-msm/ipc_router.c index fa9546984..a84b904a3 100644 --- a/arch/arm/mach-msm/ipc_router.c +++ b/arch/arm/mach-msm/ipc_router.c @@ -189,7 +189,7 @@ static LIST_HEAD(xprt_info_list); static DECLARE_RWSEM(xprt_info_list_lock_lha5); static DECLARE_COMPLETION(msm_ipc_local_router_up); -#define IPC_ROUTER_INIT_TIMEOUT (10 * HZ) +#define IPC_ROUTER_INIT_TIMEOUT 10000 static uint32_t next_port_id; static DEFINE_MUTEX(next_port_id_lock_lha1); @@ -3267,7 +3267,7 @@ void msm_ipc_router_xprt_notify(struct msm_ipc_router_xprt *xprt, if (!msm_ipc_router_workqueue) { ret = wait_for_completion_timeout(&msm_ipc_local_router_up, - IPC_ROUTER_INIT_TIMEOUT); + msecs_to_jiffies(IPC_ROUTER_INIT_TIMEOUT)); if (!ret || !msm_ipc_router_workqueue) { pr_err("%s: IPC Router not initialized\n", __func__); return; diff --git a/arch/arm/mach-msm/platsmp.c b/arch/arm/mach-msm/platsmp.c index 796fba1de..c780f776e 100644 --- a/arch/arm/mach-msm/platsmp.c +++ b/arch/arm/mach-msm/platsmp.c @@ -235,7 +235,7 @@ static int __cpuinit release_from_pen(unsigned int cpu) */ gic_raise_softirq(cpumask_of(cpu), 1); - timeout = jiffies + (1 * HZ); + timeout = jiffies + msecs_to_jiffies(1000); while (time_before(jiffies, timeout)) { smp_rmb(); if (pen_release == -1) diff --git a/arch/arm/mach-msm/qdsp6v2/apr.c b/arch/arm/mach-msm/qdsp6v2/apr.c index d3119bccf..9de7c6cf0 100644 --- a/arch/arm/mach-msm/qdsp6v2/apr.c +++ b/arch/arm/mach-msm/qdsp6v2/apr.c @@ -220,11 +220,11 @@ int apr_wait_for_device_up(int dest_id) if (dest_id == APR_DEST_MODEM) rc = wait_event_interruptible_timeout(modem_wait, (apr_get_modem_state() == APR_SUBSYS_UP), - (1 * HZ)); + msecs_to_jiffies(1000)); else if (dest_id == APR_DEST_QDSP6) rc = wait_event_interruptible_timeout(dsp_wait, (apr_get_q6_state() == APR_SUBSYS_UP), - (1 * HZ)); + msecs_to_jiffies(1000)); else pr_err("%s: unknown dest_id %d\n", __func__, dest_id); /* returns left time */ diff --git a/arch/arm/mach-msm/qdsp6v2/apr_tal.c b/arch/arm/mach-msm/qdsp6v2/apr_tal.c index 4443d64e5..333ae8f6e 100644 --- a/arch/arm/mach-msm/qdsp6v2/apr_tal.c +++ b/arch/arm/mach-msm/qdsp6v2/apr_tal.c @@ -183,7 +183,7 @@ struct apr_svc_ch_dev *apr_tal_open(uint32_t svc, uint32_t dest, return NULL; } rc = wait_event_timeout(apr_svc_ch[dl][dest][svc].wait, - (apr_svc_ch[dl][dest][svc].smd_state == 1), 5 * HZ); + (apr_svc_ch[dl][dest][svc].smd_state == 1), msecs_to_jiffies(5000)); if (rc == 0) { pr_err("apr_tal:TIMEOUT for OPEN event\n"); mutex_unlock(&apr_svc_ch[dl][dest][svc].m_lock); diff --git a/arch/arm/mach-msm/qdsp6v2/ultrasound/usf.c b/arch/arm/mach-msm/qdsp6v2/ultrasound/usf.c index 1c42020f1..9733612fc 100644 --- a/arch/arm/mach-msm/qdsp6v2/ultrasound/usf.c +++ b/arch/arm/mach-msm/qdsp6v2/ultrasound/usf.c @@ -31,7 +31,7 @@ #define USF_VERSION_ID 0x0171 /* Standard timeout in the asynchronous ops */ -#define USF_TIMEOUT_JIFFIES (1*HZ) /* 1 sec */ +#define USF_TIMEOUT_JIFFIES 1000 /* 1 sec */ /* Undefined USF device */ #define USF_UNDEF_DEV_ID 0xffff @@ -886,9 +886,9 @@ static int usf_set_us_detection(struct usf_type *usf, unsigned long arg) USF_ADSP_RESTART_STATE)); } else { if (detect_info.detect_timeout == USF_DEFAULT_TIMEOUT) - timeout = USF_TIMEOUT_JIFFIES; + timeout = msecs_to_jiffies(USF_TIMEOUT_JIFFIES); else - timeout = detect_info.detect_timeout * HZ; + timeout = detect_info.detect_timeout * msecs_to_jiffies(1000); } rc = wait_event_interruptible_timeout(usf_xx->wait, (usf_xx->us_detect_type != @@ -1098,7 +1098,7 @@ static int usf_get_tx_update(struct usf_type *usf, unsigned long arg) else { prev_jiffies = jiffies; if (upd_tx_info.timeout == USF_DEFAULT_TIMEOUT) { - timeout = USF_TIMEOUT_JIFFIES; + timeout = msecs_to_jiffies(USF_TIMEOUT_JIFFIES); rc = wait_event_timeout( usf_xx->wait, (usf_xx->prev_region != @@ -1107,7 +1107,7 @@ static int usf_get_tx_update(struct usf_type *usf, unsigned long arg) USF_WORK_STATE), timeout); } else { - timeout = upd_tx_info.timeout * HZ; + timeout = upd_tx_info.timeout * msecs_to_jiffies(1000); rc = wait_event_interruptible_timeout( usf_xx->wait, (usf_xx->prev_region != @@ -1190,7 +1190,7 @@ static int usf_set_rx_update(struct usf_xx_type *usf_xx, unsigned long arg) usf_xx->usc, &upd_rx_info.free_region) || (usf_xx->usf_state == USF_IDLE_STATE), - USF_TIMEOUT_JIFFIES); + msecs_to_jiffies(USF_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; diff --git a/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c b/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c index 1a1d58727..3c9f6c6b9 100644 --- a/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c +++ b/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c @@ -33,7 +33,7 @@ #define WRITEDONE_IDX_STATUS 0 /* Standard timeout in the asynchronous ops */ -#define Q6USM_TIMEOUT_JIFFIES (1*HZ) /* 1 sec */ +#define Q6USM_TIMEOUT_JIFFIES 1000 /* 1 sec */ static DEFINE_MUTEX(session_lock); @@ -103,7 +103,7 @@ static int q6usm_memory_map(uint32_t buf_add, int dir, uint32_t bufsz, rc = wait_event_timeout(this_mmap.cmd_wait, (atomic_read(&this_mmap.cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: timeout. waited for memory_map\n", __func__); @@ -141,7 +141,7 @@ int q6usm_memory_unmap(uint32_t buf_add, int dir, uint32_t session, rc = wait_event_timeout(this_mmap.cmd_wait, (atomic_read(&this_mmap.cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: timeout. waited for memory_unmap\n", __func__); @@ -795,7 +795,7 @@ int q6usm_open_read(struct us_client *usc, } rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: timeout, waited for OPEN_READ rc[%d]\n", @@ -905,7 +905,7 @@ int q6usm_enc_cfg_blk(struct us_client *usc, struct us_encdec_cfg *us_cfg) } rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: timeout opcode[0x%x]\n", @@ -993,7 +993,7 @@ int q6usm_dec_cfg_blk(struct us_client *usc, struct us_encdec_cfg *us_cfg) } rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: timeout opcode[0x%x]\n", @@ -1041,7 +1041,7 @@ int q6usm_open_write(struct us_client *usc, } rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s:timeout. waited for OPEN_WRITR rc[%d]\n", @@ -1079,7 +1079,7 @@ int q6usm_run(struct us_client *usc, uint32_t flags, rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: timeout. waited for run success rc[%d]\n", @@ -1280,7 +1280,7 @@ int q6usm_cmd(struct us_client *usc, int cmd) goto fail_cmd; } rc = wait_event_timeout(usc->cmd_wait, (atomic_read(state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s:timeout. waited for response opcode[0x%x]\n", @@ -1320,11 +1320,11 @@ int q6usm_set_us_detection(struct us_client *usc, } rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; - pr_err("%s: CMD_SIGNAL_DETECT_MODE: timeout=%d\n", - __func__, Q6USM_TIMEOUT_JIFFIES); + pr_err("%s: CMD_SIGNAL_DETECT_MODE: timeout=%ld\n", + __func__, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); } else rc = 0; @@ -1366,11 +1366,11 @@ int q6usm_set_us_stream_param(int dir, struct us_client *usc, rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; - pr_err("%s: CMD_SET_PARAM: timeout=%d\n", - __func__, Q6USM_TIMEOUT_JIFFIES); + pr_err("%s: CMD_SET_PARAM: timeout=%ld\n", + __func__, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); } else rc = 0; @@ -1412,11 +1412,11 @@ int q6usm_get_us_stream_param(int dir, struct us_client *usc, rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; - pr_err("%s: CMD_GET_PARAM: timeout=%d\n", - __func__, Q6USM_TIMEOUT_JIFFIES); + pr_err("%s: CMD_GET_PARAM: timeout=%ld\n", + __func__, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); } else rc = 0; diff --git a/arch/arm/mach-msm/smd_pkt.c b/arch/arm/mach-msm/smd_pkt.c index c6001a626..27cd2e50f 100644 --- a/arch/arm/mach-msm/smd_pkt.c +++ b/arch/arm/mach-msm/smd_pkt.c @@ -896,7 +896,7 @@ int smd_pkt_open(struct inode *inode, struct file *file) r = wait_event_interruptible_timeout( smd_pkt_devp->ch_opened_wait_queue, - smd_pkt_devp->is_open, (2 * HZ)); + smd_pkt_devp->is_open, msecs_to_jiffies(2000)); if (r == 0) { r = -ETIMEDOUT; /* close the ch to sync smd's state with smd_pkt */ diff --git a/arch/arm/mach-msm/smd_qmi.c b/arch/arm/mach-msm/smd_qmi.c index ca10f00d0..925fb7d4b 100644 --- a/arch/arm/mach-msm/smd_qmi.c +++ b/arch/arm/mach-msm/smd_qmi.c @@ -509,7 +509,7 @@ static void qmi_notify(void *priv, unsigned event) int sz; sz = smd_cur_packet_size(ctxt->ch); if ((sz > 0) && (sz <= smd_read_avail(ctxt->ch))) { - wake_lock_timeout(&ctxt->wake_lock, HZ / 2); + wake_lock_timeout(&ctxt->wake_lock, msecs_to_jiffies(500)); queue_work(qmi_wq, &ctxt->read_work); } break; diff --git a/arch/arm/mach-msm/smd_tty.c b/arch/arm/mach-msm/smd_tty.c index 55b000340..e7945633e 100644 --- a/arch/arm/mach-msm/smd_tty.c +++ b/arch/arm/mach-msm/smd_tty.c @@ -484,7 +484,7 @@ static int smd_tty_port_activate(struct tty_port *tport, } res = wait_event_interruptible_timeout(info->ch_opened_wait_queue, - info->is_open, (2 * HZ)); + info->is_open, msecs_to_jiffies(2000)); if (res == 0) res = -ETIMEDOUT; if (res < 0) { diff --git a/arch/arm/mach-msm/smem_log.c b/arch/arm/mach-msm/smem_log.c index 35a6e1599..93cf370f8 100644 --- a/arch/arm/mach-msm/smem_log.c +++ b/arch/arm/mach-msm/smem_log.c @@ -1740,7 +1740,7 @@ static int debug_dump(char *buf, int max, uint32_t cont) r = wait_event_interruptible_timeout(inst[GEN].read_wait, inst[GEN].last_read_avail, smem_log_timeout_ms * - HZ / 1000); + msecs_to_jiffies(1)); DBG("%s: read available %d\n", __func__, inst[GEN].last_read_avail); if (r < 0) @@ -1764,7 +1764,7 @@ static int debug_dump_sym(char *buf, int max, uint32_t cont) r = wait_event_interruptible_timeout(inst[GEN].read_wait, inst[GEN].last_read_avail, smem_log_timeout_ms * - HZ / 1000); + msecs_to_jiffies(1)); DBG("%s: readavailable %d\n", __func__, inst[GEN].last_read_avail); if (r < 0) diff --git a/arch/arm/mach-msm/smp2p_spinlock_test.c b/arch/arm/mach-msm/smp2p_spinlock_test.c index c14bac0a7..5ecf6be79 100644 --- a/arch/arm/mach-msm/smp2p_spinlock_test.c +++ b/arch/arm/mach-msm/smp2p_spinlock_test.c @@ -88,7 +88,7 @@ static void smp2p_ut_remote_spinlock_core(struct seq_file *s, int remote_pid, UT_ASSERT_INT(ret, ==, 0); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_out.cb_completion, HZ * 2), + &cb_out.cb_completion, msecs_to_jiffies(2000)), >, 0); UT_ASSERT_INT(cb_out.cb_count, ==, 1); UT_ASSERT_INT(cb_out.event_open, ==, 1); @@ -99,7 +99,7 @@ static void smp2p_ut_remote_spinlock_core(struct seq_file *s, int remote_pid, UT_ASSERT_INT(ret, ==, 0); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ * 2), + &cb_in.cb_completion, msecs_to_jiffies(2000)), >, 0); UT_ASSERT_INT(cb_in.cb_count, ==, 1); UT_ASSERT_INT(cb_in.event_open, ==, 1); @@ -116,7 +116,7 @@ static void smp2p_ut_remote_spinlock_core(struct seq_file *s, int remote_pid, UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ * 2), + &cb_in.cb_completion, msecs_to_jiffies(2000)), >, 0); UT_ASSERT_INT(cb_in.cb_count, ==, 1); UT_ASSERT_INT(cb_in.event_entry_update, ==, 1); @@ -225,7 +225,7 @@ static void smp2p_ut_remote_spinlock_core(struct seq_file *s, int remote_pid, do { UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ * 2), + &cb_in.cb_completion, msecs_to_jiffies(2000)), >, 0); INIT_COMPLETION(cb_in.cb_completion); ret = msm_smp2p_in_read(remote_pid, diff --git a/arch/arm/mach-msm/smp2p_test.c b/arch/arm/mach-msm/smp2p_test.c index 2f86df4b7..c16e79371 100644 --- a/arch/arm/mach-msm/smp2p_test.c +++ b/arch/arm/mach-msm/smp2p_test.c @@ -85,7 +85,7 @@ static void smp2p_ut_local_basic(struct seq_file *s) /* verify port was opened */ UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_data.cb_completion, HZ / 2), >, 0); + &cb_data.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_data.cb_count, ==, 1); UT_ASSERT_INT(cb_data.event_open, ==, 1); UT_ASSERT_INT(rmp->rx_interrupt_count, ==, 2); @@ -173,7 +173,7 @@ static void smp2p_ut_local_late_open(struct seq_file *s) /* verify port was opened */ UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_data.cb_completion, HZ / 2), + &cb_data.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_data.cb_count, ==, 1); UT_ASSERT_INT(cb_data.event_open, ==, 1); @@ -265,7 +265,7 @@ static void smp2p_ut_local_early_open(struct seq_file *s) UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_data.cb_completion, HZ / 8), + &cb_data.cb_completion, msecs_to_jiffies(125)), ==, 0); UT_ASSERT_INT(cb_data.cb_count, ==, 0); UT_ASSERT_INT(cb_data.event_open, ==, 0); @@ -294,7 +294,7 @@ static void smp2p_ut_local_early_open(struct seq_file *s) UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_data.cb_completion, HZ / 2), + &cb_data.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_data.cb_count, ==, 1); UT_ASSERT_INT(cb_data.event_open, ==, 1); @@ -384,7 +384,7 @@ static void smp2p_ut_mock_loopback(struct seq_file *s) local = msm_smp2p_init_rmt_lpb_proc(SMP2P_REMOTE_MOCK_PROC); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &rmp->cb_completion, HZ / 2), + &rmp->cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(rmp->rx_interrupt_count, ==, 2); @@ -398,7 +398,7 @@ static void smp2p_ut_mock_loopback(struct seq_file *s) rmp->tx_interrupt(); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &rmp->cb_completion, HZ / 2), + &rmp->cb_completion, msecs_to_jiffies(500)), >, 0); /* Verify Echo Response */ @@ -421,7 +421,7 @@ static void smp2p_ut_mock_loopback(struct seq_file *s) rmp->tx_interrupt(); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &rmp->cb_completion, HZ / 2), + &rmp->cb_completion, msecs_to_jiffies(500)), >, 0); /* Verify PINGPONG Response */ @@ -443,7 +443,7 @@ static void smp2p_ut_mock_loopback(struct seq_file *s) rmp->tx_interrupt(); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &rmp->cb_completion, HZ / 2), + &rmp->cb_completion, msecs_to_jiffies(500)), >, 0); /* Verify CLEARALL response */ @@ -494,7 +494,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid) UT_ASSERT_INT(ret, ==, 0); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_out.cb_completion, HZ / 2), + &cb_out.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_out.cb_count, ==, 1); UT_ASSERT_INT(cb_out.event_open, ==, 1); @@ -505,7 +505,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid) UT_ASSERT_INT(ret, ==, 0); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ / 2), + &cb_in.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_in.cb_count, ==, 1); UT_ASSERT_INT(cb_in.event_open, ==, 1); @@ -523,7 +523,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid) /* Verify inbound reply */ UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ / 2), + &cb_in.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_in.cb_count, ==, 1); UT_ASSERT_INT(cb_in.event_entry_update, ==, 1); @@ -549,7 +549,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid) /* Verify inbound reply */ UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ / 2), + &cb_in.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_in.cb_count, ==, 1); UT_ASSERT_INT(cb_in.event_entry_update, ==, 1); @@ -573,7 +573,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid) /* Verify inbound reply */ UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ / 2), + &cb_in.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_in.cb_count, ==, 1); UT_ASSERT_INT(cb_in.event_entry_update, ==, 1); @@ -598,7 +598,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid) UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ / 2), + &cb_in.cb_completion, msecs_to_jiffies(500)), ==, 0); UT_ASSERT_INT(cb_in.cb_count, ==, 0); UT_ASSERT_INT(cb_in.event_entry_update, ==, 0); @@ -814,7 +814,7 @@ static void smp2p_ut_local_in_max_entries(struct seq_file *s) UT_ASSERT_INT(ret, ==, 0); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &(cb_in[j].cb_completion), HZ / 2), + &(cb_in[j].cb_completion), msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_in[j].cb_count, ==, 1); UT_ASSERT_INT(cb_in[j].event_entry_update, ==, 0); @@ -909,7 +909,7 @@ static void smp2p_ut_local_in_multiple(struct seq_file *s) UT_ASSERT_INT(ret, ==, 0); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &(cb_in_1.cb_completion), HZ / 2), + &(cb_in_1.cb_completion), msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_in_1.cb_count, ==, 1); UT_ASSERT_INT(cb_in_1.event_entry_update, ==, 0); @@ -920,7 +920,7 @@ static void smp2p_ut_local_in_multiple(struct seq_file *s) UT_ASSERT_INT(ret, ==, 0); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &(cb_in_2.cb_completion), HZ / 2), + &(cb_in_2.cb_completion), msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_in_2.cb_count, ==, 1); UT_ASSERT_INT(cb_in_2.event_entry_update, ==, 0); diff --git a/arch/arm/mach-msm/timer.c b/arch/arm/mach-msm/timer.c index 1ea0f2dc6..2a6e0bda5 100644 --- a/arch/arm/mach-msm/timer.c +++ b/arch/arm/mach-msm/timer.c @@ -722,7 +722,7 @@ static void msm_timer_sync_to_gpt(struct msm_clock *clock, int exit_sleep) &__get_cpu_var(msm_clocks_percpu)[clock->index]; struct msm_timer_sync_data_t data; uint32_t gpt_clk_val; - u64 gpt_period = (1ULL << 32) * HZ; + u64 gpt_period = (1ULL << 32) * msecs_to_jiffies(1000); u64 now = get_jiffies_64(); do_div(gpt_period, gpt_hz); diff --git a/block/Makefile b/block/Makefile index 25f094699..9085458d0 100644 --- a/block/Makefile +++ b/block/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_IOSCHED_SIO) += sio-iosched.o obj-$(CONFIG_IOSCHED_FIOPS) += fiops-iosched.o obj-$(CONFIG_IOSCHED_TEST) += test-iosched.o +obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c new file mode 100644 index 000000000..637c90665 --- /dev/null +++ b/block/bfq-iosched.c @@ -0,0 +1,4207 @@ +/* + * Budget Fair Queueing (BFQ) disk scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + * + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ + * file. + * + * BFQ is a proportional-share storage-I/O scheduling algorithm based on + * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, + * measured in number of sectors, to processes instead of time slices. The + * device is not granted to the in-service process for a given time slice, + * but until it has exhausted its assigned budget. This change from the time + * to the service domain allows BFQ to distribute the device throughput + * among processes as desired, without any distortion due to ZBR, workload + * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, + * called B-WF2Q+, to schedule processes according to their budgets. More + * precisely, BFQ schedules queues associated to processes. Thanks to the + * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to + * I/O-bound processes issuing sequential requests (to boost the + * throughput), and yet guarantee a low latency to interactive and soft + * real-time applications. + * + * BFQ is described in [1], where also a reference to the initial, more + * theoretical paper on BFQ can be found. The interested reader can find + * in the latter paper full details on the main algorithm, as well as + * formulas of the guarantees and formal proofs of all the properties. + * With respect to the version of BFQ presented in these papers, this + * implementation adds a few more heuristics, such as the one that + * guarantees a low latency to soft real-time applications, and a + * hierarchical extension based on H-WF2Q+. + * + * B-WF2Q+ is based on WF2Q+, that is described in [2], together with + * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) + * complexity derives from the one introduced with EEVDF in [3]. + * + * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness + * with the BFQ Disk I/O Scheduler'', + * Proceedings of the 5th Annual International Systems and Storage + * Conference (SYSTOR '12), June 2012. + * + * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf + * + * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing + * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, + * Oct 1997. + * + * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz + * + * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline + * First: A Flexible and Accurate Mechanism for Proportional Share + * Resource Allocation,'' technical report. + * + * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "bfq.h" +#include "blk.h" + +/* Expiration time of sync (0) and async (1) requests, in jiffies. */ +static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; + +/* Maximum backwards seek, in KiB. */ +static const int bfq_back_max = 16 * 1024; + +/* Penalty of a backwards seek, in number of sectors. */ +static const int bfq_back_penalty = 2; + +/* Idling period duration, in jiffies. */ +static int bfq_slice_idle = HZ / 125; + +/* Default maximum budget values, in sectors and number of requests. */ +static const int bfq_default_max_budget = 16 * 1024; +static const int bfq_max_budget_async_rq = 4; + +/* + * Async to sync throughput distribution is controlled as follows: + * when an async request is served, the entity is charged the number + * of sectors of the request, multiplied by the factor below + */ +static const int bfq_async_charge_factor = 10; + +/* Default timeout values, in jiffies, approximating CFQ defaults. */ +static const int bfq_timeout_sync = HZ / 8; +static int bfq_timeout_async = HZ / 25; + +struct kmem_cache *bfq_pool; + +/* Below this threshold (in ms), we consider thinktime immediate. */ +#define BFQ_MIN_TT 2 + +/* hw_tag detection: parallel requests threshold and min samples needed. */ +#define BFQ_HW_QUEUE_THRESHOLD 4 +#define BFQ_HW_QUEUE_SAMPLES 32 + +#define BFQQ_SEEK_THR (sector_t)(8 * 1024) +#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) + +/* Min samples used for peak rate estimation (for autotuning). */ +#define BFQ_PEAK_RATE_SAMPLES 32 + +/* Shift used for peak rate fixed precision calculations. */ +#define BFQ_RATE_SHIFT 16 + +/* + * By default, BFQ computes the duration of the weight raising for + * interactive applications automatically, using the following formula: + * duration = (R / r) * T, where r is the peak rate of the device, and + * R and T are two reference parameters. + * In particular, R is the peak rate of the reference device (see below), + * and T is a reference time: given the systems that are likely to be + * installed on the reference device according to its speed class, T is + * about the maximum time needed, under BFQ and while reading two files in + * parallel, to load typical large applications on these systems. + * In practice, the slower/faster the device at hand is, the more/less it + * takes to load applications with respect to the reference device. + * Accordingly, the longer/shorter BFQ grants weight raising to interactive + * applications. + * + * BFQ uses four different reference pairs (R, T), depending on: + * . whether the device is rotational or non-rotational; + * . whether the device is slow, such as old or portable HDDs, as well as + * SD cards, or fast, such as newer HDDs and SSDs. + * + * The device's speed class is dynamically (re)detected in + * bfq_update_peak_rate() every time the estimated peak rate is updated. + * + * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] + * are the reference values for a slow/fast rotational device, whereas + * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for + * a slow/fast non-rotational device. Finally, device_speed_thresh are the + * thresholds used to switch between speed classes. + * Both the reference peak rates and the thresholds are measured in + * sectors/usec, left-shifted by BFQ_RATE_SHIFT. + */ +static int R_slow[2] = {1536, 10752}; +static int R_fast[2] = {17415, 34791}; +/* + * To improve readability, a conversion function is used to initialize the + * following arrays, which entails that they can be initialized only in a + * function. + */ +static int T_slow[2]; +static int T_fast[2]; +static int device_speed_thresh[2]; + +#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) + +#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) +#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) + +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd); + +#include "bfq-ioc.c" +#include "bfq-sched.c" +#include "bfq-cgroup.c" + +#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\ + IOPRIO_CLASS_IDLE) +#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\ + IOPRIO_CLASS_RT) + +#define bfq_sample_valid(samples) ((samples) > 80) + +/* + * The following macro groups conditions that need to be evaluated when + * checking if existing queues and groups form a symmetric scenario + * and therefore idling can be reduced or disabled for some of the + * queues. See the comment to the function bfq_bfqq_must_not_expire() + * for further details. + */ +#ifdef CONFIG_CGROUP_BFQIO +#define symmetric_scenario (!bfqd->active_numerous_groups && \ + !bfq_differentiated_weights(bfqd)) +#else +#define symmetric_scenario (!bfq_differentiated_weights(bfqd)) +#endif + +/* + * We regard a request as SYNC, if either it's a read or has the SYNC bit + * set (in which case it could also be a direct WRITE). + */ +static inline int bfq_bio_sync(struct bio *bio) +{ + if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) + return 1; + + return 0; +} + +/* + * Scheduler run of queue, if there are requests pending and no one in the + * driver that will restart queueing. + */ +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) +{ + if (bfqd->queued != 0) { + bfq_log(bfqd, "schedule dispatch"); + kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work); + } +} + +/* + * Lifted from AS - choose which of rq1 and rq2 that is best served now. + * We choose the request that is closesr to the head right now. Distance + * behind the head is penalized and only allowed to a certain extent. + */ +static struct request *bfq_choose_req(struct bfq_data *bfqd, + struct request *rq1, + struct request *rq2, + sector_t last) +{ + sector_t s1, s2, d1 = 0, d2 = 0; + unsigned long back_max; +#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ +#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ + unsigned wrap = 0; /* bit mask: requests behind the disk head? */ + + if (rq1 == NULL || rq1 == rq2) + return rq2; + if (rq2 == NULL) + return rq1; + + if (rq_is_sync(rq1) && !rq_is_sync(rq2)) + return rq1; + else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) + return rq2; + if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) + return rq1; + else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) + return rq2; + + s1 = blk_rq_pos(rq1); + s2 = blk_rq_pos(rq2); + + /* + * By definition, 1KiB is 2 sectors. + */ + back_max = bfqd->bfq_back_max * 2; + + /* + * Strict one way elevator _except_ in the case where we allow + * short backward seeks which are biased as twice the cost of a + * similar forward seek. + */ + if (s1 >= last) + d1 = s1 - last; + else if (s1 + back_max >= last) + d1 = (last - s1) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ1_WRAP; + + if (s2 >= last) + d2 = s2 - last; + else if (s2 + back_max >= last) + d2 = (last - s2) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ2_WRAP; + + /* Found required data */ + + /* + * By doing switch() on the bit mask "wrap" we avoid having to + * check two variables for all permutations: --> faster! + */ + switch (wrap) { + case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ + if (d1 < d2) + return rq1; + else if (d2 < d1) + return rq2; + else { + if (s1 >= s2) + return rq1; + else + return rq2; + } + + case BFQ_RQ2_WRAP: + return rq1; + case BFQ_RQ1_WRAP: + return rq2; + case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ + default: + /* + * Since both rqs are wrapped, + * start with the one that's further behind head + * (--> only *one* back seek required), + * since back seek takes more time than forward. + */ + if (s1 <= s2) + return rq1; + else + return rq2; + } +} + +static struct bfq_queue * +bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, + sector_t sector, struct rb_node **ret_parent, + struct rb_node ***rb_link) +{ + struct rb_node **p, *parent; + struct bfq_queue *bfqq = NULL; + + parent = NULL; + p = &root->rb_node; + while (*p) { + struct rb_node **n; + + parent = *p; + bfqq = rb_entry(parent, struct bfq_queue, pos_node); + + /* + * Sort strictly based on sector. Smallest to the left, + * largest to the right. + */ + if (sector > blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_right; + else if (sector < blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_left; + else + break; + p = n; + bfqq = NULL; + } + + *ret_parent = parent; + if (rb_link) + *rb_link = p; + + bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", + (long long unsigned)sector, + bfqq != NULL ? bfqq->pid : 0); + + return bfqq; +} + +static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct rb_node **p, *parent; + struct bfq_queue *__bfqq; + + if (bfqq->pos_root != NULL) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + + if (bfq_class_idle(bfqq)) + return; + if (!bfqq->next_rq) + return; + + bfqq->pos_root = &bfqd->rq_pos_tree; + __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, + blk_rq_pos(bfqq->next_rq), &parent, &p); + if (__bfqq == NULL) { + rb_link_node(&bfqq->pos_node, parent, p); + rb_insert_color(&bfqq->pos_node, bfqq->pos_root); + } else + bfqq->pos_root = NULL; +} + +/* + * Tell whether there are active queues or groups with differentiated weights. + */ +static inline bool bfq_differentiated_weights(struct bfq_data *bfqd) +{ + /* + * For weights to differ, at least one of the trees must contain + * at least two nodes. + */ + return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && + (bfqd->queue_weights_tree.rb_node->rb_left || + bfqd->queue_weights_tree.rb_node->rb_right) +#ifdef CONFIG_CGROUP_BFQIO + ) || + (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && + (bfqd->group_weights_tree.rb_node->rb_left || + bfqd->group_weights_tree.rb_node->rb_right) +#endif + ); +} + +/* + * If the weight-counter tree passed as input contains no counter for + * the weight of the input entity, then add that counter; otherwise just + * increment the existing counter. + * + * Note that weight-counter trees contain few nodes in mostly symmetric + * scenarios. For example, if all queues have the same weight, then the + * weight-counter tree for the queues may contain at most one node. + * This holds even if low_latency is on, because weight-raised queues + * are not inserted in the tree. + * In most scenarios, the rate at which nodes are created/destroyed + * should be low too. + */ +static void bfq_weights_tree_add(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* + * Do not insert if the entity is already associated with a + * counter, which happens if: + * 1) the entity is associated with a queue, + * 2) a request arrival has caused the queue to become both + * non-weight-raised, and hence change its weight, and + * backlogged; in this respect, each of the two events + * causes an invocation of this function, + * 3) this is the invocation of this function caused by the + * second event. This second invocation is actually useless, + * and we handle this fact by exiting immediately. More + * efficient or clearer solutions might possibly be adopted. + */ + if (entity->weight_counter) + return; + + while (*new) { + struct bfq_weight_counter *__counter = container_of(*new, + struct bfq_weight_counter, + weights_node); + parent = *new; + + if (entity->weight == __counter->weight) { + entity->weight_counter = __counter; + goto inc_counter; + } + if (entity->weight < __counter->weight) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), + GFP_ATOMIC); + entity->weight_counter->weight = entity->weight; + rb_link_node(&entity->weight_counter->weights_node, parent, new); + rb_insert_color(&entity->weight_counter->weights_node, root); + +inc_counter: + entity->weight_counter->num_active++; +} + +/* + * Decrement the weight counter associated with the entity, and, if the + * counter reaches 0, remove the counter from the tree. + * See the comments to the function bfq_weights_tree_add() for considerations + * about overhead. + */ +static void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) +{ + if (!entity->weight_counter) + return; + + BUG_ON(RB_EMPTY_ROOT(root)); + BUG_ON(entity->weight_counter->weight != entity->weight); + + BUG_ON(!entity->weight_counter->num_active); + entity->weight_counter->num_active--; + if (entity->weight_counter->num_active > 0) + goto reset_entity_pointer; + + rb_erase(&entity->weight_counter->weights_node, root); + kfree(entity->weight_counter); + +reset_entity_pointer: + entity->weight_counter = NULL; +} + +static struct request *bfq_find_next_rq(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *last) +{ + struct rb_node *rbnext = rb_next(&last->rb_node); + struct rb_node *rbprev = rb_prev(&last->rb_node); + struct request *next = NULL, *prev = NULL; + + BUG_ON(RB_EMPTY_NODE(&last->rb_node)); + + if (rbprev != NULL) + prev = rb_entry_rq(rbprev); + + if (rbnext != NULL) + next = rb_entry_rq(rbnext); + else { + rbnext = rb_first(&bfqq->sort_list); + if (rbnext && rbnext != &last->rb_node) + next = rb_entry_rq(rbnext); + } + + return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); +} + +/* see the definition of bfq_async_charge_factor for details */ +static inline unsigned long bfq_serv_to_charge(struct request *rq, + struct bfq_queue *bfqq) +{ + return blk_rq_sectors(rq) * + (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * + bfq_async_charge_factor)); +} + +/** + * bfq_updated_next_req - update the queue after a new next_rq selection. + * @bfqd: the device data the queue belongs to. + * @bfqq: the queue to update. + * + * If the first request of a queue changes we make sure that the queue + * has enough budget to serve at least its first request (if the + * request has grown). We do this because if the queue has not enough + * budget for its first request, it has to go through two dispatch + * rounds to actually get it dispatched. + */ +static void bfq_updated_next_req(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + struct request *next_rq = bfqq->next_rq; + unsigned long new_budget; + + if (next_rq == NULL) + return; + + if (bfqq == bfqd->in_service_queue) + /* + * In order not to break guarantees, budgets cannot be + * changed after an entity has been selected. + */ + return; + + BUG_ON(entity->tree != &st->active); + BUG_ON(entity == entity->sched_data->in_service_entity); + + new_budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + if (entity->budget != new_budget) { + entity->budget = new_budget; + bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", + new_budget); + bfq_activate_bfqq(bfqd, bfqq); + } +} + +static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd) +{ + u64 dur; + + if (bfqd->bfq_wr_max_time > 0) + return bfqd->bfq_wr_max_time; + + dur = bfqd->RT_prod; + do_div(dur, bfqd->peak_rate); + + return dur; +} + +static inline unsigned +bfq_bfqq_cooperations(struct bfq_queue *bfqq) +{ + return bfqq->bic ? bfqq->bic->cooperations : 0; +} + +static inline void +bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +{ + if (bic->saved_idle_window) + bfq_mark_bfqq_idle_window(bfqq); + else + bfq_clear_bfqq_idle_window(bfqq); + if (bic->saved_IO_bound) + bfq_mark_bfqq_IO_bound(bfqq); + else + bfq_clear_bfqq_IO_bound(bfqq); + /* Assuming that the flag in_large_burst is already correctly set */ + if (bic->wr_time_left && bfqq->bfqd->low_latency && + !bfq_bfqq_in_large_burst(bfqq) && + bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { + /* + * Start a weight raising period with the duration given by + * the raising_time_left snapshot. + */ + if (bfq_bfqq_busy(bfqq)) + bfqq->bfqd->wr_busy_queues++; + bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bic->wr_time_left; + bfqq->last_wr_start_finish = jiffies; + bfqq->entity.ioprio_changed = 1; + } + /* + * Clear wr_time_left to prevent bfq_bfqq_save_state() from + * getting confused about the queue's need of a weight-raising + * period. + */ + bic->wr_time_left = 0; +} + +/* Must be called with the queue_lock held. */ +static int bfqq_process_refs(struct bfq_queue *bfqq) +{ + int process_refs, io_refs; + + io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; + process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; + BUG_ON(process_refs < 0); + return process_refs; +} + +/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ +static inline void bfq_reset_burst_list(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_queue *item; + struct hlist_node *pos, *n; + + hlist_for_each_entry_safe(item, pos, n, + &bfqd->burst_list, burst_list_node) + hlist_del_init(&item->burst_list_node); + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); + bfqd->burst_size = 1; +} + +/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ +static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + /* Increment burst size to take into account also bfqq */ + bfqd->burst_size++; + + if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { + struct bfq_queue *pos, *bfqq_item; + struct hlist_node *p, *n; + + /* + * Enough queues have been activated shortly after each + * other to consider this burst as large. + */ + bfqd->large_burst = true; + + /* + * We can now mark all queues in the burst list as + * belonging to a large burst. + */ + hlist_for_each_entry(bfqq_item, n, &bfqd->burst_list, + burst_list_node) + bfq_mark_bfqq_in_large_burst(bfqq_item); + bfq_mark_bfqq_in_large_burst(bfqq); + + /* + * From now on, and until the current burst finishes, any + * new queue being activated shortly after the last queue + * was inserted in the burst can be immediately marked as + * belonging to a large burst. So the burst list is not + * needed any more. Remove it. + */ + hlist_for_each_entry_safe(pos, p, n, &bfqd->burst_list, + burst_list_node) + hlist_del_init(&pos->burst_list_node); + } else /* burst not yet large: add bfqq to the burst list */ + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); +} + +/* + * If many queues happen to become active shortly after each other, then, + * to help the processes associated to these queues get their job done as + * soon as possible, it is usually better to not grant either weight-raising + * or device idling to these queues. In this comment we describe, firstly, + * the reasons why this fact holds, and, secondly, the next function, which + * implements the main steps needed to properly mark these queues so that + * they can then be treated in a different way. + * + * As for the terminology, we say that a queue becomes active, i.e., + * switches from idle to backlogged, either when it is created (as a + * consequence of the arrival of an I/O request), or, if already existing, + * when a new request for the queue arrives while the queue is idle. + * Bursts of activations, i.e., activations of different queues occurring + * shortly after each other, are typically caused by services or applications + * that spawn or reactivate many parallel threads/processes. Examples are + * systemd during boot or git grep. + * + * These services or applications benefit mostly from a high throughput: + * the quicker the requests of the activated queues are cumulatively served, + * the sooner the target job of these queues gets completed. As a consequence, + * weight-raising any of these queues, which also implies idling the device + * for it, is almost always counterproductive: in most cases it just lowers + * throughput. + * + * On the other hand, a burst of activations may be also caused by the start + * of an application that does not consist in a lot of parallel I/O-bound + * threads. In fact, with a complex application, the burst may be just a + * consequence of the fact that several processes need to be executed to + * start-up the application. To start an application as quickly as possible, + * the best thing to do is to privilege the I/O related to the application + * with respect to all other I/O. Therefore, the best strategy to start as + * quickly as possible an application that causes a burst of activations is + * to weight-raise all the queues activated during the burst. This is the + * exact opposite of the best strategy for the other type of bursts. + * + * In the end, to take the best action for each of the two cases, the two + * types of bursts need to be distinguished. Fortunately, this seems + * relatively easy to do, by looking at the sizes of the bursts. In + * particular, we found a threshold such that bursts with a larger size + * than that threshold are apparently caused only by services or commands + * such as systemd or git grep. For brevity, hereafter we call just 'large' + * these bursts. BFQ *does not* weight-raise queues whose activations occur + * in a large burst. In addition, for each of these queues BFQ performs or + * does not perform idling depending on which choice boosts the throughput + * most. The exact choice depends on the device and request pattern at + * hand. + * + * Turning back to the next function, it implements all the steps needed + * to detect the occurrence of a large burst and to properly mark all the + * queues belonging to it (so that they can then be treated in a different + * way). This goal is achieved by maintaining a special "burst list" that + * holds, temporarily, the queues that belong to the burst in progress. The + * list is then used to mark these queues as belonging to a large burst if + * the burst does become large. The main steps are the following. + * + * . when the very first queue is activated, the queue is inserted into the + * list (as it could be the first queue in a possible burst) + * + * . if the current burst has not yet become large, and a queue Q that does + * not yet belong to the burst is activated shortly after the last time + * at which a new queue entered the burst list, then the function appends + * Q to the burst list + * + * . if, as a consequence of the previous step, the burst size reaches + * the large-burst threshold, then + * + * . all the queues in the burst list are marked as belonging to a + * large burst + * + * . the burst list is deleted; in fact, the burst list already served + * its purpose (keeping temporarily track of the queues in a burst, + * so as to be able to mark them as belonging to a large burst in the + * previous sub-step), and now is not needed any more + * + * . the device enters a large-burst mode + * + * . if a queue Q that does not belong to the burst is activated while + * the device is in large-burst mode and shortly after the last time + * at which a queue either entered the burst list or was marked as + * belonging to the current large burst, then Q is immediately marked + * as belonging to a large burst. + * + * . if a queue Q that does not belong to the burst is activated a while + * later, i.e., not shortly after, than the last time at which a queue + * either entered the burst list or was marked as belonging to the + * current large burst, then the current burst is deemed as finished and: + * + * . the large-burst mode is reset if set + * + * . the burst list is emptied + * + * . Q is inserted in the burst list, as Q may be the first queue + * in a possible new burst (then the burst list contains just Q + * after this step). + */ +static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool idle_for_long_time) +{ + /* + * If bfqq happened to be activated in a burst, but has been idle + * for at least as long as an interactive queue, then we assume + * that, in the overall I/O initiated in the burst, the I/O + * associated to bfqq is finished. So bfqq does not need to be + * treated as a queue belonging to a burst anymore. Accordingly, + * we reset bfqq's in_large_burst flag if set, and remove bfqq + * from the burst list if it's there. We do not decrement instead + * burst_size, because the fact that bfqq does not need to belong + * to the burst list any more does not invalidate the fact that + * bfqq may have been activated during the current burst. + */ + if (idle_for_long_time) { + hlist_del_init(&bfqq->burst_list_node); + bfq_clear_bfqq_in_large_burst(bfqq); + } + + /* + * If bfqq is already in the burst list or is part of a large + * burst, then there is nothing else to do. + */ + if (!hlist_unhashed(&bfqq->burst_list_node) || + bfq_bfqq_in_large_burst(bfqq)) + return; + + /* + * If bfqq's activation happens late enough, then the current + * burst is finished, and related data structures must be reset. + * + * In this respect, consider the special case where bfqq is the very + * first queue being activated. In this case, last_ins_in_burst is + * not yet significant when we get here. But it is easy to verify + * that, whether or not the following condition is true, bfqq will + * end up being inserted into the burst list. In particular the + * list will happen to contain only bfqq. And this is exactly what + * has to happen, as bfqq may be the first queue in a possible + * burst. + */ + if (time_is_before_jiffies(bfqd->last_ins_in_burst + + bfqd->bfq_burst_interval)) { + bfqd->large_burst = false; + bfq_reset_burst_list(bfqd, bfqq); + return; + } + + /* + * If we get here, then bfqq is being activated shortly after the + * last queue. So, if the current burst is also large, we can mark + * bfqq as belonging to this large burst immediately. + */ + if (bfqd->large_burst) { + bfq_mark_bfqq_in_large_burst(bfqq); + return; + } + + /* + * If we get here, then a large-burst state has not yet been + * reached, but bfqq is being activated shortly after the last + * queue. Then we add bfqq to the burst. + */ + bfq_add_to_burst(bfqd, bfqq); +} + +static void bfq_add_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_entity *entity = &bfqq->entity; + struct bfq_data *bfqd = bfqq->bfqd; + struct request *next_rq, *prev; + unsigned long old_wr_coeff = bfqq->wr_coeff; + bool interactive = false; + + bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); + bfqq->queued[rq_is_sync(rq)]++; + bfqd->queued++; + + elv_rb_add(&bfqq->sort_list, rq); + + /* + * Check if this request is a better next-serve candidate. + */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); + BUG_ON(next_rq == NULL); + bfqq->next_rq = next_rq; + + /* + * Adjust priority tree position, if next_rq changes. + */ + if (prev != bfqq->next_rq) + bfq_rq_pos_tree_add(bfqd, bfqq); + + if (!bfq_bfqq_busy(bfqq)) { + bool soft_rt, coop_or_in_burst, + idle_for_long_time = time_is_before_jiffies( + bfqq->budget_timeout + + bfqd->bfq_wr_min_idle_time); + + if (bfq_bfqq_sync(bfqq)) { + bool already_in_burst = + !hlist_unhashed(&bfqq->burst_list_node) || + bfq_bfqq_in_large_burst(bfqq); + bfq_handle_burst(bfqd, bfqq, idle_for_long_time); + /* + * If bfqq was not already in the current burst, + * then, at this point, bfqq either has been + * added to the current burst or has caused the + * current burst to terminate. In particular, in + * the second case, bfqq has become the first + * queue in a possible new burst. + * In both cases last_ins_in_burst needs to be + * moved forward. + */ + if (!already_in_burst) + bfqd->last_ins_in_burst = jiffies; + } + + coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || + bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; + soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && + !coop_or_in_burst && + time_is_before_jiffies(bfqq->soft_rt_next_start); + interactive = !coop_or_in_burst && idle_for_long_time; + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + + if (!bfq_bfqq_IO_bound(bfqq)) { + if (time_before(jiffies, + RQ_BIC(rq)->ttime.last_end_request + + bfqd->bfq_slice_idle)) { + bfqq->requests_within_timer++; + if (bfqq->requests_within_timer >= + bfqd->bfq_requests_within_timer) + bfq_mark_bfqq_IO_bound(bfqq); + } else + bfqq->requests_within_timer = 0; + } + + if (!bfqd->low_latency) + goto add_bfqq_busy; + + if (bfq_bfqq_just_split(bfqq)) + goto set_ioprio_changed; + + /* + * If the queue: + * - is not being boosted, + * - has been idle for enough time, + * - is not a sync queue or is linked to a bfq_io_cq (it is + * shared "for its nature" or it is not shared and its + * requests have not been redirected to a shared queue) + * start a weight-raising period. + */ + if (old_wr_coeff == 1 && (interactive || soft_rt) && + (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + if (interactive) + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + else + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + bfq_log_bfqq(bfqd, bfqq, + "wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } else if (old_wr_coeff > 1) { + if (interactive) + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + else if (coop_or_in_burst || + (bfqq->wr_cur_max_time == + bfqd->bfq_wr_rt_max_time && + !soft_rt)) { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq-> + wr_cur_max_time)); + } else if (time_before( + bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time, + jiffies + + bfqd->bfq_wr_rt_max_time) && + soft_rt) { + /* + * + * The remaining weight-raising time is lower + * than bfqd->bfq_wr_rt_max_time, which means + * that the application is enjoying weight + * raising either because deemed soft-rt in + * the near past, or because deemed interactive + * a long ago. + * In both cases, resetting now the current + * remaining weight-raising time for the + * application to the weight-raising duration + * for soft rt applications would not cause any + * latency increase for the application (as the + * new duration would be higher than the + * remaining time). + * + * In addition, the application is now meeting + * the requirements for being deemed soft rt. + * In the end we can correctly and safely + * (re)charge the weight-raising duration for + * the application with the weight-raising + * duration for soft rt applications. + * + * In particular, doing this recharge now, i.e., + * before the weight-raising period for the + * application finishes, reduces the probability + * of the following negative scenario: + * 1) the weight of a soft rt application is + * raised at startup (as for any newly + * created application), + * 2) since the application is not interactive, + * at a certain time weight-raising is + * stopped for the application, + * 3) at that time the application happens to + * still have pending requests, and hence + * is destined to not have a chance to be + * deemed soft rt before these requests are + * completed (see the comments to the + * function bfq_bfqq_softrt_next_start() + * for details on soft rt detection), + * 4) these pending requests experience a high + * latency because the application is not + * weight-raised while they are pending. + */ + bfqq->last_wr_start_finish = jiffies; + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + } + } +set_ioprio_changed: + if (old_wr_coeff != bfqq->wr_coeff) + entity->ioprio_changed = 1; +add_bfqq_busy: + bfqq->last_idle_bklogged = jiffies; + bfqq->service_from_backlogged = 0; + bfq_clear_bfqq_softrt_update(bfqq); + bfq_add_bfqq_busy(bfqd, bfqq); + } else { + if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && + time_is_before_jiffies( + bfqq->last_wr_start_finish + + bfqd->bfq_wr_min_inter_arr_async)) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + + bfqd->wr_busy_queues++; + entity->ioprio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, + "non-idle wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + if (prev != bfqq->next_rq) + bfq_updated_next_req(bfqd, bfqq); + } + + if (bfqd->low_latency && + (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) + bfqq->last_wr_start_finish = jiffies; +} + +static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, + struct bio *bio) +{ + struct task_struct *tsk = current; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq; + + bic = bfq_bic_lookup(bfqd, tsk->io_context); + if (bic == NULL) + return NULL; + + bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); + if (bfqq != NULL) { + sector_t sector = bio->bi_sector + bio_sectors(bio); + + return elv_rb_find(&bfqq->sort_list, sector); + } + + return NULL; +} + +static void bfq_activate_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + + bfqd->rq_in_driver++; + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); + bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", + (long long unsigned)bfqd->last_position); +} + +static inline void bfq_deactivate_request(struct request_queue *q, + struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + + BUG_ON(bfqd->rq_in_driver == 0); + bfqd->rq_in_driver--; +} + +static void bfq_remove_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + const int sync = rq_is_sync(rq); + + if (bfqq->next_rq == rq) { + bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); + bfq_updated_next_req(bfqd, bfqq); + } + + if (rq->queuelist.prev != &rq->queuelist) + list_del_init(&rq->queuelist); + BUG_ON(bfqq->queued[sync] == 0); + bfqq->queued[sync]--; + bfqd->queued--; + elv_rb_del(&bfqq->sort_list, rq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) + bfq_del_bfqq_busy(bfqd, bfqq, 1); + /* + * Remove queue from request-position tree as it is empty. + */ + if (bfqq->pos_root != NULL) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + } + + if (rq->cmd_flags & REQ_META) { + BUG_ON(bfqq->meta_pending == 0); + bfqq->meta_pending--; + } +} + +static int bfq_merge(struct request_queue *q, struct request **req, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct request *__rq; + + __rq = bfq_find_rq_fmerge(bfqd, bio); + if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { + *req = __rq; + return ELEVATOR_FRONT_MERGE; + } + + return ELEVATOR_NO_MERGE; +} + +static void bfq_merged_request(struct request_queue *q, struct request *req, + int type) +{ + if (type == ELEVATOR_FRONT_MERGE && + rb_prev(&req->rb_node) && + blk_rq_pos(req) < + blk_rq_pos(container_of(rb_prev(&req->rb_node), + struct request, rb_node))) { + struct bfq_queue *bfqq = RQ_BFQQ(req); + struct bfq_data *bfqd = bfqq->bfqd; + struct request *prev, *next_rq; + + /* Reposition request in its sort_list */ + elv_rb_del(&bfqq->sort_list, req); + elv_rb_add(&bfqq->sort_list, req); + /* Choose next request to be served for bfqq */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, + bfqd->last_position); + BUG_ON(next_rq == NULL); + bfqq->next_rq = next_rq; + /* + * If next_rq changes, update both the queue's budget to + * fit the new request and the queue's position in its + * rq_pos_tree. + */ + if (prev != bfqq->next_rq) { + bfq_updated_next_req(bfqd, bfqq); + bfq_rq_pos_tree_add(bfqd, bfqq); + } + } +} + +static void bfq_merged_requests(struct request_queue *q, struct request *rq, + struct request *next) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); + + /* + * If next and rq belong to the same bfq_queue and next is older + * than rq, then reposition rq in the fifo (by substituting next + * with rq). Otherwise, if next and rq belong to different + * bfq_queues, never reposition rq: in fact, we would have to + * reposition it with respect to next's position in its own fifo, + * which would most certainly be too expensive with respect to + * the benefits. + */ + if (bfqq == next_bfqq && + !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && + time_before(rq_fifo_time(next), rq_fifo_time(rq))) { + list_del_init(&rq->queuelist); + list_replace_init(&next->queuelist, &rq->queuelist); + rq_set_fifo_time(rq, rq_fifo_time(next)); + } + + if (bfqq->next_rq == next) + bfqq->next_rq = rq; + + bfq_remove_request(next); +} + +/* Must be called with bfqq != NULL */ +static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq) +{ + BUG_ON(bfqq == NULL); + if (bfq_bfqq_busy(bfqq)) + bfqq->bfqd->wr_busy_queues--; + bfqq->wr_coeff = 1; + bfqq->wr_cur_max_time = 0; + /* Trigger a weight change on the next activation of the queue */ + bfqq->entity.ioprio_changed = 1; +} + +static void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + if (bfqg->async_bfqq[i][j] != NULL) + bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); + if (bfqg->async_idle_bfqq != NULL) + bfq_bfqq_end_wr(bfqg->async_idle_bfqq); +} + +static void bfq_end_wr(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + + spin_lock_irq(bfqd->queue->queue_lock); + + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + bfq_end_wr_async(bfqd); + + spin_unlock_irq(bfqd->queue->queue_lock); +} + +static inline sector_t bfq_io_struct_pos(void *io_struct, bool request) +{ + if (request) + return blk_rq_pos(io_struct); + else + return ((struct bio *)io_struct)->bi_sector; +} + +static inline sector_t bfq_dist_from(sector_t pos1, + sector_t pos2) +{ + if (pos1 >= pos2) + return pos1 - pos2; + else + return pos2 - pos1; +} + +static inline int bfq_rq_close_to_sector(void *io_struct, bool request, + sector_t sector) +{ + return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <= + BFQQ_SEEK_THR; +} + +static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector) +{ + struct rb_root *root = &bfqd->rq_pos_tree; + struct rb_node *parent, *node; + struct bfq_queue *__bfqq; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + /* + * First, if we find a request starting at the end of the last + * request, choose it. + */ + __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); + if (__bfqq != NULL) + return __bfqq; + + /* + * If the exact sector wasn't found, the parent of the NULL leaf + * will contain the closest sector (rq_pos_tree sorted by + * next_request position). + */ + __bfqq = rb_entry(parent, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + if (blk_rq_pos(__bfqq->next_rq) < sector) + node = rb_next(&__bfqq->pos_node); + else + node = rb_prev(&__bfqq->pos_node); + if (node == NULL) + return NULL; + + __bfqq = rb_entry(node, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + return NULL; +} + +/* + * bfqd - obvious + * cur_bfqq - passed in so that we don't decide that the current queue + * is closely cooperating with itself + * sector - used as a reference point to search for a close queue + */ +static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, + struct bfq_queue *cur_bfqq, + sector_t sector) +{ + struct bfq_queue *bfqq; + + if (bfq_class_idle(cur_bfqq)) + return NULL; + if (!bfq_bfqq_sync(cur_bfqq)) + return NULL; + if (BFQQ_SEEKY(cur_bfqq)) + return NULL; + + /* If device has only one backlogged bfq_queue, don't search. */ + if (bfqd->busy_queues == 1) + return NULL; + + /* + * We should notice if some of the queues are cooperating, e.g. + * working closely on the same area of the disk. In that case, + * we can group them together and don't waste time idling. + */ + bfqq = bfqq_close(bfqd, sector); + if (bfqq == NULL || bfqq == cur_bfqq) + return NULL; + + /* + * Do not merge queues from different bfq_groups. + */ + if (bfqq->entity.parent != cur_bfqq->entity.parent) + return NULL; + + /* + * It only makes sense to merge sync queues. + */ + if (!bfq_bfqq_sync(bfqq)) + return NULL; + if (BFQQ_SEEKY(bfqq)) + return NULL; + + /* + * Do not merge queues of different priority classes. + */ + if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq)) + return NULL; + + return bfqq; +} + +static struct bfq_queue * +bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + int process_refs, new_process_refs; + struct bfq_queue *__bfqq; + + /* + * If there are no process references on the new_bfqq, then it is + * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain + * may have dropped their last reference (not just their last process + * reference). + */ + if (!bfqq_process_refs(new_bfqq)) + return NULL; + + /* Avoid a circular list and skip interim queue merges. */ + while ((__bfqq = new_bfqq->new_bfqq)) { + if (__bfqq == bfqq) + return NULL; + new_bfqq = __bfqq; + } + + process_refs = bfqq_process_refs(bfqq); + new_process_refs = bfqq_process_refs(new_bfqq); + /* + * If the process for the bfqq has gone away, there is no + * sense in merging the queues. + */ + if (process_refs == 0 || new_process_refs == 0) + return NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", + new_bfqq->pid); + + /* + * Merging is just a redirection: the requests of the process + * owning one of the two queues are redirected to the other queue. + * The latter queue, in its turn, is set as shared if this is the + * first time that the requests of some process are redirected to + * it. + * + * We redirect bfqq to new_bfqq and not the opposite, because we + * are in the context of the process owning bfqq, hence we have + * the io_cq of this process. So we can immediately configure this + * io_cq to redirect the requests of the process to new_bfqq. + * + * NOTE, even if new_bfqq coincides with the in-service queue, the + * io_cq of new_bfqq is not available, because, if the in-service + * queue is shared, bfqd->in_service_bic may not point to the + * io_cq of the in-service queue. + * Redirecting the requests of the process owning bfqq to the + * currently in-service queue is in any case the best option, as + * we feed the in-service queue with new requests close to the + * last request served and, by doing so, hopefully increase the + * throughput. + */ + bfqq->new_bfqq = new_bfqq; + atomic_add(process_refs, &new_bfqq->ref); + return new_bfqq; +} + +/* + * Attempt to schedule a merge of bfqq with the currently in-service queue + * or with a close queue among the scheduled queues. + * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue + * structure otherwise. + * + * The OOM queue is not allowed to participate to cooperation: in fact, since + * the requests temporarily redirected to the OOM queue could be redirected + * again to dedicated queues at any time, the state needed to correctly + * handle merging with the OOM queue would be quite complex and expensive + * to maintain. Besides, in such a critical condition as an out of memory, + * the benefits of queue merging may be little relevant, or even negligible. + */ +static struct bfq_queue * +bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + void *io_struct, bool request) +{ + struct bfq_queue *in_service_bfqq, *new_bfqq; + + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + + if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) + return NULL; + + in_service_bfqq = bfqd->in_service_queue; + + if (in_service_bfqq == NULL || in_service_bfqq == bfqq || + !bfqd->in_service_bic || + unlikely(in_service_bfqq == &bfqd->oom_bfqq)) + goto check_scheduled; + + if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq)) + goto check_scheduled; + + if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq)) + goto check_scheduled; + + if (in_service_bfqq->entity.parent != bfqq->entity.parent) + goto check_scheduled; + + if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && + bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) { + new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); + if (new_bfqq != NULL) + return new_bfqq; /* Merge with in-service queue */ + } + + /* + * Check whether there is a cooperator among currently scheduled + * queues. The only thing we need is that the bio/request is not + * NULL, as we need it to establish whether a cooperator exists. + */ +check_scheduled: + new_bfqq = bfq_close_cooperator(bfqd, bfqq, + bfq_io_struct_pos(io_struct, request)); + if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq)) + return bfq_setup_merge(bfqq, new_bfqq); + + return NULL; +} + +static inline void +bfq_bfqq_save_state(struct bfq_queue *bfqq) +{ + /* + * If bfqq->bic == NULL, the queue is already shared or its requests + * have already been redirected to a shared queue; both idle window + * and weight raising state have already been saved. Do nothing. + */ + if (bfqq->bic == NULL) + return; + if (bfqq->bic->wr_time_left) + /* + * This is the queue of a just-started process, and would + * deserve weight raising: we set wr_time_left to the full + * weight-raising duration to trigger weight-raising when + * and if the queue is split and the first request of the + * queue is enqueued. + */ + bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); + else if (bfqq->wr_coeff > 1) { + unsigned long wr_duration = + jiffies - bfqq->last_wr_start_finish; + /* + * It may happen that a queue's weight raising period lasts + * longer than its wr_cur_max_time, as weight raising is + * handled only when a request is enqueued or dispatched (it + * does not use any timer). If the weight raising period is + * about to end, don't save it. + */ + if (bfqq->wr_cur_max_time <= wr_duration) + bfqq->bic->wr_time_left = 0; + else + bfqq->bic->wr_time_left = + bfqq->wr_cur_max_time - wr_duration; + /* + * The bfq_queue is becoming shared or the requests of the + * process owning the queue are being redirected to a shared + * queue. Stop the weight raising period of the queue, as in + * both cases it should not be owned by an interactive or + * soft real-time application. + */ + bfq_bfqq_end_wr(bfqq); + } else + bfqq->bic->wr_time_left = 0; + bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bfqq->bic->cooperations++; + bfqq->bic->failed_cooperations = 0; +} + +static inline void +bfq_get_bic_reference(struct bfq_queue *bfqq) +{ + /* + * If bfqq->bic has a non-NULL value, the bic to which it belongs + * is about to begin using a shared bfq_queue. + */ + if (bfqq->bic) + atomic_long_inc(&bfqq->bic->icq.ioc->refcount); +} + +static void +bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", + (long unsigned)new_bfqq->pid); + /* Save weight raising and idle window of the merged queues */ + bfq_bfqq_save_state(bfqq); + bfq_bfqq_save_state(new_bfqq); + if (bfq_bfqq_IO_bound(bfqq)) + bfq_mark_bfqq_IO_bound(new_bfqq); + bfq_clear_bfqq_IO_bound(bfqq); + /* + * Grab a reference to the bic, to prevent it from being destroyed + * before being possibly touched by a bfq_split_bfqq(). + */ + bfq_get_bic_reference(bfqq); + bfq_get_bic_reference(new_bfqq); + /* + * Merge queues (that is, let bic redirect its requests to new_bfqq) + */ + bic_set_bfqq(bic, new_bfqq, 1); + bfq_mark_bfqq_coop(new_bfqq); + /* + * new_bfqq now belongs to at least two bics (it is a shared queue): + * set new_bfqq->bic to NULL. bfqq either: + * - does not belong to any bic any more, and hence bfqq->bic must + * be set to NULL, or + * - is a queue whose owning bics have already been redirected to a + * different queue, hence the queue is destined to not belong to + * any bic soon and bfqq->bic is already NULL (therefore the next + * assignment causes no harm). + */ + new_bfqq->bic = NULL; + bfqq->bic = NULL; + bfq_put_queue(bfqq); +} + +static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) +{ + struct bfq_io_cq *bic = bfqq->bic; + struct bfq_data *bfqd = bfqq->bfqd; + + if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { + bic->failed_cooperations++; + if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) + bic->cooperations = 0; + } +} + +static int bfq_allow_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq, *new_bfqq; + + /* + * Disallow merge of a sync bio into an async request. + */ + if (bfq_bio_sync(bio) && !rq_is_sync(rq)) + return 0; + + /* + * Lookup the bfqq that this bio will be queued with. Allow + * merge only if rq is queued there. + * Queue lock is held here. + */ + bic = bfq_bic_lookup(bfqd, current->io_context); + if (bic == NULL) + return 0; + + bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); + /* + * We take advantage of this function to perform an early merge + * of the queues of possible cooperating processes. + */ + if (bfqq != NULL) { + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + if (new_bfqq != NULL) { + bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); + /* + * If we get here, the bio will be queued in the + * shared queue, i.e., new_bfqq, so use new_bfqq + * to decide whether bio and rq can be merged. + */ + bfqq = new_bfqq; + } else + bfq_bfqq_increase_failed_cooperations(bfqq); + } + + return bfqq == RQ_BFQQ(rq); +} + +static void __bfq_set_in_service_queue(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + if (bfqq != NULL) { + bfq_mark_bfqq_must_alloc(bfqq); + bfq_mark_bfqq_budget_new(bfqq); + bfq_clear_bfqq_fifo_expire(bfqq); + + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; + + bfq_log_bfqq(bfqd, bfqq, + "set_in_service_queue, cur-budget = %lu", + bfqq->entity.budget); + } + + bfqd->in_service_queue = bfqq; +} + +/* + * Get and set a new queue for service. + */ +static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); + + __bfq_set_in_service_queue(bfqd, bfqq); + return bfqq; +} + +/* + * If enough samples have been computed, return the current max budget + * stored in bfqd, which is dynamically updated according to the + * estimated disk peak rate; otherwise return the default max budget + */ +static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < 194) + return bfq_default_max_budget; + else + return bfqd->bfq_max_budget; +} + +/* + * Return min budget, which is a fraction of the current or default + * max budget (trying with 1/32) + */ +static inline unsigned long bfq_min_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < 194) + return bfq_default_max_budget / 32; + else + return bfqd->bfq_max_budget / 32; +} + +static void bfq_arm_slice_timer(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfqd->in_service_queue; + struct bfq_io_cq *bic; + unsigned long sl; + + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + + /* Processes have exited, don't wait. */ + bic = bfqd->in_service_bic; + if (bic == NULL || atomic_read(&bic->icq.ioc->nr_tasks) == 0) + return; + + bfq_mark_bfqq_wait_request(bfqq); + + /* + * We don't want to idle for seeks, but we do want to allow + * fair distribution of slice time for a process doing back-to-back + * seeks. So allow a little bit of time for him to submit a new rq. + * + * To prevent processes with (partly) seeky workloads from + * being too ill-treated, grant them a small fraction of the + * assigned budget before reducing the waiting time to + * BFQ_MIN_TT. This happened to help reduce latency. + */ + sl = bfqd->bfq_slice_idle; + /* + * Unless the queue is being weight-raised or the scenario is + * asymmetric, grant only minimum idle time if the queue either + * has been seeky for long enough or has already proved to be + * constantly seeky. + */ + if (bfq_sample_valid(bfqq->seek_samples) && + ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > + bfq_max_budget(bfqq->bfqd) / 8) || + bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && + symmetric_scenario) + sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); + else if (bfqq->wr_coeff > 1) + sl = sl * 3; + bfqd->last_idling_start = ktime_get(); + mod_timer(&bfqd->idle_slice_timer, jiffies + sl); + bfq_log(bfqd, "arm idle: %u/%u ms", + jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); +} + +/* + * Set the maximum time for the in-service queue to consume its + * budget. This prevents seeky processes from lowering the disk + * throughput (always guaranteed with a time slice scheme as in CFQ). + */ +static void bfq_set_budget_timeout(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfqd->in_service_queue; + unsigned int timeout_coeff; + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) + timeout_coeff = 1; + else + timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; + + bfqd->last_budget_start = ktime_get(); + + bfq_clear_bfqq_budget_new(bfqq); + bfqq->budget_timeout = jiffies + + bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; + + bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", + jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * + timeout_coeff)); +} + +/* + * Move request from internal lists to the request queue dispatch list. + */ +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + /* + * For consistency, the next instruction should have been executed + * after removing the request from the queue and dispatching it. + * We execute instead this instruction before bfq_remove_request() + * (and hence introduce a temporary inconsistency), for efficiency. + * In fact, in a forced_dispatch, this prevents two counters related + * to bfqq->dispatched to risk to be uselessly decremented if bfqq + * is not in service, and then to be incremented again after + * incrementing bfqq->dispatched. + */ + bfqq->dispatched++; + bfq_remove_request(rq); + elv_dispatch_sort(q, rq); + + if (bfq_bfqq_sync(bfqq)) + bfqd->sync_flight++; +} + +/* + * Return expired entry, or NULL to just start from scratch in rbtree. + */ +static struct request *bfq_check_fifo(struct bfq_queue *bfqq) +{ + struct request *rq = NULL; + + if (bfq_bfqq_fifo_expire(bfqq)) + return NULL; + + bfq_mark_bfqq_fifo_expire(bfqq); + + if (list_empty(&bfqq->fifo)) + return NULL; + + rq = rq_entry_fifo(bfqq->fifo.next); + + if (time_before(jiffies, rq_fifo_time(rq))) + return NULL; + + return rq; +} + +static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + return entity->budget - entity->service; +} + +static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + BUG_ON(bfqq != bfqd->in_service_queue); + + __bfq_bfqd_reset_in_service(bfqd); + + /* + * If this bfqq is shared between multiple processes, check + * to make sure that those processes are still issuing I/Os + * within the mean seek distance. If not, it may be time to + * break the queues apart again. + */ + if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) + bfq_mark_bfqq_split_coop(bfqq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* + * Overloading budget_timeout field to store the time + * at which the queue remains with no backlog; used by + * the weight-raising mechanism. + */ + bfqq->budget_timeout = jiffies; + bfq_del_bfqq_busy(bfqd, bfqq, 1); + } else { + bfq_activate_bfqq(bfqd, bfqq); + /* + * Resort priority tree of potential close cooperators. + */ + bfq_rq_pos_tree_add(bfqd, bfqq); + } +} + +/** + * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. + * @bfqd: device data. + * @bfqq: queue to update. + * @reason: reason for expiration. + * + * Handle the feedback on @bfqq budget. See the body for detailed + * comments. + */ +static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + enum bfqq_expiration reason) +{ + struct request *next_rq; + unsigned long budget, min_budget; + + budget = bfqq->max_budget; + min_budget = bfq_min_budget(bfqd); + + BUG_ON(bfqq != bfqd->in_service_queue); + + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", + budget, bfq_min_budget(bfqd)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", + bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); + + if (bfq_bfqq_sync(bfqq)) { + switch (reason) { + /* + * Caveat: in all the following cases we trade latency + * for throughput. + */ + case BFQ_BFQQ_TOO_IDLE: + /* + * This is the only case where we may reduce + * the budget: if there is no request of the + * process still waiting for completion, then + * we assume (tentatively) that the timer has + * expired because the batch of requests of + * the process could have been served with a + * smaller budget. Hence, betting that + * process will behave in the same way when it + * becomes backlogged again, we reduce its + * next budget. As long as we guess right, + * this budget cut reduces the latency + * experienced by the process. + * + * However, if there are still outstanding + * requests, then the process may have not yet + * issued its next request just because it is + * still waiting for the completion of some of + * the still outstanding ones. So in this + * subcase we do not reduce its budget, on the + * contrary we increase it to possibly boost + * the throughput, as discussed in the + * comments to the BUDGET_TIMEOUT case. + */ + if (bfqq->dispatched > 0) /* still outstanding reqs */ + budget = min(budget * 2, bfqd->bfq_max_budget); + else { + if (budget > 5 * min_budget) + budget -= 4 * min_budget; + else + budget = min_budget; + } + break; + case BFQ_BFQQ_BUDGET_TIMEOUT: + /* + * We double the budget here because: 1) it + * gives the chance to boost the throughput if + * this is not a seeky process (which may have + * bumped into this timeout because of, e.g., + * ZBR), 2) together with charge_full_budget + * it helps give seeky processes higher + * timestamps, and hence be served less + * frequently. + */ + budget = min(budget * 2, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_BUDGET_EXHAUSTED: + /* + * The process still has backlog, and did not + * let either the budget timeout or the disk + * idling timeout expire. Hence it is not + * seeky, has a short thinktime and may be + * happy with a higher budget too. So + * definitely increase the budget of this good + * candidate to boost the disk throughput. + */ + budget = min(budget * 4, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_NO_MORE_REQUESTS: + /* + * Leave the budget unchanged. + */ + default: + return; + } + } else /* async queue */ + /* async queues get always the maximum possible budget + * (their ability to dispatch is limited by + * @bfqd->bfq_max_budget_async_rq). + */ + budget = bfqd->bfq_max_budget; + + bfqq->max_budget = budget; + + if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 && + bfqq->max_budget > bfqd->bfq_max_budget) + bfqq->max_budget = bfqd->bfq_max_budget; + + /* + * Make sure that we have enough budget for the next request. + * Since the finish time of the bfqq must be kept in sync with + * the budget, be sure to call __bfq_bfqq_expire() after the + * update. + */ + next_rq = bfqq->next_rq; + if (next_rq != NULL) + bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + else + bfqq->entity.budget = bfqq->max_budget; + + bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu", + next_rq != NULL ? blk_rq_sectors(next_rq) : 0, + bfqq->entity.budget); +} + +static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) +{ + unsigned long max_budget; + + /* + * The max_budget calculated when autotuning is equal to the + * amount of sectors transfered in timeout_sync at the + * estimated peak rate. + */ + max_budget = (unsigned long)(peak_rate * 1000 * + timeout >> BFQ_RATE_SHIFT); + + return max_budget; +} + +/* + * In addition to updating the peak rate, checks whether the process + * is "slow", and returns 1 if so. This slow flag is used, in addition + * to the budget timeout, to reduce the amount of service provided to + * seeky processes, and hence reduce their chances to lower the + * throughput. See the code for more details. + */ +static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int compensate, enum bfqq_expiration reason) +{ + u64 bw, usecs, expected, timeout; + ktime_t delta; + int update = 0; + + if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) + return 0; + + if (compensate) + delta = bfqd->last_idling_start; + else + delta = ktime_get(); + delta = ktime_sub(delta, bfqd->last_budget_start); + usecs = ktime_to_us(delta); + + /* Don't trust short/unrealistic values. */ + if (usecs < 100 || usecs >= LONG_MAX) + return 0; + + /* + * Calculate the bandwidth for the last slice. We use a 64 bit + * value to store the peak rate, in sectors per usec in fixed + * point math. We do so to have enough precision in the estimate + * and to avoid overflows. + */ + bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; + do_div(bw, (unsigned long)usecs); + + timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); + + /* + * Use only long (> 20ms) intervals to filter out spikes for + * the peak rate estimation. + */ + if (usecs > 20000) { + if (bw > bfqd->peak_rate || + (!BFQQ_SEEKY(bfqq) && + reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { + bfq_log(bfqd, "measured bw =%llu", bw); + /* + * To smooth oscillations use a low-pass filter with + * alpha=7/8, i.e., + * new_rate = (7/8) * old_rate + (1/8) * bw + */ + do_div(bw, 8); + if (bw == 0) + return 0; + bfqd->peak_rate *= 7; + do_div(bfqd->peak_rate, 8); + bfqd->peak_rate += bw; + update = 1; + bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); + } + + update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; + + if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) + bfqd->peak_rate_samples++; + + if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && + update) { + int dev_type = blk_queue_nonrot(bfqd->queue); + if (bfqd->bfq_user_max_budget == 0) { + bfqd->bfq_max_budget = + bfq_calc_max_budget(bfqd->peak_rate, + timeout); + bfq_log(bfqd, "new max_budget=%lu", + bfqd->bfq_max_budget); + } + if (bfqd->device_speed == BFQ_BFQD_FAST && + bfqd->peak_rate < device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_SLOW; + bfqd->RT_prod = R_slow[dev_type] * + T_slow[dev_type]; + } else if (bfqd->device_speed == BFQ_BFQD_SLOW && + bfqd->peak_rate > device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_FAST; + bfqd->RT_prod = R_fast[dev_type] * + T_fast[dev_type]; + } + } + } + + /* + * If the process has been served for a too short time + * interval to let its possible sequential accesses prevail on + * the initial seek time needed to move the disk head on the + * first sector it requested, then give the process a chance + * and for the moment return false. + */ + if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) + return 0; + + /* + * A process is considered ``slow'' (i.e., seeky, so that we + * cannot treat it fairly in the service domain, as it would + * slow down too much the other processes) if, when a slice + * ends for whatever reason, it has received service at a + * rate that would not be high enough to complete the budget + * before the budget timeout expiration. + */ + expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; + + /* + * Caveat: processes doing IO in the slower disk zones will + * tend to be slow(er) even if not seeky. And the estimated + * peak rate will actually be an average over the disk + * surface. Hence, to not be too harsh with unlucky processes, + * we keep a budget/3 margin of safety before declaring a + * process slow. + */ + return expected > (4 * bfqq->entity.budget) / 3; +} + +/* + * To be deemed as soft real-time, an application must meet two + * requirements. First, the application must not require an average + * bandwidth higher than the approximate bandwidth required to playback or + * record a compressed high-definition video. + * The next function is invoked on the completion of the last request of a + * batch, to compute the next-start time instant, soft_rt_next_start, such + * that, if the next request of the application does not arrive before + * soft_rt_next_start, then the above requirement on the bandwidth is met. + * + * The second requirement is that the request pattern of the application is + * isochronous, i.e., that, after issuing a request or a batch of requests, + * the application stops issuing new requests until all its pending requests + * have been completed. After that, the application may issue a new batch, + * and so on. + * For this reason the next function is invoked to compute + * soft_rt_next_start only for applications that meet this requirement, + * whereas soft_rt_next_start is set to infinity for applications that do + * not. + * + * Unfortunately, even a greedy application may happen to behave in an + * isochronous way if the CPU load is high. In fact, the application may + * stop issuing requests while the CPUs are busy serving other processes, + * then restart, then stop again for a while, and so on. In addition, if + * the disk achieves a low enough throughput with the request pattern + * issued by the application (e.g., because the request pattern is random + * and/or the device is slow), then the application may meet the above + * bandwidth requirement too. To prevent such a greedy application to be + * deemed as soft real-time, a further rule is used in the computation of + * soft_rt_next_start: soft_rt_next_start must be higher than the current + * time plus the maximum time for which the arrival of a request is waited + * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. + * This filters out greedy applications, as the latter issue instead their + * next request as soon as possible after the last one has been completed + * (in contrast, when a batch of requests is completed, a soft real-time + * application spends some time processing data). + * + * Unfortunately, the last filter may easily generate false positives if + * only bfqd->bfq_slice_idle is used as a reference time interval and one + * or both the following cases occur: + * 1) HZ is so low that the duration of a jiffy is comparable to or higher + * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with + * HZ=100. + * 2) jiffies, instead of increasing at a constant rate, may stop increasing + * for a while, then suddenly 'jump' by several units to recover the lost + * increments. This seems to happen, e.g., inside virtual machines. + * To address this issue, we do not use as a reference time interval just + * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In + * particular we add the minimum number of jiffies for which the filter + * seems to be quite precise also in embedded systems and KVM/QEMU virtual + * machines. + */ +static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + return max(bfqq->last_idle_bklogged + + HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate, + jiffies + bfqq->bfqd->bfq_slice_idle + 4); +} + +/* + * Return the largest-possible time instant such that, for as long as possible, + * the current time will be lower than this time instant according to the macro + * time_is_before_jiffies(). + */ +static inline unsigned long bfq_infinity_from_now(unsigned long now) +{ + return now + ULONG_MAX / 2; +} + +/** + * bfq_bfqq_expire - expire a queue. + * @bfqd: device owning the queue. + * @bfqq: the queue to expire. + * @compensate: if true, compensate for the time spent idling. + * @reason: the reason causing the expiration. + * + * + * If the process associated to the queue is slow (i.e., seeky), or in + * case of budget timeout, or, finally, if it is async, we + * artificially charge it an entire budget (independently of the + * actual service it received). As a consequence, the queue will get + * higher timestamps than the correct ones upon reactivation, and + * hence it will be rescheduled as if it had received more service + * than what it actually received. In the end, this class of processes + * will receive less service in proportion to how slowly they consume + * their budgets (and hence how seriously they tend to lower the + * throughput). + * + * In contrast, when a queue expires because it has been idling for + * too much or because it exhausted its budget, we do not touch the + * amount of service it has received. Hence when the queue will be + * reactivated and its timestamps updated, the latter will be in sync + * with the actual service received by the queue until expiration. + * + * Charging a full budget to the first type of queues and the exact + * service to the others has the effect of using the WF2Q+ policy to + * schedule the former on a timeslice basis, without violating the + * service domain guarantees of the latter. + */ +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + int compensate, + enum bfqq_expiration reason) +{ + int slow; + BUG_ON(bfqq != bfqd->in_service_queue); + + /* Update disk peak rate for autotuning and check whether the + * process is slow (see bfq_update_peak_rate). + */ + slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); + + /* + * As above explained, 'punish' slow (i.e., seeky), timed-out + * and async queues, to favor sequential sync workloads. + * + * Processes doing I/O in the slower disk zones will tend to be + * slow(er) even if not seeky. Hence, since the estimated peak + * rate is actually an average over the disk surface, these + * processes may timeout just for bad luck. To avoid punishing + * them we do not charge a full budget to a process that + * succeeded in consuming at least 2/3 of its budget. + */ + if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) + bfq_bfqq_charge_full_budget(bfqq); + + bfqq->service_from_backlogged += bfqq->entity.service; + + if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && + !bfq_bfqq_constantly_seeky(bfqq)) { + bfq_mark_bfqq_constantly_seeky(bfqq); + if (!blk_queue_nonrot(bfqd->queue)) + bfqd->const_seeky_busy_in_flight_queues++; + } + + if (reason == BFQ_BFQQ_TOO_IDLE && + bfqq->entity.service <= 2 * bfqq->entity.budget / 10 ) + bfq_clear_bfqq_IO_bound(bfqq); + + if (bfqd->low_latency && bfqq->wr_coeff == 1) + bfqq->last_wr_start_finish = jiffies; + + if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* + * If we get here, and there are no outstanding requests, + * then the request pattern is isochronous (see the comments + * to the function bfq_bfqq_softrt_next_start()). Hence we + * can compute soft_rt_next_start. If, instead, the queue + * still has outstanding requests, then we have to wait + * for the completion of all the outstanding requests to + * discover whether the request pattern is actually + * isochronous. + */ + if (bfqq->dispatched == 0) + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + else { + /* + * The application is still waiting for the + * completion of one or more requests: + * prevent it from possibly being incorrectly + * deemed as soft real-time by setting its + * soft_rt_next_start to infinity. In fact, + * without this assignment, the application + * would be incorrectly deemed as soft + * real-time if: + * 1) it issued a new request before the + * completion of all its in-flight + * requests, and + * 2) at that time, its soft_rt_next_start + * happened to be in the past. + */ + bfqq->soft_rt_next_start = + bfq_infinity_from_now(jiffies); + /* + * Schedule an update of soft_rt_next_start to when + * the task may be discovered to be isochronous. + */ + bfq_mark_bfqq_softrt_update(bfqq); + } + } + + bfq_log_bfqq(bfqd, bfqq, + "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, + slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); + + /* + * Increase, decrease or leave budget unchanged according to + * reason. + */ + __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); + __bfq_bfqq_expire(bfqd, bfqq); +} + +/* + * Budget timeout is not implemented through a dedicated timer, but + * just checked on request arrivals and completions, as well as on + * idle timer expirations. + */ +static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) +{ + if (bfq_bfqq_budget_new(bfqq) || + time_before(jiffies, bfqq->budget_timeout)) + return 0; + return 1; +} + +/* + * If we expire a queue that is waiting for the arrival of a new + * request, we may prevent the fictitious timestamp back-shifting that + * allows the guarantees of the queue to be preserved (see [1] for + * this tricky aspect). Hence we return true only if this condition + * does not hold, or if the queue is slow enough to deserve only to be + * kicked off for preserving a high throughput. +*/ +static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, + "may_budget_timeout: wait_request %d left %d timeout %d", + bfq_bfqq_wait_request(bfqq), + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, + bfq_bfqq_budget_timeout(bfqq)); + + return (!bfq_bfqq_wait_request(bfqq) || + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) + && + bfq_bfqq_budget_timeout(bfqq); +} + +/* + * Device idling is allowed only for the queues for which this function + * returns true. For this reason, the return value of this function plays a + * critical role for both throughput boosting and service guarantees. The + * return value is computed through a logical expression. In this rather + * long comment, we try to briefly describe all the details and motivations + * behind the components of this logical expression. + * + * First, the expression is false if bfqq is not sync, or if: bfqq happened + * to become active during a large burst of queue activations, and the + * pattern of requests bfqq contains boosts the throughput if bfqq is + * expired. In fact, queues that became active during a large burst benefit + * only from throughput, as discussed in the comments to bfq_handle_burst. + * In this respect, expiring bfqq certainly boosts the throughput on NCQ- + * capable flash-based devices, whereas, on rotational devices, it boosts + * the throughput only if bfqq contains random requests. + * + * On the opposite end, if (a) bfqq is sync, (b) the above burst-related + * condition does not hold, and (c) bfqq is being weight-raised, then the + * expression always evaluates to true, as device idling is instrumental + * for preserving low-latency guarantees (see [1]). If, instead, conditions + * (a) and (b) do hold, but (c) does not, then the expression evaluates to + * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and + * (2) at least one of the following two conditions holds. + * The first condition is that the device is not performing NCQ, because + * idling the device most certainly boosts the throughput if this condition + * holds and bfqq is I/O-bound and has been granted a non-null idle window. + * The second compound condition is made of the logical AND of two components. + * + * The first component is true only if there is no weight-raised busy + * queue. This guarantees that the device is not idled for a sync non- + * weight-raised queue when there are busy weight-raised queues. The former + * is then expired immediately if empty. Combined with the timestamping + * rules of BFQ (see [1] for details), this causes sync non-weight-raised + * queues to get a lower number of requests served, and hence to ask for a + * lower number of requests from the request pool, before the busy weight- + * raised queues get served again. + * + * This is beneficial for the processes associated with weight-raised + * queues, when the request pool is saturated (e.g., in the presence of + * write hogs). In fact, if the processes associated with the other queues + * ask for requests at a lower rate, then weight-raised processes have a + * higher probability to get a request from the pool immediately (or at + * least soon) when they need one. Hence they have a higher probability to + * actually get a fraction of the disk throughput proportional to their + * high weight. This is especially true with NCQ-capable drives, which + * enqueue several requests in advance and further reorder internally- + * queued requests. + * + * In the end, mistreating non-weight-raised queues when there are busy + * weight-raised queues seems to mitigate starvation problems in the + * presence of heavy write workloads and NCQ, and hence to guarantee a + * higher application and system responsiveness in these hostile scenarios. + * + * If the first component of the compound condition is instead true, i.e., + * there is no weight-raised busy queue, then the second component of the + * compound condition takes into account service-guarantee and throughput + * issues related to NCQ (recall that the compound condition is evaluated + * only if the device is detected as supporting NCQ). + * + * As for service guarantees, allowing the drive to enqueue more than one + * request at a time, and hence delegating de facto final scheduling + * decisions to the drive's internal scheduler, causes loss of control on + * the actual request service order. In this respect, when the drive is + * allowed to enqueue more than one request at a time, the service + * distribution enforced by the drive's internal scheduler is likely to + * coincide with the desired device-throughput distribution only in the + * following, perfectly symmetric, scenario: + * 1) all active queues have the same weight, + * 2) all active groups at the same level in the groups tree have the same + * weight, + * 3) all active groups at the same level in the groups tree have the same + * number of children. + * + * Even in such a scenario, sequential I/O may still receive a preferential + * treatment, but this is not likely to be a big issue with flash-based + * devices, because of their non-dramatic loss of throughput with random + * I/O. Things do differ with HDDs, for which additional care is taken, as + * explained after completing the discussion for flash-based devices. + * + * Unfortunately, keeping the necessary state for evaluating exactly the + * above symmetry conditions would be quite complex and time-consuming. + * Therefore BFQ evaluates instead the following stronger sub-conditions, + * for which it is much easier to maintain the needed state: + * 1) all active queues have the same weight, + * 2) all active groups have the same weight, + * 3) all active groups have at most one active child each. + * In particular, the last two conditions are always true if hierarchical + * support and the cgroups interface are not enabled, hence no state needs + * to be maintained in this case. + * + * According to the above considerations, the second component of the + * compound condition evaluates to true if any of the above symmetry + * sub-condition does not hold, or the device is not flash-based. Therefore, + * if also the first component is true, then idling is allowed for a sync + * queue. These are the only sub-conditions considered if the device is + * flash-based, as, for such a device, it is sensible to force idling only + * for service-guarantee issues. In fact, as for throughput, idling + * NCQ-capable flash-based devices would not boost the throughput even + * with sequential I/O; rather it would lower the throughput in proportion + * to how fast the device is. In the end, (only) if all the three + * sub-conditions hold and the device is flash-based, the compound + * condition evaluates to false and therefore no idling is performed. + * + * As already said, things change with a rotational device, where idling + * boosts the throughput with sequential I/O (even with NCQ). Hence, for + * such a device the second component of the compound condition evaluates + * to true also if the following additional sub-condition does not hold: + * the queue is constantly seeky. Unfortunately, this different behavior + * with respect to flash-based devices causes an additional asymmetry: if + * some sync queues enjoy idling and some other sync queues do not, then + * the latter get a low share of the device throughput, simply because the + * former get many requests served after being set as in service, whereas + * the latter do not. As a consequence, to guarantee the desired throughput + * distribution, on HDDs the compound expression evaluates to true (and + * hence device idling is performed) also if the following last symmetry + * condition does not hold: no other queue is benefiting from idling. Also + * this last condition is actually replaced with a simpler-to-maintain and + * stronger condition: there is no busy queue which is not constantly seeky + * (and hence may also benefit from idling). + * + * To sum up, when all the required symmetry and throughput-boosting + * sub-conditions hold, the second component of the compound condition + * evaluates to false, and hence no idling is performed. This helps to + * keep the drives' internal queues full on NCQ-capable devices, and hence + * to boost the throughput, without causing 'almost' any loss of service + * guarantees. The 'almost' follows from the fact that, if the internal + * queue of one such device is filled while all the sub-conditions hold, + * but at some point in time some sub-condition stops to hold, then it may + * become impossible to let requests be served in the new desired order + * until all the requests already queued in the device have been served. + */ +static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; +#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \ + bfqd->busy_in_flight_queues == \ + bfqd->const_seeky_busy_in_flight_queues) + +#define cond_for_expiring_in_burst (bfq_bfqq_in_large_burst(bfqq) && \ + bfqd->hw_tag && \ + (blk_queue_nonrot(bfqd->queue) || \ + bfq_bfqq_constantly_seeky(bfqq))) + +/* + * Condition for expiring a non-weight-raised queue (and hence not idling + * the device). + */ +#define cond_for_expiring_non_wr (bfqd->hw_tag && \ + (bfqd->wr_busy_queues > 0 || \ + (blk_queue_nonrot(bfqd->queue) || \ + cond_for_seeky_on_ncq_hdd))) + + return bfq_bfqq_sync(bfqq) && + !cond_for_expiring_in_burst && + (bfqq->wr_coeff > 1 || !symmetric_scenario || + (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) && + !cond_for_expiring_non_wr) + ); +} + +/* + * If the in-service queue is empty but sync, and the function + * bfq_bfqq_must_not_expire returns true, then: + * 1) the queue must remain in service and cannot be expired, and + * 2) the disk must be idled to wait for the possible arrival of a new + * request for the queue. + * See the comments to the function bfq_bfqq_must_not_expire for the reasons + * why performing device idling is the best choice to boost the throughput + * and preserve service guarantees when bfq_bfqq_must_not_expire itself + * returns true. + */ +static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; + + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && + bfq_bfqq_must_not_expire(bfqq); +} + +/* + * Select a queue for service. If we have a current queue in service, + * check whether to continue servicing it, or retrieve and set a new one. + */ +static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + struct request *next_rq; + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; + + bfqq = bfqd->in_service_queue; + if (bfqq == NULL) + goto new_queue; + + bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); + + if (bfq_may_expire_for_budg_timeout(bfqq) && + !timer_pending(&bfqd->idle_slice_timer) && + !bfq_bfqq_must_idle(bfqq)) + goto expire; + + next_rq = bfqq->next_rq; + /* + * If bfqq has requests queued and it has enough budget left to + * serve them, keep the queue, otherwise expire it. + */ + if (next_rq != NULL) { + if (bfq_serv_to_charge(next_rq, bfqq) > + bfq_bfqq_budget_left(bfqq)) { + reason = BFQ_BFQQ_BUDGET_EXHAUSTED; + goto expire; + } else { + /* + * The idle timer may be pending because we may + * not disable disk idling even when a new request + * arrives. + */ + if (timer_pending(&bfqd->idle_slice_timer)) { + /* + * If we get here: 1) at least a new request + * has arrived but we have not disabled the + * timer because the request was too small, + * 2) then the block layer has unplugged + * the device, causing the dispatch to be + * invoked. + * + * Since the device is unplugged, now the + * requests are probably large enough to + * provide a reasonable throughput. + * So we disable idling. + */ + bfq_clear_bfqq_wait_request(bfqq); + del_timer(&bfqd->idle_slice_timer); + } + goto keep_queue; + } + } + + /* + * No requests pending. However, if the in-service queue is idling + * for a new request, or has requests waiting for a completion and + * may idle after their completion, then keep it anyway. + */ + if (timer_pending(&bfqd->idle_slice_timer) || + (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) { + bfqq = NULL; + goto keep_queue; + } + + reason = BFQ_BFQQ_NO_MORE_REQUESTS; +expire: + bfq_bfqq_expire(bfqd, bfqq, 0, reason); +new_queue: + bfqq = bfq_set_in_service_queue(bfqd); + bfq_log(bfqd, "select_queue: new queue %d returned", + bfqq != NULL ? bfqq->pid : 0); +keep_queue: + return bfqq; +} + +static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ + bfq_log_bfqq(bfqd, bfqq, + "raising period dur %u/%u msec, old coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); + + BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != + entity->orig_weight * bfqq->wr_coeff); + if (entity->ioprio_changed) + bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); + + /* + * If the queue was activated in a burst, or + * too much time has elapsed from the beginning + * of this weight-raising period, or the queue has + * exceeded the acceptable number of cooperations, + * then end weight raising. + */ + if (bfq_bfqq_in_large_burst(bfqq) || + bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time)) { + bfqq->last_wr_start_finish = jiffies; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + bfqq->last_wr_start_finish, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + bfq_bfqq_end_wr(bfqq); + } + } + /* Update weight both if it must be raised and if it must be lowered */ + if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) + __bfq_entity_update_weight_prio( + bfq_entity_service_tree(entity), + entity); +} + +/* + * Dispatch one request from bfqq, moving it to the request queue + * dispatch list. + */ +static int bfq_dispatch_request(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + int dispatched = 0; + struct request *rq; + unsigned long service_to_charge; + + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + + /* Follow expired path, else get first next available. */ + rq = bfq_check_fifo(bfqq); + if (rq == NULL) + rq = bfqq->next_rq; + service_to_charge = bfq_serv_to_charge(rq, bfqq); + + if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { + /* + * This may happen if the next rq is chosen in fifo order + * instead of sector order. The budget is properly + * dimensioned to be always sufficient to serve the next + * request only if it is chosen in sector order. The reason + * is that it would be quite inefficient and little useful + * to always make sure that the budget is large enough to + * serve even the possible next rq in fifo order. + * In fact, requests are seldom served in fifo order. + * + * Expire the queue for budget exhaustion, and make sure + * that the next act_budget is enough to serve the next + * request, even if it comes from the fifo expired path. + */ + bfqq->next_rq = rq; + /* + * Since this dispatch is failed, make sure that + * a new one will be performed + */ + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); + goto expire; + } + + /* Finally, insert request into driver dispatch list. */ + bfq_bfqq_served(bfqq, service_to_charge); + bfq_dispatch_insert(bfqd->queue, rq); + + bfq_update_wr_data(bfqd, bfqq); + + bfq_log_bfqq(bfqd, bfqq, + "dispatched %u sec req (%llu), budg left %lu", + blk_rq_sectors(rq), + (long long unsigned)blk_rq_pos(rq), + bfq_bfqq_budget_left(bfqq)); + + dispatched++; + + if (bfqd->in_service_bic == NULL) { + atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); + bfqd->in_service_bic = RQ_BIC(rq); + } + + if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && + dispatched >= bfqd->bfq_max_budget_async_rq) || + bfq_class_idle(bfqq))) + goto expire; + + return dispatched; + +expire: + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED); + return dispatched; +} + +static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) +{ + int dispatched = 0; + + while (bfqq->next_rq != NULL) { + bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); + dispatched++; + } + + BUG_ON(!list_empty(&bfqq->fifo)); + return dispatched; +} + +/* + * Drain our current requests. + * Used for barriers and when switching io schedulers on-the-fly. + */ +static int bfq_forced_dispatch(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq, *n; + struct bfq_service_tree *st; + int dispatched = 0; + + bfqq = bfqd->in_service_queue; + if (bfqq != NULL) + __bfq_bfqq_expire(bfqd, bfqq); + + /* + * Loop through classes, and be careful to leave the scheduler + * in a consistent state, as feedback mechanisms and vtime + * updates cannot be disabled during the process. + */ + list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { + st = bfq_entity_service_tree(&bfqq->entity); + + dispatched += __bfq_forced_dispatch_bfqq(bfqq); + bfqq->max_budget = bfq_max_budget(bfqd); + + bfq_forget_idle(st); + } + + BUG_ON(bfqd->busy_queues != 0); + + return dispatched; +} + +static int bfq_dispatch_requests(struct request_queue *q, int force) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq; + int max_dispatch; + + bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); + if (bfqd->busy_queues == 0) + return 0; + + if (unlikely(force)) + return bfq_forced_dispatch(bfqd); + + bfqq = bfq_select_queue(bfqd); + if (bfqq == NULL) + return 0; + + if (bfq_class_idle(bfqq)) + max_dispatch = 1; + + if (!bfq_bfqq_sync(bfqq)) + max_dispatch = bfqd->bfq_max_budget_async_rq; + + if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { + if (bfqd->busy_queues > 1) + return 0; + if (bfqq->dispatched >= 4 * max_dispatch) + return 0; + } + + if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) + return 0; + + bfq_clear_bfqq_wait_request(bfqq); + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + + if (!bfq_dispatch_request(bfqd, bfqq)) + return 0; + + bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", + bfq_bfqq_sync(bfqq) ? "sync" : "async"); + + return 1; +} + +/* + * Task holds one reference to the queue, dropped when task exits. Each rq + * in-flight on this queue also holds a reference, dropped when rq is freed. + * + * Queue lock must be held here. + */ +static void bfq_put_queue(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; + + BUG_ON(atomic_read(&bfqq->ref) <= 0); + + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, + atomic_read(&bfqq->ref)); + if (!atomic_dec_and_test(&bfqq->ref)) + return; + + BUG_ON(rb_first(&bfqq->sort_list) != NULL); + BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); + BUG_ON(bfqq->entity.tree != NULL); + BUG_ON(bfq_bfqq_busy(bfqq)); + BUG_ON(bfqd->in_service_queue == bfqq); + + if (bfq_bfqq_sync(bfqq)) + /* + * The fact that this queue is being destroyed does not + * invalidate the fact that this queue may have been + * activated during the current burst. As a consequence, + * although the queue does not exist anymore, and hence + * needs to be removed from the burst list if there, + * the burst size has not to be decremented. + */ + hlist_del_init(&bfqq->burst_list_node); + + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); + + kmem_cache_free(bfq_pool, bfqq); +} + +static void bfq_put_cooperator(struct bfq_queue *bfqq) +{ + struct bfq_queue *__bfqq, *next; + + /* + * If this queue was scheduled to merge with another queue, be + * sure to drop the reference taken on that queue (and others in + * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. + */ + __bfqq = bfqq->new_bfqq; + while (__bfqq) { + if (__bfqq == bfqq) + break; + next = __bfqq->new_bfqq; + bfq_put_queue(__bfqq); + __bfqq = next; + } +} + +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + if (bfqq == bfqd->in_service_queue) { + __bfq_bfqq_expire(bfqd, bfqq); + bfq_schedule_dispatch(bfqd); + } + + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, + atomic_read(&bfqq->ref)); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); +} + +static inline void bfq_init_icq(struct io_cq *icq) +{ + struct bfq_io_cq *bic = icq_to_bic(icq); + + bic->ttime.last_end_request = jiffies; + /* + * A newly created bic indicates that the process has just + * started doing I/O, and is probably mapping into memory its + * executable and libraries: it definitely needs weight raising. + * There is however the possibility that the process performs, + * for a while, I/O close to some other process. EQM intercepts + * this behavior and may merge the queue corresponding to the + * process with some other queue, BEFORE the weight of the queue + * is raised. Merged queues are not weight-raised (they are assumed + * to belong to processes that benefit only from high throughput). + * If the merge is basically the consequence of an accident, then + * the queue will be split soon and will get back its old weight. + * It is then important to write down somewhere that this queue + * does need weight raising, even if it did not make it to get its + * weight raised before being merged. To this purpose, we overload + * the field raising_time_left and assign 1 to it, to mark the queue + * as needing weight raising. + */ + bic->wr_time_left = 1; +} + +static void bfq_exit_icq(struct io_cq *icq) +{ + struct bfq_io_cq *bic = icq_to_bic(icq); + struct bfq_data *bfqd = bic_to_bfqd(bic); + + if (bic->bfqq[BLK_RW_ASYNC]) { + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); + bic->bfqq[BLK_RW_ASYNC] = NULL; + } + + if (bic->bfqq[BLK_RW_SYNC]) { + /* + * If the bic is using a shared queue, put the reference + * taken on the io_context when the bic started using a + * shared bfq_queue. + */ + if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) + put_io_context(icq->ioc); + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); + bic->bfqq[BLK_RW_SYNC] = NULL; + } +} + +/* + * Update the entity prio values; note that the new values will not + * be used until the next (re)activation. + */ +static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +{ + struct task_struct *tsk = current; + struct io_context *ioc = bic->icq.ioc; + int ioprio_class; + + ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); + switch (ioprio_class) { + default: + dev_err(bfqq->bfqd->queue->backing_dev_info.dev, + "bfq: bad prio class %d\n", ioprio_class); + case IOPRIO_CLASS_NONE: + /* + * No prio set, inherit CPU scheduling settings. + */ + bfqq->entity.new_ioprio = task_nice_ioprio(tsk); + bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk); + break; + case IOPRIO_CLASS_RT: + bfqq->entity.new_ioprio = task_ioprio(ioc); + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT; + break; + case IOPRIO_CLASS_BE: + bfqq->entity.new_ioprio = task_ioprio(ioc); + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; + break; + case IOPRIO_CLASS_IDLE: + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE; + bfqq->entity.new_ioprio = 7; + bfq_clear_bfqq_idle_window(bfqq); + break; + } + + if (bfqq->entity.new_ioprio < 0 || + bfqq->entity.new_ioprio >= IOPRIO_BE_NR) { + printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n", + bfqq->entity.new_ioprio); + BUG(); + } + + bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio); + bfqq->entity.ioprio_changed = 1; +} + +static void bfq_check_ioprio_change(struct io_context *ioc, + struct bfq_io_cq *bic) +{ + struct bfq_data *bfqd; + struct bfq_queue *bfqq, *new_bfqq; + struct bfq_group *bfqg; + unsigned long uninitialized_var(flags); + int ioprio = bic->icq.ioc->ioprio; + + bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), + &flags); + if (unlikely(bfqd == NULL)) + return; + + bic->ioprio = ioprio; + + bfqq = bic->bfqq[BLK_RW_ASYNC]; + if (bfqq != NULL) { + bfqg = container_of(bfqq->entity.sched_data, struct bfq_group, + sched_data); + new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic->icq.ioc, + GFP_ATOMIC); + if (new_bfqq != NULL) { + bic->bfqq[BLK_RW_ASYNC] = new_bfqq; + bfq_log_bfqq(bfqd, bfqq, + "check_ioprio_change: bfqq %p %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } + } + + bfqq = bic->bfqq[BLK_RW_SYNC]; + if (bfqq != NULL) + bfq_set_next_ioprio_data(bfqq, bic); + + bfq_put_bfqd_unlock(bfqd, &flags); +} + +static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_io_cq *bic, pid_t pid, int is_sync) +{ + RB_CLEAR_NODE(&bfqq->entity.rb_node); + INIT_LIST_HEAD(&bfqq->fifo); + INIT_HLIST_NODE(&bfqq->burst_list_node); + + atomic_set(&bfqq->ref, 0); + bfqq->bfqd = bfqd; + + if (bic) + bfq_set_next_ioprio_data(bfqq, bic); + + if (is_sync) { + if (!bfq_class_idle(bfqq)) + bfq_mark_bfqq_idle_window(bfqq); + bfq_mark_bfqq_sync(bfqq); + } + bfq_mark_bfqq_IO_bound(bfqq); + + /* Tentative initial value to trade off between thr and lat */ + bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; + bfqq->pid = pid; + + bfqq->wr_coeff = 1; + bfqq->last_wr_start_finish = 0; + /* + * Set to the value for which bfqq will not be deemed as + * soft rt when it becomes backlogged. + */ + bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); +} + +static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, + int is_sync, + struct io_context *ioc, + gfp_t gfp_mask) +{ + struct bfq_queue *bfqq, *new_bfqq = NULL; + struct bfq_io_cq *bic; + +retry: + bic = bfq_bic_lookup(bfqd, ioc); + /* bic always exists here */ + bfqq = bic_to_bfqq(bic, is_sync); + + /* + * Always try a new alloc if we fall back to the OOM bfqq + * originally, since it should just be a temporary situation. + */ + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { + bfqq = NULL; + if (new_bfqq != NULL) { + bfqq = new_bfqq; + new_bfqq = NULL; + } else if (gfp_mask & __GFP_WAIT) { + spin_unlock_irq(bfqd->queue->queue_lock); + new_bfqq = kmem_cache_alloc_node(bfq_pool, + gfp_mask | __GFP_ZERO, + bfqd->queue->node); + spin_lock_irq(bfqd->queue->queue_lock); + if (new_bfqq != NULL) + goto retry; + } else { + bfqq = kmem_cache_alloc_node(bfq_pool, + gfp_mask | __GFP_ZERO, + bfqd->queue->node); + } + + if (bfqq != NULL) { + bfq_init_bfqq(bfqd, bfqq, bic, current->pid, + is_sync); + bfq_init_entity(&bfqq->entity, bfqg); + bfq_log_bfqq(bfqd, bfqq, "allocated"); + } else { + bfqq = &bfqd->oom_bfqq; + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); + } + } + + if (new_bfqq != NULL) + kmem_cache_free(bfq_pool, new_bfqq); + + return bfqq; +} + +static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, + struct bfq_group *bfqg, + int ioprio_class, int ioprio) +{ + switch (ioprio_class) { + case IOPRIO_CLASS_RT: + return &bfqg->async_bfqq[0][ioprio]; + case IOPRIO_CLASS_BE: + return &bfqg->async_bfqq[1][ioprio]; + case IOPRIO_CLASS_IDLE: + return &bfqg->async_idle_bfqq; + default: + BUG(); + } +} + +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, int is_sync, + struct io_context *ioc, gfp_t gfp_mask) +{ + const int ioprio = task_ioprio(ioc); + const int ioprio_class = task_ioprio_class(ioc); + struct bfq_queue **async_bfqq = NULL; + struct bfq_queue *bfqq = NULL; + + if (!is_sync) { + async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, + ioprio); + bfqq = *async_bfqq; + } + + if (bfqq == NULL) + bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask); + + /* + * Pin the queue now that it's allocated, scheduler exit will + * prune it. + */ + if (!is_sync && *async_bfqq == NULL) { + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", + bfqq, atomic_read(&bfqq->ref)); + *async_bfqq = bfqq; + } + + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, + atomic_read(&bfqq->ref)); + return bfqq; +} + +static void bfq_update_io_thinktime(struct bfq_data *bfqd, + struct bfq_io_cq *bic) +{ + unsigned long elapsed = jiffies - bic->ttime.last_end_request; + unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); + + bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; + bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; + bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / + bic->ttime.ttime_samples; +} + +static void bfq_update_io_seektime(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *rq) +{ + sector_t sdist; + u64 total; + + if (bfqq->last_request_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - bfqq->last_request_pos; + else + sdist = bfqq->last_request_pos - blk_rq_pos(rq); + + /* + * Don't allow the seek distance to get too large from the + * odd fragment, pagein, etc. + */ + if (bfqq->seek_samples == 0) /* first request, not really a seek */ + sdist = 0; + else if (bfqq->seek_samples <= 60) /* second & third seek */ + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); + else + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); + + bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; + bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; + total = bfqq->seek_total + (bfqq->seek_samples/2); + do_div(total, bfqq->seek_samples); + bfqq->seek_mean = (sector_t)total; + + bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, + (u64)bfqq->seek_mean); +} + +/* + * Disable idle window if the process thinks too long or seeks so much that + * it doesn't matter. + */ +static void bfq_update_idle_window(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) +{ + int enable_idle; + + /* Don't idle for async or idle io prio class. */ + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) + return; + + /* Idle window just restored, statistics are meaningless. */ + if (bfq_bfqq_just_split(bfqq)) + return; + + enable_idle = bfq_bfqq_idle_window(bfqq); + + if (atomic_read(&bic->icq.ioc->nr_tasks) == 0 || + bfqd->bfq_slice_idle == 0 || + (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && + bfqq->wr_coeff == 1)) + enable_idle = 0; + else if (bfq_sample_valid(bic->ttime.ttime_samples)) { + if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && + bfqq->wr_coeff == 1) + enable_idle = 0; + else + enable_idle = 1; + } + bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", + enable_idle); + + if (enable_idle) + bfq_mark_bfqq_idle_window(bfqq); + else + bfq_clear_bfqq_idle_window(bfqq); +} + +/* + * Called when a new fs request (rq) is added to bfqq. Check if there's + * something we should do about it. + */ +static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct request *rq) +{ + struct bfq_io_cq *bic = RQ_BIC(rq); + + if (rq->cmd_flags & REQ_META) + bfqq->meta_pending++; + + bfq_update_io_thinktime(bfqd, bic); + bfq_update_io_seektime(bfqd, bfqq, rq); + if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { + bfq_clear_bfqq_constantly_seeky(bfqq); + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || + !BFQQ_SEEKY(bfqq)) + bfq_update_idle_window(bfqd, bfqq, bic); + bfq_clear_bfqq_just_split(bfqq); + + bfq_log_bfqq(bfqd, bfqq, + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", + bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), + (long long unsigned)bfqq->seek_mean); + + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); + + if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { + int small_req = bfqq->queued[rq_is_sync(rq)] == 1 && + blk_rq_sectors(rq) < 32; + int budget_timeout = bfq_bfqq_budget_timeout(bfqq); + + /* + * There is just this request queued: if the request + * is small and the queue is not to be expired, then + * just exit. + * + * In this way, if the disk is being idled to wait for + * a new request from the in-service queue, we avoid + * unplugging the device and committing the disk to serve + * just a small request. On the contrary, we wait for + * the block layer to decide when to unplug the device: + * hopefully, new requests will be merged to this one + * quickly, then the device will be unplugged and + * larger requests will be dispatched. + */ + if (small_req && !budget_timeout) + return; + + /* + * A large enough request arrived, or the queue is to + * be expired: in both cases disk idling is to be + * stopped, so clear wait_request flag and reset + * timer. + */ + bfq_clear_bfqq_wait_request(bfqq); + del_timer(&bfqd->idle_slice_timer); + + /* + * The queue is not empty, because a new request just + * arrived. Hence we can safely expire the queue, in + * case of budget timeout, without risking that the + * timestamps of the queue are not updated correctly. + * See [1] for more details. + */ + if (budget_timeout) + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); + + /* + * Let the request rip immediately, or let a new queue be + * selected if bfqq has just been expired. + */ + __blk_run_queue(bfqd->queue); + } +} + +static void bfq_insert_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; + + assert_spin_locked(bfqd->queue->queue_lock); + + /* + * An unplug may trigger a requeue of a request from the device + * driver: make sure we are in process context while trying to + * merge two bfq_queues. + */ + if (!in_interrupt()) { + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); + if (new_bfqq != NULL) { + if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) + new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); + /* + * Release the request's reference to the old bfqq + * and make sure one is taken to the shared queue. + */ + new_bfqq->allocated[rq_data_dir(rq)]++; + bfqq->allocated[rq_data_dir(rq)]--; + atomic_inc(&new_bfqq->ref); + bfq_put_queue(bfqq); + if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + bfq_merge_bfqqs(bfqd, RQ_BIC(rq), + bfqq, new_bfqq); + rq->elv.priv[1] = new_bfqq; + bfqq = new_bfqq; + } else + bfq_bfqq_increase_failed_cooperations(bfqq); + } + + bfq_add_request(rq); + + /* + * Here a newly-created bfq_queue has already started a weight-raising + * period: clear raising_time_left to prevent bfq_bfqq_save_state() + * from assigning it a full weight-raising period. See the detailed + * comments about this field in bfq_init_icq(). + */ + if (bfqq->bic != NULL) + bfqq->bic->wr_time_left = 0; + rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); + list_add_tail(&rq->queuelist, &bfqq->fifo); + + bfq_rq_enqueued(bfqd, bfqq, rq); +} + +static void bfq_update_hw_tag(struct bfq_data *bfqd) +{ + bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, + bfqd->rq_in_driver); + + if (bfqd->hw_tag == 1) + return; + + /* + * This sample is valid if the number of outstanding requests + * is large enough to allow a queueing behavior. Note that the + * sum is not exact, as it's not taking into account deactivated + * requests. + */ + if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) + return; + + if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) + return; + + bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; + bfqd->max_rq_in_driver = 0; + bfqd->hw_tag_samples = 0; +} + +static void bfq_completed_request(struct request_queue *q, struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + bool sync = bfq_bfqq_sync(bfqq); + + bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", + blk_rq_sectors(rq), sync); + + bfq_update_hw_tag(bfqd); + + BUG_ON(!bfqd->rq_in_driver); + BUG_ON(!bfqq->dispatched); + bfqd->rq_in_driver--; + bfqq->dispatched--; + + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { + bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->busy_in_flight_queues); + bfqd->busy_in_flight_queues--; + if (bfq_bfqq_constantly_seeky(bfqq)) { + BUG_ON(!bfqd-> + const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } + } + + if (sync) { + bfqd->sync_flight--; + RQ_BIC(rq)->ttime.last_end_request = jiffies; + } + + /* + * If we are waiting to discover whether the request pattern of the + * task associated with the queue is actually isochronous, and + * both requisites for this condition to hold are satisfied, then + * compute soft_rt_next_start (see the comments to the function + * bfq_bfqq_softrt_next_start()). + */ + if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + + /* + * If this is the in-service queue, check if it needs to be expired, + * or if we want to idle in case it has no pending requests. + */ + if (bfqd->in_service_queue == bfqq) { + if (bfq_bfqq_budget_new(bfqq)) + bfq_set_budget_timeout(bfqd); + + if (bfq_bfqq_must_idle(bfqq)) { + bfq_arm_slice_timer(bfqd); + goto out; + } else if (bfq_may_expire_for_budg_timeout(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); + else if (RB_EMPTY_ROOT(&bfqq->sort_list) && + (bfqq->dispatched == 0 || + !bfq_bfqq_must_not_expire(bfqq))) + bfq_bfqq_expire(bfqd, bfqq, 0, + BFQ_BFQQ_NO_MORE_REQUESTS); + } + + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); + +out: + return; +} + +static inline int __bfq_may_queue(struct bfq_queue *bfqq) +{ + if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { + bfq_clear_bfqq_must_alloc(bfqq); + return ELV_MQUEUE_MUST; + } + + return ELV_MQUEUE_MAY; +} + +static int bfq_may_queue(struct request_queue *q, int rw) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct task_struct *tsk = current; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq; + + /* + * Don't force setup of a queue from here, as a call to may_queue + * does not necessarily imply that a request actually will be + * queued. So just lookup a possibly existing queue, or return + * 'may queue' if that fails. + */ + bic = bfq_bic_lookup(bfqd, tsk->io_context); + if (bic == NULL) + return ELV_MQUEUE_MAY; + + bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); + if (bfqq != NULL) + return __bfq_may_queue(bfqq); + + return ELV_MQUEUE_MAY; +} + +/* + * Queue lock held here. + */ +static void bfq_put_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + if (bfqq != NULL) { + const int rw = rq_data_dir(rq); + + BUG_ON(!bfqq->allocated[rw]); + bfqq->allocated[rw]--; + + rq->elv.priv[0] = NULL; + rq->elv.priv[1] = NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } +} + +/* + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this + * was the last process referring to said bfqq. + */ +static struct bfq_queue * +bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); + + put_io_context(bic->icq.ioc); + + if (bfqq_process_refs(bfqq) == 1) { + bfqq->pid = current->pid; + bfq_clear_bfqq_coop(bfqq); + bfq_clear_bfqq_split_coop(bfqq); + return bfqq; + } + + bic_set_bfqq(bic, NULL, 1); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); + return NULL; +} + +/* + * Allocate bfq data structures associated with this request. + */ +static int bfq_set_request(struct request_queue *q, struct request *rq, + gfp_t gfp_mask) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); + const int rw = rq_data_dir(rq); + const int is_sync = rq_is_sync(rq); + struct bfq_queue *bfqq; + struct bfq_group *bfqg; + unsigned long flags; + bool split = false; + + /* handle changed prio notifications; cgroup change is handled separately */ + if (unlikely(icq_get_changed(&bic->icq) & ICQ_IOPRIO_CHANGED)) + bfq_check_ioprio_change(bic->icq.ioc, bic); + + might_sleep_if(gfp_mask & __GFP_WAIT); + + spin_lock_irqsave(q->queue_lock, flags); + + if (bic == NULL) + goto queue_fail; + + bfqg = bfq_bic_update_cgroup(bic); + +new_queue: + bfqq = bic_to_bfqq(bic, is_sync); + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { + bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic->icq.ioc, gfp_mask); + bic_set_bfqq(bic, bfqq, is_sync); + if (split && is_sync) { + if ((bic->was_in_burst_list && bfqd->large_burst) || + bic->saved_in_large_burst) + bfq_mark_bfqq_in_large_burst(bfqq); + else { + bfq_clear_bfqq_in_large_burst(bfqq); + if (bic->was_in_burst_list) + hlist_add_head(&bfqq->burst_list_node, + &bfqd->burst_list); + } + } + } else { + /* If the queue was seeky for too long, break it apart. */ + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + bfqq = bfq_split_bfqq(bic, bfqq); + split = true; + if (!bfqq) + goto new_queue; + } + } + + bfqq->allocated[rw]++; + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, + atomic_read(&bfqq->ref)); + + rq->elv.priv[0] = bic; + rq->elv.priv[1] = bfqq; + + /* + * If a bfq_queue has only one process reference, it is owned + * by only one bfq_io_cq: we can set the bic field of the + * bfq_queue to the address of that structure. Also, if the + * queue has just been split, mark a flag so that the + * information is available to the other scheduler hooks. + */ + if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + bfqq->bic = bic; + if (split) { + bfq_mark_bfqq_just_split(bfqq); + /* + * If the queue has just been split from a shared + * queue, restore the idle window and the possible + * weight raising period. + */ + bfq_bfqq_resume_state(bfqq, bic); + } + } + + spin_unlock_irqrestore(q->queue_lock, flags); + + return 0; + +queue_fail: + bfq_schedule_dispatch(bfqd); + spin_unlock_irqrestore(q->queue_lock, flags); + + return 1; +} + +static void bfq_kick_queue(struct work_struct *work) +{ + struct bfq_data *bfqd = + container_of(work, struct bfq_data, unplug_work); + struct request_queue *q = bfqd->queue; + + spin_lock_irq(q->queue_lock); + __blk_run_queue(q); + spin_unlock_irq(q->queue_lock); +} + +/* + * Handler of the expiration of the timer running if the in-service queue + * is idling inside its time slice. + */ +static void bfq_idle_slice_timer(unsigned long data) +{ + struct bfq_data *bfqd = (struct bfq_data *)data; + struct bfq_queue *bfqq; + unsigned long flags; + enum bfqq_expiration reason; + + spin_lock_irqsave(bfqd->queue->queue_lock, flags); + + bfqq = bfqd->in_service_queue; + /* + * Theoretical race here: the in-service queue can be NULL or + * different from the queue that was idling if the timer handler + * spins on the queue_lock and a new request arrives for the + * current queue and there is a full dispatch cycle that changes + * the in-service queue. This can hardly happen, but in the worst + * case we just expire a queue too early. + */ + if (bfqq != NULL) { + bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); + if (bfq_bfqq_budget_timeout(bfqq)) + /* + * Also here the queue can be safely expired + * for budget timeout without wasting + * guarantees + */ + reason = BFQ_BFQQ_BUDGET_TIMEOUT; + else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) + /* + * The queue may not be empty upon timer expiration, + * because we may not disable the timer when the + * first request of the in-service queue arrives + * during disk idling. + */ + reason = BFQ_BFQQ_TOO_IDLE; + else + goto schedule_dispatch; + + bfq_bfqq_expire(bfqd, bfqq, 1, reason); + } + +schedule_dispatch: + bfq_schedule_dispatch(bfqd); + + spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); +} + +static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) +{ + del_timer_sync(&bfqd->idle_slice_timer); + cancel_work_sync(&bfqd->unplug_work); +} + +static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, + struct bfq_queue **bfqq_ptr) +{ + struct bfq_group *root_group = bfqd->root_group; + struct bfq_queue *bfqq = *bfqq_ptr; + + bfq_log(bfqd, "put_async_bfqq: %p", bfqq); + if (bfqq != NULL) { + bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); + bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + *bfqq_ptr = NULL; + } +} + +/* + * Release all the bfqg references to its async queues. If we are + * deallocating the group these queues may still contain requests, so + * we reparent them to the root cgroup (i.e., the only one that will + * exist for sure until all the requests on a device are gone). + */ +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); + + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); +} + +static void bfq_exit_queue(struct elevator_queue *e) +{ + struct bfq_data *bfqd = e->elevator_data; + struct request_queue *q = bfqd->queue; + struct bfq_queue *bfqq, *n; + + bfq_shutdown_timer_wq(bfqd); + + spin_lock_irq(q->queue_lock); + + BUG_ON(bfqd->in_service_queue != NULL); + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) + bfq_deactivate_bfqq(bfqd, bfqq, 0); + + bfq_disconnect_groups(bfqd); + spin_unlock_irq(q->queue_lock); + + bfq_shutdown_timer_wq(bfqd); + + synchronize_rcu(); + + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + + bfq_free_root_group(bfqd); + kfree(bfqd); +} + +static void *bfq_init_queue(struct request_queue *q) +{ + struct bfq_group *bfqg; + struct bfq_data *bfqd; + + bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); + if (bfqd == NULL) + return NULL; + + /* + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow + * will not attempt to free it. + */ + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); + atomic_inc(&bfqd->oom_bfqq.ref); + bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; + bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE; + bfqd->oom_bfqq.entity.new_weight = + bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio); + /* + * Trigger weight initialization, according to ioprio, at the + * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio + * class won't be changed any more. + */ + bfqd->oom_bfqq.entity.ioprio_changed = 1; + + bfqd->queue = q; + + bfqg = bfq_alloc_root_group(bfqd, q->node); + if (bfqg == NULL) { + kfree(bfqd); + return NULL; + } + + bfqd->root_group = bfqg; + bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); +#ifdef CONFIG_CGROUP_BFQIO + bfqd->active_numerous_groups = 0; +#endif + + init_timer(&bfqd->idle_slice_timer); + bfqd->idle_slice_timer.function = bfq_idle_slice_timer; + bfqd->idle_slice_timer.data = (unsigned long)bfqd; + + bfqd->rq_pos_tree = RB_ROOT; + bfqd->queue_weights_tree = RB_ROOT; + bfqd->group_weights_tree = RB_ROOT; + + INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); + + INIT_LIST_HEAD(&bfqd->active_list); + INIT_LIST_HEAD(&bfqd->idle_list); + INIT_HLIST_HEAD(&bfqd->burst_list); + + bfqd->hw_tag = -1; + + bfqd->bfq_max_budget = bfq_default_max_budget; + + bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; + bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; + bfqd->bfq_back_max = bfq_back_max; + bfqd->bfq_back_penalty = bfq_back_penalty; + bfqd->bfq_slice_idle = bfq_slice_idle; + bfqd->bfq_class_idle_last_service = 0; + bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; + bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; + bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; + + bfqd->bfq_coop_thresh = 2; + bfqd->bfq_failed_cooperations = 7000; + bfqd->bfq_requests_within_timer = 120; + + bfqd->bfq_large_burst_thresh = 11; + bfqd->bfq_burst_interval = msecs_to_jiffies(500); + + bfqd->low_latency = true; + + bfqd->bfq_wr_coeff = 20; + bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); + bfqd->bfq_wr_max_time = 0; + bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); + bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); + bfqd->bfq_wr_max_softrt_rate = 7000; /* + * Approximate rate required + * to playback or record a + * high-definition compressed + * video. + */ + bfqd->wr_busy_queues = 0; + bfqd->busy_in_flight_queues = 0; + bfqd->const_seeky_busy_in_flight_queues = 0; + + /* + * Begin by assuming, optimistically, that the device peak rate is + * equal to the highest reference rate. + */ + bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * + T_fast[blk_queue_nonrot(bfqd->queue)]; + bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; + bfqd->device_speed = BFQ_BFQD_FAST; + + return bfqd; +} + +static void bfq_slab_kill(void) +{ + if (bfq_pool != NULL) + kmem_cache_destroy(bfq_pool); +} + +static int __init bfq_slab_setup(void) +{ + bfq_pool = KMEM_CACHE(bfq_queue, 0); + if (bfq_pool == NULL) + return -ENOMEM; + return 0; +} + +static ssize_t bfq_var_show(unsigned int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static ssize_t bfq_var_store(unsigned long *var, const char *page, + size_t count) +{ + unsigned long new_val; + int ret = kstrtoul(page, 10, &new_val); + + if (ret == 0) + *var = new_val; + + return count; +} + +static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) +{ + struct bfq_data *bfqd = e->elevator_data; + return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? + jiffies_to_msecs(bfqd->bfq_wr_max_time) : + jiffies_to_msecs(bfq_wr_duration(bfqd))); +} + +static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) +{ + struct bfq_queue *bfqq; + struct bfq_data *bfqd = e->elevator_data; + ssize_t num_char = 0; + + num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", + bfqd->queued); + + spin_lock_irq(bfqd->queue->queue_lock); + + num_char += sprintf(page + num_char, "Active:\n"); + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { + num_char += sprintf(page + num_char, + "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n", + bfqq->pid, + bfqq->entity.weight, + bfqq->queued[0], + bfqq->queued[1], + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + num_char += sprintf(page + num_char, "Idle:\n"); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { + num_char += sprintf(page + num_char, + "pid%d: weight %hu, dur %d/%u\n", + bfqq->pid, + bfqq->entity.weight, + jiffies_to_msecs(jiffies - + bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + spin_unlock_irq(bfqd->queue->queue_lock); + + return num_char; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned int __data = __VAR; \ + if (__CONV) \ + __data = jiffies_to_msecs(__data); \ + return bfq_var_show(__data, (page)); \ +} +SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); +SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); +SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); +SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); +SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); +SHOW_FUNCTION(bfq_max_budget_async_rq_show, + bfqd->bfq_max_budget_async_rq, 0); +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); +SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); +SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); +SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); +SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); +SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); +SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, + 1); +SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t \ +__FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned long uninitialized_var(__data); \ + int ret = bfq_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + if (__CONV) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else \ + *(__PTR) = __data; \ + return ret; \ +} +STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, + INT_MAX, 1); +STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, + INT_MAX, 1); +STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); +STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, + INT_MAX, 0); +STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, + 1, INT_MAX, 0); +STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, + INT_MAX, 1); +STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); +STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, + 1); +STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, + INT_MAX, 1); +STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, + &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, + INT_MAX, 0); +#undef STORE_FUNCTION + +/* do nothing for the moment */ +static ssize_t bfq_weights_store(struct elevator_queue *e, + const char *page, size_t count) +{ + return count; +} + +static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) +{ + u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); + + if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) + return bfq_calc_max_budget(bfqd->peak_rate, timeout); + else + return bfq_default_max_budget; +} + +static ssize_t bfq_max_budget_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data == 0) + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + else { + if (__data > INT_MAX) + __data = INT_MAX; + bfqd->bfq_max_budget = __data; + } + + bfqd->bfq_user_max_budget = __data; + + return ret; +} + +static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data < 1) + __data = 1; + else if (__data > INT_MAX) + __data = INT_MAX; + + bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); + if (bfqd->bfq_user_max_budget == 0) + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + + return ret; +} + +static ssize_t bfq_low_latency_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data > 1) + __data = 1; + if (__data == 0 && bfqd->low_latency != 0) + bfq_end_wr(bfqd); + bfqd->low_latency = __data; + + return ret; +} + +#define BFQ_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) + +static struct elv_fs_entry bfq_attrs[] = { + BFQ_ATTR(fifo_expire_sync), + BFQ_ATTR(fifo_expire_async), + BFQ_ATTR(back_seek_max), + BFQ_ATTR(back_seek_penalty), + BFQ_ATTR(slice_idle), + BFQ_ATTR(max_budget), + BFQ_ATTR(max_budget_async_rq), + BFQ_ATTR(timeout_sync), + BFQ_ATTR(timeout_async), + BFQ_ATTR(low_latency), + BFQ_ATTR(wr_coeff), + BFQ_ATTR(wr_max_time), + BFQ_ATTR(wr_rt_max_time), + BFQ_ATTR(wr_min_idle_time), + BFQ_ATTR(wr_min_inter_arr_async), + BFQ_ATTR(wr_max_softrt_rate), + BFQ_ATTR(weights), + __ATTR_NULL +}; + +static struct elevator_type iosched_bfq = { + .ops = { + .elevator_merge_fn = bfq_merge, + .elevator_merged_fn = bfq_merged_request, + .elevator_merge_req_fn = bfq_merged_requests, + .elevator_allow_merge_fn = bfq_allow_merge, + .elevator_dispatch_fn = bfq_dispatch_requests, + .elevator_add_req_fn = bfq_insert_request, + .elevator_activate_req_fn = bfq_activate_request, + .elevator_deactivate_req_fn = bfq_deactivate_request, + .elevator_completed_req_fn = bfq_completed_request, + .elevator_former_req_fn = elv_rb_former_request, + .elevator_latter_req_fn = elv_rb_latter_request, + .elevator_init_icq_fn = bfq_init_icq, + .elevator_exit_icq_fn = bfq_exit_icq, + .elevator_set_req_fn = bfq_set_request, + .elevator_put_req_fn = bfq_put_request, + .elevator_may_queue_fn = bfq_may_queue, + .elevator_init_fn = bfq_init_queue, + .elevator_exit_fn = bfq_exit_queue, + }, + .icq_size = sizeof(struct bfq_io_cq), + .icq_align = __alignof__(struct bfq_io_cq), + .elevator_attrs = bfq_attrs, + .elevator_name = "bfq", + .elevator_owner = THIS_MODULE, +}; + +static int __init bfq_init(void) +{ + /* + * Can be 0 on HZ < 1000 setups. + */ + if (bfq_slice_idle == 0) + bfq_slice_idle = 1; + + if (bfq_timeout_async == 0) + bfq_timeout_async = 1; + + if (bfq_slab_setup()) + return -ENOMEM; + + /* + * Times to load large popular applications for the typical systems + * installed on the reference devices (see the comments before the + * definitions of the two arrays). + */ + T_slow[0] = msecs_to_jiffies(2600); + T_slow[1] = msecs_to_jiffies(1000); + T_fast[0] = msecs_to_jiffies(5500); + T_fast[1] = msecs_to_jiffies(2000); + + /* + * Thresholds that determine the switch between speed classes (see + * the comments before the definition of the array). + */ + device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; + device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; + + elv_register(&iosched_bfq); + pr_info("BFQ I/O-scheduler: v7r8"); + + return 0; +} + +static void __exit bfq_exit(void) +{ + elv_unregister(&iosched_bfq); + bfq_slab_kill(); +} + +module_init(bfq_init); +module_exit(bfq_exit); + +MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); +MODULE_LICENSE("GPL"); diff --git a/block/bfq-sched.c b/block/bfq-sched.c new file mode 100644 index 000000000..d0890c6d4 --- /dev/null +++ b/block/bfq-sched.c @@ -0,0 +1,1180 @@ +/* + * BFQ: Hierarchical B-WF2Q+ scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + */ + +#ifdef CONFIG_CGROUP_BFQIO +#define for_each_entity(entity) \ + for (; entity != NULL; entity = entity->parent) + +#define for_each_entity_safe(entity, parent) \ + for (; entity && ({ parent = entity->parent; 1; }); entity = parent) + +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + int extract, + struct bfq_data *bfqd); + +static inline void bfq_update_budget(struct bfq_entity *next_in_service) +{ + struct bfq_entity *bfqg_entity; + struct bfq_group *bfqg; + struct bfq_sched_data *group_sd; + + BUG_ON(next_in_service == NULL); + + group_sd = next_in_service->sched_data; + + bfqg = container_of(group_sd, struct bfq_group, sched_data); + /* + * bfq_group's my_entity field is not NULL only if the group + * is not the root group. We must not touch the root entity + * as it must never become an in-service entity. + */ + bfqg_entity = bfqg->my_entity; + if (bfqg_entity != NULL) + bfqg_entity->budget = next_in_service->budget; +} + +static int bfq_update_next_in_service(struct bfq_sched_data *sd) +{ + struct bfq_entity *next_in_service; + + if (sd->in_service_entity != NULL) + /* will update/requeue at the end of service */ + return 0; + + /* + * NOTE: this can be improved in many ways, such as returning + * 1 (and thus propagating upwards the update) only when the + * budget changes, or caching the bfqq that will be scheduled + * next from this subtree. By now we worry more about + * correctness than about performance... + */ + next_in_service = bfq_lookup_next_entity(sd, 0, NULL); + sd->next_in_service = next_in_service; + + if (next_in_service != NULL) + bfq_update_budget(next_in_service); + + return 1; +} + +static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) +{ + BUG_ON(sd->next_in_service != entity); +} +#else +#define for_each_entity(entity) \ + for (; entity != NULL; entity = NULL) + +#define for_each_entity_safe(entity, parent) \ + for (parent = NULL; entity != NULL; entity = parent) + +static inline int bfq_update_next_in_service(struct bfq_sched_data *sd) +{ + return 0; +} + +static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) +{ +} + +static inline void bfq_update_budget(struct bfq_entity *next_in_service) +{ +} +#endif + +/* + * Shift for timestamp calculations. This actually limits the maximum + * service allowed in one timestamp delta (small shift values increase it), + * the maximum total weight that can be used for the queues in the system + * (big shift values increase it), and the period of virtual time + * wraparounds. + */ +#define WFQ_SERVICE_SHIFT 22 + +/** + * bfq_gt - compare two timestamps. + * @a: first ts. + * @b: second ts. + * + * Return @a > @b, dealing with wrapping correctly. + */ +static inline int bfq_gt(u64 a, u64 b) +{ + return (s64)(a - b) > 0; +} + +static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = NULL; + + BUG_ON(entity == NULL); + + if (entity->my_sched_data == NULL) + bfqq = container_of(entity, struct bfq_queue, entity); + + return bfqq; +} + + +/** + * bfq_delta - map service into the virtual time domain. + * @service: amount of service. + * @weight: scale factor (weight of an entity or weight sum). + */ +static inline u64 bfq_delta(unsigned long service, + unsigned long weight) +{ + u64 d = (u64)service << WFQ_SERVICE_SHIFT; + + do_div(d, weight); + return d; +} + +/** + * bfq_calc_finish - assign the finish time to an entity. + * @entity: the entity to act upon. + * @service: the service to be charged to the entity. + */ +static inline void bfq_calc_finish(struct bfq_entity *entity, + unsigned long service) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + BUG_ON(entity->weight == 0); + + entity->finish = entity->start + + bfq_delta(service, entity->weight); + + if (bfqq != NULL) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: serv %lu, w %d", + service, entity->weight); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: start %llu, finish %llu, delta %llu", + entity->start, entity->finish, + bfq_delta(service, entity->weight)); + } +} + +/** + * bfq_entity_of - get an entity from a node. + * @node: the node field of the entity. + * + * Convert a node pointer to the relative entity. This is used only + * to simplify the logic of some functions and not as the generic + * conversion mechanism because, e.g., in the tree walking functions, + * the check for a %NULL value would be redundant. + */ +static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) +{ + struct bfq_entity *entity = NULL; + + if (node != NULL) + entity = rb_entry(node, struct bfq_entity, rb_node); + + return entity; +} + +/** + * bfq_extract - remove an entity from a tree. + * @root: the tree root. + * @entity: the entity to remove. + */ +static inline void bfq_extract(struct rb_root *root, + struct bfq_entity *entity) +{ + BUG_ON(entity->tree != root); + + entity->tree = NULL; + rb_erase(&entity->rb_node, root); +} + +/** + * bfq_idle_extract - extract an entity from the idle tree. + * @st: the service tree of the owning @entity. + * @entity: the entity being removed. + */ +static void bfq_idle_extract(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *next; + + BUG_ON(entity->tree != &st->idle); + + if (entity == st->first_idle) { + next = rb_next(&entity->rb_node); + st->first_idle = bfq_entity_of(next); + } + + if (entity == st->last_idle) { + next = rb_prev(&entity->rb_node); + st->last_idle = bfq_entity_of(next); + } + + bfq_extract(&st->idle, entity); + + if (bfqq != NULL) + list_del(&bfqq->bfqq_list); +} + +/** + * bfq_insert - generic tree insertion. + * @root: tree root. + * @entity: entity to insert. + * + * This is used for the idle and the active tree, since they are both + * ordered by finish time. + */ +static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) +{ + struct bfq_entity *entry; + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + + BUG_ON(entity->tree != NULL); + + while (*node != NULL) { + parent = *node; + entry = rb_entry(parent, struct bfq_entity, rb_node); + + if (bfq_gt(entry->finish, entity->finish)) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&entity->rb_node, parent, node); + rb_insert_color(&entity->rb_node, root); + + entity->tree = root; +} + +/** + * bfq_update_min - update the min_start field of a entity. + * @entity: the entity to update. + * @node: one of its children. + * + * This function is called when @entity may store an invalid value for + * min_start due to updates to the active tree. The function assumes + * that the subtree rooted at @node (which may be its left or its right + * child) has a valid min_start value. + */ +static inline void bfq_update_min(struct bfq_entity *entity, + struct rb_node *node) +{ + struct bfq_entity *child; + + if (node != NULL) { + child = rb_entry(node, struct bfq_entity, rb_node); + if (bfq_gt(entity->min_start, child->min_start)) + entity->min_start = child->min_start; + } +} + +/** + * bfq_update_active_node - recalculate min_start. + * @node: the node to update. + * + * @node may have changed position or one of its children may have moved, + * this function updates its min_start value. The left and right subtrees + * are assumed to hold a correct min_start value. + */ +static inline void bfq_update_active_node(struct rb_node *node) +{ + struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); + + entity->min_start = entity->start; + bfq_update_min(entity, node->rb_right); + bfq_update_min(entity, node->rb_left); +} + +/** + * bfq_update_active_tree - update min_start for the whole active tree. + * @node: the starting node. + * + * @node must be the deepest modified node after an update. This function + * updates its min_start using the values held by its children, assuming + * that they did not change, and then updates all the nodes that may have + * changed in the path to the root. The only nodes that may have changed + * are the ones in the path or their siblings. + */ +static void bfq_update_active_tree(struct rb_node *node) +{ + struct rb_node *parent; + +up: + bfq_update_active_node(node); + + parent = rb_parent(node); + if (parent == NULL) + return; + + if (node == parent->rb_left && parent->rb_right != NULL) + bfq_update_active_node(parent->rb_right); + else if (parent->rb_left != NULL) + bfq_update_active_node(parent->rb_left); + + node = parent; + goto up; +} + +static void bfq_weights_tree_add(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root); + +static void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root); + + +/** + * bfq_active_insert - insert an entity in the active tree of its + * group/device. + * @st: the service tree of the entity. + * @entity: the entity being inserted. + * + * The active tree is ordered by finish time, but an extra key is kept + * per each node, containing the minimum value for the start times of + * its children (and the node itself), so it's possible to search for + * the eligible node with the lowest finish time in logarithmic time. + */ +static void bfq_active_insert(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node = &entity->rb_node; +#ifdef CONFIG_CGROUP_BFQIO + struct bfq_sched_data *sd = NULL; + struct bfq_group *bfqg = NULL; + struct bfq_data *bfqd = NULL; +#endif + + bfq_insert(&st->active, entity); + + if (node->rb_left != NULL) + node = node->rb_left; + else if (node->rb_right != NULL) + node = node->rb_right; + + bfq_update_active_tree(node); + +#ifdef CONFIG_CGROUP_BFQIO + sd = entity->sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; +#endif + if (bfqq != NULL) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); +#ifdef CONFIG_CGROUP_BFQIO + else { /* bfq_group */ + BUG_ON(!bfqd); + bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); + } + if (bfqg != bfqd->root_group) { + BUG_ON(!bfqg); + BUG_ON(!bfqd); + bfqg->active_entities++; + if (bfqg->active_entities == 2) + bfqd->active_numerous_groups++; + } +#endif +} + +/** + * bfq_ioprio_to_weight - calc a weight from an ioprio. + * @ioprio: the ioprio value to convert. + */ +static inline unsigned short bfq_ioprio_to_weight(int ioprio) +{ + BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); + return IOPRIO_BE_NR - ioprio; +} + +/** + * bfq_weight_to_ioprio - calc an ioprio from a weight. + * @weight: the weight value to convert. + * + * To preserve as mush as possible the old only-ioprio user interface, + * 0 is used as an escape ioprio value for weights (numerically) equal or + * larger than IOPRIO_BE_NR + */ +static inline unsigned short bfq_weight_to_ioprio(int weight) +{ + BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); + return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; +} + +static inline void bfq_get_entity(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + if (bfqq != NULL) { + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", + bfqq, atomic_read(&bfqq->ref)); + } +} + +/** + * bfq_find_deepest - find the deepest node that an extraction can modify. + * @node: the node being removed. + * + * Do the first step of an extraction in an rb tree, looking for the + * node that will replace @node, and returning the deepest node that + * the following modifications to the tree can touch. If @node is the + * last node in the tree return %NULL. + */ +static struct rb_node *bfq_find_deepest(struct rb_node *node) +{ + struct rb_node *deepest; + + if (node->rb_right == NULL && node->rb_left == NULL) + deepest = rb_parent(node); + else if (node->rb_right == NULL) + deepest = node->rb_left; + else if (node->rb_left == NULL) + deepest = node->rb_right; + else { + deepest = rb_next(node); + if (deepest->rb_right != NULL) + deepest = deepest->rb_right; + else if (rb_parent(deepest) != node) + deepest = rb_parent(deepest); + } + + return deepest; +} + +/** + * bfq_active_extract - remove an entity from the active tree. + * @st: the service_tree containing the tree. + * @entity: the entity being removed. + */ +static void bfq_active_extract(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node; +#ifdef CONFIG_CGROUP_BFQIO + struct bfq_sched_data *sd = NULL; + struct bfq_group *bfqg = NULL; + struct bfq_data *bfqd = NULL; +#endif + + node = bfq_find_deepest(&entity->rb_node); + bfq_extract(&st->active, entity); + + if (node != NULL) + bfq_update_active_tree(node); + +#ifdef CONFIG_CGROUP_BFQIO + sd = entity->sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; +#endif + if (bfqq != NULL) + list_del(&bfqq->bfqq_list); +#ifdef CONFIG_CGROUP_BFQIO + else { /* bfq_group */ + BUG_ON(!bfqd); + bfq_weights_tree_remove(bfqd, entity, + &bfqd->group_weights_tree); + } + if (bfqg != bfqd->root_group) { + BUG_ON(!bfqg); + BUG_ON(!bfqd); + BUG_ON(!bfqg->active_entities); + bfqg->active_entities--; + if (bfqg->active_entities == 1) { + BUG_ON(!bfqd->active_numerous_groups); + bfqd->active_numerous_groups--; + } + } +#endif +} + +/** + * bfq_idle_insert - insert an entity into the idle tree. + * @st: the service tree containing the tree. + * @entity: the entity to insert. + */ +static void bfq_idle_insert(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_entity *first_idle = st->first_idle; + struct bfq_entity *last_idle = st->last_idle; + + if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) + st->first_idle = entity; + if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) + st->last_idle = entity; + + bfq_insert(&st->idle, entity); + + if (bfqq != NULL) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); +} + +/** + * bfq_forget_entity - remove an entity from the wfq trees. + * @st: the service tree. + * @entity: the entity being removed. + * + * Update the device status and forget everything about @entity, putting + * the device reference to it, if it is a queue. Entities belonging to + * groups are not refcounted. + */ +static void bfq_forget_entity(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_sched_data *sd; + + BUG_ON(!entity->on_st); + + entity->on_st = 0; + st->wsum -= entity->weight; + if (bfqq != NULL) { + sd = entity->sched_data; + bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } +} + +/** + * bfq_put_idle_entity - release the idle tree ref of an entity. + * @st: service tree for the entity. + * @entity: the entity being released. + */ +static void bfq_put_idle_entity(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + bfq_idle_extract(st, entity); + bfq_forget_entity(st, entity); +} + +/** + * bfq_forget_idle - update the idle tree if necessary. + * @st: the service tree to act upon. + * + * To preserve the global O(log N) complexity we only remove one entry here; + * as the idle tree will not grow indefinitely this can be done safely. + */ +static void bfq_forget_idle(struct bfq_service_tree *st) +{ + struct bfq_entity *first_idle = st->first_idle; + struct bfq_entity *last_idle = st->last_idle; + + if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && + !bfq_gt(last_idle->finish, st->vtime)) { + /* + * Forget the whole idle tree, increasing the vtime past + * the last finish time of idle entities. + */ + st->vtime = last_idle->finish; + } + + if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) + bfq_put_idle_entity(st, first_idle); +} + +static struct bfq_service_tree * +__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + struct bfq_entity *entity) +{ + struct bfq_service_tree *new_st = old_st; + + if (entity->ioprio_changed) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + unsigned short prev_weight, new_weight; + struct bfq_data *bfqd = NULL; + struct rb_root *root; +#ifdef CONFIG_CGROUP_BFQIO + struct bfq_sched_data *sd; + struct bfq_group *bfqg; +#endif + + if (bfqq != NULL) + bfqd = bfqq->bfqd; +#ifdef CONFIG_CGROUP_BFQIO + else { + sd = entity->my_sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; + BUG_ON(!bfqd); + } +#endif + + BUG_ON(old_st->wsum < entity->weight); + old_st->wsum -= entity->weight; + + if (entity->new_weight != entity->orig_weight) { + if (entity->new_weight < BFQ_MIN_WEIGHT || + entity->new_weight > BFQ_MAX_WEIGHT) { + printk(KERN_CRIT "update_weight_prio: " + "new_weight %d\n", + entity->new_weight); + BUG(); + } + entity->orig_weight = entity->new_weight; + entity->ioprio = + bfq_weight_to_ioprio(entity->orig_weight); + } + + entity->ioprio_class = entity->new_ioprio_class; + entity->ioprio_changed = 0; + + /* + * NOTE: here we may be changing the weight too early, + * this will cause unfairness. The correct approach + * would have required additional complexity to defer + * weight changes to the proper time instants (i.e., + * when entity->finish <= old_st->vtime). + */ + new_st = bfq_entity_service_tree(entity); + + prev_weight = entity->weight; + new_weight = entity->orig_weight * + (bfqq != NULL ? bfqq->wr_coeff : 1); + /* + * If the weight of the entity changes, remove the entity + * from its old weight counter (if there is a counter + * associated with the entity), and add it to the counter + * associated with its new weight. + */ + if (prev_weight != new_weight) { + root = bfqq ? &bfqd->queue_weights_tree : + &bfqd->group_weights_tree; + bfq_weights_tree_remove(bfqd, entity, root); + } + entity->weight = new_weight; + /* + * Add the entity to its weights tree only if it is + * not associated with a weight-raised queue. + */ + if (prev_weight != new_weight && + (bfqq ? bfqq->wr_coeff == 1 : 1)) + /* If we get here, root has been initialized. */ + bfq_weights_tree_add(bfqd, entity, root); + + new_st->wsum += entity->weight; + + if (new_st != old_st) + entity->start = new_st->vtime; + } + + return new_st; +} + +/** + * bfq_bfqq_served - update the scheduler status after selection for + * service. + * @bfqq: the queue being served. + * @served: bytes to transfer. + * + * NOTE: this can be optimized, as the timestamps of upper level entities + * are synchronized every time a new bfqq is selected for service. By now, + * we keep it to better check consistency. + */ +static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st; + + for_each_entity(entity) { + st = bfq_entity_service_tree(entity); + + entity->service += served; + BUG_ON(entity->service > entity->budget); + BUG_ON(st->wsum == 0); + + st->vtime += bfq_delta(served, st->wsum); + bfq_forget_idle(st); + } + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served); +} + +/** + * bfq_bfqq_charge_full_budget - set the service to the entity budget. + * @bfqq: the queue that needs a service update. + * + * When it's not possible to be fair in the service domain, because + * a queue is not consuming its budget fast enough (the meaning of + * fast depends on the timeout parameter), we charge it a full + * budget. In this way we should obtain a sort of time-domain + * fairness among all the seeky/slow queues. + */ +static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); + + bfq_bfqq_served(bfqq, entity->budget - entity->service); +} + +/** + * __bfq_activate_entity - activate an entity. + * @entity: the entity being activated. + * + * Called whenever an entity is activated, i.e., it is not active and one + * of its children receives a new request, or has to be reactivated due to + * budget exhaustion. It uses the current budget of the entity (and the + * service received if @entity is active) of the queue to calculate its + * timestamps. + */ +static void __bfq_activate_entity(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + + if (entity == sd->in_service_entity) { + BUG_ON(entity->tree != NULL); + /* + * If we are requeueing the current entity we have + * to take care of not charging to it service it has + * not received. + */ + bfq_calc_finish(entity, entity->service); + entity->start = entity->finish; + sd->in_service_entity = NULL; + } else if (entity->tree == &st->active) { + /* + * Requeueing an entity due to a change of some + * next_in_service entity below it. We reuse the + * old start time. + */ + bfq_active_extract(st, entity); + } else if (entity->tree == &st->idle) { + /* + * Must be on the idle tree, bfq_idle_extract() will + * check for that. + */ + bfq_idle_extract(st, entity); + entity->start = bfq_gt(st->vtime, entity->finish) ? + st->vtime : entity->finish; + } else { + /* + * The finish time of the entity may be invalid, and + * it is in the past for sure, otherwise the queue + * would have been on the idle tree. + */ + entity->start = st->vtime; + st->wsum += entity->weight; + bfq_get_entity(entity); + + BUG_ON(entity->on_st); + entity->on_st = 1; + } + + st = __bfq_entity_update_weight_prio(st, entity); + bfq_calc_finish(entity, entity->budget); + bfq_active_insert(st, entity); +} + +/** + * bfq_activate_entity - activate an entity and its ancestors if necessary. + * @entity: the entity to activate. + * + * Activate @entity and all the entities on the path from it to the root. + */ +static void bfq_activate_entity(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd; + + for_each_entity(entity) { + __bfq_activate_entity(entity); + + sd = entity->sched_data; + if (!bfq_update_next_in_service(sd)) + /* + * No need to propagate the activation to the + * upper entities, as they will be updated when + * the in-service entity is rescheduled. + */ + break; + } +} + +/** + * __bfq_deactivate_entity - deactivate an entity from its service tree. + * @entity: the entity to deactivate. + * @requeue: if false, the entity will not be put into the idle tree. + * + * Deactivate an entity, independently from its previous state. If the + * entity was not on a service tree just return, otherwise if it is on + * any scheduler tree, extract it from that tree, and if necessary + * and if the caller did not specify @requeue, put it on the idle tree. + * + * Return %1 if the caller should update the entity hierarchy, i.e., + * if the entity was in service or if it was the next_in_service for + * its sched_data; return %0 otherwise. + */ +static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + int was_in_service = entity == sd->in_service_entity; + int ret = 0; + + if (!entity->on_st) + return 0; + + BUG_ON(was_in_service && entity->tree != NULL); + + if (was_in_service) { + bfq_calc_finish(entity, entity->service); + sd->in_service_entity = NULL; + } else if (entity->tree == &st->active) + bfq_active_extract(st, entity); + else if (entity->tree == &st->idle) + bfq_idle_extract(st, entity); + else if (entity->tree != NULL) + BUG(); + + if (was_in_service || sd->next_in_service == entity) + ret = bfq_update_next_in_service(sd); + + if (!requeue || !bfq_gt(entity->finish, st->vtime)) + bfq_forget_entity(st, entity); + else + bfq_idle_insert(st, entity); + + BUG_ON(sd->in_service_entity == entity); + BUG_ON(sd->next_in_service == entity); + + return ret; +} + +/** + * bfq_deactivate_entity - deactivate an entity. + * @entity: the entity to deactivate. + * @requeue: true if the entity can be put on the idle tree + */ +static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) +{ + struct bfq_sched_data *sd; + struct bfq_entity *parent; + + for_each_entity_safe(entity, parent) { + sd = entity->sched_data; + + if (!__bfq_deactivate_entity(entity, requeue)) + /* + * The parent entity is still backlogged, and + * we don't need to update it as it is still + * in service. + */ + break; + + if (sd->next_in_service != NULL) + /* + * The parent entity is still backlogged and + * the budgets on the path towards the root + * need to be updated. + */ + goto update; + + /* + * If we reach there the parent is no more backlogged and + * we want to propagate the dequeue upwards. + */ + requeue = 1; + } + + return; + +update: + entity = parent; + for_each_entity(entity) { + __bfq_activate_entity(entity); + + sd = entity->sched_data; + if (!bfq_update_next_in_service(sd)) + break; + } +} + +/** + * bfq_update_vtime - update vtime if necessary. + * @st: the service tree to act upon. + * + * If necessary update the service tree vtime to have at least one + * eligible entity, skipping to its start time. Assumes that the + * active tree of the device is not empty. + * + * NOTE: this hierarchical implementation updates vtimes quite often, + * we may end up with reactivated processes getting timestamps after a + * vtime skip done because we needed a ->first_active entity on some + * intermediate node. + */ +static void bfq_update_vtime(struct bfq_service_tree *st) +{ + struct bfq_entity *entry; + struct rb_node *node = st->active.rb_node; + + entry = rb_entry(node, struct bfq_entity, rb_node); + if (bfq_gt(entry->min_start, st->vtime)) { + st->vtime = entry->min_start; + bfq_forget_idle(st); + } +} + +/** + * bfq_first_active_entity - find the eligible entity with + * the smallest finish time + * @st: the service tree to select from. + * + * This function searches the first schedulable entity, starting from the + * root of the tree and going on the left every time on this side there is + * a subtree with at least one eligible (start >= vtime) entity. The path on + * the right is followed only if a) the left subtree contains no eligible + * entities and b) no eligible entity has been found yet. + */ +static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) +{ + struct bfq_entity *entry, *first = NULL; + struct rb_node *node = st->active.rb_node; + + while (node != NULL) { + entry = rb_entry(node, struct bfq_entity, rb_node); +left: + if (!bfq_gt(entry->start, st->vtime)) + first = entry; + + BUG_ON(bfq_gt(entry->min_start, st->vtime)); + + if (node->rb_left != NULL) { + entry = rb_entry(node->rb_left, + struct bfq_entity, rb_node); + if (!bfq_gt(entry->min_start, st->vtime)) { + node = node->rb_left; + goto left; + } + } + if (first != NULL) + break; + node = node->rb_right; + } + + BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); + return first; +} + +/** + * __bfq_lookup_next_entity - return the first eligible entity in @st. + * @st: the service tree. + * + * Update the virtual time in @st and return the first eligible entity + * it contains. + */ +static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, + bool force) +{ + struct bfq_entity *entity, *new_next_in_service = NULL; + + if (RB_EMPTY_ROOT(&st->active)) + return NULL; + + bfq_update_vtime(st); + entity = bfq_first_active_entity(st); + BUG_ON(bfq_gt(entity->start, st->vtime)); + + /* + * If the chosen entity does not match with the sched_data's + * next_in_service and we are forcedly serving the IDLE priority + * class tree, bubble up budget update. + */ + if (unlikely(force && entity != entity->sched_data->next_in_service)) { + new_next_in_service = entity; + for_each_entity(new_next_in_service) + bfq_update_budget(new_next_in_service); + } + + return entity; +} + +/** + * bfq_lookup_next_entity - return the first eligible entity in @sd. + * @sd: the sched_data. + * @extract: if true the returned entity will be also extracted from @sd. + * + * NOTE: since we cache the next_in_service entity at each level of the + * hierarchy, the complexity of the lookup can be decreased with + * absolutely no effort just returning the cached next_in_service value; + * we prefer to do full lookups to test the consistency of * the data + * structures. + */ +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + int extract, + struct bfq_data *bfqd) +{ + struct bfq_service_tree *st = sd->service_tree; + struct bfq_entity *entity; + int i = 0; + + BUG_ON(sd->in_service_entity != NULL); + + if (bfqd != NULL && + jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { + entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, + true); + if (entity != NULL) { + i = BFQ_IOPRIO_CLASSES - 1; + bfqd->bfq_class_idle_last_service = jiffies; + sd->next_in_service = entity; + } + } + for (; i < BFQ_IOPRIO_CLASSES; i++) { + entity = __bfq_lookup_next_entity(st + i, false); + if (entity != NULL) { + if (extract) { + bfq_check_next_in_service(sd, entity); + bfq_active_extract(st + i, entity); + sd->in_service_entity = entity; + sd->next_in_service = NULL; + } + break; + } + } + + return entity; +} + +/* + * Get next queue for service. + */ +static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) +{ + struct bfq_entity *entity = NULL; + struct bfq_sched_data *sd; + struct bfq_queue *bfqq; + + BUG_ON(bfqd->in_service_queue != NULL); + + if (bfqd->busy_queues == 0) + return NULL; + + sd = &bfqd->root_group->sched_data; + for (; sd != NULL; sd = entity->my_sched_data) { + entity = bfq_lookup_next_entity(sd, 1, bfqd); + BUG_ON(entity == NULL); + entity->service = 0; + } + + bfqq = bfq_entity_to_bfqq(entity); + BUG_ON(bfqq == NULL); + + return bfqq; +} + +static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) +{ + if (bfqd->in_service_bic != NULL) { + put_io_context(bfqd->in_service_bic->icq.ioc); + bfqd->in_service_bic = NULL; + } + + bfqd->in_service_queue = NULL; + del_timer(&bfqd->idle_slice_timer); +} + +static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int requeue) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (bfqq == bfqd->in_service_queue) + __bfq_bfqd_reset_in_service(bfqd); + + bfq_deactivate_entity(entity, requeue); +} + +static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_activate_entity(entity); +} + +/* + * Called when the bfqq no longer has requests pending, remove it from + * the service tree. + */ +static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int requeue) +{ + BUG_ON(!bfq_bfqq_busy(bfqq)); + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + + bfq_log_bfqq(bfqd, bfqq, "del from busy"); + + bfq_clear_bfqq_busy(bfqq); + + BUG_ON(bfqd->busy_queues == 0); + bfqd->busy_queues--; + + if (!bfqq->dispatched) { + bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->busy_in_flight_queues); + bfqd->busy_in_flight_queues--; + if (bfq_bfqq_constantly_seeky(bfqq)) { + BUG_ON(!bfqd-> + const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } + } + if (bfqq->wr_coeff > 1) + bfqd->wr_busy_queues--; + + bfq_deactivate_bfqq(bfqd, bfqq, requeue); +} + +/* + * Called when an inactive queue receives a new request. + */ +static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + BUG_ON(bfq_bfqq_busy(bfqq)); + BUG_ON(bfqq == bfqd->in_service_queue); + + bfq_log_bfqq(bfqd, bfqq, "add to busy"); + + bfq_activate_bfqq(bfqd, bfqq); + + bfq_mark_bfqq_busy(bfqq); + bfqd->busy_queues++; + + if (!bfqq->dispatched) { + if (bfqq->wr_coeff == 1) + bfq_weights_tree_add(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + bfqd->busy_in_flight_queues++; + if (bfq_bfqq_constantly_seeky(bfqq)) + bfqd->const_seeky_busy_in_flight_queues++; + } + } + if (bfqq->wr_coeff > 1) + bfqd->wr_busy_queues++; +} diff --git a/block/bfq.h b/block/bfq.h new file mode 100644 index 000000000..e6f1a88f4 --- /dev/null +++ b/block/bfq.h @@ -0,0 +1,806 @@ +/* + * BFQ-v7r8 for 3.4.0: data structures and common functions prototypes. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + */ + +#ifndef _BFQ_H +#define _BFQ_H + +#include +#include +#include +#include + +#define BFQ_IOPRIO_CLASSES 3 +#define BFQ_CL_IDLE_TIMEOUT (HZ/5) + +#define BFQ_MIN_WEIGHT 1 +#define BFQ_MAX_WEIGHT 1000 + +#define BFQ_DEFAULT_QUEUE_IOPRIO 4 + +#define BFQ_DEFAULT_GRP_WEIGHT 10 +#define BFQ_DEFAULT_GRP_IOPRIO 0 +#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE + +struct bfq_entity; + +/** + * struct bfq_service_tree - per ioprio_class service tree. + * @active: tree for active entities (i.e., those backlogged). + * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). + * @first_idle: idle entity with minimum F_i. + * @last_idle: idle entity with maximum F_i. + * @vtime: scheduler virtual time. + * @wsum: scheduler weight sum; active and idle entities contribute to it. + * + * Each service tree represents a B-WF2Q+ scheduler on its own. Each + * ioprio_class has its own independent scheduler, and so its own + * bfq_service_tree. All the fields are protected by the queue lock + * of the containing bfqd. + */ +struct bfq_service_tree { + struct rb_root active; + struct rb_root idle; + + struct bfq_entity *first_idle; + struct bfq_entity *last_idle; + + u64 vtime; + unsigned long wsum; +}; + +/** + * struct bfq_sched_data - multi-class scheduler. + * @in_service_entity: entity in service. + * @next_in_service: head-of-the-line entity in the scheduler. + * @service_tree: array of service trees, one per ioprio_class. + * + * bfq_sched_data is the basic scheduler queue. It supports three + * ioprio_classes, and can be used either as a toplevel queue or as + * an intermediate queue on a hierarchical setup. + * @next_in_service points to the active entity of the sched_data + * service trees that will be scheduled next. + * + * The supported ioprio_classes are the same as in CFQ, in descending + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. + * Requests from higher priority queues are served before all the + * requests from lower priority queues; among requests of the same + * queue requests are served according to B-WF2Q+. + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_sched_data { + struct bfq_entity *in_service_entity; + struct bfq_entity *next_in_service; + struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; +}; + +/** + * struct bfq_weight_counter - counter of the number of all active entities + * with a given weight. + * @weight: weight of the entities that this counter refers to. + * @num_active: number of active entities with this weight. + * @weights_node: weights tree member (see bfq_data's @queue_weights_tree + * and @group_weights_tree). + */ +struct bfq_weight_counter { + short int weight; + unsigned int num_active; + struct rb_node weights_node; +}; + +/** + * struct bfq_entity - schedulable entity. + * @rb_node: service_tree member. + * @weight_counter: pointer to the weight counter associated with this entity. + * @on_st: flag, true if the entity is on a tree (either the active or + * the idle one of its service_tree). + * @finish: B-WF2Q+ finish timestamp (aka F_i). + * @start: B-WF2Q+ start timestamp (aka S_i). + * @tree: tree the entity is enqueued into; %NULL if not on a tree. + * @min_start: minimum start time of the (active) subtree rooted at + * this entity; used for O(log N) lookups into active trees. + * @service: service received during the last round of service. + * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. + * @weight: weight of the queue + * @parent: parent entity, for hierarchical scheduling. + * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the + * associated scheduler queue, %NULL on leaf nodes. + * @sched_data: the scheduler queue this entity belongs to. + * @ioprio: the ioprio in use. + * @new_weight: when a weight change is requested, the new weight value. + * @orig_weight: original weight, used to implement weight boosting + * @new_ioprio: when an ioprio change is requested, the new ioprio value. + * @ioprio_class: the ioprio_class in use. + * @new_ioprio_class: when an ioprio_class change is requested, the new + * ioprio_class value. + * @ioprio_changed: flag, true when the user requested a weight, ioprio or + * ioprio_class change. + * + * A bfq_entity is used to represent either a bfq_queue (leaf node in the + * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each + * entity belongs to the sched_data of the parent group in the cgroup + * hierarchy. Non-leaf entities have also their own sched_data, stored + * in @my_sched_data. + * + * Each entity stores independently its priority values; this would + * allow different weights on different devices, but this + * functionality is not exported to userspace by now. Priorities and + * weights are updated lazily, first storing the new values into the + * new_* fields, then setting the @ioprio_changed flag. As soon as + * there is a transition in the entity state that allows the priority + * update to take place the effective and the requested priority + * values are synchronized. + * + * Unless cgroups are used, the weight value is calculated from the + * ioprio to export the same interface as CFQ. When dealing with + * ``well-behaved'' queues (i.e., queues that do not spend too much + * time to consume their budget and have true sequential behavior, and + * when there are no external factors breaking anticipation) the + * relative weights at each level of the cgroups hierarchy should be + * guaranteed. All the fields are protected by the queue lock of the + * containing bfqd. + */ +struct bfq_entity { + struct rb_node rb_node; + struct bfq_weight_counter *weight_counter; + + int on_st; + + u64 finish; + u64 start; + + struct rb_root *tree; + + u64 min_start; + + unsigned long service, budget; + unsigned short weight, new_weight; + unsigned short orig_weight; + + struct bfq_entity *parent; + + struct bfq_sched_data *my_sched_data; + struct bfq_sched_data *sched_data; + + unsigned short ioprio, new_ioprio; + unsigned short ioprio_class, new_ioprio_class; + + int ioprio_changed; +}; + +struct bfq_group; + +/** + * struct bfq_queue - leaf schedulable entity. + * @ref: reference counter. + * @bfqd: parent bfq_data. + * @new_bfqq: shared bfq_queue if queue is cooperating with + * one or more other queues. + * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree). + * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree). + * @sort_list: sorted list of pending requests. + * @next_rq: if fifo isn't expired, next request to serve. + * @queued: nr of requests queued in @sort_list. + * @allocated: currently allocated requests. + * @meta_pending: pending metadata requests. + * @fifo: fifo list of requests in sort_list. + * @entity: entity representing this queue in the scheduler. + * @max_budget: maximum budget allowed from the feedback mechanism. + * @budget_timeout: budget expiration (in jiffies). + * @dispatched: number of requests on the dispatch list or inside driver. + * @flags: status flags. + * @bfqq_list: node for active/idle bfqq list inside our bfqd. + * @burst_list_node: node for the device's burst list. + * @seek_samples: number of seeks sampled + * @seek_total: sum of the distances of the seeks sampled + * @seek_mean: mean seek distance + * @last_request_pos: position of the last request enqueued + * @requests_within_timer: number of consecutive pairs of request completion + * and arrival, such that the queue becomes idle + * after the completion, but the next request arrives + * within an idle time slice; used only if the queue's + * IO_bound has been cleared. + * @pid: pid of the process owning the queue, used for logging purposes. + * @last_wr_start_finish: start time of the current weight-raising period if + * the @bfq-queue is being weight-raised, otherwise + * finish time of the last weight-raising period + * @wr_cur_max_time: current max raising time for this queue + * @soft_rt_next_start: minimum time instant such that, only if a new + * request is enqueued after this time instant in an + * idle @bfq_queue with no outstanding requests, then + * the task associated with the queue it is deemed as + * soft real-time (see the comments to the function + * bfq_bfqq_softrt_next_start()) + * @last_idle_bklogged: time of the last transition of the @bfq_queue from + * idle to backlogged + * @service_from_backlogged: cumulative service received from the @bfq_queue + * since the last transition from idle to + * backlogged + * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the + * queue is shared + * + * A bfq_queue is a leaf request queue; it can be associated with an + * io_context or more, if it is async or shared between cooperating + * processes. @cgroup holds a reference to the cgroup, to be sure that it + * does not disappear while a bfqq still references it (mostly to avoid + * races between request issuing and task migration followed by cgroup + * destruction). + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_queue { + atomic_t ref; + struct bfq_data *bfqd; + + /* fields for cooperating queues handling */ + struct bfq_queue *new_bfqq; + struct rb_node pos_node; + struct rb_root *pos_root; + + struct rb_root sort_list; + struct request *next_rq; + int queued[2]; + int allocated[2]; + int meta_pending; + struct list_head fifo; + + struct bfq_entity entity; + + unsigned long max_budget; + unsigned long budget_timeout; + + int dispatched; + + unsigned int flags; + + struct list_head bfqq_list; + + struct hlist_node burst_list_node; + + unsigned int seek_samples; + u64 seek_total; + sector_t seek_mean; + sector_t last_request_pos; + + unsigned int requests_within_timer; + + pid_t pid; + struct bfq_io_cq *bic; + + /* weight-raising fields */ + unsigned long wr_cur_max_time; + unsigned long soft_rt_next_start; + unsigned long last_wr_start_finish; + unsigned int wr_coeff; + unsigned long last_idle_bklogged; + unsigned long service_from_backlogged; +}; + +/** + * struct bfq_ttime - per process thinktime stats. + * @ttime_total: total process thinktime + * @ttime_samples: number of thinktime samples + * @ttime_mean: average process thinktime + */ +struct bfq_ttime { + unsigned long last_end_request; + + unsigned long ttime_total; + unsigned long ttime_samples; + unsigned long ttime_mean; +}; + +/** + * struct bfq_io_cq - per (request_queue, io_context) structure. + * @icq: associated io_cq structure + * @bfqq: array of two process queues, the sync and the async + * @ttime: associated @bfq_ttime struct + * @wr_time_left: snapshot of the time left before weight raising ends + * for the sync queue associated to this process; this + * snapshot is taken to remember this value while the weight + * raising is suspended because the queue is merged with a + * shared queue, and is used to set @raising_cur_max_time + * when the queue is split from the shared queue and its + * weight is raised again + * @saved_idle_window: same purpose as the previous field for the idle + * window + * @saved_IO_bound: same purpose as the previous two fields for the I/O + * bound classification of a queue + * @saved_in_large_burst: same purpose as the previous fields for the + * value of the field keeping the queue's belonging + * to a large burst + * @was_in_burst_list: true if the queue belonged to a burst list + * before its merge with another cooperating queue + * @cooperations: counter of consecutive successful queue merges underwent + * by any of the process' @bfq_queues + * @failed_cooperations: counter of consecutive failed queue merges of any + * of the process' @bfq_queues + */ +struct bfq_io_cq { + struct io_cq icq; /* must be the first member */ + struct bfq_queue *bfqq[2]; + struct bfq_ttime ttime; + int ioprio; + + unsigned int wr_time_left; + bool saved_idle_window; + bool saved_IO_bound; + + bool saved_in_large_burst; + bool was_in_burst_list; + + unsigned int cooperations; + unsigned int failed_cooperations; +}; + +enum bfq_device_speed { + BFQ_BFQD_FAST, + BFQ_BFQD_SLOW, +}; + +/** + * struct bfq_data - per device data structure. + * @queue: request queue for the managed device. + * @root_group: root bfq_group for the device. + * @rq_pos_tree: rbtree sorted by next_request position, used when + * determining if two or more queues have interleaving + * requests (see bfq_close_cooperator()). + * @active_numerous_groups: number of bfq_groups containing more than one + * active @bfq_entity. + * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by + * weight. Used to keep track of whether all @bfq_queues + * have the same weight. The tree contains one counter + * for each distinct weight associated to some active + * and not weight-raised @bfq_queue (see the comments to + * the functions bfq_weights_tree_[add|remove] for + * further details). + * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted + * by weight. Used to keep track of whether all + * @bfq_groups have the same weight. The tree contains + * one counter for each distinct weight associated to + * some active @bfq_group (see the comments to the + * functions bfq_weights_tree_[add|remove] for further + * details). + * @busy_queues: number of bfq_queues containing requests (including the + * queue in service, even if it is idling). + * @busy_in_flight_queues: number of @bfq_queues containing pending or + * in-flight requests, plus the @bfq_queue in + * service, even if idle but waiting for the + * possible arrival of its next sync request. This + * field is updated only if the device is rotational, + * but used only if the device is also NCQ-capable. + * The reason why the field is updated also for non- + * NCQ-capable rotational devices is related to the + * fact that the value of @hw_tag may be set also + * later than when busy_in_flight_queues may need to + * be incremented for the first time(s). Taking also + * this possibility into account, to avoid unbalanced + * increments/decrements, would imply more overhead + * than just updating busy_in_flight_queues + * regardless of the value of @hw_tag. + * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues + * (that is, seeky queues that expired + * for budget timeout at least once) + * containing pending or in-flight + * requests, including the in-service + * @bfq_queue if constantly seeky. This + * field is updated only if the device + * is rotational, but used only if the + * device is also NCQ-capable (see the + * comments to @busy_in_flight_queues). + * @wr_busy_queues: number of weight-raised busy @bfq_queues. + * @queued: number of queued requests. + * @rq_in_driver: number of requests dispatched and waiting for completion. + * @sync_flight: number of sync requests in the driver. + * @max_rq_in_driver: max number of reqs in driver in the last + * @hw_tag_samples completed requests. + * @hw_tag_samples: nr of samples used to calculate hw_tag. + * @hw_tag: flag set to one if the driver is showing a queueing behavior. + * @budgets_assigned: number of budgets assigned. + * @idle_slice_timer: timer set when idling for the next sequential request + * from the queue in service. + * @unplug_work: delayed work to restart dispatching on the request queue. + * @in_service_queue: bfq_queue in service. + * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. + * @last_position: on-disk position of the last served request. + * @last_budget_start: beginning of the last budget. + * @last_idling_start: beginning of the last idle slice. + * @peak_rate: peak transfer rate observed for a budget. + * @peak_rate_samples: number of samples used to calculate @peak_rate. + * @bfq_max_budget: maximum budget allotted to a bfq_queue before + * rescheduling. + * @group_list: list of all the bfq_groups active on the device. + * @active_list: list of all the bfq_queues active on the device. + * @idle_list: list of all the bfq_queues idle on the device. + * @bfq_fifo_expire: timeout for async/sync requests; when it expires + * requests are served in fifo order. + * @bfq_back_penalty: weight of backward seeks wrt forward ones. + * @bfq_back_max: maximum allowed backward seek. + * @bfq_slice_idle: maximum idling time. + * @bfq_user_max_budget: user-configured max budget value + * (0 for auto-tuning). + * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to + * async queues. + * @bfq_timeout: timeout for bfq_queues to consume their budget; used to + * to prevent seeky queues to impose long latencies to well + * behaved ones (this also implies that seeky queues cannot + * receive guarantees in the service domain; after a timeout + * they are charged for the whole allocated budget, to try + * to preserve a behavior reasonably fair among them, but + * without service-domain guarantees). + * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is + * no more granted any weight-raising. + * @bfq_failed_cooperations: number of consecutive failed cooperation + * chances after which weight-raising is restored + * to a queue subject to more than bfq_coop_thresh + * queue merges. + * @bfq_requests_within_timer: number of consecutive requests that must be + * issued within the idle time slice to set + * again idling to a queue which was marked as + * non-I/O-bound (see the definition of the + * IO_bound flag for further details). + * @last_ins_in_burst: last time at which a queue entered the current + * burst of queues being activated shortly after + * each other; for more details about this and the + * following parameters related to a burst of + * activations, see the comments to the function + * @bfq_handle_burst. + * @bfq_burst_interval: reference time interval used to decide whether a + * queue has been activated shortly after + * @last_ins_in_burst. + * @burst_size: number of queues in the current burst of queue activations. + * @bfq_large_burst_thresh: maximum burst size above which the current + * queue-activation burst is deemed as 'large'. + * @large_burst: true if a large queue-activation burst is in progress. + * @burst_list: head of the burst list (as for the above fields, more details + * in the comments to the function bfq_handle_burst). + * @low_latency: if set to true, low-latency heuristics are enabled. + * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised + * queue is multiplied. + * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). + * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. + * @bfq_wr_min_idle_time: minimum idle period after which weight-raising + * may be reactivated for a queue (in jiffies). + * @bfq_wr_min_inter_arr_async: minimum period between request arrivals + * after which weight-raising may be + * reactivated for an already busy queue + * (in jiffies). + * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, + * sectors per seconds. + * @RT_prod: cached value of the product R*T used for computing the maximum + * duration of the weight raising automatically. + * @device_speed: device-speed class for the low-latency heuristic. + * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. + * + * All the fields are protected by the @queue lock. + */ +struct bfq_data { + struct request_queue *queue; + + struct bfq_group *root_group; + struct rb_root rq_pos_tree; + +#ifdef CONFIG_CGROUP_BFQIO + int active_numerous_groups; +#endif + + struct rb_root queue_weights_tree; + struct rb_root group_weights_tree; + + int busy_queues; + int busy_in_flight_queues; + int const_seeky_busy_in_flight_queues; + int wr_busy_queues; + int queued; + int rq_in_driver; + int sync_flight; + + int max_rq_in_driver; + int hw_tag_samples; + int hw_tag; + + int budgets_assigned; + + struct timer_list idle_slice_timer; + struct work_struct unplug_work; + + struct bfq_queue *in_service_queue; + struct bfq_io_cq *in_service_bic; + + sector_t last_position; + + ktime_t last_budget_start; + ktime_t last_idling_start; + int peak_rate_samples; + u64 peak_rate; + unsigned long bfq_max_budget; + + struct hlist_head group_list; + struct list_head active_list; + struct list_head idle_list; + + unsigned int bfq_fifo_expire[2]; + unsigned int bfq_back_penalty; + unsigned int bfq_back_max; + unsigned int bfq_slice_idle; + u64 bfq_class_idle_last_service; + + unsigned int bfq_user_max_budget; + unsigned int bfq_max_budget_async_rq; + unsigned int bfq_timeout[2]; + + unsigned int bfq_coop_thresh; + unsigned int bfq_failed_cooperations; + unsigned int bfq_requests_within_timer; + + unsigned long last_ins_in_burst; + unsigned long bfq_burst_interval; + int burst_size; + unsigned long bfq_large_burst_thresh; + bool large_burst; + struct hlist_head burst_list; + + bool low_latency; + + /* parameters of the low_latency heuristics */ + unsigned int bfq_wr_coeff; + unsigned int bfq_wr_max_time; + unsigned int bfq_wr_rt_max_time; + unsigned int bfq_wr_min_idle_time; + unsigned long bfq_wr_min_inter_arr_async; + unsigned int bfq_wr_max_softrt_rate; + u64 RT_prod; + enum bfq_device_speed device_speed; + + struct bfq_queue oom_bfqq; +}; + +enum bfqq_state_flags { + BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ + BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ + BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ + BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ + BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ + BFQ_BFQQ_FLAG_sync, /* synchronous queue */ + BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ + BFQ_BFQQ_FLAG_IO_bound, /* + * bfqq has timed-out at least once + * having consumed at most 2/10 of + * its budget + */ + BFQ_BFQQ_FLAG_in_large_burst, /* + * bfqq activated in a large burst, + * see comments to bfq_handle_burst. + */ + BFQ_BFQQ_FLAG_constantly_seeky, /* + * bfqq has proved to be slow and + * seeky until budget timeout + */ + BFQ_BFQQ_FLAG_softrt_update, /* + * may need softrt-next-start + * update + */ + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ + BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ +}; + +#define BFQ_BFQQ_FNS(name) \ +static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ +} \ +static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ +} \ +static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ +{ \ + return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ +} + +BFQ_BFQQ_FNS(busy); +BFQ_BFQQ_FNS(wait_request); +BFQ_BFQQ_FNS(must_alloc); +BFQ_BFQQ_FNS(fifo_expire); +BFQ_BFQQ_FNS(idle_window); +BFQ_BFQQ_FNS(sync); +BFQ_BFQQ_FNS(budget_new); +BFQ_BFQQ_FNS(IO_bound); +BFQ_BFQQ_FNS(in_large_burst); +BFQ_BFQQ_FNS(constantly_seeky); +BFQ_BFQQ_FNS(coop); +BFQ_BFQQ_FNS(split_coop); +BFQ_BFQQ_FNS(just_split); +BFQ_BFQQ_FNS(softrt_update); +#undef BFQ_BFQQ_FNS + +/* Logging facilities. */ +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) + +#define bfq_log(bfqd, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) + +/* Expiration reasons. */ +enum bfqq_expiration { + BFQ_BFQQ_TOO_IDLE = 0, /* + * queue has been idling for + * too long + */ + BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ + BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ + BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ +}; + +#ifdef CONFIG_CGROUP_BFQIO +/** + * struct bfq_group - per (device, cgroup) data structure. + * @entity: schedulable entity to insert into the parent group sched_data. + * @sched_data: own sched_data, to contain child entities (they may be + * both bfq_queues and bfq_groups). + * @group_node: node to be inserted into the bfqio_cgroup->group_data + * list of the containing cgroup's bfqio_cgroup. + * @bfqd_node: node to be inserted into the @bfqd->group_list list + * of the groups active on the same device; used for cleanup. + * @bfqd: the bfq_data for the device this group acts upon. + * @async_bfqq: array of async queues for all the tasks belonging to + * the group, one queue per ioprio value per ioprio_class, + * except for the idle class that has only one queue. + * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). + * @my_entity: pointer to @entity, %NULL for the toplevel group; used + * to avoid too many special cases during group creation/ + * migration. + * @active_entities: number of active entities belonging to the group; + * unused for the root group. Used to know whether there + * are groups with more than one active @bfq_entity + * (see the comments to the function + * bfq_bfqq_must_not_expire()). + * + * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup + * there is a set of bfq_groups, each one collecting the lower-level + * entities belonging to the group that are acting on the same device. + * + * Locking works as follows: + * o @group_node is protected by the bfqio_cgroup lock, and is accessed + * via RCU from its readers. + * o @bfqd is protected by the queue lock, RCU is used to access it + * from the readers. + * o All the other fields are protected by the @bfqd queue lock. + */ +struct bfq_group { + struct bfq_entity entity; + struct bfq_sched_data sched_data; + + struct hlist_node group_node; + struct hlist_node bfqd_node; + + void *bfqd; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; + + struct bfq_entity *my_entity; + + int active_entities; +}; + +/** + * struct bfqio_cgroup - bfq cgroup data structure. + * @css: subsystem state for bfq in the containing cgroup. + * @weight: cgroup weight. + * @ioprio: cgroup ioprio. + * @ioprio_class: cgroup ioprio_class. + * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data. + * @group_data: list containing the bfq_group belonging to this cgroup. + * + * @group_data is accessed using RCU, with @lock protecting the updates, + * @ioprio and @ioprio_class are protected by @lock. + */ +struct bfqio_cgroup { + struct cgroup_subsys_state css; + + unsigned short weight, ioprio, ioprio_class; + + spinlock_t lock; + struct hlist_head group_data; +}; +#else +struct bfq_group { + struct bfq_sched_data sched_data; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; +}; +#endif + +static inline struct bfq_service_tree * +bfq_entity_service_tree(struct bfq_entity *entity) +{ + struct bfq_sched_data *sched_data = entity->sched_data; + unsigned int idx = entity->ioprio_class - 1; + + BUG_ON(idx >= BFQ_IOPRIO_CLASSES); + BUG_ON(sched_data == NULL); + + return sched_data->service_tree + idx; +} + +static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, + bool is_sync) +{ + return bic->bfqq[is_sync]; +} + +static inline void bic_set_bfqq(struct bfq_io_cq *bic, + struct bfq_queue *bfqq, bool is_sync) +{ + bic->bfqq[is_sync] = bfqq; +} + +static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) +{ + return bic->icq.q->elevator->elevator_data; +} + +/** + * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. + * @ptr: a pointer to a bfqd. + * @flags: storage for the flags to be saved. + * + * This function allows bfqg->bfqd to be protected by the + * queue lock of the bfqd they reference; the pointer is dereferenced + * under RCU, so the storage for bfqd is assured to be safe as long + * as the RCU read side critical section does not end. After the + * bfqd->queue->queue_lock is taken the pointer is rechecked, to be + * sure that no other writer accessed it. If we raced with a writer, + * the function returns NULL, with the queue unlocked, otherwise it + * returns the dereferenced pointer, with the queue locked. + */ +static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, + unsigned long *flags) +{ + struct bfq_data *bfqd; + + rcu_read_lock(); + bfqd = rcu_dereference(*(struct bfq_data **)ptr); + + if (bfqd != NULL) { + spin_lock_irqsave(bfqd->queue->queue_lock, *flags); + if (*ptr == bfqd) + goto out; + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); + } + + bfqd = NULL; +out: + rcu_read_unlock(); + return bfqd; +} + +static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd, + unsigned long *flags) +{ + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); +} + +static void bfq_check_ioprio_change(struct io_context *ioc, + struct bfq_io_cq *bic); +static void bfq_put_queue(struct bfq_queue *bfqq); +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, int is_sync, + struct io_context *ioc, gfp_t gfp_mask); +static void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg); +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); + +#endif /* _BFQ_H */ diff --git a/block/blk-exec.c b/block/blk-exec.c index 1b5cb66ef..55dd9861d 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "blk.h" diff --git a/build.sh b/build.sh index d6c7de091..0a6060e70 100755 --- a/build.sh +++ b/build.sh @@ -34,7 +34,7 @@ RDIR=$(pwd) # japanese variants: # dcm = N900D / SC-01F (NTT Docomo) # kdi = N900J / SCL22 (au by KDDI) -VARIANT=can +VARIANT=eur [ -z $VER ] && \ # version number @@ -61,11 +61,13 @@ MAKE_ZIP=1 MAKE_TAR=1 # directory containing cross-compile arm-cortex_a15 toolchain -TOOLCHAIN=/home/jc/build/toolchain/arm-cortex_a15-linux-gnueabihf-linaro_4.9.4-2015.06 +TOOLCHAIN=/home/vagrant/g2/DORIMANX_LG_STOCK_LP_KERNEL/android-toolchain # amount of cpu threads to use in kernel make process THREADS=5 +G_SESNSOR_HUB=1 + ############## SCARY NO-TOUCHY STUFF ############### export ARCH=arm @@ -82,6 +84,10 @@ if ! [ -d $RDIR"/ik.ramdisk/variant/$VARIANT/" ] ; then exit -1 fi +if [ $G_SESNSOR_HUB -eq 1 ] ; then + sed -i "s/CONFIG_SENSORS_SSP_STM_LEGACY=y/# CONFIG_SENSORS_SSP_STM_LEGACY is not set/" arch/arm/configs/ik_defconfig +fi + [ $PERMISSIVE -eq 1 ] && SELINUX="never_enforce" || SELINUX="always_enforce" KDIR=$RDIR/build/arch/arm/boot diff --git a/crypto/Kconfig b/crypto/Kconfig index 566ebe0d8..9a6bd9712 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1035,6 +1035,22 @@ config CRYPTO_LZO help This is the LZO algorithm. +config CRYPTO_LZ4 + tristate "LZ4 compression algorithm" + select CRYPTO_ALGAPI + select LZ4_COMPRESS + select LZ4_DECOMPRESS + help + This is the LZ4 algorithm. + +config CRYPTO_LZ4HC + tristate "LZ4HC compression algorithm" + select CRYPTO_ALGAPI + select LZ4HC_COMPRESS + select LZ4_DECOMPRESS + help + This is the LZ4 high compression mode algorithm. + comment "Random Number Generation" config CRYPTO_ANSI_CPRNG diff --git a/crypto/Makefile b/crypto/Makefile index cba53f235..df6e37262 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -86,6 +86,8 @@ obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o obj-$(CONFIG_CRYPTO_LZO) += lzo.o +obj-$(CONFIG_CRYPTO_LZ4) += lz4.o +obj-$(CONFIG_CRYPTO_LZ4HC) += lz4hc.o obj-$(CONFIG_CRYPTO_RNG2) += rng.o obj-$(CONFIG_CRYPTO_RNG2) += krng.o obj-$(CONFIG_CRYPTO_ANSI_CPRNG) += ansi_cprng.o diff --git a/crypto/ablkcipher.c b/crypto/ablkcipher.c index 313a05b8f..7da6f6a21 100644 --- a/crypto/ablkcipher.c +++ b/crypto/ablkcipher.c @@ -715,7 +715,7 @@ struct crypto_ablkcipher *crypto_alloc_ablkcipher(const char *alg_name, err: if (err != -EAGAIN) break; - if (signal_pending(current)) { + if (fatal_signal_pending(current)) { err = -EINTR; break; } diff --git a/crypto/ahash.c b/crypto/ahash.c index d4c8aae5b..2a79f7ff1 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -487,7 +487,8 @@ static int ahash_prepare_alg(struct ahash_alg *alg) struct crypto_alg *base = &alg->halg.base; if (alg->halg.digestsize > PAGE_SIZE / 8 || - alg->halg.statesize > PAGE_SIZE / 8) + alg->halg.statesize > PAGE_SIZE / 8 || + alg->halg.statesize == 0) return -EINVAL; base->cra_type = &crypto_ahash_type; diff --git a/crypto/algapi.c b/crypto/algapi.c index c18f8b104..8df2104b5 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -386,7 +386,7 @@ static void crypto_wait_for_test(struct crypto_larval *larval) crypto_alg_tested(larval->alg.cra_driver_name, 0); } - err = wait_for_completion_interruptible(&larval->completion); + err = wait_for_completion_killable(&larval->completion); WARN_ON(err); out: diff --git a/crypto/api.c b/crypto/api.c index 5c13704f4..29fe0757f 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -181,7 +181,7 @@ static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg) struct crypto_larval *larval = (void *)alg; long timeout; - timeout = wait_for_completion_interruptible_timeout( + timeout = wait_for_completion_killable_timeout( &larval->completion, 60 * HZ); alg = larval->adult; @@ -460,7 +460,7 @@ struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask) err: if (err != -EAGAIN) break; - if (signal_pending(current)) { + if (fatal_signal_pending(current)) { err = -EINTR; break; } @@ -589,7 +589,7 @@ void *crypto_alloc_tfm(const char *alg_name, err: if (err != -EAGAIN) break; - if (signal_pending(current)) { + if (fatal_signal_pending(current)) { err = -EINTR; break; } diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c index 910497bd7..0c19d0357 100644 --- a/crypto/crypto_user.c +++ b/crypto/crypto_user.c @@ -350,7 +350,7 @@ static struct crypto_alg *crypto_user_aead_alg(const char *name, u32 type, err = PTR_ERR(alg); if (err != -EAGAIN) break; - if (signal_pending(current)) { + if (fatal_signal_pending(current)) { err = -EINTR; break; } diff --git a/crypto/lz4.c b/crypto/lz4.c new file mode 100644 index 000000000..34d072b72 --- /dev/null +++ b/crypto/lz4.c @@ -0,0 +1,106 @@ +/* + * Cryptographic API. + * + * Copyright (c) 2013 Chanho Min + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include + +struct lz4_ctx { + void *lz4_comp_mem; +}; + +static int lz4_init(struct crypto_tfm *tfm) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->lz4_comp_mem = vmalloc(LZ4_MEM_COMPRESS); + if (!ctx->lz4_comp_mem) + return -ENOMEM; + + return 0; +} + +static void lz4_exit(struct crypto_tfm *tfm) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + vfree(ctx->lz4_comp_mem); +} + +static int lz4_compress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + size_t tmp_len = *dlen; + int err; + + err = lz4_compress(src, slen, dst, &tmp_len, ctx->lz4_comp_mem); + + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return 0; +} + +static int lz4_decompress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + int err; + size_t tmp_len = *dlen; + size_t __slen = slen; + + err = lz4_decompress_unknownoutputsize(src, __slen, dst, &tmp_len); + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return err; +} + +static struct crypto_alg alg_lz4 = { + .cra_name = "lz4", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = sizeof(struct lz4_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg_lz4.cra_list), + .cra_init = lz4_init, + .cra_exit = lz4_exit, + .cra_u = { .compress = { + .coa_compress = lz4_compress_crypto, + .coa_decompress = lz4_decompress_crypto } } +}; + +static int __init lz4_mod_init(void) +{ + return crypto_register_alg(&alg_lz4); +} + +static void __exit lz4_mod_fini(void) +{ + crypto_unregister_alg(&alg_lz4); +} + +module_init(lz4_mod_init); +module_exit(lz4_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4 Compression Algorithm"); diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c new file mode 100644 index 000000000..9218b3fed --- /dev/null +++ b/crypto/lz4hc.c @@ -0,0 +1,106 @@ +/* + * Cryptographic API. + * + * Copyright (c) 2013 Chanho Min + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ +#include +#include +#include +#include +#include + +struct lz4hc_ctx { + void *lz4hc_comp_mem; +}; + +static int lz4hc_init(struct crypto_tfm *tfm) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->lz4hc_comp_mem = vmalloc(LZ4HC_MEM_COMPRESS); + if (!ctx->lz4hc_comp_mem) + return -ENOMEM; + + return 0; +} + +static void lz4hc_exit(struct crypto_tfm *tfm) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + + vfree(ctx->lz4hc_comp_mem); +} + +static int lz4hc_compress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + size_t tmp_len = *dlen; + int err; + + err = lz4hc_compress(src, slen, dst, &tmp_len, ctx->lz4hc_comp_mem); + + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return 0; +} + +static int lz4hc_decompress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + int err; + size_t tmp_len = *dlen; + size_t __slen = slen; + + err = lz4_decompress_unknownoutputsize(src, __slen, dst, &tmp_len); + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return err; +} + +static struct crypto_alg alg_lz4hc = { + .cra_name = "lz4hc", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = sizeof(struct lz4hc_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg_lz4hc.cra_list), + .cra_init = lz4hc_init, + .cra_exit = lz4hc_exit, + .cra_u = { .compress = { + .coa_compress = lz4hc_compress_crypto, + .coa_decompress = lz4hc_decompress_crypto } } +}; + +static int __init lz4hc_mod_init(void) +{ + return crypto_register_alg(&alg_lz4hc); +} + +static void __exit lz4hc_mod_fini(void) +{ + crypto_unregister_alg(&alg_lz4hc); +} + +module_init(lz4hc_mod_init); +module_exit(lz4hc_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4HC Compression Algorithm"); diff --git a/crypto/testmgr.c b/crypto/testmgr.c index b0b161d03..360db8701 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -462,7 +462,7 @@ static int test_aead(struct crypto_aead *tfm, int enc, ret = crypto_aead_setkey(tfm, key, template[i].klen); - if (!ret == template[i].fail) { + if ((!ret) == template[i].fail) { printk(KERN_ERR "alg: aead: setkey failed on " "test %d for %s: flags=%x\n", j, algo, crypto_aead_get_flags(tfm)); @@ -552,7 +552,7 @@ static int test_aead(struct crypto_aead *tfm, int enc, key = template[i].key; ret = crypto_aead_setkey(tfm, key, template[i].klen); - if (!ret == template[i].fail) { + if ((!ret) == template[i].fail) { printk(KERN_ERR "alg: aead: setkey failed on " "chunk test %d for %s: flags=%x\n", j, algo, crypto_aead_get_flags(tfm)); @@ -756,7 +756,7 @@ static int test_cipher(struct crypto_cipher *tfm, int enc, ret = crypto_cipher_setkey(tfm, template[i].key, template[i].klen); - if (!ret == template[i].fail) { + if ((!ret) == template[i].fail) { printk(KERN_ERR "alg: cipher: setkey failed " "on test %d for %s: flags=%x\n", j, algo, crypto_cipher_get_flags(tfm)); @@ -852,7 +852,7 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc, ret = crypto_ablkcipher_setkey(tfm, template[i].key, template[i].klen); - if (!ret == template[i].fail) { + if ((!ret) == template[i].fail) { printk(KERN_ERR "alg: skcipher: setkey failed " "on test %d for %s: flags=%x\n", j, algo, crypto_ablkcipher_get_flags(tfm)); @@ -916,7 +916,7 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc, ret = crypto_ablkcipher_setkey(tfm, template[i].key, template[i].klen); - if (!ret == template[i].fail) { + if ((!ret) == template[i].fail) { printk(KERN_ERR "alg: skcipher: setkey failed " "on chunk test %d for %s: flags=%x\n", j, algo, diff --git a/drivers/auxdisplay/ks0108.c b/drivers/auxdisplay/ks0108.c index 5b9385239..0d752851a 100644 --- a/drivers/auxdisplay/ks0108.c +++ b/drivers/auxdisplay/ks0108.c @@ -139,6 +139,7 @@ static int __init ks0108_init(void) ks0108_pardevice = parport_register_device(ks0108_parport, KS0108_NAME, NULL, NULL, NULL, PARPORT_DEV_EXCL, NULL); + parport_put_port(ks0108_parport); if (ks0108_pardevice == NULL) { printk(KERN_ERR KS0108_NAME ": ERROR: " "parport didn't register new device\n"); diff --git a/drivers/base/devres.c b/drivers/base/devres.c index 524bf96c2..06c541dc4 100644 --- a/drivers/base/devres.c +++ b/drivers/base/devres.c @@ -254,10 +254,10 @@ void * devres_get(struct device *dev, void *new_res, if (!dr) { add_dr(dev, &new_dr->node); dr = new_dr; - new_dr = NULL; + new_res = NULL; } spin_unlock_irqrestore(&dev->devres_lock, flags); - devres_free(new_dr); + devres_free(new_res); return dr->data; } diff --git a/drivers/base/platform.c b/drivers/base/platform.c index a1a722502..5a1373330 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -311,9 +311,7 @@ int platform_device_add(struct platform_device *pdev) failed: while (--i >= 0) { struct resource *r = &pdev->resource[i]; - unsigned long type = resource_type(r); - - if (type == IORESOURCE_MEM || type == IORESOURCE_IO) + if (r->parent) release_resource(r); } @@ -338,9 +336,7 @@ void platform_device_del(struct platform_device *pdev) for (i = 0; i < pdev->num_resources; i++) { struct resource *r = &pdev->resource[i]; - unsigned long type = resource_type(r); - - if (type == IORESOURCE_MEM || type == IORESOURCE_IO) + if (r->parent) release_resource(r); } } diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c index 1db128951..023a9d79e 100644 --- a/drivers/base/regmap/regmap-debugfs.c +++ b/drivers/base/regmap/regmap-debugfs.c @@ -23,8 +23,7 @@ static struct dentry *regmap_debugfs_root; /* Calculate the length of a fixed format */ static size_t regmap_calc_reg_len(int max_val, char *buf, size_t buf_size) { - snprintf(buf, buf_size, "%x", max_val); - return strlen(buf); + return snprintf(NULL, 0, "%x", max_val); } static ssize_t regmap_name_read_file(struct file *file, @@ -205,7 +204,7 @@ static ssize_t regmap_access_read_file(struct file *file, /* If we're in the region the user is trying to read */ if (p >= *ppos) { /* ...but not beyond it */ - if (buf_pos >= count - 1 - tot_len) + if (buf_pos + tot_len + 1 >= count) break; /* Format the register */ diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index a79640712..e93e2ded7 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -118,6 +118,8 @@ source "drivers/block/paride/Kconfig" source "drivers/block/mtip32xx/Kconfig" +source "drivers/block/zram/Kconfig" + config BLK_CPQ_DA tristate "Compaq SMART2 support" depends on PCI && VIRT_TO_BUS diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index a81cdd7b9..16477b255 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1314,7 +1314,8 @@ static void blkback_changed(struct xenbus_device *dev, break; /* Missed the backend's Closing state -- fallthrough */ case XenbusStateClosing: - blkfront_closing(info); + if (info) + blkfront_closing(info); break; } } diff --git a/drivers/staging/zram/Kconfig b/drivers/block/zram/Kconfig similarity index 70% rename from drivers/staging/zram/Kconfig rename to drivers/block/zram/Kconfig index 983314c41..386ba3d1a 100644 --- a/drivers/staging/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -14,12 +14,13 @@ config ZRAM disks and maybe many more. See zram.txt for more information. - Project home: -config ZRAM_DEBUG - bool "Compressed RAM block device debug support" +config ZRAM_LZ4_COMPRESS + bool "Enable LZ4 algorithm support" depends on ZRAM + select LZ4_COMPRESS + select LZ4_DECOMPRESS default n help - This option adds additional debugging code to the compressed - RAM block device driver. + This option enables LZ4 compression algorithm support. Compression + algorithm can be changed using `comp_algorithm' device attribute. \ No newline at end of file diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile new file mode 100644 index 000000000..be0763ff5 --- /dev/null +++ b/drivers/block/zram/Makefile @@ -0,0 +1,5 @@ +zram-y := zcomp_lzo.o zcomp.o zram_drv.o + +zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o + +obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c new file mode 100644 index 000000000..b51a816d7 --- /dev/null +++ b/drivers/block/zram/zcomp.c @@ -0,0 +1,230 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "zcomp.h" +#include "zcomp_lzo.h" +#ifdef CONFIG_ZRAM_LZ4_COMPRESS +#include "zcomp_lz4.h" +#endif + +static struct zcomp_backend *backends[] = { + &zcomp_lzo, +#ifdef CONFIG_ZRAM_LZ4_COMPRESS + &zcomp_lz4, +#endif + NULL +}; + +static struct zcomp_backend *find_backend(const char *compress) +{ + int i = 0; + while (backends[i]) { + if (sysfs_streq(compress, backends[i]->name)) + break; + i++; + } + return backends[i]; +} + +static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + if (zstrm->private) + comp->backend->destroy(zstrm->private); + free_pages((unsigned long)zstrm->buffer, 1); + kfree(zstrm); +} + +/* + * allocate new zcomp_strm structure with ->private initialized by + * backend, return NULL on error + */ +static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) +{ + struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags); + if (!zstrm) + return NULL; + + zstrm->private = comp->backend->create(flags); + /* + * allocate 2 pages. 1 for compressed data, plus 1 extra for the + * case when compressed size is larger than the original one + */ + zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1); + if (!zstrm->private || !zstrm->buffer) { + zcomp_strm_free(comp, zstrm); + zstrm = NULL; + } + return zstrm; +} + +/* show available compressors */ +ssize_t zcomp_available_show(const char *comp, char *buf) +{ + ssize_t sz = 0; + int i = 0; + + while (backends[i]) { + if (!strcmp(comp, backends[i]->name)) + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "[%s] ", backends[i]->name); + else + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "%s ", backends[i]->name); + i++; + } + sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); + return sz; +} + +bool zcomp_available_algorithm(const char *comp) +{ + return find_backend(comp) != NULL; +} + +struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) +{ + return *get_cpu_ptr(comp->stream); +} + +void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + put_cpu_ptr(comp->stream); +} + +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const unsigned char *src, size_t *dst_len) +{ + return comp->backend->compress(src, zstrm->buffer, dst_len, + zstrm->private); +} + +int zcomp_decompress(struct zcomp *comp, const unsigned char *src, + size_t src_len, unsigned char *dst) +{ + return comp->backend->decompress(src, src_len, dst); +} + +static int __zcomp_cpu_notifier(struct zcomp *comp, + unsigned long action, unsigned long cpu) +{ + struct zcomp_strm *zstrm; + + switch (action) { + case CPU_UP_PREPARE: + if (WARN_ON(*per_cpu_ptr(comp->stream, cpu))) + break; + zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); + if (IS_ERR_OR_NULL(zstrm)) { + pr_err("Can't allocate a compression stream\n"); + return NOTIFY_BAD; + } + *per_cpu_ptr(comp->stream, cpu) = zstrm; + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + zstrm = *per_cpu_ptr(comp->stream, cpu); + if (!IS_ERR_OR_NULL(zstrm)) + zcomp_strm_free(comp, zstrm); + *per_cpu_ptr(comp->stream, cpu) = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} + +static int zcomp_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + unsigned long cpu = (unsigned long)pcpu; + struct zcomp *comp = container_of(nb, typeof(*comp), notifier); + + return __zcomp_cpu_notifier(comp, action, cpu); +} + +static int zcomp_init(struct zcomp *comp) +{ + unsigned long cpu; + int ret; + + comp->notifier.notifier_call = zcomp_cpu_notifier; + + comp->stream = alloc_percpu(struct zcomp_strm *); + if (!comp->stream) + return -ENOMEM; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) { + ret = __zcomp_cpu_notifier(comp, CPU_UP_PREPARE, cpu); + if (ret == NOTIFY_BAD) + goto cleanup; + } + __register_cpu_notifier(&comp->notifier); + cpu_notifier_register_done(); + return 0; + +cleanup: + for_each_online_cpu(cpu) + __zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu); + cpu_notifier_register_done(); + return -ENOMEM; +} + +void zcomp_destroy(struct zcomp *comp) +{ + unsigned long cpu; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + __zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu); + __unregister_cpu_notifier(&comp->notifier); + cpu_notifier_register_done(); + + free_percpu(comp->stream); + kfree(comp); +} + +/* + * search available compressors for requested algorithm. + * allocate new zcomp and initialize it. return compressing + * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL) + * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in + * case of allocation error, or any other error potentially + * returned by zcomp_init(). + */ +struct zcomp *zcomp_create(const char *compress) +{ + struct zcomp *comp; + struct zcomp_backend *backend; + int error; + + backend = find_backend(compress); + if (!backend) + return ERR_PTR(-EINVAL); + + comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); + if (!comp) + return ERR_PTR(-ENOMEM); + + comp->backend = backend; + error = zcomp_init(comp); + if (error) { + kfree(comp); + return ERR_PTR(error); + } + return comp; +} diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h new file mode 100644 index 000000000..ffd88cb74 --- /dev/null +++ b/drivers/block/zram/zcomp.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_H_ +#define _ZCOMP_H_ + +struct zcomp_strm { + /* compression/decompression buffer */ + void *buffer; + /* + * The private data of the compression stream, only compression + * stream backend can touch this (e.g. compression algorithm + * working memory) + */ + void *private; +}; + +/* static compression backend */ +struct zcomp_backend { + int (*compress)(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private); + + int (*decompress)(const unsigned char *src, size_t src_len, + unsigned char *dst); + + void *(*create)(gfp_t flags); + void (*destroy)(void *private); + + const char *name; +}; + +/* dynamic per-device compression frontend */ +struct zcomp { + struct zcomp_strm * __percpu *stream; + struct zcomp_backend *backend; + struct notifier_block notifier; +}; + +ssize_t zcomp_available_show(const char *comp, char *buf); +bool zcomp_available_algorithm(const char *comp); + +struct zcomp *zcomp_create(const char *comp); +void zcomp_destroy(struct zcomp *comp); + +struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); +void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm); + +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const unsigned char *src, size_t *dst_len); + +int zcomp_decompress(struct zcomp *comp, const unsigned char *src, + size_t src_len, unsigned char *dst); + +bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); +#endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c new file mode 100644 index 000000000..0110086ac --- /dev/null +++ b/drivers/block/zram/zcomp_lz4.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include "zcomp_lz4.h" + +static void *zcomp_lz4_create(gfp_t flags) +{ + void *ret; + + ret = kmalloc(LZ4_MEM_COMPRESS, flags); + if (!ret) + ret = __vmalloc(LZ4_MEM_COMPRESS, + flags | __GFP_HIGHMEM, + PAGE_KERNEL); + return ret; +} + +static void zcomp_lz4_destroy(void *private) +{ + kvfree(private); +} + +static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private) +{ + /* return : Success if return 0 */ + return lz4_compress(src, PAGE_SIZE, dst, dst_len, private); +} + +static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len, + unsigned char *dst) +{ + size_t dst_len = PAGE_SIZE; + /* return : Success if return 0 */ + return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len); +} + +struct zcomp_backend zcomp_lz4 = { + .compress = zcomp_lz4_compress, + .decompress = zcomp_lz4_decompress, + .create = zcomp_lz4_create, + .destroy = zcomp_lz4_destroy, + .name = "lz4", +}; diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h new file mode 100644 index 000000000..60613fb29 --- /dev/null +++ b/drivers/block/zram/zcomp_lz4.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_LZ4_H_ +#define _ZCOMP_LZ4_H_ + +#include "zcomp.h" + +extern struct zcomp_backend zcomp_lz4; + +#endif /* _ZCOMP_LZ4_H_ */ diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c new file mode 100644 index 000000000..ed7a1f054 --- /dev/null +++ b/drivers/block/zram/zcomp_lzo.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include "zcomp_lzo.h" + +static void *lzo_create(gfp_t flags) +{ + void *ret; + + ret = kmalloc(LZO1X_MEM_COMPRESS, flags); + if (!ret) + ret = __vmalloc(LZO1X_MEM_COMPRESS, + flags | __GFP_HIGHMEM, + PAGE_KERNEL); + return ret; +} + +static void lzo_destroy(void *private) +{ + kvfree(private); +} + +static int lzo_compress(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private) +{ + int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private); + return ret == LZO_E_OK ? 0 : ret; +} + +static int lzo_decompress(const unsigned char *src, size_t src_len, + unsigned char *dst) +{ + size_t dst_len = PAGE_SIZE; + int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len); + return ret == LZO_E_OK ? 0 : ret; +} + +struct zcomp_backend zcomp_lzo = { + .compress = lzo_compress, + .decompress = lzo_decompress, + .create = lzo_create, + .destroy = lzo_destroy, + .name = "lzo", +}; diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h new file mode 100644 index 000000000..128c5807f --- /dev/null +++ b/drivers/block/zram/zcomp_lzo.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_LZO_H_ +#define _ZCOMP_LZO_H_ + +#include "zcomp.h" + +extern struct zcomp_backend zcomp_lzo; + +#endif /* _ZCOMP_LZO_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c new file mode 100644 index 000000000..a0b07110d --- /dev/null +++ b/drivers/block/zram/zram_drv.c @@ -0,0 +1,1555 @@ +/* + * Compressed RAM block device + * + * Copyright (C) 2008, 2009, 2010 Nitin Gupta + * 2012, 2013 Minchan Kim + * + * This code is released using a dual license strategy: BSD/GPL + * You can choose the licence that better fits your requirements. + * + * Released under the terms of 3-clause BSD License + * Released under the terms of GNU General Public License Version 2.0 + * + */ + +#define KMSG_COMPONENT "zram" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zram_drv.h" + +#ifdef CONFIG_STATE_NOTIFIER +#include +static struct notifier_block notif; +#endif + +static DEFINE_IDR(zram_index_idr); +/* idr index must be protected */ +static DEFINE_MUTEX(zram_index_mutex); + +static int zram_major; +#ifdef CONFIG_ZRAM_LZ4_COMPRESS +static const char *default_compressor = "lz4"; +#else +static const char *default_compressor = "lzo"; +#endif + +/* Module params (documentation at end) */ +static unsigned int num_devices = 1; + +static inline void deprecated_attr_warn(const char *name) +{ + pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n", + task_pid_nr(current), + current->comm, + name, + "See zram documentation."); +} + +#define ZRAM_ATTR_RO(name) \ +static ssize_t name##_show(struct device *d, \ + struct device_attribute *attr, char *b) \ +{ \ + struct zram *zram = dev_to_zram(d); \ + \ + deprecated_attr_warn(__stringify(name)); \ + return scnprintf(b, PAGE_SIZE, "%llu\n", \ + (u64)atomic64_read(&zram->stats.name)); \ +} \ +static DEVICE_ATTR_RO(name); + +static inline bool init_done(struct zram *zram) +{ + return zram->disksize; +} + +static inline struct zram *dev_to_zram(struct device *dev) +{ + return (struct zram *)dev_to_disk(dev)->private_data; +} + +/* flag operations require table entry bit_spin_lock() being held */ +static int zram_test_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + return meta->table[index].value & BIT(flag); +} + +static void zram_set_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + meta->table[index].value |= BIT(flag); +} + +static void zram_clear_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + meta->table[index].value &= ~BIT(flag); +} + +static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +{ + return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); +} + +static void zram_set_obj_size(struct zram_meta *meta, + u32 index, size_t size) +{ + unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; + + meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; +} + +static inline bool is_partial_io(struct bio_vec *bvec) +{ + return bvec->bv_len != PAGE_SIZE; +} + +/* + * Check if request is within bounds and aligned on zram logical blocks. + */ +static inline bool valid_io_request(struct zram *zram, + sector_t start, unsigned int size) +{ + u64 end, bound; + + /* unaligned request */ + if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) + return false; + if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) + return false; + + end = start + (size >> SECTOR_SHIFT); + bound = zram->disksize >> SECTOR_SHIFT; + /* out of range range */ + if (unlikely(start >= bound || end > bound || start > end)) + return false; + + /* I/O request is valid */ + return true; +} + +static void update_position(u32 *index, int *offset, struct bio_vec *bvec) +{ + if (*offset + bvec->bv_len >= PAGE_SIZE) + (*index)++; + *offset = (*offset + bvec->bv_len) % PAGE_SIZE; +} + +static inline void update_used_max(struct zram *zram, + const unsigned long pages) +{ + unsigned long old_max, cur_max; + + old_max = atomic_long_read(&zram->stats.max_used_pages); + + do { + cur_max = old_max; + if (pages > cur_max) + old_max = atomic_long_cmpxchg( + &zram->stats.max_used_pages, cur_max, pages); + } while (old_max != cur_max); +} + +static bool page_zero_filled(void *ptr) +{ + unsigned int pos; + unsigned long *page; + + page = (unsigned long *)ptr; + + for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { + if (page[pos]) + return false; + } + + return true; +} + +static void handle_zero_page(struct bio_vec *bvec) +{ + struct page *page = bvec->bv_page; + void *user_mem; + + user_mem = kmap_atomic(page); + if (is_partial_io(bvec)) + memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); + else + clear_page(user_mem); + kunmap_atomic(user_mem); + + flush_dcache_page(page); +} + +#ifdef CONFIG_STATE_NOTIFIER +static int zram_compact(struct zram *zram) +{ + struct zram_meta *meta; + u64 new_size, data_size; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + pr_info("*** zram state notifier: zram is off ***\n"); + return -EINVAL; + } + + meta = zram->meta; + + data_size = atomic64_read(&zram->stats.compr_data_size); + + pr_info("*** zram state notifier: starting compaction ***\n"); + zs_compact(meta->mem_pool); + + new_size = atomic64_read(&zram->stats.compr_data_size); + if (new_size < data_size) + pr_info("%s compacted. Saved %llu kb.\n", + zram->disk->disk_name, + (unsigned long long)(data_size - new_size)); + pr_info("*** zram state notifier: finished compaction ***\n"); + + up_read(&zram->init_lock); + + return 0; +} + +static int zram_compact_cb(int id, void *ptr, void *data) +{ + zram_compact(ptr); + return 0; +} + +static int state_notifier_callback(struct notifier_block *this, + unsigned long event, void *data) +{ + switch (event) { + case STATE_NOTIFIER_ACTIVE: + break; + case STATE_NOTIFIER_SUSPEND: + idr_for_each(&zram_index_idr, &zram_compact_cb, NULL); + break; + default: + break; + } + + return NOTIFY_OK; +} +#endif + +static ssize_t initstate_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u32 val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = init_done(zram); + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%u\n", val); +} + +static ssize_t disksize_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); +} + +static ssize_t orig_data_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + deprecated_attr_warn("orig_data_size"); + return scnprintf(buf, PAGE_SIZE, "%llu\n", + (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); +} + +static ssize_t mem_used_total_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val = 0; + struct zram *zram = dev_to_zram(dev); + + deprecated_attr_warn("mem_used_total"); + down_read(&zram->init_lock); + if (init_done(zram)) { + struct zram_meta *meta = zram->meta; + val = zs_get_total_pages(meta->mem_pool); + } + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); +} + +static ssize_t mem_limit_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val; + struct zram *zram = dev_to_zram(dev); + + deprecated_attr_warn("mem_limit"); + down_read(&zram->init_lock); + val = zram->limit_pages; + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); +} + +static ssize_t mem_limit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + u64 limit; + char *tmp; + struct zram *zram = dev_to_zram(dev); + + limit = memparse(buf, &tmp); + if (buf == tmp) /* no chars parsed, invalid input */ + return -EINVAL; + + down_write(&zram->init_lock); + zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT; + up_write(&zram->init_lock); + + return len; +} + +static ssize_t mem_used_max_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val = 0; + struct zram *zram = dev_to_zram(dev); + + deprecated_attr_warn("mem_used_max"); + down_read(&zram->init_lock); + if (init_done(zram)) + val = atomic_long_read(&zram->stats.max_used_pages); + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); +} + +static ssize_t mem_used_max_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int err; + unsigned long val; + struct zram *zram = dev_to_zram(dev); + + err = kstrtoul(buf, 10, &val); + if (err || val != 0) + return -EINVAL; + + down_read(&zram->init_lock); + if (init_done(zram)) { + struct zram_meta *meta = zram->meta; + atomic_long_set(&zram->stats.max_used_pages, + zs_get_total_pages(meta->mem_pool)); + } + up_read(&zram->init_lock); + + return len; +} + +/* + * We switched to per-cpu streams and this attr is not needed anymore. + * However, we will keep it around for some time, because: + * a) we may revert per-cpu streams in the future + * b) it's visible to user space and we need to follow our 2 years + * retirement rule; but we already have a number of 'soon to be + * altered' attrs, so max_comp_streams need to wait for the next + * layoff cycle. + */ +static ssize_t max_comp_streams_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus()); +} + +static ssize_t max_comp_streams_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + return len; +} + +static ssize_t comp_algorithm_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + size_t sz; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + sz = zcomp_available_show(zram->compressor, buf); + up_read(&zram->init_lock); + + return sz; +} + +static ssize_t comp_algorithm_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + size_t sz; + + if (!zcomp_available_algorithm(buf)) + return -EINVAL; + + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + pr_info("Can't change algorithm for initialized device\n"); + return -EBUSY; + } + strlcpy(zram->compressor, buf, sizeof(zram->compressor)); + + /* ignore trailing newline */ + sz = strlen(zram->compressor); + if (sz > 0 && zram->compressor[sz - 1] == '\n') + zram->compressor[sz - 1] = 0x00; + + up_write(&zram->init_lock); + return len; +} + +static ssize_t compact_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + struct zram_meta *meta; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + return -EINVAL; + } + + meta = zram->meta; + zs_compact(meta->mem_pool); + up_read(&zram->init_lock); + + return len; +} + +static ssize_t io_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + ssize_t ret; + + down_read(&zram->init_lock); + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8llu\n", + (u64)atomic64_read(&zram->stats.failed_reads), + (u64)atomic64_read(&zram->stats.failed_writes), + (u64)atomic64_read(&zram->stats.invalid_io), + (u64)atomic64_read(&zram->stats.notify_free)); + up_read(&zram->init_lock); + + return ret; +} + +static ssize_t mm_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + struct zs_pool_stats pool_stats; + u64 orig_size, mem_used = 0; + long max_used; + ssize_t ret; + + memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats)); + + down_read(&zram->init_lock); + if (init_done(zram)) { + mem_used = zs_get_total_pages(zram->meta->mem_pool); + zs_pool_stats(zram->meta->mem_pool, &pool_stats); + } + + orig_size = atomic64_read(&zram->stats.pages_stored); + max_used = atomic_long_read(&zram->stats.max_used_pages); + + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n", + orig_size << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.compr_data_size), + mem_used << PAGE_SHIFT, + zram->limit_pages << PAGE_SHIFT, + max_used << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.zero_pages), + pool_stats.pages_compacted); + up_read(&zram->init_lock); + + return ret; +} + +static ssize_t debug_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int version = 1; + struct zram *zram = dev_to_zram(dev); + ssize_t ret; + + down_read(&zram->init_lock); + ret = scnprintf(buf, PAGE_SIZE, + "version: %d\n%8llu\n", + version, + (u64)atomic64_read(&zram->stats.writestall)); + up_read(&zram->init_lock); + + return ret; +} + +static DEVICE_ATTR_RO(io_stat); +static DEVICE_ATTR_RO(mm_stat); +static DEVICE_ATTR_RO(debug_stat); +ZRAM_ATTR_RO(num_reads); +ZRAM_ATTR_RO(num_writes); +ZRAM_ATTR_RO(failed_reads); +ZRAM_ATTR_RO(failed_writes); +ZRAM_ATTR_RO(invalid_io); +ZRAM_ATTR_RO(notify_free); +ZRAM_ATTR_RO(zero_pages); +ZRAM_ATTR_RO(compr_data_size); + +static inline bool zram_meta_get(struct zram *zram) +{ + if (atomic_inc_not_zero(&zram->refcount)) + return true; + return false; +} + +static inline void zram_meta_put(struct zram *zram) +{ + atomic_dec(&zram->refcount); +} + +static void zram_meta_free(struct zram_meta *meta, u64 disksize) +{ + size_t num_pages = disksize >> PAGE_SHIFT; + size_t index; + + /* Free all pages that are still in this zram device */ + for (index = 0; index < num_pages; index++) { + unsigned long handle = meta->table[index].handle; + + if (!handle) + continue; + + zs_free(meta->mem_pool, handle); + } + + zs_destroy_pool(meta->mem_pool); + vfree(meta->table); + kfree(meta); +} + +static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize) +{ + size_t num_pages; + struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); + + if (!meta) + return NULL; + + num_pages = disksize >> PAGE_SHIFT; + meta->table = vzalloc(num_pages * sizeof(*meta->table)); + if (!meta->table) { + pr_err("Error allocating zram address table\n"); + goto out_error; + } + + meta->mem_pool = zs_create_pool(pool_name); + if (!meta->mem_pool) { + pr_err("Error creating memory pool\n"); + goto out_error; + } + + return meta; + +out_error: + vfree(meta->table); + kfree(meta); + return NULL; +} + +/* + * To protect concurrent access to the same index entry, + * caller should hold this table index entry's bit_spinlock to + * indicate this index entry is accessing. + */ +static void zram_free_page(struct zram *zram, size_t index) +{ + struct zram_meta *meta = zram->meta; + unsigned long handle = meta->table[index].handle; + + if (unlikely(!handle)) { + /* + * No memory is allocated for zero filled pages. + * Simply clear zero page flag. + */ + if (zram_test_flag(meta, index, ZRAM_ZERO)) { + zram_clear_flag(meta, index, ZRAM_ZERO); + atomic64_dec(&zram->stats.zero_pages); + } + return; + } + + zs_free(meta->mem_pool, handle); + + atomic64_sub(zram_get_obj_size(meta, index), + &zram->stats.compr_data_size); + atomic64_dec(&zram->stats.pages_stored); + + meta->table[index].handle = 0; + zram_set_obj_size(meta, index, 0); +} + +static int zram_decompress_page(struct zram *zram, char *mem, u32 index) +{ + int ret = 0; + unsigned char *cmem; + struct zram_meta *meta = zram->meta; + unsigned long handle; + size_t size; + + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + handle = meta->table[index].handle; + size = zram_get_obj_size(meta, index); + + if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + clear_page(mem); + return 0; + } + + cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); + if (size == PAGE_SIZE) + copy_page(mem, cmem); + else + ret = zcomp_decompress(zram->comp, cmem, size, mem); + zs_unmap_object(meta->mem_pool, handle); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + /* Should NEVER happen. Return bio error if it does. */ + if (unlikely(ret)) { + pr_err("Decompression failed! err=%d, page=%u\n", ret, index); + return ret; + } + + return 0; +} + +static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset) +{ + int ret; + struct page *page; + unsigned char *user_mem, *uncmem = NULL; + struct zram_meta *meta = zram->meta; + page = bvec->bv_page; + + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + if (unlikely(!meta->table[index].handle) || + zram_test_flag(meta, index, ZRAM_ZERO)) { + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + handle_zero_page(bvec); + return 0; + } + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + if (is_partial_io(bvec)) + /* Use a temporary buffer to decompress the page */ + uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); + + user_mem = kmap_atomic(page); + if (!is_partial_io(bvec)) + uncmem = user_mem; + + if (!uncmem) { + pr_err("Unable to allocate temp memory\n"); + ret = -ENOMEM; + goto out_cleanup; + } + + ret = zram_decompress_page(zram, uncmem, index); + /* Should NEVER happen. Return bio error if it does. */ + if (unlikely(ret)) + goto out_cleanup; + + if (is_partial_io(bvec)) + memcpy(user_mem + bvec->bv_offset, uncmem + offset, + bvec->bv_len); + + flush_dcache_page(page); + ret = 0; +out_cleanup: + kunmap_atomic(user_mem); + if (is_partial_io(bvec)) + kfree(uncmem); + return ret; +} + +static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, + int offset) +{ + int ret = 0; + size_t clen; + unsigned long handle = 0; + struct page *page; + unsigned char *user_mem, *cmem, *src, *uncmem = NULL; + struct zram_meta *meta = zram->meta; + struct zcomp_strm *zstrm = NULL; + unsigned long alloced_pages; + + page = bvec->bv_page; + if (is_partial_io(bvec)) { + /* + * This is a partial IO. We need to read the full page + * before to write the changes. + */ + uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); + if (!uncmem) { + ret = -ENOMEM; + goto out; + } + ret = zram_decompress_page(zram, uncmem, index); + if (ret) + goto out; + } + +compress_again: + user_mem = kmap_atomic(page); + if (is_partial_io(bvec)) { + memcpy(uncmem + offset, user_mem + bvec->bv_offset, + bvec->bv_len); + kunmap_atomic(user_mem); + user_mem = NULL; + } else { + uncmem = user_mem; + } + + if (page_zero_filled(uncmem)) { + if (user_mem) + kunmap_atomic(user_mem); + /* Free memory associated with this sector now. */ + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_free_page(zram, index); + zram_set_flag(meta, index, ZRAM_ZERO); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + atomic64_inc(&zram->stats.zero_pages); + ret = 0; + goto out; + } + + zstrm = zcomp_strm_find(zram->comp); + ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); + if (!is_partial_io(bvec)) { + kunmap_atomic(user_mem); + user_mem = NULL; + uncmem = NULL; + } + + if (unlikely(ret)) { + pr_err("Compression failed! err=%d\n", ret); + goto out; + } + + src = zstrm->buffer; + if (unlikely(clen > max_zpage_size)) { + clen = PAGE_SIZE; + if (is_partial_io(bvec)) + src = uncmem; + } + + /* + * handle allocation has 2 paths: + * a) fast path is executed with preemption disabled (for + * per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear, + * since we can't sleep; + * b) slow path enables preemption and attempts to allocate + * the page with __GFP_DIRECT_RECLAIM bit set. we have to + * put per-cpu compression stream and, thus, to re-do + * the compression once handle is allocated. + * + * if we have a 'non-null' handle here then we are coming + * from the slow path and handle has already been allocated. + */ + if (!handle) + handle = zs_malloc(meta->mem_pool, clen, +#if 0 /* not implemented yet */ + __GFP_KSWAPD_RECLAIM | +#endif + __GFP_NOWARN | + __GFP_HIGHMEM); + if (!handle) { + zcomp_strm_release(zram->comp, zstrm); + zstrm = NULL; + + atomic64_inc(&zram->stats.writestall); + + handle = zs_malloc(meta->mem_pool, clen, + GFP_NOIO | __GFP_HIGHMEM); + if (handle) + goto compress_again; + + pr_err("Error allocating memory for compressed page: %u, size=%zu\n", + index, clen); + ret = -ENOMEM; + goto out; + } + + alloced_pages = zs_get_total_pages(meta->mem_pool); + update_used_max(zram, alloced_pages); + + if (zram->limit_pages && alloced_pages > zram->limit_pages) { + zs_free(meta->mem_pool, handle); + ret = -ENOMEM; + goto out; + } + + cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); + + if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { + src = kmap_atomic(page); + copy_page(cmem, src); + kunmap_atomic(src); + } else { + memcpy(cmem, src, clen); + } + + zcomp_strm_release(zram->comp, zstrm); + zstrm = NULL; + zs_unmap_object(meta->mem_pool, handle); + + /* + * Free memory associated with this sector + * before overwriting unused sectors. + */ + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_free_page(zram, index); + + meta->table[index].handle = handle; + zram_set_obj_size(meta, index, clen); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + /* Update stats */ + atomic64_add(clen, &zram->stats.compr_data_size); + atomic64_inc(&zram->stats.pages_stored); +out: + if (zstrm) + zcomp_strm_release(zram->comp, zstrm); + if (is_partial_io(bvec)) + kfree(uncmem); + return ret; +} + +/* + * zram_bio_discard - handler on discard request + * @index: physical block index in PAGE_SIZE units + * @offset: byte offset within physical block + */ +static void zram_bio_discard(struct zram *zram, u32 index, + int offset, struct bio *bio) +{ + size_t n = bio->bi_size; + struct zram_meta *meta = zram->meta; + + /* + * zram manages data in physical block size units. Because logical block + * size isn't identical with physical block size on some arch, we + * could get a discard request pointing to a specific offset within a + * certain physical block. Although we can handle this request by + * reading that physiclal block and decompressing and partially zeroing + * and re-compressing and then re-storing it, this isn't reasonable + * because our intent with a discard request is to save memory. So + * skipping this logical block is appropriate here. + */ + if (offset) { + if (n <= (PAGE_SIZE - offset)) + return; + + n -= (PAGE_SIZE - offset); + index++; + } + + while (n >= PAGE_SIZE) { + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_free_page(zram, index); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + atomic64_inc(&zram->stats.notify_free); + index++; + n -= PAGE_SIZE; + } +} + +static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, + int offset, int rw) +{ + unsigned long start_time = jiffies; + int ret; + + generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT, + &zram->disk->part0); + + if (rw == READ) { + atomic64_inc(&zram->stats.num_reads); + ret = zram_bvec_read(zram, bvec, index, offset); + } else { + atomic64_inc(&zram->stats.num_writes); + ret = zram_bvec_write(zram, bvec, index, offset); + } + + generic_end_io_acct(rw, &zram->disk->part0, start_time); + + if (unlikely(ret)) { + if (rw == READ) + atomic64_inc(&zram->stats.failed_reads); + else + atomic64_inc(&zram->stats.failed_writes); + } + + return ret; +} + +static void __zram_make_request(struct zram *zram, struct bio *bio) +{ + int offset, rw, i; + u32 index; + struct bio_vec *bvec; + + index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; + offset = (bio->bi_sector & + (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; + + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + zram_bio_discard(zram, index, offset, bio); + bio_endio(bio, 0); + return; + } + + rw = bio_data_dir(bio); + bio_for_each_segment(bvec, bio, i) { + int max_transfer_size = PAGE_SIZE - offset; + + if (bvec->bv_len > max_transfer_size) { + /* + * zram_bvec_rw() can only make operation on a single + * zram page. Split the bio vector. + */ + struct bio_vec bv; + + bv.bv_page = bvec->bv_page; + bv.bv_len = max_transfer_size; + bv.bv_offset = bvec->bv_offset; + + if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0) + goto out; + + bv.bv_len = bvec->bv_len - max_transfer_size; + bv.bv_offset += max_transfer_size; + if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0) + goto out; + } else + if (zram_bvec_rw(zram, bvec, index, offset, rw) < 0) + goto out; + + update_position(&index, &offset, bvec); + } + + set_bit(BIO_UPTODATE, &bio->bi_flags); + bio_endio(bio, 0); + return; + +out: + bio_io_error(bio); +} + +/* + * Handler function for all zram I/O requests. + */ +static void zram_make_request(struct request_queue *queue, struct bio *bio) +{ + struct zram *zram = queue->queuedata; + + if (unlikely(!zram_meta_get(zram))) + goto error; + + if (!valid_io_request(zram, bio->bi_sector, + bio->bi_size)) { + atomic64_inc(&zram->stats.invalid_io); + goto put_zram; + } + + __zram_make_request(zram, bio); + zram_meta_put(zram); + return; +put_zram: + zram_meta_put(zram); +error: + bio_io_error(bio); +} + +static void zram_slot_free_notify(struct block_device *bdev, + unsigned long index) +{ + struct zram *zram; + struct zram_meta *meta; + + zram = bdev->bd_disk->private_data; + meta = zram->meta; + + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_free_page(zram, index); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + atomic64_inc(&zram->stats.notify_free); +} + +/* not ready for this change right now: Dorimanx */ +#if 0 +static int zram_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + int offset, err = -EIO; + u32 index; + struct zram *zram; + struct bio_vec bv; + + zram = bdev->bd_disk->private_data; + if (unlikely(!zram_meta_get(zram))) + goto out; + + if (!valid_io_request(zram, sector, PAGE_SIZE)) { + atomic64_inc(&zram->stats.invalid_io); + err = -EINVAL; + goto put_zram; + } + + index = sector >> SECTORS_PER_PAGE_SHIFT; + offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT; + + bv.bv_page = page; + bv.bv_len = PAGE_SIZE; + bv.bv_offset = 0; + + err = zram_bvec_rw(zram, &bv, index, offset, rw); +put_zram: + zram_meta_put(zram); +out: + /* + * If I/O fails, just return error(ie, non-zero) without + * calling page_endio. + * It causes resubmit the I/O with bio request by upper functions + * of rw_page(e.g., swap_readpage, __swap_writepage) and + * bio->bi_end_io does things to handle the error + * (e.g., SetPageError, set_page_dirty and extra works). + */ + if (err == 0) + page_endio(page, rw, 0); + return err; +} +#endif + +static void zram_reset_device(struct zram *zram) +{ + struct zram_meta *meta; + struct zcomp *comp; + u64 disksize; + + down_write(&zram->init_lock); + + zram->limit_pages = 0; + + if (!init_done(zram)) { + up_write(&zram->init_lock); + return; + } + + meta = zram->meta; + comp = zram->comp; + disksize = zram->disksize; + /* + * Refcount will go down to 0 eventually and r/w handler + * cannot handle further I/O so it will bail out by + * check zram_meta_get. + */ + zram_meta_put(zram); + /* + * We want to free zram_meta in process context to avoid + * deadlock between reclaim path and any other locks. + */ + wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); + + /* Reset stats */ + memset(&zram->stats, 0, sizeof(zram->stats)); + zram->disksize = 0; + + set_capacity(zram->disk, 0); + part_stat_set_all(&zram->disk->part0, 0); + + up_write(&zram->init_lock); + /* I/O operation under all of CPU are done so let's free */ + zram_meta_free(meta, disksize); + zcomp_destroy(comp); +} + +static ssize_t disksize_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + u64 disksize; + struct zcomp *comp; + struct zram_meta *meta; + struct zram *zram = dev_to_zram(dev); + int err; + + disksize = memparse(buf, NULL); + if (!disksize) + return -EINVAL; + + disksize = PAGE_ALIGN(disksize); + meta = zram_meta_alloc(zram->disk->disk_name, disksize); + if (!meta) + return -ENOMEM; + + comp = zcomp_create(zram->compressor); + if (IS_ERR(comp)) { + pr_err("Cannot initialise %s compressing backend\n", + zram->compressor); + err = PTR_ERR(comp); + goto out_free_meta; + } + + down_write(&zram->init_lock); + if (init_done(zram)) { + pr_info("Cannot change disksize for initialized device\n"); + err = -EBUSY; + goto out_destroy_comp; + } + + init_waitqueue_head(&zram->io_done); + atomic_set(&zram->refcount, 1); + zram->meta = meta; + zram->comp = comp; + zram->disksize = disksize; + set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); + up_write(&zram->init_lock); + + /* + * Revalidate disk out of the init_lock to avoid lockdep splat. + * It's okay because disk's capacity is protected by init_lock + * so that revalidate_disk always sees up-to-date capacity. + */ + revalidate_disk(zram->disk); + + return len; + +out_destroy_comp: + up_write(&zram->init_lock); + zcomp_destroy(comp); +out_free_meta: + zram_meta_free(meta, disksize); + return err; +} + +static ssize_t reset_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int ret; + unsigned short do_reset; + struct zram *zram; + struct block_device *bdev; + + ret = kstrtou16(buf, 10, &do_reset); + if (ret) + return ret; + + if (!do_reset) + return -EINVAL; + + zram = dev_to_zram(dev); + bdev = bdget_disk(zram->disk, 0); + if (!bdev) + return -ENOMEM; + + mutex_lock(&bdev->bd_mutex); + /* Do not reset an active device or claimed device */ + if (bdev->bd_openers || zram->claim) { + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return -EBUSY; + } + + /* From now on, anyone can't open /dev/zram[0-9] */ + zram->claim = true; + mutex_unlock(&bdev->bd_mutex); + + /* Make sure all the pending I/O are finished */ + fsync_bdev(bdev); + zram_reset_device(zram); + revalidate_disk(zram->disk); + bdput(bdev); + + mutex_lock(&bdev->bd_mutex); + zram->claim = false; + mutex_unlock(&bdev->bd_mutex); + + return len; +} + +static int zram_open(struct block_device *bdev, fmode_t mode) +{ + int ret = 0; + struct zram *zram; + + WARN_ON(!mutex_is_locked(&bdev->bd_mutex)); + + zram = bdev->bd_disk->private_data; + /* zram was claimed to reset so open request fails */ + if (zram->claim) + ret = -EBUSY; + + return ret; +} + +static const struct block_device_operations zram_devops = { + .open = zram_open, + .swap_slot_free_notify = zram_slot_free_notify, +#if 0 + .rw_page = zram_rw_page, +#endif + .owner = THIS_MODULE +}; + +static DEVICE_ATTR_WO(compact); +static DEVICE_ATTR_RW(disksize); +static DEVICE_ATTR_RO(initstate); +static DEVICE_ATTR_WO(reset); +static DEVICE_ATTR_RO(orig_data_size); +static DEVICE_ATTR_RO(mem_used_total); +static DEVICE_ATTR_RW(mem_limit); +static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_RW(max_comp_streams); +static DEVICE_ATTR_RW(comp_algorithm); + +static struct attribute *zram_disk_attrs[] = { + &dev_attr_disksize.attr, + &dev_attr_initstate.attr, + &dev_attr_reset.attr, + &dev_attr_num_reads.attr, + &dev_attr_num_writes.attr, + &dev_attr_failed_reads.attr, + &dev_attr_failed_writes.attr, + &dev_attr_compact.attr, + &dev_attr_invalid_io.attr, + &dev_attr_notify_free.attr, + &dev_attr_zero_pages.attr, + &dev_attr_orig_data_size.attr, + &dev_attr_compr_data_size.attr, + &dev_attr_mem_used_total.attr, + &dev_attr_mem_limit.attr, + &dev_attr_mem_used_max.attr, + &dev_attr_max_comp_streams.attr, + &dev_attr_comp_algorithm.attr, + &dev_attr_io_stat.attr, + &dev_attr_mm_stat.attr, + &dev_attr_debug_stat.attr, + NULL, +}; + +static struct attribute_group zram_disk_attr_group = { + .attrs = zram_disk_attrs, +}; + +/* + * Allocate and initialize new zram device. the function returns + * '>= 0' device_id upon success, and negative value otherwise. + */ +static int zram_add(void) +{ + struct zram *zram; + struct request_queue *queue; + int ret, device_id; + + zram = kzalloc(sizeof(struct zram), GFP_KERNEL); + if (!zram) + return -ENOMEM; + + ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL); + if (ret < 0) + goto out_free_dev; + device_id = ret; + + init_rwsem(&zram->init_lock); + + queue = blk_alloc_queue(GFP_KERNEL); + if (!queue) { + pr_err("Error allocating disk queue for device %d\n", + device_id); + ret = -ENOMEM; + goto out_free_idr; + } + + blk_queue_make_request(queue, zram_make_request); + + /* gendisk structure */ + zram->disk = alloc_disk(1); + if (!zram->disk) { + pr_err("Error allocating disk structure for device %d\n", + device_id); + ret = -ENOMEM; + goto out_free_queue; + } + + zram->disk->major = zram_major; + zram->disk->first_minor = device_id; + zram->disk->fops = &zram_devops; + zram->disk->queue = queue; + zram->disk->queue->queuedata = zram; + zram->disk->private_data = zram; + snprintf(zram->disk->disk_name, 16, "zram%d", device_id); + + __set_bit(QUEUE_FLAG_FAST, &zram->disk->queue->queue_flags); + /* Actual capacity set using syfs (/sys/block/zram/disksize */ + set_capacity(zram->disk, 0); + /* zram devices sort of resembles non-rotational disks */ + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); + /* + * To ensure that we always get PAGE_SIZE aligned + * and n*PAGE_SIZED sized I/O requests. + */ + blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE); + blk_queue_logical_block_size(zram->disk->queue, + ZRAM_LOGICAL_BLOCK_SIZE); + blk_queue_io_min(zram->disk->queue, PAGE_SIZE); + blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); + zram->disk->queue->limits.discard_granularity = PAGE_SIZE; + blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX); + /* + * zram_bio_discard() will clear all logical blocks if logical block + * size is identical with physical block size(PAGE_SIZE). But if it is + * different, we will skip discarding some parts of logical blocks in + * the part of the request range which isn't aligned to physical block + * size. So we can't ensure that all discarded logical blocks are + * zeroed. + */ + if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) + zram->disk->queue->limits.discard_zeroes_data = 1; + else + zram->disk->queue->limits.discard_zeroes_data = 0; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue); + + add_disk(zram->disk); + + ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, + &zram_disk_attr_group); + if (ret < 0) { + pr_err("Error creating sysfs group for device %d\n", + device_id); + goto out_free_disk; + } + strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); + zram->meta = NULL; + + pr_info("Added device: %s\n", zram->disk->disk_name); + return device_id; + +out_free_disk: + del_gendisk(zram->disk); + put_disk(zram->disk); +out_free_queue: + blk_cleanup_queue(queue); +out_free_idr: + idr_remove(&zram_index_idr, device_id); +out_free_dev: + kfree(zram); + return ret; +} + +static int zram_remove(struct zram *zram) +{ + struct block_device *bdev; + + bdev = bdget_disk(zram->disk, 0); + if (!bdev) + return -ENOMEM; + + mutex_lock(&bdev->bd_mutex); + if (bdev->bd_openers || zram->claim) { + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return -EBUSY; + } + + zram->claim = true; + mutex_unlock(&bdev->bd_mutex); + + /* + * Remove sysfs first, so no one will perform a disksize + * store while we destroy the devices. This also helps during + * hot_remove -- zram_reset_device() is the last holder of + * ->init_lock, no later/concurrent disksize_store() or any + * other sysfs handlers are possible. + */ + sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, + &zram_disk_attr_group); + + /* Make sure all the pending I/O are finished */ + fsync_bdev(bdev); + zram_reset_device(zram); + bdput(bdev); + + pr_info("Removed device: %s\n", zram->disk->disk_name); + + blk_cleanup_queue(zram->disk->queue); + del_gendisk(zram->disk); + put_disk(zram->disk); + kfree(zram); + return 0; +} + +/* zram-control sysfs attributes */ +static ssize_t hot_add_show(struct class *class, + struct class_attribute *attr, + char *buf) +{ + int ret; + + mutex_lock(&zram_index_mutex); + ret = zram_add(); + mutex_unlock(&zram_index_mutex); + + if (ret < 0) + return ret; + return scnprintf(buf, PAGE_SIZE, "%d\n", ret); +} + +static ssize_t hot_remove_store(struct class *class, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct zram *zram; + int ret, dev_id; + + /* dev_id is gendisk->first_minor, which is `int' */ + ret = kstrtoint(buf, 10, &dev_id); + if (ret) + return ret; + if (dev_id < 0) + return -EINVAL; + + mutex_lock(&zram_index_mutex); + + zram = idr_find(&zram_index_idr, dev_id); + if (zram) { + ret = zram_remove(zram); + idr_remove(&zram_index_idr, dev_id); + } else { + ret = -ENODEV; + } + + mutex_unlock(&zram_index_mutex); + return ret ? ret : count; +} + +static struct class_attribute zram_control_class_attrs[] = { + __ATTR_RO(hot_add), + __ATTR_WO(hot_remove), + __ATTR_NULL, +}; + +static struct class zram_control_class = { + .name = "zram-control", + .owner = THIS_MODULE, + .class_attrs = zram_control_class_attrs, +}; + +static int zram_remove_cb(int id, void *ptr, void *data) +{ + zram_remove(ptr); + return 0; +} + +static void destroy_devices(void) +{ + class_unregister(&zram_control_class); + idr_for_each(&zram_index_idr, &zram_remove_cb, NULL); + idr_destroy(&zram_index_idr); + unregister_blkdev(zram_major, "zram"); +} + +static int __init zram_init(void) +{ + int ret; + +#ifdef CONFIG_STATE_NOTIFIER + notif.notifier_call = state_notifier_callback; + if (state_register_client(¬if)) + pr_warn("Failed to register State notifier callback\n"); +#endif + + ret = class_register(&zram_control_class); + if (ret) { + pr_err("Unable to register zram-control class\n"); + return ret; + } + + zram_major = register_blkdev(0, "zram"); + if (zram_major <= 0) { + pr_err("Unable to get major number\n"); + class_unregister(&zram_control_class); + return -EBUSY; + } + + while (num_devices != 0) { + mutex_lock(&zram_index_mutex); + ret = zram_add(); + mutex_unlock(&zram_index_mutex); + if (ret < 0) + goto out_error; + num_devices--; + } + + return 0; + +out_error: + destroy_devices(); + return ret; +} + +static void __exit zram_exit(void) +{ +#ifdef CONFIG_STATE_NOTIFIER + state_unregister_client(¬if); + notif.notifier_call = NULL; +#endif + destroy_devices(); +} + +module_init(zram_init); +module_exit(zram_exit); + +module_param(num_devices, uint, 0); +MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices"); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Nitin Gupta "); +MODULE_DESCRIPTION("Compressed RAM Block Device"); diff --git a/drivers/staging/zram/zram_drv.h b/drivers/block/zram/zram_drv.h similarity index 53% rename from drivers/staging/zram/zram_drv.h rename to drivers/block/zram/zram_drv.h index 508a19f44..3f5bf66a2 100644 --- a/drivers/staging/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -2,6 +2,7 @@ * Compressed RAM block device * * Copyright (C) 2008, 2009, 2010 Nitin Gupta + * 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the licence that better fits your requirements. @@ -9,22 +10,15 @@ * Released under the terms of 3-clause BSD License * Released under the terms of GNU General Public License Version 2.0 * - * Project home: http://compcache.googlecode.com */ #ifndef _ZRAM_DRV_H_ #define _ZRAM_DRV_H_ #include -#include +#include -#include "../zsmalloc/zsmalloc.h" - -/* - * Some arbitrary value. This is just to catch - * invalid value for num_devices module parameter. - */ -static const unsigned max_num_devices = 32; +#include "zcomp.h" /*-- Configurable parameters */ @@ -32,7 +26,7 @@ static const unsigned max_num_devices = 32; * Pages that compress to size greater than this are stored * uncompressed in memory. */ -static const size_t max_zpage_size = PAGE_SIZE / 10 * 9; +static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /* * NOTE: max_zpage_size must be less than or equal to: @@ -43,7 +37,6 @@ static const size_t max_zpage_size = PAGE_SIZE / 10 * 9; /*-- End of configurable params */ #define SECTOR_SHIFT 9 -#define SECTOR_SIZE (1 << SECTOR_SHIFT) #define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) #define ZRAM_LOGICAL_BLOCK_SHIFT 12 @@ -51,10 +44,24 @@ static const size_t max_zpage_size = PAGE_SIZE / 10 * 9; #define ZRAM_SECTOR_PER_LOGICAL_BLOCK \ (1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT)) -/* Flags for zram pages (table[page_no].flags) */ + +/* + * The lower ZRAM_FLAG_SHIFT bits of table.value is for + * object size (excluding header), the higher bits is for + * zram_pageflags. + * + * zram is mainly used for memory efficiency so we want to keep memory + * footprint small so we can squeeze size and flags into a field. + * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header), + * the higher bits is for zram_pageflags. + */ +#define ZRAM_FLAG_SHIFT 24 + +/* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { /* Page consists entirely of zeros */ - ZRAM_ZERO, + ZRAM_ZERO = ZRAM_FLAG_SHIFT, + ZRAM_ACCESS, /* page is now accessed */ __NR_ZRAM_PAGEFLAGS, }; @@ -62,64 +69,54 @@ enum zram_pageflags { /*-- Data structures */ /* Allocated for each disk page */ -struct table { +struct zram_table_entry { unsigned long handle; - u16 size; /* object size (excluding header) */ - u8 count; /* object ref count (not yet used) */ - u8 flags; -} __aligned(4); + unsigned long value; +}; -/* - * All 64bit fields should only be manipulated by 64bit atomic accessors. - * All modifications to 32bit counter should be protected by zram->lock. - */ struct zram_stats { - atomic64_t compr_size; /* compressed size of pages stored */ + atomic64_t compr_data_size; /* compressed size of pages stored */ atomic64_t num_reads; /* failed + successful */ atomic64_t num_writes; /* --do-- */ - atomic64_t failed_reads; /* should NEVER! happen */ + atomic64_t failed_reads; /* can happen when memory is too low */ atomic64_t failed_writes; /* can happen when memory is too low */ atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ - u32 pages_zero; /* no. of zero filled pages */ - u32 pages_stored; /* no. of pages currently stored */ - u32 good_compress; /* % of pages with compression ratio<=50% */ - u32 bad_compress; /* % of pages with compression ratio>=75% */ + atomic64_t zero_pages; /* no. of zero filled pages */ + atomic64_t pages_stored; /* no. of pages currently stored */ + atomic_long_t max_used_pages; /* no. of maximum pages stored */ + atomic64_t writestall; /* no. of write slow paths */ }; struct zram_meta { - void *compress_workmem; - void *compress_buffer; - struct table *table; + struct zram_table_entry *table; struct zs_pool *mem_pool; }; -struct zram_slot_free { - unsigned long index; - struct zram_slot_free *next; -}; - struct zram { struct zram_meta *meta; - struct rw_semaphore lock; /* protect compression buffers, table, - * 32bit stat counters against concurrent - * notifications, reads and writes */ - - struct work_struct free_work; /* handle pending free request */ - struct zram_slot_free *slot_free_rq; /* list head of free request */ - - struct request_queue *queue; + struct zcomp *comp; struct gendisk *disk; - int init_done; - /* Prevent concurrent execution of device init, reset and R/W request */ + /* Prevent concurrent execution of device init */ struct rw_semaphore init_lock; + /* + * the number of pages zram can consume for storing compressed data + */ + unsigned long limit_pages; + + struct zram_stats stats; + atomic_t refcount; /* refcount for zram_meta */ + /* wait all IO under all of cpu are done */ + wait_queue_head_t io_done; /* * This is the limit on amount of *uncompressed* worth of data * we can store in a disk. */ u64 disksize; /* bytes */ - spinlock_t slot_free_lock; - - struct zram_stats stats; + char compressor[10]; + /* + * zram is claimed so open request will be failed + */ + bool claim; /* Protected by bdev->bd_mutex */ }; #endif diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c index ed4b7481a..93c5b2fdf 100644 --- a/drivers/gpu/drm/drm_crtc.c +++ b/drivers/gpu/drm/drm_crtc.c @@ -2945,7 +2945,7 @@ static struct drm_property_blob *drm_property_create_blob(struct drm_device *dev struct drm_property_blob *blob; int ret; - if (!length || !data) + if (!length || length > ULONG_MAX - sizeof(struct drm_property_blob) || !data) return NULL; blob = kzalloc(sizeof(struct drm_property_blob)+length, GFP_KERNEL); diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index 2f46bbfbb..b242534f5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -172,11 +172,12 @@ nouveau_gem_info(struct drm_file *file_priv, struct drm_gem_object *gem, struct nouveau_bo *nvbo = nouveau_gem_object(gem); struct nouveau_vma *vma; - if (nvbo->bo.mem.mem_type == TTM_PL_TT) + if (is_power_of_2(nvbo->valid_domains)) + rep->domain = nvbo->valid_domains; + else if (nvbo->bo.mem.mem_type == TTM_PL_TT) rep->domain = NOUVEAU_GEM_DOMAIN_GART; else rep->domain = NOUVEAU_GEM_DOMAIN_VRAM; - rep->offset = nvbo->bo.offset; if (fpriv->vm) { vma = nouveau_bo_vma_find(nvbo, fpriv->vm); diff --git a/drivers/gpu/drm/radeon/radeon_combios.c b/drivers/gpu/drm/radeon/radeon_combios.c index b72eb507d..69c2dd085 100644 --- a/drivers/gpu/drm/radeon/radeon_combios.c +++ b/drivers/gpu/drm/radeon/radeon_combios.c @@ -3399,6 +3399,14 @@ void radeon_combios_asic_init(struct drm_device *dev) rdev->pdev->subsystem_device == 0x30ae) return; + /* quirk for rs4xx HP Compaq dc5750 Small Form Factor to make it resume + * - it hangs on resume inside the dynclk 1 table. + */ + if (rdev->family == CHIP_RS480 && + rdev->pdev->subsystem_vendor == 0x103c && + rdev->pdev->subsystem_device == 0x280a) + return; + /* DYN CLK 1 */ table = combios_get_table_offset(dev, COMBIOS_DYN_CLK_1_TABLE); if (table) diff --git a/drivers/gpu/drm/radeon/radeon_connectors.c b/drivers/gpu/drm/radeon/radeon_connectors.c index 9184bbe7c..9c5d96cb6 100644 --- a/drivers/gpu/drm/radeon/radeon_connectors.c +++ b/drivers/gpu/drm/radeon/radeon_connectors.c @@ -82,6 +82,11 @@ void radeon_connector_hotplug(struct drm_connector *connector) if (!radeon_hpd_sense(rdev, radeon_connector->hpd.hpd)) { drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); } else if (radeon_dp_needs_link_train(radeon_connector)) { + /* Don't try to start link training before we + * have the dpcd */ + if (!radeon_dp_getdpcd(radeon_connector)) + return; + /* set it to OFF so that drm_helper_connector_dpms() * won't return immediately since the current state * is ON at this point. diff --git a/drivers/gpu/ion/ion.c b/drivers/gpu/ion/ion.c index 9c61bad7e..34d20c949 100644 --- a/drivers/gpu/ion/ion.c +++ b/drivers/gpu/ion/ion.c @@ -325,7 +325,7 @@ static struct ion_handle *ion_handle_create(struct ion_client *client, if (!handle) return ERR_PTR(-ENOMEM); kref_init(&handle->ref); - rb_init_node(&handle->node); + RB_CLEAR_NODE(&handle->node); handle->client = client; ion_buffer_get(buffer); ion_buffer_add_to_handle(buffer); diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c index 5cb6f4f64..791989de8 100644 --- a/drivers/gpu/msm/adreno.c +++ b/drivers/gpu/msm/adreno.c @@ -1735,7 +1735,7 @@ static int adreno_of_get_pdata(struct platform_device *pdev) if (adreno_of_read_property(pdev->dev.of_node, "qcom,idle-timeout", &pdata->idle_timeout)) - pdata->idle_timeout = HZ/12; + pdata->idle_timeout = 80; pdata->strtstp_sleepwake = of_property_read_bool(pdev->dev.of_node, "qcom,strtstp-sleepwake"); diff --git a/drivers/gpu/msm/adreno_dispatch.c b/drivers/gpu/msm/adreno_dispatch.c index 12c40c1ae..2a42e64db 100644 --- a/drivers/gpu/msm/adreno_dispatch.c +++ b/drivers/gpu/msm/adreno_dispatch.c @@ -267,7 +267,7 @@ static struct kgsl_cmdbatch *_get_cmdbatch(struct adreno_context *drawctxt) * it hasn't already been started */ if (!timer_pending(&cmdbatch->timer)) - mod_timer(&cmdbatch->timer, jiffies + (5 * HZ)); + mod_timer(&cmdbatch->timer, jiffies + msecs_to_jiffies(5000)); return ERR_PTR(-EAGAIN); } diff --git a/drivers/gpu/msm/adreno_profile.c b/drivers/gpu/msm/adreno_profile.c index ebe9e587f..3929eee47 100644 --- a/drivers/gpu/msm/adreno_profile.c +++ b/drivers/gpu/msm/adreno_profile.c @@ -909,7 +909,7 @@ static int profile_pipe_print(struct file *filep, char __user *ubuf, kgsl_mutex_unlock(&device->mutex, &device->mutex_owner); set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ / 10); + schedule_timeout(msecs_to_jiffies(100)); kgsl_mutex_lock(&device->mutex, &device->mutex_owner); if (signal_pending(current)) { diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h index c69bbee74..234666488 100644 --- a/drivers/gpu/msm/kgsl_device.h +++ b/drivers/gpu/msm/kgsl_device.h @@ -30,7 +30,6 @@ #define KGSL_TIMEOUT_DEFAULT 0xFFFFFFFF #define KGSL_TIMEOUT_PART 50 /* 50 msec */ -#define FIRST_TIMEOUT (HZ / 2) /* KGSL device state is initialized to INIT when platform_probe * diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c index 98140720d..8a2b31085 100644 --- a/drivers/gpu/msm/kgsl_pwrctrl.c +++ b/drivers/gpu/msm/kgsl_pwrctrl.c @@ -505,7 +505,6 @@ static int kgsl_pwrctrl_idle_timer_store(struct device *dev, unsigned int val = 0; struct kgsl_device *device = kgsl_device_from_dev(dev); struct kgsl_pwrctrl *pwr; - const long div = 1000/HZ; int ret; if (device == NULL) @@ -519,8 +518,7 @@ static int kgsl_pwrctrl_idle_timer_store(struct device *dev, kgsl_mutex_lock(&device->mutex, &device->mutex_owner); /* Let the timeout be requested in ms, but convert to jiffies. */ - val /= div; - pwr->interval_timeout = val; + pwr->interval_timeout = msecs_to_jiffies(val); kgsl_mutex_unlock(&device->mutex, &device->mutex_owner); @@ -532,12 +530,11 @@ static int kgsl_pwrctrl_idle_timer_show(struct device *dev, char *buf) { struct kgsl_device *device = kgsl_device_from_dev(dev); - int mul = 1000/HZ; if (device == NULL) return 0; /* Show the idle_timeout converted to msec */ return snprintf(buf, PAGE_SIZE, "%d\n", - device->pwrctrl.interval_timeout * mul); + jiffies_to_msecs(device->pwrctrl.interval_timeout)); } static int kgsl_pwrctrl_pmqos_latency_store(struct device *dev, @@ -1121,7 +1118,7 @@ int kgsl_pwrctrl_init(struct kgsl_device *device) pwr->power_flags = 0; pwr->idle_needed = pdata->idle_needed; - pwr->interval_timeout = pdata->idle_timeout; + pwr->interval_timeout = msecs_to_jiffies(pdata->idle_timeout); pwr->strtstp_sleepwake = pdata->strtstp_sleepwake; pwr->ebi1_clk = clk_get(&pdev->dev, "bus_clk"); if (IS_ERR(pwr->ebi1_clk)) @@ -1722,7 +1719,7 @@ static int _check_active_count(struct kgsl_device *device, int count) int kgsl_active_count_wait(struct kgsl_device *device, int count) { int result = 0; - long wait_jiffies = HZ; + long wait_jiffies = msecs_to_jiffies(1000); BUG_ON(!mutex_is_locked(&device->mutex)); diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c889aaef3..90104c6fc 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -856,6 +856,11 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err) case IB_CM_SIDR_REQ_RCVD: spin_unlock_irq(&cm_id_priv->lock); cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT); + spin_lock_irq(&cm.lock); + if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) + rb_erase(&cm_id_priv->sidr_id_node, + &cm.remote_sidr_table); + spin_unlock_irq(&cm.lock); break; case IB_CM_REQ_SENT: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); @@ -3092,7 +3097,10 @@ int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, spin_unlock_irqrestore(&cm_id_priv->lock, flags); spin_lock_irqsave(&cm.lock, flags); - rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) { + rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + RB_CLEAR_NODE(&cm_id_priv->sidr_id_node); + } spin_unlock_irqrestore(&cm.lock, flags); return 0; diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 5bcb2afd3..228af1894 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -69,7 +69,7 @@ */ struct ib_uverbs_device { - struct kref ref; + atomic_t refcount; int num_comp_vectors; struct completion comp; struct device *dev; @@ -78,6 +78,7 @@ struct ib_uverbs_device { struct cdev cdev; struct rb_root xrcd_tree; struct mutex xrcd_tree_mutex; + struct kobject kobj; }; struct ib_uverbs_event_file { diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4d27e4c3f..95885b490 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1979,6 +1979,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, next->send_flags = user_wr->send_flags; if (is_ud) { + if (next->opcode != IB_WR_SEND && + next->opcode != IB_WR_SEND_WITH_IMM) { + ret = -EINVAL; + goto out_put; + } + next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah, file->ucontext); if (!next->wr.ud.ah) { @@ -2015,9 +2021,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, user_wr->wr.atomic.compare_add; next->wr.atomic.swap = user_wr->wr.atomic.swap; next->wr.atomic.rkey = user_wr->wr.atomic.rkey; + case IB_WR_SEND: break; default: - break; + ret = -EINVAL; + goto out_put; } } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 5b51e4e6e..c8e766924 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -117,14 +117,18 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, static void ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device); -static void ib_uverbs_release_dev(struct kref *ref) +static void ib_uverbs_release_dev(struct kobject *kobj) { struct ib_uverbs_device *dev = - container_of(ref, struct ib_uverbs_device, ref); + container_of(kobj, struct ib_uverbs_device, kobj); - complete(&dev->comp); + kfree(dev); } +static struct kobj_type ib_uverbs_dev_ktype = { + .release = ib_uverbs_release_dev, +}; + static void ib_uverbs_release_event_file(struct kref *ref) { struct ib_uverbs_event_file *file = @@ -273,13 +277,19 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, return context->device->dealloc_ucontext(context); } +static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev) +{ + complete(&dev->comp); +} + static void ib_uverbs_release_file(struct kref *ref) { struct ib_uverbs_file *file = container_of(ref, struct ib_uverbs_file, ref); module_put(file->device->ib_dev->owner); - kref_put(&file->device->ref, ib_uverbs_release_dev); + if (atomic_dec_and_test(&file->device->refcount)) + ib_uverbs_comp_dev(file->device); kfree(file); } @@ -621,9 +631,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) int ret; dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev); - if (dev) - kref_get(&dev->ref); - else + if (!atomic_inc_not_zero(&dev->refcount)) return -ENXIO; if (!try_module_get(dev->ib_dev->owner)) { @@ -644,6 +652,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) mutex_init(&file->mutex); filp->private_data = file; + kobject_get(&dev->kobj); return nonseekable_open(inode, filp); @@ -651,13 +660,16 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) module_put(dev->ib_dev->owner); err: - kref_put(&dev->ref, ib_uverbs_release_dev); + if (atomic_dec_and_test(&dev->refcount)) + ib_uverbs_comp_dev(dev); + return ret; } static int ib_uverbs_close(struct inode *inode, struct file *filp) { struct ib_uverbs_file *file = filp->private_data; + struct ib_uverbs_device *dev = file->device; ib_uverbs_cleanup_ucontext(file, file->ucontext); @@ -665,6 +677,7 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) kref_put(&file->async_file->ref, ib_uverbs_release_event_file); kref_put(&file->ref, ib_uverbs_release_file); + kobject_put(&dev->kobj); return 0; } @@ -760,10 +773,11 @@ static void ib_uverbs_add_one(struct ib_device *device) if (!uverbs_dev) return; - kref_init(&uverbs_dev->ref); + atomic_set(&uverbs_dev->refcount, 1); init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; mutex_init(&uverbs_dev->xrcd_tree_mutex); + kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype); spin_lock(&map_lock); devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); @@ -790,6 +804,7 @@ static void ib_uverbs_add_one(struct ib_device *device) cdev_init(&uverbs_dev->cdev, NULL); uverbs_dev->cdev.owner = THIS_MODULE; uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; + uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj; kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum); if (cdev_add(&uverbs_dev->cdev, base, 1)) goto err_cdev; @@ -820,9 +835,10 @@ static void ib_uverbs_add_one(struct ib_device *device) clear_bit(devnum, overflow_map); err: - kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); + if (atomic_dec_and_test(&uverbs_dev->refcount)) + ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); - kfree(uverbs_dev); + kobject_put(&uverbs_dev->kobj); return; } @@ -842,9 +858,10 @@ static void ib_uverbs_remove_one(struct ib_device *device) else clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); - kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); + if (atomic_dec_and_test(&uverbs_dev->refcount)) + ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); - kfree(uverbs_dev); + kobject_put(&uverbs_dev->kobj); } static char *uverbs_devnode(struct device *dev, umode_t *mode) diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index a251becda..890c23b3d 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -169,9 +169,13 @@ int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) enum rdma_link_layer ll; memset(ah_attr, 0, sizeof *ah_attr); - ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24; ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num); + if (ll == IB_LINK_LAYER_ETHERNET) + ah_attr->sl = be32_to_cpu(ah->av.eth.sl_tclass_flowlabel) >> 29; + else + ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0; if (ah->av.ib.stat_rate) ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET; diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig index f0f234dda..bc44d4c6c 100644 --- a/drivers/input/touchscreen/Kconfig +++ b/drivers/input/touchscreen/Kconfig @@ -1145,6 +1145,12 @@ config TOUCHSCREEN_ZINITIX_BT531 To compile this driver as a module, choose M here: the module will be called BT531. +config STATE_NOTIFIER + bool "State Notifier" + depends on CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI + default y + + source "drivers/input/touchscreen/wacom/Kconfig" source "drivers/input/touchscreen/gt9xx/Kconfig" source "drivers/input/touchscreen/synaptics/Kconfig" diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile index be6587f82..33101e7e9 100644 --- a/drivers/input/touchscreen/Makefile +++ b/drivers/input/touchscreen/Makefile @@ -100,3 +100,4 @@ obj-$(CONFIG_TOUCHSCREEN_FTS_S) += stm_s/ obj-$(CONFIG_TOUCHSCREEN_FTS) += stm_fts/ obj-$(CONFIG_TOUCHSCREEN_ZINITIX_BT532) += zinitix_bt532_ts.o obj-$(CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI_JSGLTE) += synaptics_jsglte/ +obj-$(CONFIG_STATE_NOTIFIER) += state_notifier.o diff --git a/drivers/input/touchscreen/state_notifier.c b/drivers/input/touchscreen/state_notifier.c new file mode 100644 index 000000000..7c67b9fc6 --- /dev/null +++ b/drivers/input/touchscreen/state_notifier.c @@ -0,0 +1,135 @@ +/* + * State Notifier Driver + * + * Copyright (c) 2013-2015, Pranav Vashi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include + +#define DEFAULT_SUSPEND_DEFER_TIME 10 +#define STATE_NOTIFIER "state_notifier" + +/* + * debug = 1 will print all + */ +static unsigned int debug = 1; +module_param_named(debug_mask, debug, uint, 0644); + +static bool state_suspended; +module_param_named(state_suspended, state_suspended, bool, 0444); + +#define dprintk(msg...) \ +do { \ + if (debug) \ + pr_info(msg); \ +} while (0) + +static unsigned int suspend_defer_time = DEFAULT_SUSPEND_DEFER_TIME; +module_param_named(suspend_defer_time, suspend_defer_time, uint, 0664); +static struct delayed_work suspend_work; +static struct workqueue_struct *susp_wq; +struct work_struct resume_work; +static bool suspend_in_progress; + +static BLOCKING_NOTIFIER_HEAD(state_notifier_list); + +/** + * state_register_client - register a client notifier + * @nb: notifier block to callback on events + */ +int state_register_client(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&state_notifier_list, nb); +} +EXPORT_SYMBOL(state_register_client); + +/** + * state_unregister_client - unregister a client notifier + * @nb: notifier block to callback on events + */ +int state_unregister_client(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&state_notifier_list, nb); +} +EXPORT_SYMBOL(state_unregister_client); + +/** + * state_notifier_call_chain - notify clients on state_events + * @val: Value passed unmodified to notifier function + * @v: pointer passed unmodified to notifier function + * + */ +int state_notifier_call_chain(unsigned long val, void *v) +{ + return blocking_notifier_call_chain(&state_notifier_list, val, v); +} +EXPORT_SYMBOL_GPL(state_notifier_call_chain); + +static void _suspend_work(struct work_struct *work) +{ + state_notifier_call_chain(STATE_NOTIFIER_SUSPEND, NULL); + state_suspended = true; + suspend_in_progress = false; + dprintk("%s: suspend completed.\n", STATE_NOTIFIER); +} + +static void _resume_work(struct work_struct *work) +{ + state_notifier_call_chain(STATE_NOTIFIER_ACTIVE, NULL); + msleep_interruptible(50); + state_suspended = false; + dprintk("%s: resume completed.\n", STATE_NOTIFIER); +} + +void state_suspend(void) +{ + if (state_suspended || suspend_in_progress) + return; + + dprintk("%s: suspend called.\n", STATE_NOTIFIER); + suspend_in_progress = true; + + queue_delayed_work_on(0, susp_wq, &suspend_work, + msecs_to_jiffies(suspend_defer_time * 1000)); +} + +void state_resume(void) +{ + dprintk("%s: resume called.\n", STATE_NOTIFIER); + cancel_delayed_work_sync(&suspend_work); + suspend_in_progress = false; + + if (state_suspended) + queue_work_on(0, susp_wq, &resume_work); +} + +static int __init state_notifier_init(void) +{ + susp_wq = create_singlethread_workqueue("state_susp_wq"); + + if (!susp_wq) { + pr_err("State Notifier failed to allocate suspend workqueue\n"); + return 0; + } + + INIT_DELAYED_WORK(&suspend_work, _suspend_work); + INIT_WORK(&resume_work, _resume_work); + + return 0; +} + +subsys_initcall(state_notifier_init); + +MODULE_AUTHOR("Pranav Vashi "); +MODULE_DESCRIPTION("State Notifier Driver"); +MODULE_LICENSE("GPLv2"); diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index a20d64d4e..4fd4f519c 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1931,8 +1931,8 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) static void clear_dte_entry(u16 devid) { /* remove entry from the device table seen by the hardware */ - amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; - amd_iommu_dev_table[devid].data[1] = 0; + amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; + amd_iommu_dev_table[devid].data[1] &= DTE_FLAG_MASK; amd_iommu_apply_erratum_63(devid); } diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index c4ffacb03..42f2090d3 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -277,6 +277,7 @@ #define IOMMU_PTE_IR (1ULL << 61) #define IOMMU_PTE_IW (1ULL << 62) +#define DTE_FLAG_MASK (0x3ffULL << 32) #define DTE_FLAG_IOTLB (0x01UL << 32) #define DTE_FLAG_GV (0x01ULL << 55) #define DTE_GLX_SHIFT (56) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 2e48b2273..f0a84bc3c 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -1827,13 +1827,20 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, return -ENOMEM; /* It is large page*/ if (largepage_lvl > 1) { + unsigned long nr_superpages, end_pfn, lvl_pages; + pteval |= DMA_PTE_LARGE_PAGE; - /* Ensure that old small page tables are removed to make room - for superpage, if they exist. */ - dma_pte_clear_range(domain, iov_pfn, - iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1); - dma_pte_free_pagetable(domain, iov_pfn, - iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1); + lvl_pages = lvl_to_nr_pages(largepage_lvl); + + nr_superpages = sg_res / lvl_pages; + end_pfn = iov_pfn + nr_superpages * lvl_pages - 1; + + /* + * Ensure that old small page tables are + * removed to make room for superpage(s). + */ + dma_pte_clear_range(domain, iov_pfn, end_pfn); + dma_pte_free_pagetable(domain, iov_pfn, end_pfn); } else { pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; } diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c index ce8897933..004fa1089 100644 --- a/drivers/macintosh/windfarm_core.c +++ b/drivers/macintosh/windfarm_core.c @@ -421,7 +421,7 @@ int wf_unregister_client(struct notifier_block *nb) { mutex_lock(&wf_lock); blocking_notifier_chain_unregister(&wf_client_list, nb); - wf_client_count++; + wf_client_count--; if (wf_client_count == 0) wf_stop_thread(); mutex_unlock(&wf_lock); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2241534d3..f3b617eb3 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -347,7 +347,7 @@ config DM_MULTIPATH # of SCSI_DH if the latter isn't defined but if # it is, DM_MULTIPATH must depend on it. We get a build # error if SCSI_DH=m and DM_MULTIPATH=y - depends on SCSI_DH || !SCSI_DH + depends on !SCSI_DH || SCSI ---help--- Allow volume managers to support multipath hardware. diff --git a/drivers/md/md.c b/drivers/md/md.c index 14c3eea6e..29fb41513 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7929,6 +7929,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, /* Make sure they get written out promptly */ sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); + set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); md_wakeup_thread(rdev->mddev->thread); } return rv; diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index dddd5a47f..be86d59ea 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -502,7 +502,7 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) r = new_block(s->info, &right); if (r < 0) { - /* FIXME: put left */ + unlock_block(s->info, left); return r; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index de63a1fc3..a0bb34a81 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -88,6 +88,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) char b[BDEVNAME_SIZE]; char b2[BDEVNAME_SIZE]; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); + unsigned short blksize = 512; if (!conf) return -ENOMEM; @@ -102,6 +103,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) sector_div(sectors, mddev->chunk_sectors); rdev1->sectors = sectors * mddev->chunk_sectors; + blksize = max(blksize, queue_logical_block_size( + rdev1->bdev->bd_disk->queue)); + rdev_for_each(rdev2, mddev) { pr_debug("md/raid0:%s: comparing %s(%llu)" " with %s(%llu)\n", @@ -138,6 +142,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } pr_debug("md/raid0:%s: FINAL %d zones\n", mdname(mddev), conf->nr_strip_zones); + /* + * now since we have the hard sector sizes, we can make sure + * chunk size is a multiple of that sector size + */ + if ((mddev->chunk_sectors << 9) % blksize) { + printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n", + mdname(mddev), + mddev->chunk_sectors << 9, blksize); + err = -EINVAL; + goto abort; + } + err = -ENOMEM; conf->strip_zone = kzalloc(sizeof(struct strip_zone)* conf->nr_strip_zones, GFP_KERNEL); @@ -186,9 +202,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } dev[j] = rdev1; - disk_stack_limits(mddev->gendisk, rdev1->bdev, - rdev1->data_offset << 9); - if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) conf->has_merge_bvec = 1; @@ -257,21 +270,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) mddev->queue->backing_dev_info.congested_fn = raid0_congested; mddev->queue->backing_dev_info.congested_data = mddev; - /* - * now since we have the hard sector sizes, we can make sure - * chunk size is a multiple of that sector size - */ - if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { - printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n", - mdname(mddev), - mddev->chunk_sectors << 9); - goto abort; - } - - blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); - blk_queue_io_opt(mddev->queue, - (mddev->chunk_sectors << 9) * mddev->raid_disks); - pr_debug("md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; @@ -431,6 +429,27 @@ static int raid0_run(struct mddev *mddev) mddev->private = conf; } conf = mddev->private; + if (mddev->queue) { + struct md_rdev *rdev; + bool discard_supported = false; + + blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); + + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); + blk_queue_io_opt(mddev->queue, + (mddev->chunk_sectors << 9) * mddev->raid_disks); + + rdev_for_each(rdev, mddev) { + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; + } + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + } /* calculate array device size */ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a5e678ee7..375cdc68d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1285,6 +1285,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Faulty, &rdev->flags); spin_unlock_irqrestore(&conf->device_lock, flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" "md/raid1:%s: Operation continuing on %d devices.\n", @@ -2055,6 +2056,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) { int m; + bool fail = false; for (m = 0; m < conf->raid_disks * 2 ; m++) if (r1_bio->bios[m] == IO_MADE_GOOD) { struct md_rdev *rdev = conf->mirrors[m].rdev; @@ -2067,6 +2069,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) * narrow down and record precise write * errors. */ + fail = true; if (!narrow_write_error(r1_bio, m)) { md_error(conf->mddev, conf->mirrors[m].rdev); @@ -2076,9 +2079,17 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) rdev_dec_pending(conf->mirrors[m].rdev, conf->mddev); } - if (test_bit(R1BIO_WriteError, &r1_bio->state)) - close_write(r1_bio); - raid_end_bio_io(r1_bio); + if (fail) { + spin_lock_irq(&conf->device_lock); + list_add(&r1_bio->retry_list, &conf->bio_end_io_list); + conf->nr_queued++; + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(conf->mddev->thread); + } else { + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); + raid_end_bio_io(r1_bio); + } } static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) @@ -2181,6 +2192,29 @@ static void raid1d(struct mddev *mddev) md_check_recovery(mddev); + if (!list_empty_careful(&conf->bio_end_io_list) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + LIST_HEAD(tmp); + spin_lock_irqsave(&conf->device_lock, flags); + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + while (!list_empty(&conf->bio_end_io_list)) { + list_move(conf->bio_end_io_list.prev, &tmp); + conf->nr_queued--; + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); + while (!list_empty(&tmp)) { + r1_bio = list_first_entry(&conf->bio_end_io_list, + struct r1bio, retry_list); + list_del(&r1_bio->retry_list); + if (mddev->degraded) + set_bit(R1BIO_Degraded, &r1_bio->state); + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); + raid_end_bio_io(r1_bio); + } + } + blk_start_plug(&plug); for (;;) { @@ -2582,6 +2616,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; INIT_LIST_HEAD(&conf->retry_list); + INIT_LIST_HEAD(&conf->bio_end_io_list); spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 80ded1393..50086cf0e 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -48,6 +48,11 @@ struct r1conf { * block, or anything else. */ struct list_head retry_list; + /* A separate list of r1bio which just need raid_end_bio_io called. + * This mustn't happen for writes which had any errors if the superblock + * needs to be written. + */ + struct list_head bio_end_io_list; /* queue pending writes to be submitted on unplug */ struct bio_list pending_bio_list; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f9a981b0e..6ce5c8d3b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1453,6 +1453,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" "md/raid10:%s: Operation continuing on %d devices.\n", @@ -2526,6 +2527,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) } put_buf(r10_bio); } else { + bool fail = false; for (m = 0; m < conf->copies; m++) { int dev = r10_bio->devs[m].devnum; struct bio *bio = r10_bio->devs[m].bio; @@ -2538,6 +2540,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_dec_pending(rdev, conf->mddev); } else if (bio != NULL && !test_bit(BIO_UPTODATE, &bio->bi_flags)) { + fail = true; if (!narrow_write_error(r10_bio, m)) { md_error(conf->mddev, rdev); set_bit(R10BIO_Degraded, @@ -2555,10 +2558,17 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_dec_pending(rdev, conf->mddev); } } - if (test_bit(R10BIO_WriteError, - &r10_bio->state)) - close_write(r10_bio); - raid_end_bio_io(r10_bio); + if (fail) { + spin_lock_irq(&conf->device_lock); + list_add(&r10_bio->retry_list, &conf->bio_end_io_list); + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(conf->mddev->thread); + } else { + if (test_bit(R10BIO_WriteError, + &r10_bio->state)) + close_write(r10_bio); + raid_end_bio_io(r10_bio); + } } } @@ -2572,6 +2582,29 @@ static void raid10d(struct mddev *mddev) md_check_recovery(mddev); + if (!list_empty_careful(&conf->bio_end_io_list) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + LIST_HEAD(tmp); + spin_lock_irqsave(&conf->device_lock, flags); + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + list_add(&tmp, &conf->bio_end_io_list); + list_del_init(&conf->bio_end_io_list); + } + spin_unlock_irqrestore(&conf->device_lock, flags); + while (!list_empty(&tmp)) { + r10_bio = list_first_entry(&conf->bio_end_io_list, + struct r10bio, retry_list); + list_del(&r10_bio->retry_list); + if (mddev->degraded) + set_bit(R10BIO_Degraded, &r10_bio->state); + + if (test_bit(R10BIO_WriteError, + &r10_bio->state)) + close_write(r10_bio); + raid_end_bio_io(r10_bio); + } + } + blk_start_plug(&plug); for (;;) { @@ -3270,6 +3303,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); + INIT_LIST_HEAD(&conf->bio_end_io_list); spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 7c615613c..c09b38414 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -42,6 +42,12 @@ struct r10conf { sector_t chunk_mask; struct list_head retry_list; + /* A separate list of r1bio which just need raid_end_bio_io called. + * This mustn't happen for writes which had any errors if the superblock + * needs to be written. + */ + struct list_head bio_end_io_list; + /* queue pending writes and submit them on unplug */ struct bio_list pending_bio_list; int pending_count; diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index cec1f8c05..a7ff6b547 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -946,9 +946,6 @@ static int rc_dev_uevent(struct device *device, struct kobj_uevent_env *env) { struct rc_dev *dev = to_rc_dev(device); - if (!dev || !dev->input_dev) - return -ENODEV; - if (dev->rc_map.name) ADD_HOTPLUG_VAR("NAME=%s", dev->rc_map.name); if (dev->driver_name) diff --git a/drivers/misc/modem_if/sipc4_modem.c b/drivers/misc/modem_if/sipc4_modem.c index 66ae4c1af..2ba3bac89 100755 --- a/drivers/misc/modem_if/sipc4_modem.c +++ b/drivers/misc/modem_if/sipc4_modem.c @@ -105,8 +105,8 @@ static struct io_device *create_io_device(struct modem_io_t *io_t, return NULL; } - rb_init_node(&iod->node_chan); - rb_init_node(&iod->node_fmt); + RB_CLEAR_NODE(&iod->node_chan); + RB_CLEAR_NODE(&iod->node_fmt); iod->name = io_t->name; iod->id = io_t->id; diff --git a/drivers/misc/modem_if/sipc5_modem.c b/drivers/misc/modem_if/sipc5_modem.c index 8943686fa..84043a2a7 100755 --- a/drivers/misc/modem_if/sipc5_modem.c +++ b/drivers/misc/modem_if/sipc5_modem.c @@ -108,8 +108,8 @@ static struct io_device *create_io_device(struct modem_io_t *io_t, return NULL; } - rb_init_node(&iod->node_chan); - rb_init_node(&iod->node_fmt); + RB_CLEAR_NODE(&iod->node_chan); + RB_CLEAR_NODE(&iod->node_fmt); iod->name = io_t->name; iod->id = io_t->id; diff --git a/drivers/misc/sec_misc.c b/drivers/misc/sec_misc.c index dbd80b854..5a6a17eb7 100755 --- a/drivers/misc/sec_misc.c +++ b/drivers/misc/sec_misc.c @@ -283,7 +283,7 @@ static ssize_t drop_caches_store si_meminfo(&i); printk("[Before]\nMemFree : %8lu kB\n", K(i.freeram)); printk("Cached : %8lu kB\n\n", K(global_page_state(NR_FILE_PAGES) - \ - total_swapcache_pages - i.bufferram)); + total_swapcache_pages() - i.bufferram)); iterate_supers(drop_pagecache_sb, NULL); drop_slab(); @@ -291,7 +291,7 @@ static ssize_t drop_caches_store si_meminfo(&i); printk("[After]\nMemFree : %8lu kB\n", K(i.freeram)); printk("Cached : %8lu kB\n\n", K(global_page_state(NR_FILE_PAGES) - \ - total_swapcache_pages - i.bufferram)); + total_swapcache_pages() - i.bufferram)); printk("Cached Drop done!\n"); } out: diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c index 43f1a0011..8f793ea1d 100644 --- a/drivers/mtd/ubi/io.c +++ b/drivers/mtd/ubi/io.c @@ -942,6 +942,11 @@ static int validate_vid_hdr(const struct ubi_device *ubi, goto bad; } + if (data_size > ubi->leb_size) { + dbg_err("bad data_size"); + goto bad; + } + if (vol_type == UBI_VID_STATIC) { /* * Although from high-level point of view static volumes may diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c index c015fc0a7..4105a508f 100644 --- a/drivers/mtd/ubi/vtbl.c +++ b/drivers/mtd/ubi/vtbl.c @@ -656,6 +656,7 @@ static int init_volumes(struct ubi_device *ubi, const struct ubi_scan_info *si, if (ubi->corr_peb_count) ubi_err("%d PEBs are corrupted and not used", ubi->corr_peb_count); + return -ENOSPC; } ubi->rsvd_pebs += reserved_pebs; ubi->avail_pebs -= reserved_pebs; diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index 424260a3c..43422b94d 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -1516,6 +1516,7 @@ int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si) if (ubi->corr_peb_count) ubi_err("%d PEBs are corrupted and not used", ubi->corr_peb_count); + err = -ENOSPC; goto out_free; } ubi->avail_pebs -= WL_RESERVED_PEBS; diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c index cac5b256a..37534a06b 100644 --- a/drivers/net/wireless/ath/ath9k/init.c +++ b/drivers/net/wireless/ath/ath9k/init.c @@ -683,6 +683,7 @@ void ath9k_set_hw_capab(struct ath_softc *sc, struct ieee80211_hw *hw) hw->max_rate_tries = 10; hw->sta_data_size = sizeof(struct ath_node); hw->vif_data_size = sizeof(struct ath_vif); + hw->extra_tx_headroom = 4; hw->wiphy->available_antennas_rx = BIT(ah->caps.max_rxchains) - 1; hw->wiphy->available_antennas_tx = BIT(ah->caps.max_txchains) - 1; diff --git a/drivers/net/wireless/bcmdhd/wl_cfg80211.c b/drivers/net/wireless/bcmdhd/wl_cfg80211.c index 671c85a42..24081ea33 100644 --- a/drivers/net/wireless/bcmdhd/wl_cfg80211.c +++ b/drivers/net/wireless/bcmdhd/wl_cfg80211.c @@ -10084,7 +10084,7 @@ wl_cfg80211_netdev_notifier_call(struct notifier_block * nb, break; } set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(100); + schedule_timeout(HZ); set_current_state(TASK_RUNNING); refcnt++; } diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-lib.c b/drivers/net/wireless/iwlwifi/iwl-agn-lib.c index 56f41c940..6314e24c2 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn-lib.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn-lib.c @@ -1063,7 +1063,7 @@ static void iwlagn_wowlan_program_keys(struct ieee80211_hw *hw, u8 *pn = seq.ccmp.pn; ieee80211_get_key_rx_seq(key, i, &seq); - aes_sc->pn = cpu_to_le64( + aes_sc[i].pn = cpu_to_le64( (u64)pn[5] | ((u64)pn[4] << 8) | ((u64)pn[3] << 16) | diff --git a/drivers/of/address.c b/drivers/of/address.c index 693cdaaf2..05c653958 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -623,10 +623,10 @@ struct device_node *of_find_matching_node_by_address(struct device_node *from, struct resource res; while (dn) { - if (of_address_to_resource(dn, 0, &res)) - continue; - if (res.start == base_address) + if (!of_address_to_resource(dn, 0, &res) && + res.start == base_address) return dn; + dn = of_find_matching_node(dn, matches); } diff --git a/drivers/pci/access.c b/drivers/pci/access.c index 2a581642c..f49d961cf 100644 --- a/drivers/pci/access.c +++ b/drivers/pci/access.c @@ -357,6 +357,56 @@ static const struct pci_vpd_ops pci_vpd_pci22_ops = { .release = pci_vpd_pci22_release, }; +static ssize_t pci_vpd_f0_read(struct pci_dev *dev, loff_t pos, size_t count, + void *arg) +{ + struct pci_dev *tdev = pci_get_slot(dev->bus, PCI_SLOT(dev->devfn)); + ssize_t ret; + + if (!tdev) + return -ENODEV; + + ret = pci_read_vpd(tdev, pos, count, arg); + pci_dev_put(tdev); + return ret; +} + +static ssize_t pci_vpd_f0_write(struct pci_dev *dev, loff_t pos, size_t count, + const void *arg) +{ + struct pci_dev *tdev = pci_get_slot(dev->bus, PCI_SLOT(dev->devfn)); + ssize_t ret; + + if (!tdev) + return -ENODEV; + + ret = pci_write_vpd(tdev, pos, count, arg); + pci_dev_put(tdev); + return ret; +} + +static const struct pci_vpd_ops pci_vpd_f0_ops = { + .read = pci_vpd_f0_read, + .write = pci_vpd_f0_write, + .release = pci_vpd_pci22_release, +}; + +static int pci_vpd_f0_dev_check(struct pci_dev *dev) +{ + struct pci_dev *tdev = pci_get_slot(dev->bus, PCI_SLOT(dev->devfn)); + int ret = 0; + + if (!tdev) + return -ENODEV; + if (!tdev->vpd || !tdev->multifunction || + dev->class != tdev->class || dev->vendor != tdev->vendor || + dev->device != tdev->device) + ret = -ENODEV; + + pci_dev_put(tdev); + return ret; +} + int pci_vpd_pci22_init(struct pci_dev *dev) { struct pci_vpd_pci22 *vpd; @@ -365,12 +415,21 @@ int pci_vpd_pci22_init(struct pci_dev *dev) cap = pci_find_capability(dev, PCI_CAP_ID_VPD); if (!cap) return -ENODEV; + if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0) { + int ret = pci_vpd_f0_dev_check(dev); + + if (ret) + return ret; + } vpd = kzalloc(sizeof(*vpd), GFP_ATOMIC); if (!vpd) return -ENOMEM; vpd->base.len = PCI_VPD_PCI22_SIZE; - vpd->base.ops = &pci_vpd_pci22_ops; + if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0) + vpd->base.ops = &pci_vpd_f0_ops; + else + vpd->base.ops = &pci_vpd_pci22_ops; mutex_init(&vpd->lock); vpd->cap = cap; vpd->busy = false; diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index c0300242d..3ce87c82f 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -1883,6 +1883,15 @@ static void __devinit quirk_netmos(struct pci_dev *dev) DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_NETMOS, PCI_ANY_ID, PCI_CLASS_COMMUNICATION_SERIAL, 8, quirk_netmos); +static void quirk_f0_vpd_link(struct pci_dev *dev) +{ + if ((dev->class >> 8) != PCI_CLASS_NETWORK_ETHERNET || + !dev->multifunction || !PCI_FUNC(dev->devfn)) + return; + dev->dev_flags |= PCI_DEV_FLAGS_VPD_REF_F0; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_f0_vpd_link); + static void __devinit quirk_e100_interrupt(struct pci_dev *dev) { u16 command, pmcsr; @@ -2834,12 +2843,15 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x3c28, vtd_mask_spec_errors); static void __devinit fixup_ti816x_class(struct pci_dev* dev) { + u32 class = dev->class; + /* TI 816x devices do not have class code set when in PCIe boot mode */ - dev_info(&dev->dev, "Setting PCI class for 816x PCIe device\n"); - dev->class = PCI_CLASS_MULTIMEDIA_VIDEO; + dev->class = PCI_CLASS_MULTIMEDIA_VIDEO << 8; + dev_info(&dev->dev, "PCI class overridden (%#08x -> %#08x)\n", + class, dev->class); } DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_TI, 0xb800, - PCI_CLASS_NOT_DEFINED, 0, fixup_ti816x_class); + PCI_CLASS_NOT_DEFINED, 0, fixup_ti816x_class); /* Some PCIe devices do not work reliably with the claimed maximum * payload size supported. diff --git a/drivers/platform/msm/usb_bam.c b/drivers/platform/msm/usb_bam.c index bdb7e4122..913dc0bae 100644 --- a/drivers/platform/msm/usb_bam.c +++ b/drivers/platform/msm/usb_bam.c @@ -31,7 +31,7 @@ #define USB_THRESHOLD 512 #define USB_BAM_MAX_STR_LEN 50 -#define USB_BAM_TIMEOUT (10*HZ) +#define USB_BAM_TIMEOUT 10000 enum usb_bam_sm { USB_BAM_SM_INIT = 0, @@ -1073,7 +1073,7 @@ static void wait_for_prod_granted(enum usb_bam cur_bam) } else if (ret == -EINPROGRESS) { pr_debug("%s: Waiting for PROD_GRANTED\n", __func__); if (!wait_for_completion_timeout(&info.prod_avail[cur_bam], - USB_BAM_TIMEOUT)) + msecs_to_jiffies(USB_BAM_TIMEOUT))) pr_err("%s: Timeout wainting for PROD_GRANTED\n", __func__); } else @@ -1117,7 +1117,7 @@ static void wait_for_prod_release(enum usb_bam cur_bam) } else if (ret == -EINPROGRESS) { pr_debug("%s: Waiting for PROD_RELEASED\n", __func__); if (!wait_for_completion_timeout(&info.prod_released[cur_bam], - USB_BAM_TIMEOUT)) + msecs_to_jiffies(USB_BAM_TIMEOUT))) pr_err("%s: Timeout waiting for PROD_RELEASED\n", __func__); } else @@ -2073,7 +2073,7 @@ void usb_bam_reset_complete(void) { pr_debug("Waiting for reset compelte"); if (wait_for_completion_interruptible_timeout(&ctx.reset_done, - 10*HZ) <= 0) + msecs_to_jiffies(10000)) <= 0) pr_warn("Timeout while waiting for reset"); pr_debug("Finished Waiting for reset complete"); diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index a4f1a503b..0303e0256 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -2592,7 +2592,7 @@ static void regulator_bulk_enable_async(void *data, async_cookie_t cookie) int regulator_bulk_enable(int num_consumers, struct regulator_bulk_data *consumers) { - LIST_HEAD(async_domain); + ASYNC_DOMAIN_EXCLUSIVE(async_domain); int i; int ret = 0; diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c index 51ee663c1..ec96c5fe3 100644 --- a/drivers/scsi/libsas/sas_ata.c +++ b/drivers/scsi/libsas/sas_ata.c @@ -720,7 +720,7 @@ static void async_sas_ata_eh(void *data, async_cookie_t cookie) void sas_ata_strategy_handler(struct Scsi_Host *shost) { struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(shost); - LIST_HEAD(async); + ASYNC_DOMAIN_EXCLUSIVE(async); int i; /* it's ok to defer revalidation events during ata eh, these diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.c index dbb8edfc8..06da698da 100644 --- a/drivers/scsi/mvsas/mv_sas.c +++ b/drivers/scsi/mvsas/mv_sas.c @@ -984,6 +984,8 @@ static void mvs_slot_free(struct mvs_info *mvi, u32 rx_desc) static void mvs_slot_task_free(struct mvs_info *mvi, struct sas_task *task, struct mvs_slot_info *slot, u32 slot_idx) { + if (!slot) + return; if (!slot->task) return; if (!sas_protocol_ata(task->task_proto)) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 29fabffd6..b1f76da1e 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include diff --git a/drivers/sensorhub/stm32f/ssp.h b/drivers/sensorhub/stm32f/ssp.h index 414182817..81c24adf0 100755 --- a/drivers/sensorhub/stm32f/ssp.h +++ b/drivers/sensorhub/stm32f/ssp.h @@ -48,7 +48,7 @@ #include "ssp_sensorhub.h" #endif -#define SSP_DBG 1 +#define SSP_DBG 0 #define SUCCESS 1 #define FAIL 0 @@ -56,18 +56,19 @@ #define FACTORY_DATA_MAX 100 -#if SSP_DBG -#define SSP_FUNC_DBG 1 -#define SSP_DATA_DBG 0 - /* ssp mcu device ID */ #define DEVICE_ID 0x55 +#if SSP_DBG +#define SSP_FUNC_DBG 1 +#define SSP_DATA_DBG 0 #define ssp_dbg(dev, format, ...) do { \ printk(KERN_INFO dev, format, ##__VA_ARGS__); \ } while (0) #else +#define SSP_FUNC_DBG 0 +#define SSP_DATA_DBG 0 #define ssp_dbg(dev, format, ...) #endif diff --git a/drivers/sensorhub/stm32f/ssp_debug.c b/drivers/sensorhub/stm32f/ssp_debug.c index af356f4ca..3d25335b4 100755 --- a/drivers/sensorhub/stm32f/ssp_debug.c +++ b/drivers/sensorhub/stm32f/ssp_debug.c @@ -196,7 +196,9 @@ int print_mcu_debug(char *pchRcvDataFrame, int *pDataIdx, int iRcvDataFrameLength) { int iLength = pchRcvDataFrame[(*pDataIdx)++]; +#if SSP_DBG int cur = *pDataIdx; +#endif if (iLength > iRcvDataFrameLength - *pDataIdx || iLength <= 0) { ssp_dbg("[SSP]: MSG From MCU - invalid debug length(%d/%d/%d)\n", diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index dc25bee8d..2ecc2d646 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -799,6 +799,10 @@ static irqreturn_t ssp_int(int irq, void *dev_id) if (!(sccr1_reg & SSCR1_TIE)) mask &= ~SSSR_TFS; + /* Ignore RX timeout interrupt if it is disabled */ + if (!(sccr1_reg & SSCR1_TINTE)) + mask &= ~SSSR_TINT; + if (!(status & mask)) return IRQ_NONE; diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 3d8f662e4..a3f31e9ab 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -831,8 +831,7 @@ static struct class spi_master_class = { * * The caller is responsible for assigning the bus number and initializing * the master's methods before calling spi_register_master(); and (after errors - * adding the device) calling spi_master_put() and kfree() to prevent a memory - * leak. + * adding the device) calling spi_master_put() to prevent a memory leak. */ struct spi_master *spi_alloc_master(struct device *dev, unsigned size) { diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index d6931e9cd..c15649439 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -80,12 +80,6 @@ source "drivers/staging/sep/Kconfig" source "drivers/staging/iio/Kconfig" -source "drivers/staging/zram/Kconfig" - -source "drivers/staging/zcache/Kconfig" - -source "drivers/staging/zsmalloc/Kconfig" - source "drivers/staging/wlags49_h2/Kconfig" source "drivers/staging/wlags49_h25/Kconfig" @@ -126,8 +120,6 @@ source "drivers/staging/android/Kconfig" source "drivers/staging/telephony/Kconfig" -source "drivers/staging/ramster/Kconfig" - source "drivers/staging/ozwpan/Kconfig" source "drivers/staging/vnswap/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 8229e82e9..605826f30 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -32,9 +32,6 @@ obj-$(CONFIG_VT6656) += vt6656/ obj-$(CONFIG_VME_BUS) += vme/ obj-$(CONFIG_DX_SEP) += sep/ obj-$(CONFIG_IIO) += iio/ -obj-$(CONFIG_ZRAM) += zram/ -obj-$(CONFIG_ZCACHE) += zcache/ -obj-$(CONFIG_ZSMALLOC) += zsmalloc/ obj-$(CONFIG_WLAGS49_H2) += wlags49_h2/ obj-$(CONFIG_WLAGS49_H25) += wlags49_h25/ obj-$(CONFIG_FB_SM7XX) += sm7xx/ @@ -54,6 +51,5 @@ obj-$(CONFIG_MFD_NVEC) += nvec/ obj-$(CONFIG_DRM_OMAP) += omapdrm/ obj-$(CONFIG_ANDROID) += android/ obj-$(CONFIG_PHONE) += telephony/ -obj-$(CONFIG_RAMSTER) += ramster/ obj-$(CONFIG_USB_WPAN_HCD) += ozwpan/ obj-$(CONFIG_VNSWAP) += vnswap/ diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index 159cbf8a7..bf2158b6c 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -41,6 +41,7 @@ #include #include #include +#include #include @@ -144,7 +145,7 @@ static DEFINE_MUTEX(scan_mutex); #endif #if defined(CONFIG_ZSWAP) -extern atomic_t zswap_pool_pages; +extern u64 zswap_pool_total_size; extern atomic_t zswap_stored_pages; #endif @@ -174,6 +175,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) bool is_active_high; bool flag = 0; #endif + u32 zswap_pool_total_pages; if (nr_to_scan > 0) { if (mutex_lock_interruptible(&scan_mutex) < 0) @@ -196,7 +198,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) is_active_high = (global_page_state(NR_ACTIVE_FILE) > global_page_state(NR_INACTIVE_FILE)) ? 1 : 0; #endif - other_file = global_page_state(NR_FILE_PAGES); + other_file = global_page_state(NR_FILE_PAGES) + zcache_pages(); #if defined(CONFIG_CMA_PAGE_COUNTING) && defined(CONFIG_EXCLUDE_LRU_LIVING_IN_CMA) if (get_nr_swap_pages() < SSWAP_LMK_THRESHOLD && cma_page_ratio >= CMA_PAGE_RATIO @@ -205,8 +207,8 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) flag = 1; } #endif - if (global_page_state(NR_SHMEM) + total_swapcache_pages < other_file) - other_file -= global_page_state(NR_SHMEM) + total_swapcache_pages; + if (global_page_state(NR_SHMEM) + total_swapcache_pages() < other_file) + other_file -= global_page_state(NR_SHMEM) + total_swapcache_pages(); else other_file = 0; @@ -275,7 +277,8 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) #if defined(CONFIG_ZSWAP) if (atomic_read(&zswap_stored_pages)) { lowmem_print(3, "shown tasksize : %d\n", tasksize); - tasksize += atomic_read(&zswap_pool_pages) * get_mm_counter(p->mm, MM_SWAPENTS) + zswap_pool_total_pages = DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); + tasksize += zswap_pool_total_pages * get_mm_counter(p->mm, MM_SWAPENTS) / atomic_read(&zswap_stored_pages); lowmem_print(3, "real tasksize : %d\n", tasksize); } @@ -301,22 +304,22 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) #if defined(CONFIG_CMA_PAGE_COUNTING) lowmem_print(1, "send sigkill to %d (%s), adj %d, size %d, " "ofree %d, ofile %d(%c), is_kswapd %d - " - "cma_free %lu priority %d cma_i_file %lu cma_a_file %lu\n", + "cma_free %lu priority %d cma_i_file %lu cma_a_file %lu\n,Total zcache is %ldkB\n", selected->pid, selected->comm, selected_oom_score_adj, selected_tasksize, other_free, other_file, flag ? '-' : '+', !!current_is_kswapd(), nr_cma_free, sc->priority, - nr_cma_inactive_file, nr_cma_active_file); + nr_cma_inactive_file, nr_cma_active_file, (long)zcache_pages() * (long)(PAGE_SIZE / 1024)); #else lowmem_print(1, "send sigkill to %d (%s), adj %d, size %d, " "free memory = %d, reclaimable memory = %d " - "is_kswapd %d cma_free %lu priority %d\n", + "is_kswapd %d cma_free %lu priority %d\nTotal zcache is %ldkB\n", selected->pid, selected->comm, selected_oom_score_adj, selected_tasksize, other_free, other_file, !!current_is_kswapd(), - nr_cma_free, sc->priority); + nr_cma_free, sc->priority,(long)zcache_pages() * (long)(PAGE_SIZE / 1024)); #endif lowmem_deathpending_timeout = jiffies + HZ; send_sig(SIGKILL, selected, 0); @@ -399,9 +402,9 @@ static int android_oom_handler(struct notifier_block *nb, nr_cma_inactive_file = global_page_state(NR_CMA_INACTIVE_FILE); nr_cma_active_file = global_page_state(NR_CMA_ACTIVE_FILE); - other_file = global_page_state(NR_FILE_PAGES) - + other_file = global_page_state(NR_FILE_PAGES) + zcache_pages() - global_page_state(NR_SHMEM) - - total_swapcache_pages - + total_swapcache_pages() - nr_cma_inactive_file - nr_cma_active_file; #endif diff --git a/drivers/staging/ramster/Kconfig b/drivers/staging/ramster/Kconfig deleted file mode 100644 index 4af1f8d4b..000000000 --- a/drivers/staging/ramster/Kconfig +++ /dev/null @@ -1,13 +0,0 @@ -config RAMSTER - bool "Cross-machine RAM capacity sharing, aka peer-to-peer tmem" - depends on (CLEANCACHE || FRONTSWAP) && CONFIGFS_FS=y && !ZCACHE && !XVMALLOC && !HIGHMEM - select LZO_COMPRESS - select LZO_DECOMPRESS - default n - help - RAMster allows RAM on other machines in a cluster to be utilized - dynamically and symmetrically instead of swapping to a local swap - disk, thus improving performance on memory-constrained workloads - while minimizing total RAM across the cluster. RAMster, like - zcache, compresses swap pages into local RAM, but then remotifies - the compressed pages to another node in the RAMster cluster. diff --git a/drivers/staging/ramster/Makefile b/drivers/staging/ramster/Makefile deleted file mode 100644 index bcc13c87f..000000000 --- a/drivers/staging/ramster/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_RAMSTER) += zcache-main.o tmem.o r2net.o xvmalloc.o cluster/ diff --git a/drivers/staging/ramster/TODO b/drivers/staging/ramster/TODO deleted file mode 100644 index 46fcf0c58..000000000 --- a/drivers/staging/ramster/TODO +++ /dev/null @@ -1,13 +0,0 @@ -For this staging driver, RAMster duplicates code from drivers/staging/zcache -then incorporates changes to the local copy of the code. For V5, it also -directly incorporates the soon-to-be-removed drivers/staging/zram/xvmalloc.[ch] -as all testing has been done with xvmalloc rather than the new zsmalloc. -Before RAMster can be promoted from staging, the zcache and RAMster drivers -should be either merged or reorganized to separate out common code. - -Until V4, RAMster duplicated code from fs/ocfs2/cluster, but this made -RAMster incompatible with ocfs2 running in the same kernel and included -lots of code that could be removed. As of V5, the ocfs2 code has been -mined and made RAMster-specific, made to communicate with a userland -ramster-tools package rather than ocfs2-tools, and can co-exist with ocfs2 -both in the same kernel and in userland on the same machine. diff --git a/drivers/staging/ramster/cluster/Makefile b/drivers/staging/ramster/cluster/Makefile deleted file mode 100755 index 9c6943652..000000000 --- a/drivers/staging/ramster/cluster/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -obj-$(CONFIG_RAMSTER) += ramster_nodemanager.o - -ramster_nodemanager-objs := heartbeat.o masklog.o nodemanager.o tcp.o diff --git a/drivers/staging/ramster/cluster/heartbeat.c b/drivers/staging/ramster/cluster/heartbeat.c deleted file mode 100755 index 002094907..000000000 --- a/drivers/staging/ramster/cluster/heartbeat.c +++ /dev/null @@ -1,464 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * Copyright (C) 2004, 2005, 2012 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include -#include -#include - -#include "heartbeat.h" -#include "tcp.h" -#include "nodemanager.h" - -#include "masklog.h" - -/* - * The first heartbeat pass had one global thread that would serialize all hb - * callback calls. This global serializing sem should only be removed once - * we've made sure that all callees can deal with being called concurrently - * from multiple hb region threads. - */ -static DECLARE_RWSEM(r2hb_callback_sem); - -/* - * multiple hb threads are watching multiple regions. A node is live - * whenever any of the threads sees activity from the node in its region. - */ -static DEFINE_SPINLOCK(r2hb_live_lock); -static unsigned long r2hb_live_node_bitmap[BITS_TO_LONGS(R2NM_MAX_NODES)]; - -static struct r2hb_callback { - struct list_head list; -} r2hb_callbacks[R2HB_NUM_CB]; - -enum r2hb_heartbeat_modes { - R2HB_HEARTBEAT_LOCAL = 0, - R2HB_HEARTBEAT_GLOBAL, - R2HB_HEARTBEAT_NUM_MODES, -}; - -char *r2hb_heartbeat_mode_desc[R2HB_HEARTBEAT_NUM_MODES] = { - "local", /* R2HB_HEARTBEAT_LOCAL */ - "global", /* R2HB_HEARTBEAT_GLOBAL */ -}; - -unsigned int r2hb_dead_threshold = R2HB_DEFAULT_DEAD_THRESHOLD; -unsigned int r2hb_heartbeat_mode = R2HB_HEARTBEAT_LOCAL; - -/* Only sets a new threshold if there are no active regions. - * - * No locking or otherwise interesting code is required for reading - * r2hb_dead_threshold as it can't change once regions are active and - * it's not interesting to anyone until then anyway. */ -static void r2hb_dead_threshold_set(unsigned int threshold) -{ - if (threshold > R2HB_MIN_DEAD_THRESHOLD) { - spin_lock(&r2hb_live_lock); - r2hb_dead_threshold = threshold; - spin_unlock(&r2hb_live_lock); - } -} - -static int r2hb_global_hearbeat_mode_set(unsigned int hb_mode) -{ - int ret = -1; - - if (hb_mode < R2HB_HEARTBEAT_NUM_MODES) { - spin_lock(&r2hb_live_lock); - r2hb_heartbeat_mode = hb_mode; - ret = 0; - spin_unlock(&r2hb_live_lock); - } - - return ret; -} - -void r2hb_exit(void) -{ -} - -int r2hb_init(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(r2hb_callbacks); i++) - INIT_LIST_HEAD(&r2hb_callbacks[i].list); - - memset(r2hb_live_node_bitmap, 0, sizeof(r2hb_live_node_bitmap)); - - return 0; -} - -/* if we're already in a callback then we're already serialized by the sem */ -static void r2hb_fill_node_map_from_callback(unsigned long *map, - unsigned bytes) -{ - BUG_ON(bytes < (BITS_TO_LONGS(R2NM_MAX_NODES) * sizeof(unsigned long))); - - memcpy(map, &r2hb_live_node_bitmap, bytes); -} - -/* - * get a map of all nodes that are heartbeating in any regions - */ -void r2hb_fill_node_map(unsigned long *map, unsigned bytes) -{ - /* callers want to serialize this map and callbacks so that they - * can trust that they don't miss nodes coming to the party */ - down_read(&r2hb_callback_sem); - spin_lock(&r2hb_live_lock); - r2hb_fill_node_map_from_callback(map, bytes); - spin_unlock(&r2hb_live_lock); - up_read(&r2hb_callback_sem); -} -EXPORT_SYMBOL_GPL(r2hb_fill_node_map); - -/* - * heartbeat configfs bits. The heartbeat set is a default set under - * the cluster set in nodemanager.c. - */ - -/* heartbeat set */ - -struct r2hb_hb_group { - struct config_group hs_group; - /* some stuff? */ -}; - -static struct r2hb_hb_group *to_r2hb_hb_group(struct config_group *group) -{ - return group ? - container_of(group, struct r2hb_hb_group, hs_group) - : NULL; -} - -static struct config_item r2hb_config_item; - -static struct config_item *r2hb_hb_group_make_item(struct config_group *group, - const char *name) -{ - int ret; - - if (strlen(name) > R2HB_MAX_REGION_NAME_LEN) { - ret = -ENAMETOOLONG; - goto free; - } - - config_item_put(&r2hb_config_item); - - return &r2hb_config_item; -free: - return ERR_PTR(ret); -} - -static void r2hb_hb_group_drop_item(struct config_group *group, - struct config_item *item) -{ - if (r2hb_global_heartbeat_active()) { - printk(KERN_NOTICE "ramster: Heartbeat %s " - "on region %s (%s)\n", - "stopped/aborted", config_item_name(item), - "no region"); - } - - config_item_put(item); -} - -struct r2hb_hb_group_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct r2hb_hb_group *, char *); - ssize_t (*store)(struct r2hb_hb_group *, const char *, size_t); -}; - -static ssize_t r2hb_hb_group_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct r2hb_hb_group *reg = to_r2hb_hb_group(to_config_group(item)); - struct r2hb_hb_group_attribute *r2hb_hb_group_attr = - container_of(attr, struct r2hb_hb_group_attribute, attr); - ssize_t ret = 0; - - if (r2hb_hb_group_attr->show) - ret = r2hb_hb_group_attr->show(reg, page); - return ret; -} - -static ssize_t r2hb_hb_group_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct r2hb_hb_group *reg = to_r2hb_hb_group(to_config_group(item)); - struct r2hb_hb_group_attribute *r2hb_hb_group_attr = - container_of(attr, struct r2hb_hb_group_attribute, attr); - ssize_t ret = -EINVAL; - - if (r2hb_hb_group_attr->store) - ret = r2hb_hb_group_attr->store(reg, page, count); - return ret; -} - -static ssize_t r2hb_hb_group_threshold_show(struct r2hb_hb_group *group, - char *page) -{ - return sprintf(page, "%u\n", r2hb_dead_threshold); -} - -static ssize_t r2hb_hb_group_threshold_store(struct r2hb_hb_group *group, - const char *page, - size_t count) -{ - unsigned long tmp; - char *p = (char *)page; - int err; - - err = kstrtoul(p, 10, &tmp); - if (err) - return err; - - /* this will validate ranges for us. */ - r2hb_dead_threshold_set((unsigned int) tmp); - - return count; -} - -static -ssize_t r2hb_hb_group_mode_show(struct r2hb_hb_group *group, - char *page) -{ - return sprintf(page, "%s\n", - r2hb_heartbeat_mode_desc[r2hb_heartbeat_mode]); -} - -static -ssize_t r2hb_hb_group_mode_store(struct r2hb_hb_group *group, - const char *page, size_t count) -{ - unsigned int i; - int ret; - size_t len; - - len = (page[count - 1] == '\n') ? count - 1 : count; - if (!len) - return -EINVAL; - - for (i = 0; i < R2HB_HEARTBEAT_NUM_MODES; ++i) { - if (strnicmp(page, r2hb_heartbeat_mode_desc[i], len)) - continue; - - ret = r2hb_global_hearbeat_mode_set(i); - if (!ret) - printk(KERN_NOTICE "ramster: Heartbeat mode " - "set to %s\n", - r2hb_heartbeat_mode_desc[i]); - return count; - } - - return -EINVAL; - -} - -static struct r2hb_hb_group_attribute r2hb_hb_group_attr_threshold = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "dead_threshold", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2hb_hb_group_threshold_show, - .store = r2hb_hb_group_threshold_store, -}; - -static struct r2hb_hb_group_attribute r2hb_hb_group_attr_mode = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "mode", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2hb_hb_group_mode_show, - .store = r2hb_hb_group_mode_store, -}; - -static struct configfs_attribute *r2hb_hb_group_attrs[] = { - &r2hb_hb_group_attr_threshold.attr, - &r2hb_hb_group_attr_mode.attr, - NULL, -}; - -static struct configfs_item_operations r2hb_hearbeat_group_item_ops = { - .show_attribute = r2hb_hb_group_show, - .store_attribute = r2hb_hb_group_store, -}; - -static struct configfs_group_operations r2hb_hb_group_group_ops = { - .make_item = r2hb_hb_group_make_item, - .drop_item = r2hb_hb_group_drop_item, -}; - -static struct config_item_type r2hb_hb_group_type = { - .ct_group_ops = &r2hb_hb_group_group_ops, - .ct_item_ops = &r2hb_hearbeat_group_item_ops, - .ct_attrs = r2hb_hb_group_attrs, - .ct_owner = THIS_MODULE, -}; - -/* this is just here to avoid touching group in heartbeat.h which the - * entire damn world #includes */ -struct config_group *r2hb_alloc_hb_set(void) -{ - struct r2hb_hb_group *hs = NULL; - struct config_group *ret = NULL; - - hs = kzalloc(sizeof(struct r2hb_hb_group), GFP_KERNEL); - if (hs == NULL) - goto out; - - config_group_init_type_name(&hs->hs_group, "heartbeat", - &r2hb_hb_group_type); - - ret = &hs->hs_group; -out: - if (ret == NULL) - kfree(hs); - return ret; -} - -void r2hb_free_hb_set(struct config_group *group) -{ - struct r2hb_hb_group *hs = to_r2hb_hb_group(group); - kfree(hs); -} - -/* hb callback registration and issuing */ - -static struct r2hb_callback *hbcall_from_type(enum r2hb_callback_type type) -{ - if (type == R2HB_NUM_CB) - return ERR_PTR(-EINVAL); - - return &r2hb_callbacks[type]; -} - -void r2hb_setup_callback(struct r2hb_callback_func *hc, - enum r2hb_callback_type type, - r2hb_cb_func *func, - void *data, - int priority) -{ - INIT_LIST_HEAD(&hc->hc_item); - hc->hc_func = func; - hc->hc_data = data; - hc->hc_priority = priority; - hc->hc_type = type; - hc->hc_magic = R2HB_CB_MAGIC; -} -EXPORT_SYMBOL_GPL(r2hb_setup_callback); - -int r2hb_register_callback(const char *region_uuid, - struct r2hb_callback_func *hc) -{ - struct r2hb_callback_func *tmp; - struct list_head *iter; - struct r2hb_callback *hbcall; - int ret; - - BUG_ON(hc->hc_magic != R2HB_CB_MAGIC); - BUG_ON(!list_empty(&hc->hc_item)); - - hbcall = hbcall_from_type(hc->hc_type); - if (IS_ERR(hbcall)) { - ret = PTR_ERR(hbcall); - goto out; - } - - down_write(&r2hb_callback_sem); - - list_for_each(iter, &hbcall->list) { - tmp = list_entry(iter, struct r2hb_callback_func, hc_item); - if (hc->hc_priority < tmp->hc_priority) { - list_add_tail(&hc->hc_item, iter); - break; - } - } - if (list_empty(&hc->hc_item)) - list_add_tail(&hc->hc_item, &hbcall->list); - - up_write(&r2hb_callback_sem); - ret = 0; -out: - mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n", - ret, __builtin_return_address(0), hc); - return ret; -} -EXPORT_SYMBOL_GPL(r2hb_register_callback); - -void r2hb_unregister_callback(const char *region_uuid, - struct r2hb_callback_func *hc) -{ - BUG_ON(hc->hc_magic != R2HB_CB_MAGIC); - - mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n", - __builtin_return_address(0), hc); - - /* XXX Can this happen _with_ a region reference? */ - if (list_empty(&hc->hc_item)) - return; - - down_write(&r2hb_callback_sem); - - list_del_init(&hc->hc_item); - - up_write(&r2hb_callback_sem); -} -EXPORT_SYMBOL_GPL(r2hb_unregister_callback); - -int r2hb_check_node_heartbeating_from_callback(u8 node_num) -{ - unsigned long testing_map[BITS_TO_LONGS(R2NM_MAX_NODES)]; - - r2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); - if (!test_bit(node_num, testing_map)) { - mlog(ML_HEARTBEAT, - "node (%u) does not have heartbeating enabled.\n", - node_num); - return 0; - } - - return 1; -} -EXPORT_SYMBOL_GPL(r2hb_check_node_heartbeating_from_callback); - -void r2hb_stop_all_regions(void) -{ -} -EXPORT_SYMBOL_GPL(r2hb_stop_all_regions); - -/* - * this is just a hack until we get the plumbing which flips file systems - * read only and drops the hb ref instead of killing the node dead. - */ -int r2hb_global_heartbeat_active(void) -{ - return (r2hb_heartbeat_mode == R2HB_HEARTBEAT_GLOBAL); -} -EXPORT_SYMBOL(r2hb_global_heartbeat_active); - -/* added for RAMster */ -void r2hb_manual_set_node_heartbeating(int node_num) -{ - if (node_num < R2NM_MAX_NODES) - set_bit(node_num, r2hb_live_node_bitmap); -} -EXPORT_SYMBOL(r2hb_manual_set_node_heartbeating); diff --git a/drivers/staging/ramster/cluster/heartbeat.h b/drivers/staging/ramster/cluster/heartbeat.h deleted file mode 100755 index 6cbc775bd..000000000 --- a/drivers/staging/ramster/cluster/heartbeat.h +++ /dev/null @@ -1,87 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * heartbeat.h - * - * Function prototypes - * - * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - */ - -#ifndef R2CLUSTER_HEARTBEAT_H -#define R2CLUSTER_HEARTBEAT_H - -#define R2HB_REGION_TIMEOUT_MS 2000 - -#define R2HB_MAX_REGION_NAME_LEN 32 - -/* number of changes to be seen as live */ -#define R2HB_LIVE_THRESHOLD 2 -/* number of equal samples to be seen as dead */ -extern unsigned int r2hb_dead_threshold; -#define R2HB_DEFAULT_DEAD_THRESHOLD 31 -/* Otherwise MAX_WRITE_TIMEOUT will be zero... */ -#define R2HB_MIN_DEAD_THRESHOLD 2 -#define R2HB_MAX_WRITE_TIMEOUT_MS \ - (R2HB_REGION_TIMEOUT_MS * (r2hb_dead_threshold - 1)) - -#define R2HB_CB_MAGIC 0x51d1e4ec - -/* callback stuff */ -enum r2hb_callback_type { - R2HB_NODE_DOWN_CB = 0, - R2HB_NODE_UP_CB, - R2HB_NUM_CB -}; - -struct r2nm_node; -typedef void (r2hb_cb_func)(struct r2nm_node *, int, void *); - -struct r2hb_callback_func { - u32 hc_magic; - struct list_head hc_item; - r2hb_cb_func *hc_func; - void *hc_data; - int hc_priority; - enum r2hb_callback_type hc_type; -}; - -struct config_group *r2hb_alloc_hb_set(void); -void r2hb_free_hb_set(struct config_group *group); - -void r2hb_setup_callback(struct r2hb_callback_func *hc, - enum r2hb_callback_type type, - r2hb_cb_func *func, - void *data, - int priority); -int r2hb_register_callback(const char *region_uuid, - struct r2hb_callback_func *hc); -void r2hb_unregister_callback(const char *region_uuid, - struct r2hb_callback_func *hc); -void r2hb_fill_node_map(unsigned long *map, - unsigned bytes); -void r2hb_exit(void); -int r2hb_init(void); -int r2hb_check_node_heartbeating_from_callback(u8 node_num); -void r2hb_stop_all_regions(void); -int r2hb_get_all_regions(char *region_uuids, u8 numregions); -int r2hb_global_heartbeat_active(void); -void r2hb_manual_set_node_heartbeating(int); - -#endif /* R2CLUSTER_HEARTBEAT_H */ diff --git a/drivers/staging/ramster/cluster/masklog.c b/drivers/staging/ramster/cluster/masklog.c deleted file mode 100755 index 1261d8579..000000000 --- a/drivers/staging/ramster/cluster/masklog.c +++ /dev/null @@ -1,155 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * Copyright (C) 2004, 2005, 2012 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include -#include -#include -#include -#include -#include - -#include "masklog.h" - -struct mlog_bits r2_mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK); -EXPORT_SYMBOL_GPL(r2_mlog_and_bits); -struct mlog_bits r2_mlog_not_bits = MLOG_BITS_RHS(0); -EXPORT_SYMBOL_GPL(r2_mlog_not_bits); - -static ssize_t mlog_mask_show(u64 mask, char *buf) -{ - char *state; - - if (__mlog_test_u64(mask, r2_mlog_and_bits)) - state = "allow"; - else if (__mlog_test_u64(mask, r2_mlog_not_bits)) - state = "deny"; - else - state = "off"; - - return snprintf(buf, PAGE_SIZE, "%s\n", state); -} - -static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) -{ - if (!strnicmp(buf, "allow", 5)) { - __mlog_set_u64(mask, r2_mlog_and_bits); - __mlog_clear_u64(mask, r2_mlog_not_bits); - } else if (!strnicmp(buf, "deny", 4)) { - __mlog_set_u64(mask, r2_mlog_not_bits); - __mlog_clear_u64(mask, r2_mlog_and_bits); - } else if (!strnicmp(buf, "off", 3)) { - __mlog_clear_u64(mask, r2_mlog_not_bits); - __mlog_clear_u64(mask, r2_mlog_and_bits); - } else - return -EINVAL; - - return count; -} - -struct mlog_attribute { - struct attribute attr; - u64 mask; -}; - -#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr) - -#define define_mask(_name) { \ - .attr = { \ - .name = #_name, \ - .mode = S_IRUGO | S_IWUSR, \ - }, \ - .mask = ML_##_name, \ -} - -static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { - define_mask(TCP), - define_mask(MSG), - define_mask(SOCKET), - define_mask(HEARTBEAT), - define_mask(HB_BIO), - define_mask(DLMFS), - define_mask(DLM), - define_mask(DLM_DOMAIN), - define_mask(DLM_THREAD), - define_mask(DLM_MASTER), - define_mask(DLM_RECOVERY), - define_mask(DLM_GLUE), - define_mask(VOTE), - define_mask(CONN), - define_mask(QUORUM), - define_mask(BASTS), - define_mask(CLUSTER), - define_mask(ERROR), - define_mask(NOTICE), - define_mask(KTHREAD), -}; - -static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; - -static ssize_t mlog_show(struct kobject *obj, struct attribute *attr, - char *buf) -{ - struct mlog_attribute *mlog_attr = to_mlog_attr(attr); - - return mlog_mask_show(mlog_attr->mask, buf); -} - -static ssize_t mlog_store(struct kobject *obj, struct attribute *attr, - const char *buf, size_t count) -{ - struct mlog_attribute *mlog_attr = to_mlog_attr(attr); - - return mlog_mask_store(mlog_attr->mask, buf, count); -} - -static const struct sysfs_ops mlog_attr_ops = { - .show = mlog_show, - .store = mlog_store, -}; - -static struct kobj_type mlog_ktype = { - .default_attrs = mlog_attr_ptrs, - .sysfs_ops = &mlog_attr_ops, -}; - -static struct kset mlog_kset = { - .kobj = {.ktype = &mlog_ktype}, -}; - -int r2_mlog_sys_init(struct kset *r2cb_kset) -{ - int i = 0; - - while (mlog_attrs[i].attr.mode) { - mlog_attr_ptrs[i] = &mlog_attrs[i].attr; - i++; - } - mlog_attr_ptrs[i] = NULL; - - kobject_set_name(&mlog_kset.kobj, "logmask"); - mlog_kset.kobj.kset = r2cb_kset; - return kset_register(&mlog_kset); -} - -void r2_mlog_sys_shutdown(void) -{ - kset_unregister(&mlog_kset); -} diff --git a/drivers/staging/ramster/cluster/masklog.h b/drivers/staging/ramster/cluster/masklog.h deleted file mode 100755 index 918ae110b..000000000 --- a/drivers/staging/ramster/cluster/masklog.h +++ /dev/null @@ -1,220 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * Copyright (C) 2005, 2012 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef R2CLUSTER_MASKLOG_H -#define R2CLUSTER_MASKLOG_H - -/* - * For now this is a trivial wrapper around printk() that gives the critical - * ability to enable sets of debugging output at run-time. In the future this - * will almost certainly be redirected to relayfs so that it can pay a - * substantially lower heisenberg tax. - * - * Callers associate the message with a bitmask and a global bitmask is - * maintained with help from /proc. If any of the bits match the message is - * output. - * - * We must have efficient bit tests on i386 and it seems gcc still emits crazy - * code for the 64bit compare. It emits very good code for the dual unsigned - * long tests, though, completely avoiding tests that can never pass if the - * caller gives a constant bitmask that fills one of the longs with all 0s. So - * the desire is to have almost all of the calls decided on by comparing just - * one of the longs. This leads to having infrequently given bits that are - * frequently matched in the high bits. - * - * _ERROR and _NOTICE are used for messages that always go to the console and - * have appropriate KERN_ prefixes. We wrap these in our function instead of - * just calling printk() so that this can eventually make its way through - * relayfs along with the debugging messages. Everything else gets KERN_DEBUG. - * The inline tests and macro dance give GCC the opportunity to quite cleverly - * only emit the appropriage printk() when the caller passes in a constant - * mask, as is almost always the case. - * - * All this bitmask nonsense is managed from the files under - * /sys/fs/r2cb/logmask/. Reading the files gives a straightforward - * indication of which bits are allowed (allow) or denied (off/deny). - * ENTRY deny - * EXIT deny - * TCP off - * MSG off - * SOCKET off - * ERROR allow - * NOTICE allow - * - * Writing changes the state of a given bit and requires a strictly formatted - * single write() call: - * - * write(fd, "allow", 5); - * - * Echoing allow/deny/off string into the logmask files can flip the bits - * on or off as expected; here is the bash script for example: - * - * log_mask="/sys/fs/r2cb/log_mask" - * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do - * echo allow >"$log_mask"/"$node" - * done - * - * The debugfs.ramster tool can also flip the bits with the -l option: - * - * debugfs.ramster -l TCP allow - */ - -/* for task_struct */ -#include - -/* bits that are frequently given and infrequently matched in the low word */ -/* NOTE: If you add a flag, you need to also update masklog.c! */ -#define ML_TCP 0x0000000000000001ULL /* net cluster/tcp.c */ -#define ML_MSG 0x0000000000000002ULL /* net network messages */ -#define ML_SOCKET 0x0000000000000004ULL /* net socket lifetime */ -#define ML_HEARTBEAT 0x0000000000000008ULL /* hb all heartbeat tracking */ -#define ML_HB_BIO 0x0000000000000010ULL /* hb io tracing */ -#define ML_DLMFS 0x0000000000000020ULL /* dlm user dlmfs */ -#define ML_DLM 0x0000000000000040ULL /* dlm general debugging */ -#define ML_DLM_DOMAIN 0x0000000000000080ULL /* dlm domain debugging */ -#define ML_DLM_THREAD 0x0000000000000100ULL /* dlm domain thread */ -#define ML_DLM_MASTER 0x0000000000000200ULL /* dlm master functions */ -#define ML_DLM_RECOVERY 0x0000000000000400ULL /* dlm master functions */ -#define ML_DLM_GLUE 0x0000000000000800ULL /* ramster dlm glue layer */ -#define ML_VOTE 0x0000000000001000ULL /* ramster node messaging */ -#define ML_CONN 0x0000000000002000ULL /* net connection management */ -#define ML_QUORUM 0x0000000000004000ULL /* net connection quorum */ -#define ML_BASTS 0x0000000000008000ULL /* dlmglue asts and basts */ -#define ML_CLUSTER 0x0000000000010000ULL /* cluster stack */ - -/* bits that are infrequently given and frequently matched in the high word */ -#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */ -#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */ -#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */ - -#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) -#ifndef MLOG_MASK_PREFIX -#define MLOG_MASK_PREFIX 0 -#endif - -/* - * When logging is disabled, force the bit test to 0 for anything other - * than errors and notices, allowing gcc to remove the code completely. - * When enabled, allow all masks. - */ -#if defined(CONFIG_RAMSTER_DEBUG_MASKLOG) -#define ML_ALLOWED_BITS (~0) -#else -#define ML_ALLOWED_BITS (ML_ERROR|ML_NOTICE) -#endif - -#define MLOG_MAX_BITS 64 - -struct mlog_bits { - unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG]; -}; - -extern struct mlog_bits r2_mlog_and_bits, r2_mlog_not_bits; - -#if BITS_PER_LONG == 32 - -#define __mlog_test_u64(mask, bits) \ - ((u32)(mask & 0xffffffff) & bits.words[0] || \ - ((u64)(mask) >> 32) & bits.words[1]) -#define __mlog_set_u64(mask, bits) do { \ - bits.words[0] |= (u32)(mask & 0xffffffff); \ - bits.words[1] |= (u64)(mask) >> 32; \ -} while (0) -#define __mlog_clear_u64(mask, bits) do { \ - bits.words[0] &= ~((u32)(mask & 0xffffffff)); \ - bits.words[1] &= ~((u64)(mask) >> 32); \ -} while (0) -#define MLOG_BITS_RHS(mask) { \ - { \ - [0] = (u32)(mask & 0xffffffff), \ - [1] = (u64)(mask) >> 32, \ - } \ -} - -#else /* 32bit long above, 64bit long below */ - -#define __mlog_test_u64(mask, bits) ((mask) & bits.words[0]) -#define __mlog_set_u64(mask, bits) do { \ - bits.words[0] |= (mask); \ -} while (0) -#define __mlog_clear_u64(mask, bits) do { \ - bits.words[0] &= ~(mask); \ -} while (0) -#define MLOG_BITS_RHS(mask) { { (mask) } } - -#endif - -/* - * smp_processor_id() "helpfully" screams when called outside preemptible - * regions in current kernels. sles doesn't have the variants that don't - * scream. just do this instead of trying to guess which we're building - * against.. *sigh*. - */ -#define __mlog_cpu_guess ({ \ - unsigned long _cpu = get_cpu(); \ - put_cpu(); \ - _cpu; \ -}) - -/* In the following two macros, the whitespace after the ',' just - * before ##args is intentional. Otherwise, gcc 2.95 will eat the - * previous token if args expands to nothing. - */ -#define __mlog_printk(level, fmt, args...) \ - printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \ - task_pid_nr(current), __mlog_cpu_guess, \ - __PRETTY_FUNCTION__, __LINE__ , ##args) - -#define mlog(mask, fmt, args...) do { \ - u64 __m = MLOG_MASK_PREFIX | (mask); \ - if ((__m & ML_ALLOWED_BITS) && \ - __mlog_test_u64(__m, r2_mlog_and_bits) && \ - !__mlog_test_u64(__m, r2_mlog_not_bits)) { \ - if (__m & ML_ERROR) \ - __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \ - else if (__m & ML_NOTICE) \ - __mlog_printk(KERN_NOTICE, fmt , ##args); \ - else \ - __mlog_printk(KERN_INFO, fmt , ##args); \ - } \ -} while (0) - -#define mlog_errno(st) do { \ - int _st = (st); \ - if (_st != -ERESTARTSYS && _st != -EINTR && \ - _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \ - mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ -} while (0) - -#define mlog_bug_on_msg(cond, fmt, args...) do { \ - if (cond) { \ - mlog(ML_ERROR, "bug expression: " #cond "\n"); \ - mlog(ML_ERROR, fmt, ##args); \ - BUG(); \ - } \ -} while (0) - -#include -#include -int r2_mlog_sys_init(struct kset *r2cb_subsys); -void r2_mlog_sys_shutdown(void); - -#endif /* R2CLUSTER_MASKLOG_H */ diff --git a/drivers/staging/ramster/cluster/nodemanager.c b/drivers/staging/ramster/cluster/nodemanager.c deleted file mode 100755 index de0e5c8da..000000000 --- a/drivers/staging/ramster/cluster/nodemanager.c +++ /dev/null @@ -1,992 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * Copyright (C) 2004, 2005, 2012 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include -#include -#include -#include - -#include "tcp.h" -#include "nodemanager.h" -#include "heartbeat.h" -#include "masklog.h" - -/* for now we operate under the assertion that there can be only one - * cluster active at a time. Changing this will require trickling - * cluster references throughout where nodes are looked up */ -struct r2nm_cluster *r2nm_single_cluster; - -char *r2nm_fence_method_desc[R2NM_FENCE_METHODS] = { - "reset", /* R2NM_FENCE_RESET */ - "panic", /* R2NM_FENCE_PANIC */ -}; - -struct r2nm_node *r2nm_get_node_by_num(u8 node_num) -{ - struct r2nm_node *node = NULL; - - if (node_num >= R2NM_MAX_NODES || r2nm_single_cluster == NULL) - goto out; - - read_lock(&r2nm_single_cluster->cl_nodes_lock); - node = r2nm_single_cluster->cl_nodes[node_num]; - if (node) - config_item_get(&node->nd_item); - read_unlock(&r2nm_single_cluster->cl_nodes_lock); -out: - return node; -} -EXPORT_SYMBOL_GPL(r2nm_get_node_by_num); - -int r2nm_configured_node_map(unsigned long *map, unsigned bytes) -{ - struct r2nm_cluster *cluster = r2nm_single_cluster; - - BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap))); - - if (cluster == NULL) - return -EINVAL; - - read_lock(&cluster->cl_nodes_lock); - memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap)); - read_unlock(&cluster->cl_nodes_lock); - - return 0; -} -EXPORT_SYMBOL_GPL(r2nm_configured_node_map); - -static struct r2nm_node *r2nm_node_ip_tree_lookup(struct r2nm_cluster *cluster, - __be32 ip_needle, - struct rb_node ***ret_p, - struct rb_node **ret_parent) -{ - struct rb_node **p = &cluster->cl_node_ip_tree.rb_node; - struct rb_node *parent = NULL; - struct r2nm_node *node, *ret = NULL; - - while (*p) { - int cmp; - - parent = *p; - node = rb_entry(parent, struct r2nm_node, nd_ip_node); - - cmp = memcmp(&ip_needle, &node->nd_ipv4_address, - sizeof(ip_needle)); - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else { - ret = node; - break; - } - } - - if (ret_p != NULL) - *ret_p = p; - if (ret_parent != NULL) - *ret_parent = parent; - - return ret; -} - -struct r2nm_node *r2nm_get_node_by_ip(__be32 addr) -{ - struct r2nm_node *node = NULL; - struct r2nm_cluster *cluster = r2nm_single_cluster; - - if (cluster == NULL) - goto out; - - read_lock(&cluster->cl_nodes_lock); - node = r2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL); - if (node) - config_item_get(&node->nd_item); - read_unlock(&cluster->cl_nodes_lock); - -out: - return node; -} -EXPORT_SYMBOL_GPL(r2nm_get_node_by_ip); - -void r2nm_node_put(struct r2nm_node *node) -{ - config_item_put(&node->nd_item); -} -EXPORT_SYMBOL_GPL(r2nm_node_put); - -void r2nm_node_get(struct r2nm_node *node) -{ - config_item_get(&node->nd_item); -} -EXPORT_SYMBOL_GPL(r2nm_node_get); - -u8 r2nm_this_node(void) -{ - u8 node_num = R2NM_MAX_NODES; - - if (r2nm_single_cluster && r2nm_single_cluster->cl_has_local) - node_num = r2nm_single_cluster->cl_local_node; - - return node_num; -} -EXPORT_SYMBOL_GPL(r2nm_this_node); - -/* node configfs bits */ - -static struct r2nm_cluster *to_r2nm_cluster(struct config_item *item) -{ - return item ? - container_of(to_config_group(item), struct r2nm_cluster, - cl_group) - : NULL; -} - -static struct r2nm_node *to_r2nm_node(struct config_item *item) -{ - return item ? container_of(item, struct r2nm_node, nd_item) : NULL; -} - -static void r2nm_node_release(struct config_item *item) -{ - struct r2nm_node *node = to_r2nm_node(item); - kfree(node); -} - -static ssize_t r2nm_node_num_read(struct r2nm_node *node, char *page) -{ - return sprintf(page, "%d\n", node->nd_num); -} - -static struct r2nm_cluster *to_r2nm_cluster_from_node(struct r2nm_node *node) -{ - /* through the first node_set .parent - * mycluster/nodes/mynode == r2nm_cluster->r2nm_node_group->r2nm_node */ - return to_r2nm_cluster(node->nd_item.ci_parent->ci_parent); -} - -enum { - R2NM_NODE_ATTR_NUM = 0, - R2NM_NODE_ATTR_PORT, - R2NM_NODE_ATTR_ADDRESS, - R2NM_NODE_ATTR_LOCAL, -}; - -static ssize_t r2nm_node_num_write(struct r2nm_node *node, const char *page, - size_t count) -{ - struct r2nm_cluster *cluster = to_r2nm_cluster_from_node(node); - unsigned long tmp; - char *p = (char *)page; - int err; - - err = kstrtoul(p, 10, &tmp); - if (err) - return err; - - if (tmp >= R2NM_MAX_NODES) - return -ERANGE; - - /* once we're in the cl_nodes tree networking can look us up by - * node number and try to use our address and port attributes - * to connect to this node.. make sure that they've been set - * before writing the node attribute? */ - if (!test_bit(R2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) || - !test_bit(R2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) - return -EINVAL; /* XXX */ - - write_lock(&cluster->cl_nodes_lock); - if (cluster->cl_nodes[tmp]) - p = NULL; - else { - cluster->cl_nodes[tmp] = node; - node->nd_num = tmp; - set_bit(tmp, cluster->cl_nodes_bitmap); - } - write_unlock(&cluster->cl_nodes_lock); - if (p == NULL) - return -EEXIST; - - return count; -} -static ssize_t r2nm_node_ipv4_port_read(struct r2nm_node *node, char *page) -{ - return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port)); -} - -static ssize_t r2nm_node_ipv4_port_write(struct r2nm_node *node, - const char *page, size_t count) -{ - unsigned long tmp; - char *p = (char *)page; - int err; - - err = kstrtoul(p, 10, &tmp); - if (err) - return err; - - if (tmp == 0) - return -EINVAL; - if (tmp >= (u16)-1) - return -ERANGE; - - node->nd_ipv4_port = htons(tmp); - - return count; -} - -static ssize_t r2nm_node_ipv4_address_read(struct r2nm_node *node, char *page) -{ - return sprintf(page, "%pI4\n", &node->nd_ipv4_address); -} - -static ssize_t r2nm_node_ipv4_address_write(struct r2nm_node *node, - const char *page, - size_t count) -{ - struct r2nm_cluster *cluster = to_r2nm_cluster_from_node(node); - int ret, i; - struct rb_node **p, *parent; - unsigned int octets[4]; - __be32 ipv4_addr = 0; - - ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2], - &octets[1], &octets[0]); - if (ret != 4) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(octets); i++) { - if (octets[i] > 255) - return -ERANGE; - be32_add_cpu(&ipv4_addr, octets[i] << (i * 8)); - } - - ret = 0; - write_lock(&cluster->cl_nodes_lock); - if (r2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) - ret = -EEXIST; - else { - rb_link_node(&node->nd_ip_node, parent, p); - rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); - } - write_unlock(&cluster->cl_nodes_lock); - if (ret) - return ret; - - memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr)); - - return count; -} - -static ssize_t r2nm_node_local_read(struct r2nm_node *node, char *page) -{ - return sprintf(page, "%d\n", node->nd_local); -} - -static ssize_t r2nm_node_local_write(struct r2nm_node *node, const char *page, - size_t count) -{ - struct r2nm_cluster *cluster = to_r2nm_cluster_from_node(node); - unsigned long tmp; - char *p = (char *)page; - ssize_t ret; - int err; - - err = kstrtoul(p, 10, &tmp); - if (err) - return err; - - tmp = !!tmp; /* boolean of whether this node wants to be local */ - - /* setting local turns on networking rx for now so we require having - * set everything else first */ - if (!test_bit(R2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) || - !test_bit(R2NM_NODE_ATTR_NUM, &node->nd_set_attributes) || - !test_bit(R2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) - return -EINVAL; /* XXX */ - - /* the only failure case is trying to set a new local node - * when a different one is already set */ - if (tmp && tmp == cluster->cl_has_local && - cluster->cl_local_node != node->nd_num) - return -EBUSY; - - /* bring up the rx thread if we're setting the new local node. */ - if (tmp && !cluster->cl_has_local) { - ret = r2net_start_listening(node); - if (ret) - return ret; - } - - if (!tmp && cluster->cl_has_local && - cluster->cl_local_node == node->nd_num) { - r2net_stop_listening(node); - cluster->cl_local_node = R2NM_INVALID_NODE_NUM; - } - - node->nd_local = tmp; - if (node->nd_local) { - cluster->cl_has_local = tmp; - cluster->cl_local_node = node->nd_num; - } - - return count; -} - -struct r2nm_node_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct r2nm_node *, char *); - ssize_t (*store)(struct r2nm_node *, const char *, size_t); -}; - -static struct r2nm_node_attribute r2nm_node_attr_num = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "num", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_node_num_read, - .store = r2nm_node_num_write, -}; - -static struct r2nm_node_attribute r2nm_node_attr_ipv4_port = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "ipv4_port", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_node_ipv4_port_read, - .store = r2nm_node_ipv4_port_write, -}; - -static struct r2nm_node_attribute r2nm_node_attr_ipv4_address = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "ipv4_address", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_node_ipv4_address_read, - .store = r2nm_node_ipv4_address_write, -}; - -static struct r2nm_node_attribute r2nm_node_attr_local = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "local", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_node_local_read, - .store = r2nm_node_local_write, -}; - -static struct configfs_attribute *r2nm_node_attrs[] = { - [R2NM_NODE_ATTR_NUM] = &r2nm_node_attr_num.attr, - [R2NM_NODE_ATTR_PORT] = &r2nm_node_attr_ipv4_port.attr, - [R2NM_NODE_ATTR_ADDRESS] = &r2nm_node_attr_ipv4_address.attr, - [R2NM_NODE_ATTR_LOCAL] = &r2nm_node_attr_local.attr, - NULL, -}; - -static int r2nm_attr_index(struct configfs_attribute *attr) -{ - int i; - for (i = 0; i < ARRAY_SIZE(r2nm_node_attrs); i++) { - if (attr == r2nm_node_attrs[i]) - return i; - } - BUG(); - return 0; -} - -static ssize_t r2nm_node_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct r2nm_node *node = to_r2nm_node(item); - struct r2nm_node_attribute *r2nm_node_attr = - container_of(attr, struct r2nm_node_attribute, attr); - ssize_t ret = 0; - - if (r2nm_node_attr->show) - ret = r2nm_node_attr->show(node, page); - return ret; -} - -static ssize_t r2nm_node_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct r2nm_node *node = to_r2nm_node(item); - struct r2nm_node_attribute *r2nm_node_attr = - container_of(attr, struct r2nm_node_attribute, attr); - ssize_t ret; - int attr_index = r2nm_attr_index(attr); - - if (r2nm_node_attr->store == NULL) { - ret = -EINVAL; - goto out; - } - - if (test_bit(attr_index, &node->nd_set_attributes)) - return -EBUSY; - - ret = r2nm_node_attr->store(node, page, count); - if (ret < count) - goto out; - - set_bit(attr_index, &node->nd_set_attributes); -out: - return ret; -} - -static struct configfs_item_operations r2nm_node_item_ops = { - .release = r2nm_node_release, - .show_attribute = r2nm_node_show, - .store_attribute = r2nm_node_store, -}; - -static struct config_item_type r2nm_node_type = { - .ct_item_ops = &r2nm_node_item_ops, - .ct_attrs = r2nm_node_attrs, - .ct_owner = THIS_MODULE, -}; - -/* node set */ - -struct r2nm_node_group { - struct config_group ns_group; - /* some stuff? */ -}; - -#if 0 -static struct r2nm_node_group *to_r2nm_node_group(struct config_group *group) -{ - return group ? - container_of(group, struct r2nm_node_group, ns_group) - : NULL; -} -#endif - -struct r2nm_cluster_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct r2nm_cluster *, char *); - ssize_t (*store)(struct r2nm_cluster *, const char *, size_t); -}; - -static ssize_t r2nm_cluster_attr_write(const char *page, ssize_t count, - unsigned int *val) -{ - unsigned long tmp; - char *p = (char *)page; - int err; - - err = kstrtoul(p, 10, &tmp); - if (err) - return err; - - if (tmp == 0) - return -EINVAL; - if (tmp >= (u32)-1) - return -ERANGE; - - *val = tmp; - - return count; -} - -static ssize_t r2nm_cluster_attr_idle_timeout_ms_read( - struct r2nm_cluster *cluster, char *page) -{ - return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms); -} - -static ssize_t r2nm_cluster_attr_idle_timeout_ms_write( - struct r2nm_cluster *cluster, const char *page, size_t count) -{ - ssize_t ret; - unsigned int val = 0; - - ret = r2nm_cluster_attr_write(page, count, &val); - - if (ret > 0) { - if (cluster->cl_idle_timeout_ms != val - && r2net_num_connected_peers()) { - mlog(ML_NOTICE, - "r2net: cannot change idle timeout after " - "the first peer has agreed to it." - " %d connected peers\n", - r2net_num_connected_peers()); - ret = -EINVAL; - } else if (val <= cluster->cl_keepalive_delay_ms) { - mlog(ML_NOTICE, "r2net: idle timeout must be larger " - "than keepalive delay\n"); - ret = -EINVAL; - } else { - cluster->cl_idle_timeout_ms = val; - } - } - - return ret; -} - -static ssize_t r2nm_cluster_attr_keepalive_delay_ms_read( - struct r2nm_cluster *cluster, char *page) -{ - return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms); -} - -static ssize_t r2nm_cluster_attr_keepalive_delay_ms_write( - struct r2nm_cluster *cluster, const char *page, size_t count) -{ - ssize_t ret; - unsigned int val = 0; - - ret = r2nm_cluster_attr_write(page, count, &val); - - if (ret > 0) { - if (cluster->cl_keepalive_delay_ms != val - && r2net_num_connected_peers()) { - mlog(ML_NOTICE, - "r2net: cannot change keepalive delay after" - " the first peer has agreed to it." - " %d connected peers\n", - r2net_num_connected_peers()); - ret = -EINVAL; - } else if (val >= cluster->cl_idle_timeout_ms) { - mlog(ML_NOTICE, "r2net: keepalive delay must be " - "smaller than idle timeout\n"); - ret = -EINVAL; - } else { - cluster->cl_keepalive_delay_ms = val; - } - } - - return ret; -} - -static ssize_t r2nm_cluster_attr_reconnect_delay_ms_read( - struct r2nm_cluster *cluster, char *page) -{ - return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms); -} - -static ssize_t r2nm_cluster_attr_reconnect_delay_ms_write( - struct r2nm_cluster *cluster, const char *page, size_t count) -{ - return r2nm_cluster_attr_write(page, count, - &cluster->cl_reconnect_delay_ms); -} - -static ssize_t r2nm_cluster_attr_fence_method_read( - struct r2nm_cluster *cluster, char *page) -{ - ssize_t ret = 0; - - if (cluster) - ret = sprintf(page, "%s\n", - r2nm_fence_method_desc[cluster->cl_fence_method]); - return ret; -} - -static ssize_t r2nm_cluster_attr_fence_method_write( - struct r2nm_cluster *cluster, const char *page, size_t count) -{ - unsigned int i; - - if (page[count - 1] != '\n') - goto bail; - - for (i = 0; i < R2NM_FENCE_METHODS; ++i) { - if (count != strlen(r2nm_fence_method_desc[i]) + 1) - continue; - if (strncasecmp(page, r2nm_fence_method_desc[i], count - 1)) - continue; - if (cluster->cl_fence_method != i) { - printk(KERN_INFO "ramster: Changing fence method to %s\n", - r2nm_fence_method_desc[i]); - cluster->cl_fence_method = i; - } - return count; - } - -bail: - return -EINVAL; -} - -static struct r2nm_cluster_attribute r2nm_cluster_attr_idle_timeout_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "idle_timeout_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_cluster_attr_idle_timeout_ms_read, - .store = r2nm_cluster_attr_idle_timeout_ms_write, -}; - -static struct r2nm_cluster_attribute r2nm_cluster_attr_keepalive_delay_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "keepalive_delay_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_cluster_attr_keepalive_delay_ms_read, - .store = r2nm_cluster_attr_keepalive_delay_ms_write, -}; - -static struct r2nm_cluster_attribute r2nm_cluster_attr_reconnect_delay_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "reconnect_delay_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_cluster_attr_reconnect_delay_ms_read, - .store = r2nm_cluster_attr_reconnect_delay_ms_write, -}; - -static struct r2nm_cluster_attribute r2nm_cluster_attr_fence_method = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "fence_method", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_cluster_attr_fence_method_read, - .store = r2nm_cluster_attr_fence_method_write, -}; - -static struct configfs_attribute *r2nm_cluster_attrs[] = { - &r2nm_cluster_attr_idle_timeout_ms.attr, - &r2nm_cluster_attr_keepalive_delay_ms.attr, - &r2nm_cluster_attr_reconnect_delay_ms.attr, - &r2nm_cluster_attr_fence_method.attr, - NULL, -}; -static ssize_t r2nm_cluster_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct r2nm_cluster *cluster = to_r2nm_cluster(item); - struct r2nm_cluster_attribute *r2nm_cluster_attr = - container_of(attr, struct r2nm_cluster_attribute, attr); - ssize_t ret = 0; - - if (r2nm_cluster_attr->show) - ret = r2nm_cluster_attr->show(cluster, page); - return ret; -} - -static ssize_t r2nm_cluster_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct r2nm_cluster *cluster = to_r2nm_cluster(item); - struct r2nm_cluster_attribute *r2nm_cluster_attr = - container_of(attr, struct r2nm_cluster_attribute, attr); - ssize_t ret; - - if (r2nm_cluster_attr->store == NULL) { - ret = -EINVAL; - goto out; - } - - ret = r2nm_cluster_attr->store(cluster, page, count); - if (ret < count) - goto out; -out: - return ret; -} - -static struct config_item *r2nm_node_group_make_item(struct config_group *group, - const char *name) -{ - struct r2nm_node *node = NULL; - - if (strlen(name) > R2NM_MAX_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - - node = kzalloc(sizeof(struct r2nm_node), GFP_KERNEL); - if (node == NULL) - return ERR_PTR(-ENOMEM); - - strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */ - config_item_init_type_name(&node->nd_item, name, &r2nm_node_type); - spin_lock_init(&node->nd_lock); - - mlog(ML_CLUSTER, "r2nm: Registering node %s\n", name); - - return &node->nd_item; -} - -static void r2nm_node_group_drop_item(struct config_group *group, - struct config_item *item) -{ - struct r2nm_node *node = to_r2nm_node(item); - struct r2nm_cluster *cluster = - to_r2nm_cluster(group->cg_item.ci_parent); - - r2net_disconnect_node(node); - - if (cluster->cl_has_local && - (cluster->cl_local_node == node->nd_num)) { - cluster->cl_has_local = 0; - cluster->cl_local_node = R2NM_INVALID_NODE_NUM; - r2net_stop_listening(node); - } - - /* XXX call into net to stop this node from trading messages */ - - write_lock(&cluster->cl_nodes_lock); - - /* XXX sloppy */ - if (node->nd_ipv4_address) - rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree); - - /* nd_num might be 0 if the node number hasn't been set.. */ - if (cluster->cl_nodes[node->nd_num] == node) { - cluster->cl_nodes[node->nd_num] = NULL; - clear_bit(node->nd_num, cluster->cl_nodes_bitmap); - } - write_unlock(&cluster->cl_nodes_lock); - - mlog(ML_CLUSTER, "r2nm: Unregistered node %s\n", - config_item_name(&node->nd_item)); - - config_item_put(item); -} - -static struct configfs_group_operations r2nm_node_group_group_ops = { - .make_item = r2nm_node_group_make_item, - .drop_item = r2nm_node_group_drop_item, -}; - -static struct config_item_type r2nm_node_group_type = { - .ct_group_ops = &r2nm_node_group_group_ops, - .ct_owner = THIS_MODULE, -}; - -/* cluster */ - -static void r2nm_cluster_release(struct config_item *item) -{ - struct r2nm_cluster *cluster = to_r2nm_cluster(item); - - kfree(cluster->cl_group.default_groups); - kfree(cluster); -} - -static struct configfs_item_operations r2nm_cluster_item_ops = { - .release = r2nm_cluster_release, - .show_attribute = r2nm_cluster_show, - .store_attribute = r2nm_cluster_store, -}; - -static struct config_item_type r2nm_cluster_type = { - .ct_item_ops = &r2nm_cluster_item_ops, - .ct_attrs = r2nm_cluster_attrs, - .ct_owner = THIS_MODULE, -}; - -/* cluster set */ - -struct r2nm_cluster_group { - struct configfs_subsystem cs_subsys; - /* some stuff? */ -}; - -#if 0 -static struct r2nm_cluster_group * -to_r2nm_cluster_group(struct config_group *group) -{ - return group ? - container_of(to_configfs_subsystem(group), - struct r2nm_cluster_group, cs_subsys) - : NULL; -} -#endif - -static struct config_group * -r2nm_cluster_group_make_group(struct config_group *group, - const char *name) -{ - struct r2nm_cluster *cluster = NULL; - struct r2nm_node_group *ns = NULL; - struct config_group *r2hb_group = NULL, *ret = NULL; - void *defs = NULL; - - /* this runs under the parent dir's i_mutex; there can be only - * one caller in here at a time */ - if (r2nm_single_cluster) - return ERR_PTR(-ENOSPC); - - cluster = kzalloc(sizeof(struct r2nm_cluster), GFP_KERNEL); - ns = kzalloc(sizeof(struct r2nm_node_group), GFP_KERNEL); - defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); - r2hb_group = r2hb_alloc_hb_set(); - if (cluster == NULL || ns == NULL || r2hb_group == NULL || defs == NULL) - goto out; - - config_group_init_type_name(&cluster->cl_group, name, - &r2nm_cluster_type); - config_group_init_type_name(&ns->ns_group, "node", - &r2nm_node_group_type); - - cluster->cl_group.default_groups = defs; - cluster->cl_group.default_groups[0] = &ns->ns_group; - cluster->cl_group.default_groups[1] = r2hb_group; - cluster->cl_group.default_groups[2] = NULL; - rwlock_init(&cluster->cl_nodes_lock); - cluster->cl_node_ip_tree = RB_ROOT; - cluster->cl_reconnect_delay_ms = R2NET_RECONNECT_DELAY_MS_DEFAULT; - cluster->cl_idle_timeout_ms = R2NET_IDLE_TIMEOUT_MS_DEFAULT; - cluster->cl_keepalive_delay_ms = R2NET_KEEPALIVE_DELAY_MS_DEFAULT; - cluster->cl_fence_method = R2NM_FENCE_RESET; - - ret = &cluster->cl_group; - r2nm_single_cluster = cluster; - -out: - if (ret == NULL) { - kfree(cluster); - kfree(ns); - r2hb_free_hb_set(r2hb_group); - kfree(defs); - ret = ERR_PTR(-ENOMEM); - } - - return ret; -} - -static void r2nm_cluster_group_drop_item(struct config_group *group, - struct config_item *item) -{ - struct r2nm_cluster *cluster = to_r2nm_cluster(item); - int i; - struct config_item *killme; - - BUG_ON(r2nm_single_cluster != cluster); - r2nm_single_cluster = NULL; - - for (i = 0; cluster->cl_group.default_groups[i]; i++) { - killme = &cluster->cl_group.default_groups[i]->cg_item; - cluster->cl_group.default_groups[i] = NULL; - config_item_put(killme); - } - - config_item_put(item); -} - -static struct configfs_group_operations r2nm_cluster_group_group_ops = { - .make_group = r2nm_cluster_group_make_group, - .drop_item = r2nm_cluster_group_drop_item, -}; - -static struct config_item_type r2nm_cluster_group_type = { - .ct_group_ops = &r2nm_cluster_group_group_ops, - .ct_owner = THIS_MODULE, -}; - -static struct r2nm_cluster_group r2nm_cluster_group = { - .cs_subsys = { - .su_group = { - .cg_item = { - .ci_namebuf = "cluster", - .ci_type = &r2nm_cluster_group_type, - }, - }, - }, -}; - -int r2nm_depend_item(struct config_item *item) -{ - return configfs_depend_item(&r2nm_cluster_group.cs_subsys, item); -} - -void r2nm_undepend_item(struct config_item *item) -{ - configfs_undepend_item(&r2nm_cluster_group.cs_subsys, item); -} - -int r2nm_depend_this_node(void) -{ - int ret = 0; - struct r2nm_node *local_node; - - local_node = r2nm_get_node_by_num(r2nm_this_node()); - if (!local_node) { - ret = -EINVAL; - goto out; - } - - ret = r2nm_depend_item(&local_node->nd_item); - r2nm_node_put(local_node); - -out: - return ret; -} - -void r2nm_undepend_this_node(void) -{ - struct r2nm_node *local_node; - - local_node = r2nm_get_node_by_num(r2nm_this_node()); - BUG_ON(!local_node); - - r2nm_undepend_item(&local_node->nd_item); - r2nm_node_put(local_node); -} - - -static void __exit exit_r2nm(void) -{ - /* XXX sync with hb callbacks and shut down hb? */ - r2net_unregister_hb_callbacks(); - configfs_unregister_subsystem(&r2nm_cluster_group.cs_subsys); - - r2net_exit(); - r2hb_exit(); -} - -static int __init init_r2nm(void) -{ - int ret = -1; - - ret = r2hb_init(); - if (ret) - goto out; - - ret = r2net_init(); - if (ret) - goto out_r2hb; - - ret = r2net_register_hb_callbacks(); - if (ret) - goto out_r2net; - - config_group_init(&r2nm_cluster_group.cs_subsys.su_group); - mutex_init(&r2nm_cluster_group.cs_subsys.su_mutex); - ret = configfs_register_subsystem(&r2nm_cluster_group.cs_subsys); - if (ret) { - printk(KERN_ERR "nodemanager: Registration returned %d\n", ret); - goto out_callbacks; - } - - if (!ret) - goto out; - - configfs_unregister_subsystem(&r2nm_cluster_group.cs_subsys); -out_callbacks: - r2net_unregister_hb_callbacks(); -out_r2net: - r2net_exit(); -out_r2hb: - r2hb_exit(); -out: - return ret; -} - -MODULE_AUTHOR("Oracle"); -MODULE_LICENSE("GPL"); - -module_init(init_r2nm) -module_exit(exit_r2nm) diff --git a/drivers/staging/ramster/cluster/nodemanager.h b/drivers/staging/ramster/cluster/nodemanager.h deleted file mode 100755 index 41a04df58..000000000 --- a/drivers/staging/ramster/cluster/nodemanager.h +++ /dev/null @@ -1,88 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * nodemanager.h - * - * Function prototypes - * - * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - */ - -#ifndef R2CLUSTER_NODEMANAGER_H -#define R2CLUSTER_NODEMANAGER_H - -#include "ramster_nodemanager.h" - -/* This totally doesn't belong here. */ -#include -#include - -enum r2nm_fence_method { - R2NM_FENCE_RESET = 0, - R2NM_FENCE_PANIC, - R2NM_FENCE_METHODS, /* Number of fence methods */ -}; - -struct r2nm_node { - spinlock_t nd_lock; - struct config_item nd_item; - char nd_name[R2NM_MAX_NAME_LEN+1]; /* replace? */ - __u8 nd_num; - /* only one address per node, as attributes, for now. */ - __be32 nd_ipv4_address; - __be16 nd_ipv4_port; - struct rb_node nd_ip_node; - /* there can be only one local node for now */ - int nd_local; - - unsigned long nd_set_attributes; -}; - -struct r2nm_cluster { - struct config_group cl_group; - unsigned cl_has_local:1; - u8 cl_local_node; - rwlock_t cl_nodes_lock; - struct r2nm_node *cl_nodes[R2NM_MAX_NODES]; - struct rb_root cl_node_ip_tree; - unsigned int cl_idle_timeout_ms; - unsigned int cl_keepalive_delay_ms; - unsigned int cl_reconnect_delay_ms; - enum r2nm_fence_method cl_fence_method; - - /* part of a hack for disk bitmap.. will go eventually. - zab */ - unsigned long cl_nodes_bitmap[BITS_TO_LONGS(R2NM_MAX_NODES)]; -}; - -extern struct r2nm_cluster *r2nm_single_cluster; - -u8 r2nm_this_node(void); - -int r2nm_configured_node_map(unsigned long *map, unsigned bytes); -struct r2nm_node *r2nm_get_node_by_num(u8 node_num); -struct r2nm_node *r2nm_get_node_by_ip(__be32 addr); -void r2nm_node_get(struct r2nm_node *node); -void r2nm_node_put(struct r2nm_node *node); - -int r2nm_depend_item(struct config_item *item); -void r2nm_undepend_item(struct config_item *item); -int r2nm_depend_this_node(void); -void r2nm_undepend_this_node(void); - -#endif /* R2CLUSTER_NODEMANAGER_H */ diff --git a/drivers/staging/ramster/cluster/ramster_nodemanager.h b/drivers/staging/ramster/cluster/ramster_nodemanager.h deleted file mode 100755 index 49f879d94..000000000 --- a/drivers/staging/ramster/cluster/ramster_nodemanager.h +++ /dev/null @@ -1,39 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ramster_nodemanager.h - * - * Header describing the interface between userspace and the kernel - * for the ramster_nodemanager module. - * - * Copyright (C) 2002, 2004, 2012 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - */ - -#ifndef _RAMSTER_NODEMANAGER_H -#define _RAMSTER_NODEMANAGER_H - -#define R2NM_API_VERSION 5 - -#define R2NM_MAX_NODES 255 -#define R2NM_INVALID_NODE_NUM 255 - -/* host name, group name, cluster name all 64 bytes */ -#define R2NM_MAX_NAME_LEN 64 /* __NEW_UTS_LEN */ - -#endif /* _RAMSTER_NODEMANAGER_H */ diff --git a/drivers/staging/ramster/cluster/tcp.c b/drivers/staging/ramster/cluster/tcp.c deleted file mode 100755 index 3af1b2c51..000000000 --- a/drivers/staging/ramster/cluster/tcp.c +++ /dev/null @@ -1,2256 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - * ---- - * - * Callers for this were originally written against a very simple synchronus - * API. This implementation reflects those simple callers. Some day I'm sure - * we'll need to move to a more robust posting/callback mechanism. - * - * Transmit calls pass in kernel virtual addresses and block copying this into - * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting - * for a failed socket to timeout. TX callers can also pass in a poniter to an - * 'int' which gets filled with an errno off the wire in response to the - * message they send. - * - * Handlers for unsolicited messages are registered. Each socket has a page - * that incoming data is copied into. First the header, then the data. - * Handlers are called from only one thread with a reference to this per-socket - * page. This page is destroyed after the handler call, so it can't be - * referenced beyond the call. Handlers may block but are discouraged from - * doing so. - * - * Any framing errors (bad magic, large payload lengths) close a connection. - * - * Our sock_container holds the state we associate with a socket. It's current - * framing state is held there as well as the refcounting we do around when it - * is safe to tear down the socket. The socket is only finally torn down from - * the container when the container loses all of its references -- so as long - * as you hold a ref on the container you can trust that the socket is valid - * for use with kernel socket APIs. - * - * Connections are initiated between a pair of nodes when the node with the - * higher node number gets a heartbeat callback which indicates that the lower - * numbered node has started heartbeating. The lower numbered node is passive - * and only accepts the connection if the higher numbered node is heartbeating. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "heartbeat.h" -#include "tcp.h" -#include "nodemanager.h" -#define MLOG_MASK_PREFIX ML_TCP -#include "masklog.h" - -#include "tcp_internal.h" - -#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u" - -/* - * In the following two log macros, the whitespace after the ',' just - * before ##args is intentional. Otherwise, gcc 2.95 will eat the - * previous token if args expands to nothing. - */ -#define msglog(hdr, fmt, args...) do { \ - typeof(hdr) __hdr = (hdr); \ - mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d " \ - "key %08x num %u] " fmt, \ - be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), \ - be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status), \ - be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key), \ - be32_to_cpu(__hdr->msg_num) , ##args); \ -} while (0) - -#define sclog(sc, fmt, args...) do { \ - typeof(sc) __sc = (sc); \ - mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \ - "pg_off %zu] " fmt, __sc, \ - atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \ - __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \ - ##args); \ -} while (0) - -static DEFINE_RWLOCK(r2net_handler_lock); -static struct rb_root r2net_handler_tree = RB_ROOT; - -static struct r2net_node r2net_nodes[R2NM_MAX_NODES]; - -/* XXX someday we'll need better accounting */ -static struct socket *r2net_listen_sock; - -/* - * listen work is only queued by the listening socket callbacks on the - * r2net_wq. teardown detaches the callbacks before destroying the workqueue. - * quorum work is queued as sock containers are shutdown.. stop_listening - * tears down all the node's sock containers, preventing future shutdowns - * and queued quroum work, before canceling delayed quorum work and - * destroying the work queue. - */ -static struct workqueue_struct *r2net_wq; -static struct work_struct r2net_listen_work; - -static struct r2hb_callback_func r2net_hb_up, r2net_hb_down; -#define R2NET_HB_PRI 0x1 - -static struct r2net_handshake *r2net_hand; -static struct r2net_msg *r2net_keep_req, *r2net_keep_resp; - -static int r2net_sys_err_translations[R2NET_ERR_MAX] = { - [R2NET_ERR_NONE] = 0, - [R2NET_ERR_NO_HNDLR] = -ENOPROTOOPT, - [R2NET_ERR_OVERFLOW] = -EOVERFLOW, - [R2NET_ERR_DIED] = -EHOSTDOWN,}; - -/* can't quite avoid *all* internal declarations :/ */ -static void r2net_sc_connect_completed(struct work_struct *work); -static void r2net_rx_until_empty(struct work_struct *work); -static void r2net_shutdown_sc(struct work_struct *work); -static void r2net_listen_data_ready(struct sock *sk, int bytes); -static void r2net_sc_send_keep_req(struct work_struct *work); -static void r2net_idle_timer(unsigned long data); -static void r2net_sc_postpone_idle(struct r2net_sock_container *sc); -static void r2net_sc_reset_idle_timer(struct r2net_sock_container *sc); - -#ifdef CONFIG_DEBUG_FS -static void r2net_init_nst(struct r2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node) -{ - INIT_LIST_HEAD(&nst->st_net_debug_item); - nst->st_task = task; - nst->st_msg_type = msgtype; - nst->st_msg_key = msgkey; - nst->st_node = node; -} - -static inline void r2net_set_nst_sock_time(struct r2net_send_tracking *nst) -{ - nst->st_sock_time = ktime_get(); -} - -static inline void r2net_set_nst_send_time(struct r2net_send_tracking *nst) -{ - nst->st_send_time = ktime_get(); -} - -static inline void r2net_set_nst_status_time(struct r2net_send_tracking *nst) -{ - nst->st_status_time = ktime_get(); -} - -static inline void r2net_set_nst_sock_container(struct r2net_send_tracking *nst, - struct r2net_sock_container *sc) -{ - nst->st_sc = sc; -} - -static inline void r2net_set_nst_msg_id(struct r2net_send_tracking *nst, - u32 msg_id) -{ - nst->st_id = msg_id; -} - -static inline void r2net_set_sock_timer(struct r2net_sock_container *sc) -{ - sc->sc_tv_timer = ktime_get(); -} - -static inline void r2net_set_data_ready_time(struct r2net_sock_container *sc) -{ - sc->sc_tv_data_ready = ktime_get(); -} - -static inline void r2net_set_advance_start_time(struct r2net_sock_container *sc) -{ - sc->sc_tv_advance_start = ktime_get(); -} - -static inline void r2net_set_advance_stop_time(struct r2net_sock_container *sc) -{ - sc->sc_tv_advance_stop = ktime_get(); -} - -static inline void r2net_set_func_start_time(struct r2net_sock_container *sc) -{ - sc->sc_tv_func_start = ktime_get(); -} - -static inline void r2net_set_func_stop_time(struct r2net_sock_container *sc) -{ - sc->sc_tv_func_stop = ktime_get(); -} - -#else /* CONFIG_DEBUG_FS */ -# define r2net_init_nst(a, b, c, d, e) -# define r2net_set_nst_sock_time(a) -# define r2net_set_nst_send_time(a) -# define r2net_set_nst_status_time(a) -# define r2net_set_nst_sock_container(a, b) -# define r2net_set_nst_msg_id(a, b) -# define r2net_set_sock_timer(a) -# define r2net_set_data_ready_time(a) -# define r2net_set_advance_start_time(a) -# define r2net_set_advance_stop_time(a) -# define r2net_set_func_start_time(a) -# define r2net_set_func_stop_time(a) -#endif /* CONFIG_DEBUG_FS */ - -#ifdef CONFIG_RAMSTER_FS_STATS -static ktime_t r2net_get_func_run_time(struct r2net_sock_container *sc) -{ - return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start); -} - -static void r2net_update_send_stats(struct r2net_send_tracking *nst, - struct r2net_sock_container *sc) -{ - sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total, - ktime_sub(ktime_get(), - nst->st_status_time)); - sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total, - ktime_sub(nst->st_status_time, - nst->st_send_time)); - sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total, - ktime_sub(nst->st_send_time, - nst->st_sock_time)); - sc->sc_send_count++; -} - -static void r2net_update_recv_stats(struct r2net_sock_container *sc) -{ - sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total, - r2net_get_func_run_time(sc)); - sc->sc_recv_count++; -} - -#else - -# define r2net_update_send_stats(a, b) - -# define r2net_update_recv_stats(sc) - -#endif /* CONFIG_RAMSTER_FS_STATS */ - -static inline int r2net_reconnect_delay(void) -{ - return r2nm_single_cluster->cl_reconnect_delay_ms; -} - -static inline int r2net_keepalive_delay(void) -{ - return r2nm_single_cluster->cl_keepalive_delay_ms; -} - -static inline int r2net_idle_timeout(void) -{ - return r2nm_single_cluster->cl_idle_timeout_ms; -} - -static inline int r2net_sys_err_to_errno(enum r2net_system_error err) -{ - int trans; - BUG_ON(err >= R2NET_ERR_MAX); - trans = r2net_sys_err_translations[err]; - - /* Just in case we mess up the translation table above */ - BUG_ON(err != R2NET_ERR_NONE && trans == 0); - return trans; -} - -struct r2net_node *r2net_nn_from_num(u8 node_num) -{ - BUG_ON(node_num >= ARRAY_SIZE(r2net_nodes)); - return &r2net_nodes[node_num]; -} - -static u8 r2net_num_from_nn(struct r2net_node *nn) -{ - BUG_ON(nn == NULL); - return nn - r2net_nodes; -} - -/* ------------------------------------------------------------ */ - -static int r2net_prep_nsw(struct r2net_node *nn, struct r2net_status_wait *nsw) -{ - int ret = 0; - - do { - if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) { - ret = -EAGAIN; - break; - } - spin_lock(&nn->nn_lock); - ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id); - if (ret == 0) - list_add_tail(&nsw->ns_node_item, - &nn->nn_status_list); - spin_unlock(&nn->nn_lock); - } while (ret == -EAGAIN); - - if (ret == 0) { - init_waitqueue_head(&nsw->ns_wq); - nsw->ns_sys_status = R2NET_ERR_NONE; - nsw->ns_status = 0; - } - - return ret; -} - -static void r2net_complete_nsw_locked(struct r2net_node *nn, - struct r2net_status_wait *nsw, - enum r2net_system_error sys_status, - s32 status) -{ - assert_spin_locked(&nn->nn_lock); - - if (!list_empty(&nsw->ns_node_item)) { - list_del_init(&nsw->ns_node_item); - nsw->ns_sys_status = sys_status; - nsw->ns_status = status; - idr_remove(&nn->nn_status_idr, nsw->ns_id); - wake_up(&nsw->ns_wq); - } -} - -static void r2net_complete_nsw(struct r2net_node *nn, - struct r2net_status_wait *nsw, - u64 id, enum r2net_system_error sys_status, - s32 status) -{ - spin_lock(&nn->nn_lock); - if (nsw == NULL) { - if (id > INT_MAX) - goto out; - - nsw = idr_find(&nn->nn_status_idr, id); - if (nsw == NULL) - goto out; - } - - r2net_complete_nsw_locked(nn, nsw, sys_status, status); - -out: - spin_unlock(&nn->nn_lock); - return; -} - -static void r2net_complete_nodes_nsw(struct r2net_node *nn) -{ - struct r2net_status_wait *nsw, *tmp; - unsigned int num_kills = 0; - - assert_spin_locked(&nn->nn_lock); - - list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) { - r2net_complete_nsw_locked(nn, nsw, R2NET_ERR_DIED, 0); - num_kills++; - } - - mlog(0, "completed %d messages for node %u\n", num_kills, - r2net_num_from_nn(nn)); -} - -static int r2net_nsw_completed(struct r2net_node *nn, - struct r2net_status_wait *nsw) -{ - int completed; - spin_lock(&nn->nn_lock); - completed = list_empty(&nsw->ns_node_item); - spin_unlock(&nn->nn_lock); - return completed; -} - -/* ------------------------------------------------------------ */ - -static void sc_kref_release(struct kref *kref) -{ - struct r2net_sock_container *sc = container_of(kref, - struct r2net_sock_container, sc_kref); - BUG_ON(timer_pending(&sc->sc_idle_timeout)); - - sclog(sc, "releasing\n"); - - if (sc->sc_sock) { - sock_release(sc->sc_sock); - sc->sc_sock = NULL; - } - - r2nm_undepend_item(&sc->sc_node->nd_item); - r2nm_node_put(sc->sc_node); - sc->sc_node = NULL; - - r2net_debug_del_sc(sc); - kfree(sc); -} - -static void sc_put(struct r2net_sock_container *sc) -{ - sclog(sc, "put\n"); - kref_put(&sc->sc_kref, sc_kref_release); -} -static void sc_get(struct r2net_sock_container *sc) -{ - sclog(sc, "get\n"); - kref_get(&sc->sc_kref); -} -static struct r2net_sock_container *sc_alloc(struct r2nm_node *node) -{ - struct r2net_sock_container *sc, *ret = NULL; - struct page *page = NULL; - int status = 0; - - page = alloc_page(GFP_NOFS); - sc = kzalloc(sizeof(*sc), GFP_NOFS); - if (sc == NULL || page == NULL) - goto out; - - kref_init(&sc->sc_kref); - r2nm_node_get(node); - sc->sc_node = node; - - /* pin the node item of the remote node */ - status = r2nm_depend_item(&node->nd_item); - if (status) { - mlog_errno(status); - r2nm_node_put(node); - goto out; - } - INIT_WORK(&sc->sc_connect_work, r2net_sc_connect_completed); - INIT_WORK(&sc->sc_rx_work, r2net_rx_until_empty); - INIT_WORK(&sc->sc_shutdown_work, r2net_shutdown_sc); - INIT_DELAYED_WORK(&sc->sc_keepalive_work, r2net_sc_send_keep_req); - - init_timer(&sc->sc_idle_timeout); - sc->sc_idle_timeout.function = r2net_idle_timer; - sc->sc_idle_timeout.data = (unsigned long)sc; - - sclog(sc, "alloced\n"); - - ret = sc; - sc->sc_page = page; - r2net_debug_add_sc(sc); - sc = NULL; - page = NULL; - -out: - if (page) - __free_page(page); - kfree(sc); - - return ret; -} - -/* ------------------------------------------------------------ */ - -static void r2net_sc_queue_work(struct r2net_sock_container *sc, - struct work_struct *work) -{ - sc_get(sc); - if (!queue_work(r2net_wq, work)) - sc_put(sc); -} -static void r2net_sc_queue_delayed_work(struct r2net_sock_container *sc, - struct delayed_work *work, - int delay) -{ - sc_get(sc); - if (!queue_delayed_work(r2net_wq, work, delay)) - sc_put(sc); -} -static void r2net_sc_cancel_delayed_work(struct r2net_sock_container *sc, - struct delayed_work *work) -{ - if (cancel_delayed_work(work)) - sc_put(sc); -} - -static atomic_t r2net_connected_peers = ATOMIC_INIT(0); - -int r2net_num_connected_peers(void) -{ - return atomic_read(&r2net_connected_peers); -} - -static void r2net_set_nn_state(struct r2net_node *nn, - struct r2net_sock_container *sc, - unsigned valid, int err) -{ - int was_valid = nn->nn_sc_valid; - int was_err = nn->nn_persistent_error; - struct r2net_sock_container *old_sc = nn->nn_sc; - - assert_spin_locked(&nn->nn_lock); - - if (old_sc && !sc) - atomic_dec(&r2net_connected_peers); - else if (!old_sc && sc) - atomic_inc(&r2net_connected_peers); - - /* the node num comparison and single connect/accept path should stop - * an non-null sc from being overwritten with another */ - BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc); - mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); - mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); - - if (was_valid && !valid && err == 0) - err = -ENOTCONN; - - mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n", - r2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid, - nn->nn_persistent_error, err); - - nn->nn_sc = sc; - nn->nn_sc_valid = valid ? 1 : 0; - nn->nn_persistent_error = err; - - /* mirrors r2net_tx_can_proceed() */ - if (nn->nn_persistent_error || nn->nn_sc_valid) - wake_up(&nn->nn_sc_wq); - - if (!was_err && nn->nn_persistent_error) { - queue_delayed_work(r2net_wq, &nn->nn_still_up, - msecs_to_jiffies(R2NET_QUORUM_DELAY_MS)); - } - - if (was_valid && !valid) { - printk(KERN_NOTICE "ramster: No longer connected to " - SC_NODEF_FMT "\n", - old_sc->sc_node->nd_name, old_sc->sc_node->nd_num, - &old_sc->sc_node->nd_ipv4_address, - ntohs(old_sc->sc_node->nd_ipv4_port)); - r2net_complete_nodes_nsw(nn); - } - - if (!was_valid && valid) { - cancel_delayed_work(&nn->nn_connect_expired); - printk(KERN_NOTICE "ramster: %s " SC_NODEF_FMT "\n", - r2nm_this_node() > sc->sc_node->nd_num ? - "Connected to" : "Accepted connection from", - sc->sc_node->nd_name, sc->sc_node->nd_num, - &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port)); - } - - /* trigger the connecting worker func as long as we're not valid, - * it will back off if it shouldn't connect. This can be called - * from node config teardown and so needs to be careful about - * the work queue actually being up. */ - if (!valid && r2net_wq) { - unsigned long delay; - /* delay if we're within a RECONNECT_DELAY of the - * last attempt */ - delay = (nn->nn_last_connect_attempt + - msecs_to_jiffies(r2net_reconnect_delay())) - - jiffies; - if (delay > msecs_to_jiffies(r2net_reconnect_delay())) - delay = 0; - mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); - queue_delayed_work(r2net_wq, &nn->nn_connect_work, delay); - - /* - * Delay the expired work after idle timeout. - * - * We might have lots of failed connection attempts that run - * through here but we only cancel the connect_expired work when - * a connection attempt succeeds. So only the first enqueue of - * the connect_expired work will do anything. The rest will see - * that it's already queued and do nothing. - */ - delay += msecs_to_jiffies(r2net_idle_timeout()); - queue_delayed_work(r2net_wq, &nn->nn_connect_expired, delay); - } - - /* keep track of the nn's sc ref for the caller */ - if ((old_sc == NULL) && sc) - sc_get(sc); - if (old_sc && (old_sc != sc)) { - r2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work); - sc_put(old_sc); - } -} - -/* see r2net_register_callbacks() */ -static void r2net_data_ready(struct sock *sk, int bytes) -{ - void (*ready)(struct sock *sk, int bytes); - - read_lock(&sk->sk_callback_lock); - if (sk->sk_user_data) { - struct r2net_sock_container *sc = sk->sk_user_data; - sclog(sc, "data_ready hit\n"); - r2net_set_data_ready_time(sc); - r2net_sc_queue_work(sc, &sc->sc_rx_work); - ready = sc->sc_data_ready; - } else { - ready = sk->sk_data_ready; - } - read_unlock(&sk->sk_callback_lock); - - ready(sk, bytes); -} - -/* see r2net_register_callbacks() */ -static void r2net_state_change(struct sock *sk) -{ - void (*state_change)(struct sock *sk); - struct r2net_sock_container *sc; - - read_lock(&sk->sk_callback_lock); - sc = sk->sk_user_data; - if (sc == NULL) { - state_change = sk->sk_state_change; - goto out; - } - - sclog(sc, "state_change to %d\n", sk->sk_state); - - state_change = sc->sc_state_change; - - switch (sk->sk_state) { - - /* ignore connecting sockets as they make progress */ - case TCP_SYN_SENT: - case TCP_SYN_RECV: - break; - case TCP_ESTABLISHED: - r2net_sc_queue_work(sc, &sc->sc_connect_work); - break; - default: - printk(KERN_INFO "ramster: Connection to " - SC_NODEF_FMT " shutdown, state %d\n", - sc->sc_node->nd_name, sc->sc_node->nd_num, - &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port), sk->sk_state); - r2net_sc_queue_work(sc, &sc->sc_shutdown_work); - break; - - } -out: - read_unlock(&sk->sk_callback_lock); - state_change(sk); -} - -/* - * we register callbacks so we can queue work on events before calling - * the original callbacks. our callbacks our careful to test user_data - * to discover when they've reaced with r2net_unregister_callbacks(). - */ -static void r2net_register_callbacks(struct sock *sk, - struct r2net_sock_container *sc) -{ - write_lock_bh(&sk->sk_callback_lock); - - /* accepted sockets inherit the old listen socket data ready */ - if (sk->sk_data_ready == r2net_listen_data_ready) { - sk->sk_data_ready = sk->sk_user_data; - sk->sk_user_data = NULL; - } - - BUG_ON(sk->sk_user_data != NULL); - sk->sk_user_data = sc; - sc_get(sc); - - sc->sc_data_ready = sk->sk_data_ready; - sc->sc_state_change = sk->sk_state_change; - sk->sk_data_ready = r2net_data_ready; - sk->sk_state_change = r2net_state_change; - - mutex_init(&sc->sc_send_lock); - - write_unlock_bh(&sk->sk_callback_lock); -} - -static int r2net_unregister_callbacks(struct sock *sk, - struct r2net_sock_container *sc) -{ - int ret = 0; - - write_lock_bh(&sk->sk_callback_lock); - if (sk->sk_user_data == sc) { - ret = 1; - sk->sk_user_data = NULL; - sk->sk_data_ready = sc->sc_data_ready; - sk->sk_state_change = sc->sc_state_change; - } - write_unlock_bh(&sk->sk_callback_lock); - - return ret; -} - -/* - * this is a little helper that is called by callers who have seen a problem - * with an sc and want to detach it from the nn if someone already hasn't beat - * them to it. if an error is given then the shutdown will be persistent - * and pending transmits will be canceled. - */ -static void r2net_ensure_shutdown(struct r2net_node *nn, - struct r2net_sock_container *sc, - int err) -{ - spin_lock(&nn->nn_lock); - if (nn->nn_sc == sc) - r2net_set_nn_state(nn, NULL, 0, err); - spin_unlock(&nn->nn_lock); -} - -/* - * This work queue function performs the blocking parts of socket shutdown. A - * few paths lead here. set_nn_state will trigger this callback if it sees an - * sc detached from the nn. state_change will also trigger this callback - * directly when it sees errors. In that case we need to call set_nn_state - * ourselves as state_change couldn't get the nn_lock and call set_nn_state - * itself. - */ -static void r2net_shutdown_sc(struct work_struct *work) -{ - struct r2net_sock_container *sc = - container_of(work, struct r2net_sock_container, - sc_shutdown_work); - struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num); - - sclog(sc, "shutting down\n"); - - /* drop the callbacks ref and call shutdown only once */ - if (r2net_unregister_callbacks(sc->sc_sock->sk, sc)) { - /* we shouldn't flush as we're in the thread, the - * races with pending sc work structs are harmless */ - del_timer_sync(&sc->sc_idle_timeout); - r2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); - sc_put(sc); - kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR); - } - - /* not fatal so failed connects before the other guy has our - * heartbeat can be retried */ - r2net_ensure_shutdown(nn, sc, 0); - sc_put(sc); -} - -/* ------------------------------------------------------------ */ - -static int r2net_handler_cmp(struct r2net_msg_handler *nmh, u32 msg_type, - u32 key) -{ - int ret = memcmp(&nmh->nh_key, &key, sizeof(key)); - - if (ret == 0) - ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type)); - - return ret; -} - -static struct r2net_msg_handler * -r2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p, - struct rb_node **ret_parent) -{ - struct rb_node **p = &r2net_handler_tree.rb_node; - struct rb_node *parent = NULL; - struct r2net_msg_handler *nmh, *ret = NULL; - int cmp; - - while (*p) { - parent = *p; - nmh = rb_entry(parent, struct r2net_msg_handler, nh_node); - cmp = r2net_handler_cmp(nmh, msg_type, key); - - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else { - ret = nmh; - break; - } - } - - if (ret_p != NULL) - *ret_p = p; - if (ret_parent != NULL) - *ret_parent = parent; - - return ret; -} - -static void r2net_handler_kref_release(struct kref *kref) -{ - struct r2net_msg_handler *nmh; - nmh = container_of(kref, struct r2net_msg_handler, nh_kref); - - kfree(nmh); -} - -static void r2net_handler_put(struct r2net_msg_handler *nmh) -{ - kref_put(&nmh->nh_kref, r2net_handler_kref_release); -} - -/* max_len is protection for the handler func. incoming messages won't - * be given to the handler if their payload is longer than the max. */ -int r2net_register_handler(u32 msg_type, u32 key, u32 max_len, - r2net_msg_handler_func *func, void *data, - r2net_post_msg_handler_func *post_func, - struct list_head *unreg_list) -{ - struct r2net_msg_handler *nmh = NULL; - struct rb_node **p, *parent; - int ret = 0; - - if (max_len > R2NET_MAX_PAYLOAD_BYTES) { - mlog(0, "max_len for message handler out of range: %u\n", - max_len); - ret = -EINVAL; - goto out; - } - - if (!msg_type) { - mlog(0, "no message type provided: %u, %p\n", msg_type, func); - ret = -EINVAL; - goto out; - - } - if (!func) { - mlog(0, "no message handler provided: %u, %p\n", - msg_type, func); - ret = -EINVAL; - goto out; - } - - nmh = kzalloc(sizeof(struct r2net_msg_handler), GFP_NOFS); - if (nmh == NULL) { - ret = -ENOMEM; - goto out; - } - - nmh->nh_func = func; - nmh->nh_func_data = data; - nmh->nh_post_func = post_func; - nmh->nh_msg_type = msg_type; - nmh->nh_max_len = max_len; - nmh->nh_key = key; - /* the tree and list get this ref.. they're both removed in - * unregister when this ref is dropped */ - kref_init(&nmh->nh_kref); - INIT_LIST_HEAD(&nmh->nh_unregister_item); - - write_lock(&r2net_handler_lock); - if (r2net_handler_tree_lookup(msg_type, key, &p, &parent)) - ret = -EEXIST; - else { - rb_link_node(&nmh->nh_node, parent, p); - rb_insert_color(&nmh->nh_node, &r2net_handler_tree); - list_add_tail(&nmh->nh_unregister_item, unreg_list); - - mlog(ML_TCP, "registered handler func %p type %u key %08x\n", - func, msg_type, key); - /* we've had some trouble with handlers seemingly vanishing. */ - mlog_bug_on_msg(r2net_handler_tree_lookup(msg_type, key, &p, - &parent) == NULL, - "couldn't find handler we *just* registered " - "for type %u key %08x\n", msg_type, key); - } - write_unlock(&r2net_handler_lock); - if (ret) - goto out; - -out: - if (ret) - kfree(nmh); - - return ret; -} -EXPORT_SYMBOL_GPL(r2net_register_handler); - -void r2net_unregister_handler_list(struct list_head *list) -{ - struct r2net_msg_handler *nmh, *n; - - write_lock(&r2net_handler_lock); - list_for_each_entry_safe(nmh, n, list, nh_unregister_item) { - mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", - nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); - rb_erase(&nmh->nh_node, &r2net_handler_tree); - list_del_init(&nmh->nh_unregister_item); - kref_put(&nmh->nh_kref, r2net_handler_kref_release); - } - write_unlock(&r2net_handler_lock); -} -EXPORT_SYMBOL_GPL(r2net_unregister_handler_list); - -static struct r2net_msg_handler *r2net_handler_get(u32 msg_type, u32 key) -{ - struct r2net_msg_handler *nmh; - - read_lock(&r2net_handler_lock); - nmh = r2net_handler_tree_lookup(msg_type, key, NULL, NULL); - if (nmh) - kref_get(&nmh->nh_kref); - read_unlock(&r2net_handler_lock); - - return nmh; -} - -/* ------------------------------------------------------------ */ - -static int r2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) -{ - int ret; - mm_segment_t oldfs; - struct kvec vec = { - .iov_len = len, - .iov_base = data, - }; - struct msghdr msg = { - .msg_iovlen = 1, - .msg_iov = (struct iovec *)&vec, - .msg_flags = MSG_DONTWAIT, - }; - - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_recvmsg(sock, &msg, len, msg.msg_flags); - set_fs(oldfs); - - return ret; -} - -static int r2net_send_tcp_msg(struct socket *sock, struct kvec *vec, - size_t veclen, size_t total) -{ - int ret; - mm_segment_t oldfs; - struct msghdr msg = { - .msg_iov = (struct iovec *)vec, - .msg_iovlen = veclen, - }; - - if (sock == NULL) { - ret = -EINVAL; - goto out; - } - - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_sendmsg(sock, &msg, total); - set_fs(oldfs); - if (ret != total) { - mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, - total); - if (ret >= 0) - ret = -EPIPE; /* should be smarter, I bet */ - goto out; - } - - ret = 0; -out: - if (ret < 0) - mlog(0, "returning error: %d\n", ret); - return ret; -} - -static void r2net_sendpage(struct r2net_sock_container *sc, - void *kmalloced_virt, - size_t size) -{ - struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num); - ssize_t ret; - - while (1) { - mutex_lock(&sc->sc_send_lock); - ret = sc->sc_sock->ops->sendpage(sc->sc_sock, - virt_to_page(kmalloced_virt), - (long)kmalloced_virt & ~PAGE_MASK, - size, MSG_DONTWAIT); - mutex_unlock(&sc->sc_send_lock); - if (ret == size) - break; - if (ret == (ssize_t)-EAGAIN) { - mlog(0, "sendpage of size %zu to " SC_NODEF_FMT - " returned EAGAIN\n", size, sc->sc_node->nd_name, - sc->sc_node->nd_num, - &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port)); - cond_resched(); - continue; - } - mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT - " failed with %zd\n", size, sc->sc_node->nd_name, - sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port), ret); - r2net_ensure_shutdown(nn, sc, 0); - break; - } -} - -static void r2net_init_msg(struct r2net_msg *msg, u16 data_len, - u16 msg_type, u32 key) -{ - memset(msg, 0, sizeof(struct r2net_msg)); - msg->magic = cpu_to_be16(R2NET_MSG_MAGIC); - msg->data_len = cpu_to_be16(data_len); - msg->msg_type = cpu_to_be16(msg_type); - msg->sys_status = cpu_to_be32(R2NET_ERR_NONE); - msg->status = 0; - msg->key = cpu_to_be32(key); -} - -static int r2net_tx_can_proceed(struct r2net_node *nn, - struct r2net_sock_container **sc_ret, - int *error) -{ - int ret = 0; - - spin_lock(&nn->nn_lock); - if (nn->nn_persistent_error) { - ret = 1; - *sc_ret = NULL; - *error = nn->nn_persistent_error; - } else if (nn->nn_sc_valid) { - kref_get(&nn->nn_sc->sc_kref); - - ret = 1; - *sc_ret = nn->nn_sc; - *error = 0; - } - spin_unlock(&nn->nn_lock); - - return ret; -} - -/* Get a map of all nodes to which this node is currently connected to */ -void r2net_fill_node_map(unsigned long *map, unsigned bytes) -{ - struct r2net_sock_container *sc; - int node, ret; - - BUG_ON(bytes < (BITS_TO_LONGS(R2NM_MAX_NODES) * sizeof(unsigned long))); - - memset(map, 0, bytes); - for (node = 0; node < R2NM_MAX_NODES; ++node) { - r2net_tx_can_proceed(r2net_nn_from_num(node), &sc, &ret); - if (!ret) { - set_bit(node, map); - sc_put(sc); - } - } -} -EXPORT_SYMBOL_GPL(r2net_fill_node_map); - -int r2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, - size_t caller_veclen, u8 target_node, int *status) -{ - int ret = 0; - struct r2net_msg *msg = NULL; - size_t veclen, caller_bytes = 0; - struct kvec *vec = NULL; - struct r2net_sock_container *sc = NULL; - struct r2net_node *nn = r2net_nn_from_num(target_node); - struct r2net_status_wait nsw = { - .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), - }; - struct r2net_send_tracking nst; - - /* this may be a general bug fix */ - init_waitqueue_head(&nsw.ns_wq); - - r2net_init_nst(&nst, msg_type, key, current, target_node); - - if (r2net_wq == NULL) { - mlog(0, "attempt to tx without r2netd running\n"); - ret = -ESRCH; - goto out; - } - - if (caller_veclen == 0) { - mlog(0, "bad kvec array length\n"); - ret = -EINVAL; - goto out; - } - - caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen); - if (caller_bytes > R2NET_MAX_PAYLOAD_BYTES) { - mlog(0, "total payload len %zu too large\n", caller_bytes); - ret = -EINVAL; - goto out; - } - - if (target_node == r2nm_this_node()) { - ret = -ELOOP; - goto out; - } - - r2net_debug_add_nst(&nst); - - r2net_set_nst_sock_time(&nst); - - wait_event(nn->nn_sc_wq, r2net_tx_can_proceed(nn, &sc, &ret)); - if (ret) - goto out; - - r2net_set_nst_sock_container(&nst, sc); - - veclen = caller_veclen + 1; - vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); - if (vec == NULL) { - mlog(0, "failed to %zu element kvec!\n", veclen); - ret = -ENOMEM; - goto out; - } - - msg = kmalloc(sizeof(struct r2net_msg), GFP_ATOMIC); - if (!msg) { - mlog(0, "failed to allocate a r2net_msg!\n"); - ret = -ENOMEM; - goto out; - } - - r2net_init_msg(msg, caller_bytes, msg_type, key); - - vec[0].iov_len = sizeof(struct r2net_msg); - vec[0].iov_base = msg; - memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec)); - - ret = r2net_prep_nsw(nn, &nsw); - if (ret) - goto out; - - msg->msg_num = cpu_to_be32(nsw.ns_id); - r2net_set_nst_msg_id(&nst, nsw.ns_id); - - r2net_set_nst_send_time(&nst); - - /* finally, convert the message header to network byte-order - * and send */ - mutex_lock(&sc->sc_send_lock); - ret = r2net_send_tcp_msg(sc->sc_sock, vec, veclen, - sizeof(struct r2net_msg) + caller_bytes); - mutex_unlock(&sc->sc_send_lock); - msglog(msg, "sending returned %d\n", ret); - if (ret < 0) { - mlog(0, "error returned from r2net_send_tcp_msg=%d\n", ret); - goto out; - } - - /* wait on other node's handler */ - r2net_set_nst_status_time(&nst); - wait_event(nsw.ns_wq, r2net_nsw_completed(nn, &nsw)); - - r2net_update_send_stats(&nst, sc); - - /* Note that we avoid overwriting the callers status return - * variable if a system error was reported on the other - * side. Callers beware. */ - ret = r2net_sys_err_to_errno(nsw.ns_sys_status); - if (status && !ret) - *status = nsw.ns_status; - - mlog(0, "woken, returning system status %d, user status %d\n", - ret, nsw.ns_status); -out: - r2net_debug_del_nst(&nst); /* must be before dropping sc and node */ - if (sc) - sc_put(sc); - kfree(vec); - kfree(msg); - r2net_complete_nsw(nn, &nsw, 0, 0, 0); - return ret; -} -EXPORT_SYMBOL_GPL(r2net_send_message_vec); - -int r2net_send_message(u32 msg_type, u32 key, void *data, u32 len, - u8 target_node, int *status) -{ - struct kvec vec = { - .iov_base = data, - .iov_len = len, - }; - return r2net_send_message_vec(msg_type, key, &vec, 1, - target_node, status); -} -EXPORT_SYMBOL_GPL(r2net_send_message); - -static int r2net_send_status_magic(struct socket *sock, struct r2net_msg *hdr, - enum r2net_system_error syserr, int err) -{ - struct kvec vec = { - .iov_base = hdr, - .iov_len = sizeof(struct r2net_msg), - }; - - BUG_ON(syserr >= R2NET_ERR_MAX); - - /* leave other fields intact from the incoming message, msg_num - * in particular */ - hdr->sys_status = cpu_to_be32(syserr); - hdr->status = cpu_to_be32(err); - /* twiddle the magic */ - hdr->magic = cpu_to_be16(R2NET_MSG_STATUS_MAGIC); - hdr->data_len = 0; - - msglog(hdr, "about to send status magic %d\n", err); - /* hdr has been in host byteorder this whole time */ - return r2net_send_tcp_msg(sock, &vec, 1, sizeof(struct r2net_msg)); -} - -/* - * "data magic" is a long version of "status magic" where the message - * payload actually contains data to be passed in reply to certain messages - */ -static int r2net_send_data_magic(struct r2net_sock_container *sc, - struct r2net_msg *hdr, - void *data, size_t data_len, - enum r2net_system_error syserr, int err) -{ - struct kvec vec[2]; - int ret; - - vec[0].iov_base = hdr; - vec[0].iov_len = sizeof(struct r2net_msg); - vec[1].iov_base = data; - vec[1].iov_len = data_len; - - BUG_ON(syserr >= R2NET_ERR_MAX); - - /* leave other fields intact from the incoming message, msg_num - * in particular */ - hdr->sys_status = cpu_to_be32(syserr); - hdr->status = cpu_to_be32(err); - hdr->magic = cpu_to_be16(R2NET_MSG_DATA_MAGIC); /* twiddle magic */ - hdr->data_len = cpu_to_be16(data_len); - - msglog(hdr, "about to send data magic %d\n", err); - /* hdr has been in host byteorder this whole time */ - ret = r2net_send_tcp_msg(sc->sc_sock, vec, 2, - sizeof(struct r2net_msg) + data_len); - return ret; -} - -/* - * called by a message handler to convert an otherwise normal reply - * message into a "data magic" message - */ -void r2net_force_data_magic(struct r2net_msg *hdr, u16 msgtype, u32 msgkey) -{ - hdr->magic = cpu_to_be16(R2NET_MSG_DATA_MAGIC); - hdr->msg_type = cpu_to_be16(msgtype); - hdr->key = cpu_to_be32(msgkey); -} - -/* this returns -errno if the header was unknown or too large, etc. - * after this is called the buffer us reused for the next message */ -static int r2net_process_message(struct r2net_sock_container *sc, - struct r2net_msg *hdr) -{ - struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num); - int ret = 0, handler_status; - enum r2net_system_error syserr; - struct r2net_msg_handler *nmh = NULL; - void *ret_data = NULL; - int data_magic = 0; - - msglog(hdr, "processing message\n"); - - r2net_sc_postpone_idle(sc); - - switch (be16_to_cpu(hdr->magic)) { - - case R2NET_MSG_STATUS_MAGIC: - /* special type for returning message status */ - r2net_complete_nsw(nn, NULL, be32_to_cpu(hdr->msg_num), - be32_to_cpu(hdr->sys_status), - be32_to_cpu(hdr->status)); - goto out; - case R2NET_MSG_KEEP_REQ_MAGIC: - r2net_sendpage(sc, r2net_keep_resp, sizeof(*r2net_keep_resp)); - goto out; - case R2NET_MSG_KEEP_RESP_MAGIC: - goto out; - case R2NET_MSG_MAGIC: - break; - case R2NET_MSG_DATA_MAGIC: - /* - * unlike a normal status magic, a data magic DOES - * (MUST) have a handler, so the control flow is - * a little funky here as a result - */ - data_magic = 1; - break; - default: - msglog(hdr, "bad magic\n"); - ret = -EINVAL; - goto out; - break; - } - - /* find a handler for it */ - handler_status = 0; - nmh = r2net_handler_get(be16_to_cpu(hdr->msg_type), - be32_to_cpu(hdr->key)); - if (!nmh) { - mlog(ML_TCP, "couldn't find handler for type %u key %08x\n", - be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key)); - syserr = R2NET_ERR_NO_HNDLR; - goto out_respond; - } - - syserr = R2NET_ERR_NONE; - - if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len) - syserr = R2NET_ERR_OVERFLOW; - - if (syserr != R2NET_ERR_NONE) - goto out_respond; - - r2net_set_func_start_time(sc); - sc->sc_msg_key = be32_to_cpu(hdr->key); - sc->sc_msg_type = be16_to_cpu(hdr->msg_type); - handler_status = (nmh->nh_func)(hdr, sizeof(struct r2net_msg) + - be16_to_cpu(hdr->data_len), - nmh->nh_func_data, &ret_data); - if (data_magic) { - /* - * handler handled data sent in reply to request - * so complete the transaction - */ - r2net_complete_nsw(nn, NULL, be32_to_cpu(hdr->msg_num), - be32_to_cpu(hdr->sys_status), handler_status); - goto out; - } - /* - * handler changed magic to DATA_MAGIC to reply to request for data, - * implies ret_data points to data to return and handler_status - * is the number of bytes of data - */ - if (be16_to_cpu(hdr->magic) == R2NET_MSG_DATA_MAGIC) { - ret = r2net_send_data_magic(sc, hdr, - ret_data, handler_status, - syserr, 0); - hdr = NULL; - mlog(0, "sending data reply %d, syserr %d returned %d\n", - handler_status, syserr, ret); - r2net_set_func_stop_time(sc); - - r2net_update_recv_stats(sc); - goto out; - } - r2net_set_func_stop_time(sc); - - r2net_update_recv_stats(sc); - -out_respond: - /* this destroys the hdr, so don't use it after this */ - mutex_lock(&sc->sc_send_lock); - ret = r2net_send_status_magic(sc->sc_sock, hdr, syserr, - handler_status); - mutex_unlock(&sc->sc_send_lock); - hdr = NULL; - mlog(0, "sending handler status %d, syserr %d returned %d\n", - handler_status, syserr, ret); - - if (nmh) { - BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL); - if (nmh->nh_post_func) - (nmh->nh_post_func)(handler_status, nmh->nh_func_data, - ret_data); - } - -out: - if (nmh) - r2net_handler_put(nmh); - return ret; -} - -static int r2net_check_handshake(struct r2net_sock_container *sc) -{ - struct r2net_handshake *hand = page_address(sc->sc_page); - struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num); - - if (hand->protocol_version != cpu_to_be64(R2NET_PROTOCOL_VERSION)) { - printk(KERN_NOTICE "ramster: " SC_NODEF_FMT " Advertised net " - "protocol version %llu but %llu is required. " - "Disconnecting.\n", sc->sc_node->nd_name, - sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port), - (unsigned long long)be64_to_cpu(hand->protocol_version), - R2NET_PROTOCOL_VERSION); - - /* don't bother reconnecting if its the wrong version. */ - r2net_ensure_shutdown(nn, sc, -ENOTCONN); - return -1; - } - - /* - * Ensure timeouts are consistent with other nodes, otherwise - * we can end up with one node thinking that the other must be down, - * but isn't. This can ultimately cause corruption. - */ - if (be32_to_cpu(hand->r2net_idle_timeout_ms) != - r2net_idle_timeout()) { - printk(KERN_NOTICE "ramster: " SC_NODEF_FMT " uses a network " - "idle timeout of %u ms, but we use %u ms locally. " - "Disconnecting.\n", sc->sc_node->nd_name, - sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port), - be32_to_cpu(hand->r2net_idle_timeout_ms), - r2net_idle_timeout()); - r2net_ensure_shutdown(nn, sc, -ENOTCONN); - return -1; - } - - if (be32_to_cpu(hand->r2net_keepalive_delay_ms) != - r2net_keepalive_delay()) { - printk(KERN_NOTICE "ramster: " SC_NODEF_FMT " uses a keepalive " - "delay of %u ms, but we use %u ms locally. " - "Disconnecting.\n", sc->sc_node->nd_name, - sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port), - be32_to_cpu(hand->r2net_keepalive_delay_ms), - r2net_keepalive_delay()); - r2net_ensure_shutdown(nn, sc, -ENOTCONN); - return -1; - } - - if (be32_to_cpu(hand->r2hb_heartbeat_timeout_ms) != - R2HB_MAX_WRITE_TIMEOUT_MS) { - printk(KERN_NOTICE "ramster: " SC_NODEF_FMT " uses a heartbeat " - "timeout of %u ms, but we use %u ms locally. " - "Disconnecting.\n", sc->sc_node->nd_name, - sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port), - be32_to_cpu(hand->r2hb_heartbeat_timeout_ms), - R2HB_MAX_WRITE_TIMEOUT_MS); - r2net_ensure_shutdown(nn, sc, -ENOTCONN); - return -1; - } - - sc->sc_handshake_ok = 1; - - spin_lock(&nn->nn_lock); - /* set valid and queue the idle timers only if it hasn't been - * shut down already */ - if (nn->nn_sc == sc) { - r2net_sc_reset_idle_timer(sc); - atomic_set(&nn->nn_timeout, 0); - r2net_set_nn_state(nn, sc, 1, 0); - } - spin_unlock(&nn->nn_lock); - - /* shift everything up as though it wasn't there */ - sc->sc_page_off -= sizeof(struct r2net_handshake); - if (sc->sc_page_off) - memmove(hand, hand + 1, sc->sc_page_off); - - return 0; -} - -/* this demuxes the queued rx bytes into header or payload bits and calls - * handlers as each full message is read off the socket. it returns -error, - * == 0 eof, or > 0 for progress made.*/ -static int r2net_advance_rx(struct r2net_sock_container *sc) -{ - struct r2net_msg *hdr; - int ret = 0; - void *data; - size_t datalen; - - sclog(sc, "receiving\n"); - r2net_set_advance_start_time(sc); - - if (unlikely(sc->sc_handshake_ok == 0)) { - if (sc->sc_page_off < sizeof(struct r2net_handshake)) { - data = page_address(sc->sc_page) + sc->sc_page_off; - datalen = sizeof(struct r2net_handshake) - - sc->sc_page_off; - ret = r2net_recv_tcp_msg(sc->sc_sock, data, datalen); - if (ret > 0) - sc->sc_page_off += ret; - } - - if (sc->sc_page_off == sizeof(struct r2net_handshake)) { - r2net_check_handshake(sc); - if (unlikely(sc->sc_handshake_ok == 0)) - ret = -EPROTO; - } - goto out; - } - - /* do we need more header? */ - if (sc->sc_page_off < sizeof(struct r2net_msg)) { - data = page_address(sc->sc_page) + sc->sc_page_off; - datalen = sizeof(struct r2net_msg) - sc->sc_page_off; - ret = r2net_recv_tcp_msg(sc->sc_sock, data, datalen); - if (ret > 0) { - sc->sc_page_off += ret; - /* only swab incoming here.. we can - * only get here once as we cross from - * being under to over */ - if (sc->sc_page_off == sizeof(struct r2net_msg)) { - hdr = page_address(sc->sc_page); - if (be16_to_cpu(hdr->data_len) > - R2NET_MAX_PAYLOAD_BYTES) - ret = -EOVERFLOW; - } - } - if (ret <= 0) - goto out; - } - - if (sc->sc_page_off < sizeof(struct r2net_msg)) { - /* oof, still don't have a header */ - goto out; - } - - /* this was swabbed above when we first read it */ - hdr = page_address(sc->sc_page); - - msglog(hdr, "at page_off %zu\n", sc->sc_page_off); - - /* do we need more payload? */ - if (sc->sc_page_off - sizeof(struct r2net_msg) < - be16_to_cpu(hdr->data_len)) { - /* need more payload */ - data = page_address(sc->sc_page) + sc->sc_page_off; - datalen = (sizeof(struct r2net_msg) + - be16_to_cpu(hdr->data_len)) - - sc->sc_page_off; - ret = r2net_recv_tcp_msg(sc->sc_sock, data, datalen); - if (ret > 0) - sc->sc_page_off += ret; - if (ret <= 0) - goto out; - } - - if (sc->sc_page_off - sizeof(struct r2net_msg) == - be16_to_cpu(hdr->data_len)) { - /* we can only get here once, the first time we read - * the payload.. so set ret to progress if the handler - * works out. after calling this the message is toast */ - ret = r2net_process_message(sc, hdr); - if (ret == 0) - ret = 1; - sc->sc_page_off = 0; - } - -out: - sclog(sc, "ret = %d\n", ret); - r2net_set_advance_stop_time(sc); - return ret; -} - -/* this work func is triggerd by data ready. it reads until it can read no - * more. it interprets 0, eof, as fatal. if data_ready hits while we're doing - * our work the work struct will be marked and we'll be called again. */ -static void r2net_rx_until_empty(struct work_struct *work) -{ - struct r2net_sock_container *sc = - container_of(work, struct r2net_sock_container, sc_rx_work); - int ret; - - do { - ret = r2net_advance_rx(sc); - } while (ret > 0); - - if (ret <= 0 && ret != -EAGAIN) { - struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num); - sclog(sc, "saw error %d, closing\n", ret); - /* not permanent so read failed handshake can retry */ - r2net_ensure_shutdown(nn, sc, 0); - } - - sc_put(sc); -} - -static int r2net_set_nodelay(struct socket *sock) -{ - int ret, val = 1; - mm_segment_t oldfs; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - - /* - * Dear unsuspecting programmer, - * - * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level - * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will - * silently turn into SO_DEBUG. - * - * Yours, - * Keeper of hilariously fragile interfaces. - */ - ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char __user *)&val, sizeof(val)); - - set_fs(oldfs); - return ret; -} - -static void r2net_initialize_handshake(void) -{ - r2net_hand->r2hb_heartbeat_timeout_ms = cpu_to_be32( - R2HB_MAX_WRITE_TIMEOUT_MS); - r2net_hand->r2net_idle_timeout_ms = cpu_to_be32(r2net_idle_timeout()); - r2net_hand->r2net_keepalive_delay_ms = cpu_to_be32( - r2net_keepalive_delay()); - r2net_hand->r2net_reconnect_delay_ms = cpu_to_be32( - r2net_reconnect_delay()); -} - -/* ------------------------------------------------------------ */ - -/* called when a connect completes and after a sock is accepted. the - * rx path will see the response and mark the sc valid */ -static void r2net_sc_connect_completed(struct work_struct *work) -{ - struct r2net_sock_container *sc = - container_of(work, struct r2net_sock_container, - sc_connect_work); - - mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n", - (unsigned long long)R2NET_PROTOCOL_VERSION, - (unsigned long long)be64_to_cpu(r2net_hand->connector_id)); - - r2net_initialize_handshake(); - r2net_sendpage(sc, r2net_hand, sizeof(*r2net_hand)); - sc_put(sc); -} - -/* this is called as a work_struct func. */ -static void r2net_sc_send_keep_req(struct work_struct *work) -{ - struct r2net_sock_container *sc = - container_of(work, struct r2net_sock_container, - sc_keepalive_work.work); - - r2net_sendpage(sc, r2net_keep_req, sizeof(*r2net_keep_req)); - sc_put(sc); -} - -/* socket shutdown does a del_timer_sync against this as it tears down. - * we can't start this timer until we've got to the point in sc buildup - * where shutdown is going to be involved */ -static void r2net_idle_timer(unsigned long data) -{ - struct r2net_sock_container *sc = (struct r2net_sock_container *)data; -#ifdef CONFIG_DEBUG_FS - unsigned long msecs = ktime_to_ms(ktime_get()) - - ktime_to_ms(sc->sc_tv_timer); -#else - unsigned long msecs = r2net_idle_timeout(); -#endif - - printk(KERN_NOTICE "ramster: Connection to " SC_NODEF_FMT " has been " - "idle for %lu.%lu secs, shutting it down.\n", - sc->sc_node->nd_name, sc->sc_node->nd_num, - &sc->sc_node->nd_ipv4_address, ntohs(sc->sc_node->nd_ipv4_port), - msecs / 1000, msecs % 1000); - - /* - * Initialize the nn_timeout so that the next connection attempt - * will continue in r2net_start_connect. - */ - /* Avoid spurious shutdowns... not sure if this is still necessary */ - pr_err("ramster_idle_timer, skipping shutdown work\n"); -#if 0 - /* old code used to do these two lines */ - atomic_set(&nn->nn_timeout, 1); - r2net_sc_queue_work(sc, &sc->sc_shutdown_work); -#endif -} - -static void r2net_sc_reset_idle_timer(struct r2net_sock_container *sc) -{ - r2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); - r2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, - msecs_to_jiffies(r2net_keepalive_delay())); - r2net_set_sock_timer(sc); - mod_timer(&sc->sc_idle_timeout, - jiffies + msecs_to_jiffies(r2net_idle_timeout())); -} - -static void r2net_sc_postpone_idle(struct r2net_sock_container *sc) -{ - /* Only push out an existing timer */ - if (timer_pending(&sc->sc_idle_timeout)) - r2net_sc_reset_idle_timer(sc); -} - -/* this work func is kicked whenever a path sets the nn state which doesn't - * have valid set. This includes seeing hb come up, losing a connection, - * having a connect attempt fail, etc. This centralizes the logic which decides - * if a connect attempt should be made or if we should give up and all future - * transmit attempts should fail */ -static void r2net_start_connect(struct work_struct *work) -{ - struct r2net_node *nn = - container_of(work, struct r2net_node, nn_connect_work.work); - struct r2net_sock_container *sc = NULL; - struct r2nm_node *node = NULL, *mynode = NULL; - struct socket *sock = NULL; - struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; - int ret = 0, stop; - unsigned int timeout; - - /* if we're greater we initiate tx, otherwise we accept */ - if (r2nm_this_node() <= r2net_num_from_nn(nn)) - goto out; - - /* watch for racing with tearing a node down */ - node = r2nm_get_node_by_num(r2net_num_from_nn(nn)); - if (node == NULL) { - ret = 0; - goto out; - } - - mynode = r2nm_get_node_by_num(r2nm_this_node()); - if (mynode == NULL) { - ret = 0; - goto out; - } - - spin_lock(&nn->nn_lock); - /* - * see if we already have one pending or have given up. - * For nn_timeout, it is set when we close the connection - * because of the idle time out. So it means that we have - * at least connected to that node successfully once, - * now try to connect to it again. - */ - timeout = atomic_read(&nn->nn_timeout); - stop = (nn->nn_sc || - (nn->nn_persistent_error && - (nn->nn_persistent_error != -ENOTCONN || timeout == 0))); - spin_unlock(&nn->nn_lock); - if (stop) - goto out; - - nn->nn_last_connect_attempt = jiffies; - - sc = sc_alloc(node); - if (sc == NULL) { - mlog(0, "couldn't allocate sc\n"); - ret = -ENOMEM; - goto out; - } - - ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (ret < 0) { - mlog(0, "can't create socket: %d\n", ret); - goto out; - } - sc->sc_sock = sock; /* freed by sc_kref_release */ - - sock->sk->sk_allocation = GFP_ATOMIC; - - myaddr.sin_family = AF_INET; - myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; - myaddr.sin_port = htons(0); /* any port */ - - ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, - sizeof(myaddr)); - if (ret) { - mlog(ML_ERROR, "bind failed with %d at address %pI4\n", - ret, &mynode->nd_ipv4_address); - goto out; - } - - ret = r2net_set_nodelay(sc->sc_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } - - r2net_register_callbacks(sc->sc_sock->sk, sc); - - spin_lock(&nn->nn_lock); - /* handshake completion will set nn->nn_sc_valid */ - r2net_set_nn_state(nn, sc, 0, 0); - spin_unlock(&nn->nn_lock); - - remoteaddr.sin_family = AF_INET; - remoteaddr.sin_addr.s_addr = node->nd_ipv4_address; - remoteaddr.sin_port = node->nd_ipv4_port; - - ret = sc->sc_sock->ops->connect(sc->sc_sock, - (struct sockaddr *)&remoteaddr, - sizeof(remoteaddr), - O_NONBLOCK); - if (ret == -EINPROGRESS) - ret = 0; - -out: - if (ret) { - printk(KERN_NOTICE "ramster: Connect attempt to " SC_NODEF_FMT - " failed with errno %d\n", sc->sc_node->nd_name, - sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port), ret); - /* 0 err so that another will be queued and attempted - * from set_nn_state */ - if (sc) - r2net_ensure_shutdown(nn, sc, 0); - } - if (sc) - sc_put(sc); - if (node) - r2nm_node_put(node); - if (mynode) - r2nm_node_put(mynode); - - return; -} - -static void r2net_connect_expired(struct work_struct *work) -{ - struct r2net_node *nn = - container_of(work, struct r2net_node, nn_connect_expired.work); - - spin_lock(&nn->nn_lock); - if (!nn->nn_sc_valid) { - printk(KERN_NOTICE "ramster: No connection established with " - "node %u after %u.%u seconds, giving up.\n", - r2net_num_from_nn(nn), - r2net_idle_timeout() / 1000, - r2net_idle_timeout() % 1000); - - r2net_set_nn_state(nn, NULL, 0, -ENOTCONN); - } - spin_unlock(&nn->nn_lock); -} - -static void r2net_still_up(struct work_struct *work) -{ -} - -/* ------------------------------------------------------------ */ - -void r2net_disconnect_node(struct r2nm_node *node) -{ - struct r2net_node *nn = r2net_nn_from_num(node->nd_num); - - /* don't reconnect until it's heartbeating again */ - spin_lock(&nn->nn_lock); - atomic_set(&nn->nn_timeout, 0); - r2net_set_nn_state(nn, NULL, 0, -ENOTCONN); - spin_unlock(&nn->nn_lock); - - if (r2net_wq) { - cancel_delayed_work(&nn->nn_connect_expired); - cancel_delayed_work(&nn->nn_connect_work); - cancel_delayed_work(&nn->nn_still_up); - flush_workqueue(r2net_wq); - } -} - -static void r2net_hb_node_down_cb(struct r2nm_node *node, int node_num, - void *data) -{ - if (!node) - return; - - if (node_num != r2nm_this_node()) - r2net_disconnect_node(node); - - BUG_ON(atomic_read(&r2net_connected_peers) < 0); -} - -static void r2net_hb_node_up_cb(struct r2nm_node *node, int node_num, - void *data) -{ - struct r2net_node *nn = r2net_nn_from_num(node_num); - - BUG_ON(!node); - - /* ensure an immediate connect attempt */ - nn->nn_last_connect_attempt = jiffies - - (msecs_to_jiffies(r2net_reconnect_delay()) + 1); - - if (node_num != r2nm_this_node()) { - /* believe it or not, accept and node hearbeating testing - * can succeed for this node before we got here.. so - * only use set_nn_state to clear the persistent error - * if that hasn't already happened */ - spin_lock(&nn->nn_lock); - atomic_set(&nn->nn_timeout, 0); - if (nn->nn_persistent_error) - r2net_set_nn_state(nn, NULL, 0, 0); - spin_unlock(&nn->nn_lock); - } -} - -void r2net_unregister_hb_callbacks(void) -{ - r2hb_unregister_callback(NULL, &r2net_hb_up); - r2hb_unregister_callback(NULL, &r2net_hb_down); -} - -int r2net_register_hb_callbacks(void) -{ - int ret; - - r2hb_setup_callback(&r2net_hb_down, R2HB_NODE_DOWN_CB, - r2net_hb_node_down_cb, NULL, R2NET_HB_PRI); - r2hb_setup_callback(&r2net_hb_up, R2HB_NODE_UP_CB, - r2net_hb_node_up_cb, NULL, R2NET_HB_PRI); - - ret = r2hb_register_callback(NULL, &r2net_hb_up); - if (ret == 0) - ret = r2hb_register_callback(NULL, &r2net_hb_down); - - if (ret) - r2net_unregister_hb_callbacks(); - - return ret; -} - -/* ------------------------------------------------------------ */ - -static int r2net_accept_one(struct socket *sock) -{ - int ret, slen; - struct sockaddr_in sin; - struct socket *new_sock = NULL; - struct r2nm_node *node = NULL; - struct r2nm_node *local_node = NULL; - struct r2net_sock_container *sc = NULL; - struct r2net_node *nn; - - BUG_ON(sock == NULL); - ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, - sock->sk->sk_protocol, &new_sock); - if (ret) - goto out; - - new_sock->type = sock->type; - new_sock->ops = sock->ops; - ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); - if (ret < 0) - goto out; - - new_sock->sk->sk_allocation = GFP_ATOMIC; - - ret = r2net_set_nodelay(new_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } - - slen = sizeof(sin); - ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, - &slen, 1); - if (ret < 0) - goto out; - - node = r2nm_get_node_by_ip(sin.sin_addr.s_addr); - if (node == NULL) { - printk(KERN_NOTICE "ramster: Attempt to connect from unknown " - "node at %pI4:%d\n", &sin.sin_addr.s_addr, - ntohs(sin.sin_port)); - ret = -EINVAL; - goto out; - } - - if (r2nm_this_node() >= node->nd_num) { - local_node = r2nm_get_node_by_num(r2nm_this_node()); - printk(KERN_NOTICE "ramster: Unexpected connect attempt seen " - "at node '%s' (%u, %pI4:%d) from node '%s' (%u, " - "%pI4:%d)\n", local_node->nd_name, local_node->nd_num, - &(local_node->nd_ipv4_address), - ntohs(local_node->nd_ipv4_port), node->nd_name, - node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port)); - ret = -EINVAL; - goto out; - } - - /* this happens all the time when the other node sees our heartbeat - * and tries to connect before we see their heartbeat */ - if (!r2hb_check_node_heartbeating_from_callback(node->nd_num)) { - mlog(ML_CONN, "attempt to connect from node '%s' at " - "%pI4:%d but it isn't heartbeating\n", - node->nd_name, &sin.sin_addr.s_addr, - ntohs(sin.sin_port)); - ret = -EINVAL; - goto out; - } - - nn = r2net_nn_from_num(node->nd_num); - - spin_lock(&nn->nn_lock); - if (nn->nn_sc) - ret = -EBUSY; - else - ret = 0; - spin_unlock(&nn->nn_lock); - if (ret) { - printk(KERN_NOTICE "ramster: Attempt to connect from node '%s' " - "at %pI4:%d but it already has an open connection\n", - node->nd_name, &sin.sin_addr.s_addr, - ntohs(sin.sin_port)); - goto out; - } - - sc = sc_alloc(node); - if (sc == NULL) { - ret = -ENOMEM; - goto out; - } - - sc->sc_sock = new_sock; - new_sock = NULL; - - spin_lock(&nn->nn_lock); - atomic_set(&nn->nn_timeout, 0); - r2net_set_nn_state(nn, sc, 0, 0); - spin_unlock(&nn->nn_lock); - - r2net_register_callbacks(sc->sc_sock->sk, sc); - r2net_sc_queue_work(sc, &sc->sc_rx_work); - - r2net_initialize_handshake(); - r2net_sendpage(sc, r2net_hand, sizeof(*r2net_hand)); - -out: - if (new_sock) - sock_release(new_sock); - if (node) - r2nm_node_put(node); - if (local_node) - r2nm_node_put(local_node); - if (sc) - sc_put(sc); - return ret; -} - -static void r2net_accept_many(struct work_struct *work) -{ - struct socket *sock = r2net_listen_sock; - while (r2net_accept_one(sock) == 0) - cond_resched(); -} - -static void r2net_listen_data_ready(struct sock *sk, int bytes) -{ - void (*ready)(struct sock *sk, int bytes); - - read_lock(&sk->sk_callback_lock); - ready = sk->sk_user_data; - if (ready == NULL) { /* check for teardown race */ - ready = sk->sk_data_ready; - goto out; - } - - /* ->sk_data_ready is also called for a newly established child socket - * before it has been accepted and the acceptor has set up their - * data_ready.. we only want to queue listen work for our listening - * socket */ - if (sk->sk_state == TCP_LISTEN) { - mlog(ML_TCP, "bytes: %d\n", bytes); - queue_work(r2net_wq, &r2net_listen_work); - } - -out: - read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); -} - -static int r2net_open_listening_sock(__be32 addr, __be16 port) -{ - struct socket *sock = NULL; - int ret; - struct sockaddr_in sin = { - .sin_family = PF_INET, - .sin_addr = { .s_addr = addr }, - .sin_port = port, - }; - - ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (ret < 0) { - printk(KERN_ERR "ramster: Error %d while creating socket\n", - ret); - goto out; - } - - sock->sk->sk_allocation = GFP_ATOMIC; - - write_lock_bh(&sock->sk->sk_callback_lock); - sock->sk->sk_user_data = sock->sk->sk_data_ready; - sock->sk->sk_data_ready = r2net_listen_data_ready; - write_unlock_bh(&sock->sk->sk_callback_lock); - - r2net_listen_sock = sock; - INIT_WORK(&r2net_listen_work, r2net_accept_many); - - sock->sk->sk_reuse = 1; - ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); - if (ret < 0) { - printk(KERN_ERR "ramster: Error %d while binding socket at " - "%pI4:%u\n", ret, &addr, ntohs(port)); - goto out; - } - - ret = sock->ops->listen(sock, 64); - if (ret < 0) - printk(KERN_ERR "ramster: Error %d while listening on %pI4:%u\n", - ret, &addr, ntohs(port)); - -out: - if (ret) { - r2net_listen_sock = NULL; - if (sock) - sock_release(sock); - } - return ret; -} - -/* - * called from node manager when we should bring up our network listening - * socket. node manager handles all the serialization to only call this - * once and to match it with r2net_stop_listening(). note, - * r2nm_this_node() doesn't work yet as we're being called while it - * is being set up. - */ -int r2net_start_listening(struct r2nm_node *node) -{ - int ret = 0; - - BUG_ON(r2net_wq != NULL); - BUG_ON(r2net_listen_sock != NULL); - - mlog(ML_KTHREAD, "starting r2net thread...\n"); - r2net_wq = create_singlethread_workqueue("r2net"); - if (r2net_wq == NULL) { - mlog(ML_ERROR, "unable to launch r2net thread\n"); - return -ENOMEM; /* ? */ - } - - ret = r2net_open_listening_sock(node->nd_ipv4_address, - node->nd_ipv4_port); - if (ret) { - destroy_workqueue(r2net_wq); - r2net_wq = NULL; - } - - return ret; -} - -/* again, r2nm_this_node() doesn't work here as we're involved in - * tearing it down */ -void r2net_stop_listening(struct r2nm_node *node) -{ - struct socket *sock = r2net_listen_sock; - size_t i; - - BUG_ON(r2net_wq == NULL); - BUG_ON(r2net_listen_sock == NULL); - - /* stop the listening socket from generating work */ - write_lock_bh(&sock->sk->sk_callback_lock); - sock->sk->sk_data_ready = sock->sk->sk_user_data; - sock->sk->sk_user_data = NULL; - write_unlock_bh(&sock->sk->sk_callback_lock); - - for (i = 0; i < ARRAY_SIZE(r2net_nodes); i++) { - struct r2nm_node *node = r2nm_get_node_by_num(i); - if (node) { - r2net_disconnect_node(node); - r2nm_node_put(node); - } - } - - /* finish all work and tear down the work queue */ - mlog(ML_KTHREAD, "waiting for r2net thread to exit....\n"); - destroy_workqueue(r2net_wq); - r2net_wq = NULL; - - sock_release(r2net_listen_sock); - r2net_listen_sock = NULL; -} - -void r2net_hb_node_up_manual(int node_num) -{ - struct r2nm_node dummy; - if (r2nm_single_cluster == NULL) - pr_err("ramster: cluster not alive, node_up_manual ignored\n"); - else { - r2hb_manual_set_node_heartbeating(node_num); - r2net_hb_node_up_cb(&dummy, node_num, NULL); - } -} - -/* ------------------------------------------------------------ */ - -int r2net_init(void) -{ - unsigned long i; - - if (r2net_debugfs_init()) - return -ENOMEM; - - r2net_hand = kzalloc(sizeof(struct r2net_handshake), GFP_KERNEL); - r2net_keep_req = kzalloc(sizeof(struct r2net_msg), GFP_KERNEL); - r2net_keep_resp = kzalloc(sizeof(struct r2net_msg), GFP_KERNEL); - if (!r2net_hand || !r2net_keep_req || !r2net_keep_resp) { - kfree(r2net_hand); - kfree(r2net_keep_req); - kfree(r2net_keep_resp); - return -ENOMEM; - } - - r2net_hand->protocol_version = cpu_to_be64(R2NET_PROTOCOL_VERSION); - r2net_hand->connector_id = cpu_to_be64(1); - - r2net_keep_req->magic = cpu_to_be16(R2NET_MSG_KEEP_REQ_MAGIC); - r2net_keep_resp->magic = cpu_to_be16(R2NET_MSG_KEEP_RESP_MAGIC); - - for (i = 0; i < ARRAY_SIZE(r2net_nodes); i++) { - struct r2net_node *nn = r2net_nn_from_num(i); - - atomic_set(&nn->nn_timeout, 0); - spin_lock_init(&nn->nn_lock); - INIT_DELAYED_WORK(&nn->nn_connect_work, r2net_start_connect); - INIT_DELAYED_WORK(&nn->nn_connect_expired, - r2net_connect_expired); - INIT_DELAYED_WORK(&nn->nn_still_up, r2net_still_up); - /* until we see hb from a node we'll return einval */ - nn->nn_persistent_error = -ENOTCONN; - init_waitqueue_head(&nn->nn_sc_wq); - idr_init(&nn->nn_status_idr); - INIT_LIST_HEAD(&nn->nn_status_list); - } - - return 0; -} - -void r2net_exit(void) -{ - kfree(r2net_hand); - kfree(r2net_keep_req); - kfree(r2net_keep_resp); - r2net_debugfs_exit(); -} diff --git a/drivers/staging/ramster/cluster/tcp.h b/drivers/staging/ramster/cluster/tcp.h deleted file mode 100755 index 9d0583345..000000000 --- a/drivers/staging/ramster/cluster/tcp.h +++ /dev/null @@ -1,159 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * tcp.h - * - * Function prototypes - * - * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - */ - -#ifndef R2CLUSTER_TCP_H -#define R2CLUSTER_TCP_H - -#include -#ifdef __KERNEL__ -#include -#include -#else -#include -#endif -#include -#include - -struct r2net_msg { - __be16 magic; - __be16 data_len; - __be16 msg_type; - __be16 pad1; - __be32 sys_status; - __be32 status; - __be32 key; - __be32 msg_num; - __u8 buf[0]; -}; - -typedef int (r2net_msg_handler_func)(struct r2net_msg *msg, u32 len, void *data, - void **ret_data); -typedef void (r2net_post_msg_handler_func)(int status, void *data, - void *ret_data); - -#define R2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct r2net_msg)) - -/* same as hb delay, we're waiting for another node to recognize our hb */ -#define R2NET_RECONNECT_DELAY_MS_DEFAULT 2000 - -#define R2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 -#define R2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 - - -/* TODO: figure this out.... */ -static inline int r2net_link_down(int err, struct socket *sock) -{ - if (sock) { - if (sock->sk->sk_state != TCP_ESTABLISHED && - sock->sk->sk_state != TCP_CLOSE_WAIT) - return 1; - } - - if (err >= 0) - return 0; - switch (err) { - - /* ????????????????????????? */ - case -ERESTARTSYS: - case -EBADF: - /* When the server has died, an ICMP port unreachable - * message prompts ECONNREFUSED. */ - case -ECONNREFUSED: - case -ENOTCONN: - case -ECONNRESET: - case -EPIPE: - return 1; - - } - return 0; -} - -enum { - R2NET_DRIVER_UNINITED, - R2NET_DRIVER_READY, -}; - -int r2net_send_message(u32 msg_type, u32 key, void *data, u32 len, - u8 target_node, int *status); -int r2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec, - size_t veclen, u8 target_node, int *status); - -int r2net_register_handler(u32 msg_type, u32 key, u32 max_len, - r2net_msg_handler_func *func, void *data, - r2net_post_msg_handler_func *post_func, - struct list_head *unreg_list); -void r2net_unregister_handler_list(struct list_head *list); - -void r2net_fill_node_map(unsigned long *map, unsigned bytes); - -void r2net_force_data_magic(struct r2net_msg *, u16, u32); -void r2net_hb_node_up_manual(int); -struct r2net_node *r2net_nn_from_num(u8); - -struct r2nm_node; -int r2net_register_hb_callbacks(void); -void r2net_unregister_hb_callbacks(void); -int r2net_start_listening(struct r2nm_node *node); -void r2net_stop_listening(struct r2nm_node *node); -void r2net_disconnect_node(struct r2nm_node *node); -int r2net_num_connected_peers(void); - -int r2net_init(void); -void r2net_exit(void); - -struct r2net_send_tracking; -struct r2net_sock_container; - -#if 0 -int r2net_debugfs_init(void); -void r2net_debugfs_exit(void); -void r2net_debug_add_nst(struct r2net_send_tracking *nst); -void r2net_debug_del_nst(struct r2net_send_tracking *nst); -void r2net_debug_add_sc(struct r2net_sock_container *sc); -void r2net_debug_del_sc(struct r2net_sock_container *sc); -#else -static inline int r2net_debugfs_init(void) -{ - return 0; -} -static inline void r2net_debugfs_exit(void) -{ -} -static inline void r2net_debug_add_nst(struct r2net_send_tracking *nst) -{ -} -static inline void r2net_debug_del_nst(struct r2net_send_tracking *nst) -{ -} -static inline void r2net_debug_add_sc(struct r2net_sock_container *sc) -{ -} -static inline void r2net_debug_del_sc(struct r2net_sock_container *sc) -{ -} -#endif /* CONFIG_DEBUG_FS */ - -#endif /* R2CLUSTER_TCP_H */ diff --git a/drivers/staging/ramster/cluster/tcp_internal.h b/drivers/staging/ramster/cluster/tcp_internal.h deleted file mode 100755 index 4d8cc9f96..000000000 --- a/drivers/staging/ramster/cluster/tcp_internal.h +++ /dev/null @@ -1,248 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef R2CLUSTER_TCP_INTERNAL_H -#define R2CLUSTER_TCP_INTERNAL_H - -#define R2NET_MSG_MAGIC ((u16)0xfa55) -#define R2NET_MSG_STATUS_MAGIC ((u16)0xfa56) -#define R2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57) -#define R2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58) -/* - * "data magic" is a long version of "status magic" where the message - * payload actually contains data to be passed in reply to certain messages - */ -#define R2NET_MSG_DATA_MAGIC ((u16)0xfa59) - -/* we're delaying our quorum decision so that heartbeat will have timed - * out truly dead nodes by the time we come around to making decisions - * on their number */ -#define R2NET_QUORUM_DELAY_MS \ - ((r2hb_dead_threshold + 2) * R2HB_REGION_TIMEOUT_MS) - -/* - * This version number represents quite a lot, unfortunately. It not - * only represents the raw network message protocol on the wire but also - * locking semantics of the file system using the protocol. It should - * be somewhere else, I'm sure, but right now it isn't. - * - * With version 11, we separate out the filesystem locking portion. The - * filesystem now has a major.minor version it negotiates. Version 11 - * introduces this negotiation to the r2dlm protocol, and as such the - * version here in tcp_internal.h should not need to be bumped for - * filesystem locking changes. - * - * New in version 11 - * - Negotiation of filesystem locking in the dlm join. - * - * New in version 10: - * - Meta/data locks combined - * - * New in version 9: - * - All votes removed - * - * New in version 8: - * - Replace delete inode votes with a cluster lock - * - * New in version 7: - * - DLM join domain includes the live nodemap - * - * New in version 6: - * - DLM lockres remote refcount fixes. - * - * New in version 5: - * - Network timeout checking protocol - * - * New in version 4: - * - Remove i_generation from lock names for better stat performance. - * - * New in version 3: - * - Replace dentry votes with a cluster lock - * - * New in version 2: - * - full 64 bit i_size in the metadata lock lvbs - * - introduction of "rw" lock and pushing meta/data locking down - */ -#define R2NET_PROTOCOL_VERSION 11ULL -struct r2net_handshake { - __be64 protocol_version; - __be64 connector_id; - __be32 r2hb_heartbeat_timeout_ms; - __be32 r2net_idle_timeout_ms; - __be32 r2net_keepalive_delay_ms; - __be32 r2net_reconnect_delay_ms; -}; - -struct r2net_node { - /* this is never called from int/bh */ - spinlock_t nn_lock; - - /* set the moment an sc is allocated and a connect is started */ - struct r2net_sock_container *nn_sc; - /* _valid is only set after the handshake passes and tx can happen */ - unsigned nn_sc_valid:1; - /* if this is set tx just returns it */ - int nn_persistent_error; - /* It is only set to 1 after the idle time out. */ - atomic_t nn_timeout; - - /* threads waiting for an sc to arrive wait on the wq for generation - * to increase. it is increased when a connecting socket succeeds - * or fails or when an accepted socket is attached. */ - wait_queue_head_t nn_sc_wq; - - struct idr nn_status_idr; - struct list_head nn_status_list; - - /* connects are attempted from when heartbeat comes up until either hb - * goes down, the node is unconfigured, no connect attempts succeed - * before R2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work - * is queued from set_nn_state both from hb up and from itself if a - * connect attempt fails and so can be self-arming. shutdown is - * careful to first mark the nn such that no connects will be attempted - * before canceling delayed connect work and flushing the queue. */ - struct delayed_work nn_connect_work; - unsigned long nn_last_connect_attempt; - - /* this is queued as nodes come up and is canceled when a connection is - * established. this expiring gives up on the node and errors out - * transmits */ - struct delayed_work nn_connect_expired; - - /* after we give up on a socket we wait a while before deciding - * that it is still heartbeating and that we should do some - * quorum work */ - struct delayed_work nn_still_up; -}; - -struct r2net_sock_container { - struct kref sc_kref; - /* the next two are valid for the life time of the sc */ - struct socket *sc_sock; - struct r2nm_node *sc_node; - - /* all of these sc work structs hold refs on the sc while they are - * queued. they should not be able to ref a freed sc. the teardown - * race is with r2net_wq destruction in r2net_stop_listening() */ - - /* rx and connect work are generated from socket callbacks. sc - * shutdown removes the callbacks and then flushes the work queue */ - struct work_struct sc_rx_work; - struct work_struct sc_connect_work; - /* shutdown work is triggered in two ways. the simple way is - * for a code path calls ensure_shutdown which gets a lock, removes - * the sc from the nn, and queues the work. in this case the - * work is single-shot. the work is also queued from a sock - * callback, though, and in this case the work will find the sc - * still on the nn and will call ensure_shutdown itself.. this - * ends up triggering the shutdown work again, though nothing - * will be done in that second iteration. so work queue teardown - * has to be careful to remove the sc from the nn before waiting - * on the work queue so that the shutdown work doesn't remove the - * sc and rearm itself. - */ - struct work_struct sc_shutdown_work; - - struct timer_list sc_idle_timeout; - struct delayed_work sc_keepalive_work; - - unsigned sc_handshake_ok:1; - - struct page *sc_page; - size_t sc_page_off; - - /* original handlers for the sockets */ - void (*sc_state_change)(struct sock *sk); - void (*sc_data_ready)(struct sock *sk, int bytes); - - u32 sc_msg_key; - u16 sc_msg_type; - -#ifdef CONFIG_DEBUG_FS - struct list_head sc_net_debug_item; - ktime_t sc_tv_timer; - ktime_t sc_tv_data_ready; - ktime_t sc_tv_advance_start; - ktime_t sc_tv_advance_stop; - ktime_t sc_tv_func_start; - ktime_t sc_tv_func_stop; -#endif -#ifdef CONFIG_RAMSTER_FS_STATS - ktime_t sc_tv_acquiry_total; - ktime_t sc_tv_send_total; - ktime_t sc_tv_status_total; - u32 sc_send_count; - u32 sc_recv_count; - ktime_t sc_tv_process_total; -#endif - struct mutex sc_send_lock; -}; - -struct r2net_msg_handler { - struct rb_node nh_node; - u32 nh_max_len; - u32 nh_msg_type; - u32 nh_key; - r2net_msg_handler_func *nh_func; - r2net_msg_handler_func *nh_func_data; - r2net_post_msg_handler_func - *nh_post_func; - struct kref nh_kref; - struct list_head nh_unregister_item; -}; - -enum r2net_system_error { - R2NET_ERR_NONE = 0, - R2NET_ERR_NO_HNDLR, - R2NET_ERR_OVERFLOW, - R2NET_ERR_DIED, - R2NET_ERR_MAX -}; - -struct r2net_status_wait { - enum r2net_system_error ns_sys_status; - s32 ns_status; - int ns_id; - wait_queue_head_t ns_wq; - struct list_head ns_node_item; -}; - -#ifdef CONFIG_DEBUG_FS -/* just for state dumps */ -struct r2net_send_tracking { - struct list_head st_net_debug_item; - struct task_struct *st_task; - struct r2net_sock_container *st_sc; - u32 st_id; - u32 st_msg_type; - u32 st_msg_key; - u8 st_node; - ktime_t st_sock_time; - ktime_t st_send_time; - ktime_t st_status_time; -}; -#else -struct r2net_send_tracking { - u32 dummy; -}; -#endif /* CONFIG_DEBUG_FS */ - -#endif /* R2CLUSTER_TCP_INTERNAL_H */ diff --git a/drivers/staging/ramster/r2net.c b/drivers/staging/ramster/r2net.c deleted file mode 100644 index 2ee02204c..000000000 --- a/drivers/staging/ramster/r2net.c +++ /dev/null @@ -1,401 +0,0 @@ -/* - * r2net.c - * - * Copyright (c) 2011, Dan Magenheimer, Oracle Corp. - * - * Ramster_r2net provides an interface between zcache and r2net. - * - * FIXME: support more than two nodes - */ - -#include -#include "cluster/tcp.h" -#include "cluster/nodemanager.h" -#include "tmem.h" -#include "zcache.h" -#include "ramster.h" - -#define RAMSTER_TESTING - -#define RMSTR_KEY 0x77347734 - -enum { - RMSTR_TMEM_PUT_EPH = 100, - RMSTR_TMEM_PUT_PERS, - RMSTR_TMEM_ASYNC_GET_REQUEST, - RMSTR_TMEM_ASYNC_GET_AND_FREE_REQUEST, - RMSTR_TMEM_ASYNC_GET_REPLY, - RMSTR_TMEM_FLUSH, - RMSTR_TMEM_FLOBJ, - RMSTR_TMEM_DESTROY_POOL, -}; - -#define RMSTR_R2NET_MAX_LEN \ - (R2NET_MAX_PAYLOAD_BYTES - sizeof(struct tmem_xhandle)) - -#include "cluster/tcp_internal.h" - -static struct r2nm_node *r2net_target_node; -static int r2net_target_nodenum; - -int r2net_remote_target_node_set(int node_num) -{ - int ret = -1; - - r2net_target_node = r2nm_get_node_by_num(node_num); - if (r2net_target_node != NULL) { - r2net_target_nodenum = node_num; - r2nm_node_put(r2net_target_node); - ret = 0; - } - return ret; -} - -/* FIXME following buffer should be per-cpu, protected by preempt_disable */ -static char ramster_async_get_buf[R2NET_MAX_PAYLOAD_BYTES]; - -static int ramster_remote_async_get_request_handler(struct r2net_msg *msg, - u32 len, void *data, void **ret_data) -{ - char *pdata; - struct tmem_xhandle xh; - int found; - size_t size = RMSTR_R2NET_MAX_LEN; - u16 msgtype = be16_to_cpu(msg->msg_type); - bool get_and_free = (msgtype == RMSTR_TMEM_ASYNC_GET_AND_FREE_REQUEST); - unsigned long flags; - - xh = *(struct tmem_xhandle *)msg->buf; - if (xh.xh_data_size > RMSTR_R2NET_MAX_LEN) - BUG(); - pdata = ramster_async_get_buf; - *(struct tmem_xhandle *)pdata = xh; - pdata += sizeof(struct tmem_xhandle); - local_irq_save(flags); - found = zcache_get(xh.client_id, xh.pool_id, &xh.oid, xh.index, - pdata, &size, 1, get_and_free ? 1 : -1); - local_irq_restore(flags); - if (found < 0) { - /* a zero size indicates the get failed */ - size = 0; - } - if (size > RMSTR_R2NET_MAX_LEN) - BUG(); - *ret_data = pdata - sizeof(struct tmem_xhandle); - /* now make caller (r2net_process_message) handle specially */ - r2net_force_data_magic(msg, RMSTR_TMEM_ASYNC_GET_REPLY, RMSTR_KEY); - return size + sizeof(struct tmem_xhandle); -} - -static int ramster_remote_async_get_reply_handler(struct r2net_msg *msg, - u32 len, void *data, void **ret_data) -{ - char *in = (char *)msg->buf; - int datalen = len - sizeof(struct r2net_msg); - int ret = -1; - struct tmem_xhandle *xh = (struct tmem_xhandle *)in; - - in += sizeof(struct tmem_xhandle); - datalen -= sizeof(struct tmem_xhandle); - BUG_ON(datalen < 0 || datalen > PAGE_SIZE); - ret = zcache_localify(xh->pool_id, &xh->oid, xh->index, - in, datalen, xh->extra); -#ifdef RAMSTER_TESTING - if (ret == -EEXIST) - pr_err("TESTING ArrgREP, aborted overwrite on racy put\n"); -#endif - return ret; -} - -int ramster_remote_put_handler(struct r2net_msg *msg, - u32 len, void *data, void **ret_data) -{ - struct tmem_xhandle *xh; - char *p = (char *)msg->buf; - int datalen = len - sizeof(struct r2net_msg) - - sizeof(struct tmem_xhandle); - u16 msgtype = be16_to_cpu(msg->msg_type); - bool ephemeral = (msgtype == RMSTR_TMEM_PUT_EPH); - unsigned long flags; - int ret; - - xh = (struct tmem_xhandle *)p; - p += sizeof(struct tmem_xhandle); - zcache_autocreate_pool(xh->client_id, xh->pool_id, ephemeral); - local_irq_save(flags); - ret = zcache_put(xh->client_id, xh->pool_id, &xh->oid, xh->index, - p, datalen, 1, ephemeral ? 1 : -1); - local_irq_restore(flags); - return ret; -} - -int ramster_remote_flush_handler(struct r2net_msg *msg, - u32 len, void *data, void **ret_data) -{ - struct tmem_xhandle *xh; - char *p = (char *)msg->buf; - - xh = (struct tmem_xhandle *)p; - p += sizeof(struct tmem_xhandle); - (void)zcache_flush(xh->client_id, xh->pool_id, &xh->oid, xh->index); - return 0; -} - -int ramster_remote_flobj_handler(struct r2net_msg *msg, - u32 len, void *data, void **ret_data) -{ - struct tmem_xhandle *xh; - char *p = (char *)msg->buf; - - xh = (struct tmem_xhandle *)p; - p += sizeof(struct tmem_xhandle); - (void)zcache_flush_object(xh->client_id, xh->pool_id, &xh->oid); - return 0; -} - -int ramster_remote_async_get(struct tmem_xhandle *xh, bool free, int remotenode, - size_t expect_size, uint8_t expect_cksum, - void *extra) -{ - int ret = -1, status; - struct r2nm_node *node = NULL; - struct kvec vec[1]; - size_t veclen = 1; - u32 msg_type; - - node = r2nm_get_node_by_num(remotenode); - if (node == NULL) - goto out; - xh->client_id = r2nm_this_node(); /* which node is getting */ - xh->xh_data_cksum = expect_cksum; - xh->xh_data_size = expect_size; - xh->extra = extra; - vec[0].iov_len = sizeof(*xh); - vec[0].iov_base = xh; - if (free) - msg_type = RMSTR_TMEM_ASYNC_GET_AND_FREE_REQUEST; - else - msg_type = RMSTR_TMEM_ASYNC_GET_REQUEST; - ret = r2net_send_message_vec(msg_type, RMSTR_KEY, - vec, veclen, remotenode, &status); - r2nm_node_put(node); - if (ret < 0) { - /* FIXME handle bad message possibilities here? */ - pr_err("UNTESTED ret<0 in ramster_remote_async_get\n"); - } - ret = status; -out: - return ret; -} - -#ifdef RAMSTER_TESTING -/* leave me here to see if it catches a weird crash */ -static void ramster_check_irq_counts(void) -{ - static int last_hardirq_cnt, last_softirq_cnt, last_preempt_cnt; - int cur_hardirq_cnt, cur_softirq_cnt, cur_preempt_cnt; - - cur_hardirq_cnt = hardirq_count() >> HARDIRQ_SHIFT; - if (cur_hardirq_cnt > last_hardirq_cnt) { - last_hardirq_cnt = cur_hardirq_cnt; - if (!(last_hardirq_cnt&(last_hardirq_cnt-1))) - pr_err("RAMSTER TESTING RRP hardirq_count=%d\n", - last_hardirq_cnt); - } - cur_softirq_cnt = softirq_count() >> SOFTIRQ_SHIFT; - if (cur_softirq_cnt > last_softirq_cnt) { - last_softirq_cnt = cur_softirq_cnt; - if (!(last_softirq_cnt&(last_softirq_cnt-1))) - pr_err("RAMSTER TESTING RRP softirq_count=%d\n", - last_softirq_cnt); - } - cur_preempt_cnt = preempt_count() & PREEMPT_MASK; - if (cur_preempt_cnt > last_preempt_cnt) { - last_preempt_cnt = cur_preempt_cnt; - if (!(last_preempt_cnt&(last_preempt_cnt-1))) - pr_err("RAMSTER TESTING RRP preempt_count=%d\n", - last_preempt_cnt); - } -} -#endif - -int ramster_remote_put(struct tmem_xhandle *xh, char *data, size_t size, - bool ephemeral, int *remotenode) -{ - int nodenum, ret = -1, status; - struct r2nm_node *node = NULL; - struct kvec vec[2]; - size_t veclen = 2; - u32 msg_type; -#ifdef RAMSTER_TESTING - struct r2net_node *nn; -#endif - - BUG_ON(size > RMSTR_R2NET_MAX_LEN); - xh->client_id = r2nm_this_node(); /* which node is putting */ - vec[0].iov_len = sizeof(*xh); - vec[0].iov_base = xh; - vec[1].iov_len = size; - vec[1].iov_base = data; - node = r2net_target_node; - if (!node) - goto out; - - nodenum = r2net_target_nodenum; - - r2nm_node_get(node); - -#ifdef RAMSTER_TESTING - nn = r2net_nn_from_num(nodenum); - WARN_ON_ONCE(nn->nn_persistent_error || !nn->nn_sc_valid); -#endif - - if (ephemeral) - msg_type = RMSTR_TMEM_PUT_EPH; - else - msg_type = RMSTR_TMEM_PUT_PERS; -#ifdef RAMSTER_TESTING - /* leave me here to see if it catches a weird crash */ - ramster_check_irq_counts(); -#endif - - ret = r2net_send_message_vec(msg_type, RMSTR_KEY, vec, veclen, - nodenum, &status); -#ifdef RAMSTER_TESTING - if (ret != 0) { - static unsigned long cnt; - cnt++; - if (!(cnt&(cnt-1))) - pr_err("ramster_remote_put: message failed, " - "ret=%d, cnt=%lu\n", ret, cnt); - ret = -1; - } -#endif - if (ret < 0) - ret = -1; - else { - ret = status; - *remotenode = nodenum; - } - - r2nm_node_put(node); -out: - return ret; -} - -int ramster_remote_flush(struct tmem_xhandle *xh, int remotenode) -{ - int ret = -1, status; - struct r2nm_node *node = NULL; - struct kvec vec[1]; - size_t veclen = 1; - - node = r2nm_get_node_by_num(remotenode); - BUG_ON(node == NULL); - xh->client_id = r2nm_this_node(); /* which node is flushing */ - vec[0].iov_len = sizeof(*xh); - vec[0].iov_base = xh; - BUG_ON(irqs_disabled()); - BUG_ON(in_softirq()); - ret = r2net_send_message_vec(RMSTR_TMEM_FLUSH, RMSTR_KEY, - vec, veclen, remotenode, &status); - r2nm_node_put(node); - return ret; -} - -int ramster_remote_flush_object(struct tmem_xhandle *xh, int remotenode) -{ - int ret = -1, status; - struct r2nm_node *node = NULL; - struct kvec vec[1]; - size_t veclen = 1; - - node = r2nm_get_node_by_num(remotenode); - BUG_ON(node == NULL); - xh->client_id = r2nm_this_node(); /* which node is flobjing */ - vec[0].iov_len = sizeof(*xh); - vec[0].iov_base = xh; - ret = r2net_send_message_vec(RMSTR_TMEM_FLOBJ, RMSTR_KEY, - vec, veclen, remotenode, &status); - r2nm_node_put(node); - return ret; -} - -/* - * Handler registration - */ - -static LIST_HEAD(r2net_unreg_list); - -static void r2net_unregister_handlers(void) -{ - r2net_unregister_handler_list(&r2net_unreg_list); -} - -int r2net_register_handlers(void) -{ - int status; - - status = r2net_register_handler(RMSTR_TMEM_PUT_EPH, RMSTR_KEY, - RMSTR_R2NET_MAX_LEN, - ramster_remote_put_handler, - NULL, NULL, &r2net_unreg_list); - if (status) - goto bail; - - status = r2net_register_handler(RMSTR_TMEM_PUT_PERS, RMSTR_KEY, - RMSTR_R2NET_MAX_LEN, - ramster_remote_put_handler, - NULL, NULL, &r2net_unreg_list); - if (status) - goto bail; - - status = r2net_register_handler(RMSTR_TMEM_ASYNC_GET_REQUEST, RMSTR_KEY, - RMSTR_R2NET_MAX_LEN, - ramster_remote_async_get_request_handler, - NULL, NULL, - &r2net_unreg_list); - if (status) - goto bail; - - status = r2net_register_handler(RMSTR_TMEM_ASYNC_GET_AND_FREE_REQUEST, - RMSTR_KEY, RMSTR_R2NET_MAX_LEN, - ramster_remote_async_get_request_handler, - NULL, NULL, - &r2net_unreg_list); - if (status) - goto bail; - - status = r2net_register_handler(RMSTR_TMEM_ASYNC_GET_REPLY, RMSTR_KEY, - RMSTR_R2NET_MAX_LEN, - ramster_remote_async_get_reply_handler, - NULL, NULL, - &r2net_unreg_list); - if (status) - goto bail; - - status = r2net_register_handler(RMSTR_TMEM_FLUSH, RMSTR_KEY, - RMSTR_R2NET_MAX_LEN, - ramster_remote_flush_handler, - NULL, NULL, - &r2net_unreg_list); - if (status) - goto bail; - - status = r2net_register_handler(RMSTR_TMEM_FLOBJ, RMSTR_KEY, - RMSTR_R2NET_MAX_LEN, - ramster_remote_flobj_handler, - NULL, NULL, - &r2net_unreg_list); - if (status) - goto bail; - - pr_info("ramster: r2net handlers registered\n"); - -bail: - if (status) { - r2net_unregister_handlers(); - pr_err("ramster: couldn't register r2net handlers\n"); - } - return status; -} diff --git a/drivers/staging/ramster/ramster.h b/drivers/staging/ramster/ramster.h deleted file mode 100644 index 0c9455e8d..000000000 --- a/drivers/staging/ramster/ramster.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * ramster.h - * - * Peer-to-peer transcendent memory - * - * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp. - */ - -#ifndef _RAMSTER_H_ -#define _RAMSTER_H_ - -/* - * format of remote pampd: - * bit 0 == intransit - * bit 1 == is_remote... if this bit is set, then - * bit 2-9 == remotenode - * bit 10-22 == size - * bit 23-30 == cksum - */ -#define FAKE_PAMPD_INTRANSIT_BITS 1 -#define FAKE_PAMPD_ISREMOTE_BITS 1 -#define FAKE_PAMPD_REMOTENODE_BITS 8 -#define FAKE_PAMPD_REMOTESIZE_BITS 13 -#define FAKE_PAMPD_CHECKSUM_BITS 8 - -#define FAKE_PAMPD_INTRANSIT_SHIFT 0 -#define FAKE_PAMPD_ISREMOTE_SHIFT (FAKE_PAMPD_INTRANSIT_SHIFT + \ - FAKE_PAMPD_INTRANSIT_BITS) -#define FAKE_PAMPD_REMOTENODE_SHIFT (FAKE_PAMPD_ISREMOTE_SHIFT + \ - FAKE_PAMPD_ISREMOTE_BITS) -#define FAKE_PAMPD_REMOTESIZE_SHIFT (FAKE_PAMPD_REMOTENODE_SHIFT + \ - FAKE_PAMPD_REMOTENODE_BITS) -#define FAKE_PAMPD_CHECKSUM_SHIFT (FAKE_PAMPD_REMOTESIZE_SHIFT + \ - FAKE_PAMPD_REMOTESIZE_BITS) - -#define FAKE_PAMPD_MASK(x) ((1UL << (x)) - 1) - -static inline void *pampd_make_remote(int remotenode, size_t size, - unsigned char cksum) -{ - unsigned long fake_pampd = 0; - fake_pampd |= 1UL << FAKE_PAMPD_ISREMOTE_SHIFT; - fake_pampd |= ((unsigned long)remotenode & - FAKE_PAMPD_MASK(FAKE_PAMPD_REMOTENODE_BITS)) << - FAKE_PAMPD_REMOTENODE_SHIFT; - fake_pampd |= ((unsigned long)size & - FAKE_PAMPD_MASK(FAKE_PAMPD_REMOTESIZE_BITS)) << - FAKE_PAMPD_REMOTESIZE_SHIFT; - fake_pampd |= ((unsigned long)cksum & - FAKE_PAMPD_MASK(FAKE_PAMPD_CHECKSUM_BITS)) << - FAKE_PAMPD_CHECKSUM_SHIFT; - return (void *)fake_pampd; -} - -static inline unsigned int pampd_remote_node(void *pampd) -{ - unsigned long fake_pampd = (unsigned long)pampd; - return (fake_pampd >> FAKE_PAMPD_REMOTENODE_SHIFT) & - FAKE_PAMPD_MASK(FAKE_PAMPD_REMOTENODE_BITS); -} - -static inline unsigned int pampd_remote_size(void *pampd) -{ - unsigned long fake_pampd = (unsigned long)pampd; - return (fake_pampd >> FAKE_PAMPD_REMOTESIZE_SHIFT) & - FAKE_PAMPD_MASK(FAKE_PAMPD_REMOTESIZE_BITS); -} - -static inline unsigned char pampd_remote_cksum(void *pampd) -{ - unsigned long fake_pampd = (unsigned long)pampd; - return (fake_pampd >> FAKE_PAMPD_CHECKSUM_SHIFT) & - FAKE_PAMPD_MASK(FAKE_PAMPD_CHECKSUM_BITS); -} - -static inline bool pampd_is_remote(void *pampd) -{ - unsigned long fake_pampd = (unsigned long)pampd; - return (fake_pampd >> FAKE_PAMPD_ISREMOTE_SHIFT) & - FAKE_PAMPD_MASK(FAKE_PAMPD_ISREMOTE_BITS); -} - -static inline bool pampd_is_intransit(void *pampd) -{ - unsigned long fake_pampd = (unsigned long)pampd; - return (fake_pampd >> FAKE_PAMPD_INTRANSIT_SHIFT) & - FAKE_PAMPD_MASK(FAKE_PAMPD_INTRANSIT_BITS); -} - -/* note that it is a BUG for intransit to be set without isremote also set */ -static inline void *pampd_mark_intransit(void *pampd) -{ - unsigned long fake_pampd = (unsigned long)pampd; - - fake_pampd |= 1UL << FAKE_PAMPD_ISREMOTE_SHIFT; - fake_pampd |= 1UL << FAKE_PAMPD_INTRANSIT_SHIFT; - return (void *)fake_pampd; -} - -static inline void *pampd_mask_intransit_and_remote(void *marked_pampd) -{ - unsigned long pampd = (unsigned long)marked_pampd; - - pampd &= ~(1UL << FAKE_PAMPD_INTRANSIT_SHIFT); - pampd &= ~(1UL << FAKE_PAMPD_ISREMOTE_SHIFT); - return (void *)pampd; -} - -extern int ramster_remote_async_get(struct tmem_xhandle *, - bool, int, size_t, uint8_t, void *extra); -extern int ramster_remote_put(struct tmem_xhandle *, char *, size_t, - bool, int *); -extern int ramster_remote_flush(struct tmem_xhandle *, int); -extern int ramster_remote_flush_object(struct tmem_xhandle *, int); -extern int r2net_register_handlers(void); -extern int r2net_remote_target_node_set(int); - -#endif /* _TMEM_H */ diff --git a/drivers/staging/ramster/tmem.c b/drivers/staging/ramster/tmem.c deleted file mode 100644 index 8f2f6892d..000000000 --- a/drivers/staging/ramster/tmem.c +++ /dev/null @@ -1,851 +0,0 @@ -/* - * In-kernel transcendent memory (generic implementation) - * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. - * - * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented - * "handles" (triples containing a pool id, and object id, and an index), to - * pages in a page-accessible memory (PAM). Tmem references the PAM pages via - * an abstract "pampd" (PAM page-descriptor), which can be operated on by a - * set of functions (pamops). Each pampd contains some representation of - * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of - * pages and must be able to insert, find, and delete these pages at a - * potential frequency of thousands per second concurrently across many CPUs, - * (and, if used with KVM, across many vcpus across many guests). - * Tmem is tracked with a hierarchy of data structures, organized by - * the elements in a handle-tuple: pool_id, object_id, and page index. - * One or more "clients" (e.g. guests) each provide one or more tmem_pools. - * Each pool, contains a hash table of rb_trees of tmem_objs. Each - * tmem_obj contains a radix-tree-like tree of pointers, with intermediate - * nodes called tmem_objnodes. Each leaf pointer in this tree points to - * a pampd, which is accessible only through a small set of callbacks - * registered by the PAM implementation (see tmem_register_pamops). Tmem - * does all memory allocation via a set of callbacks registered by the tmem - * host implementation (e.g. see tmem_register_hostops). - */ - -#include -#include -#include -#include - -#include "tmem.h" - -/* data structure sentinels used for debugging... see tmem.h */ -#define POOL_SENTINEL 0x87658765 -#define OBJ_SENTINEL 0x12345678 -#define OBJNODE_SENTINEL 0xfedcba09 - -/* - * A tmem host implementation must use this function to register callbacks - * for memory allocation. - */ -static struct tmem_hostops tmem_hostops; - -static void tmem_objnode_tree_init(void); - -void tmem_register_hostops(struct tmem_hostops *m) -{ - tmem_objnode_tree_init(); - tmem_hostops = *m; -} - -/* - * A tmem host implementation must use this function to register - * callbacks for a page-accessible memory (PAM) implementation - */ -static struct tmem_pamops tmem_pamops; - -void tmem_register_pamops(struct tmem_pamops *m) -{ - tmem_pamops = *m; -} - -/* - * Oid's are potentially very sparse and tmem_objs may have an indeterminately - * short life, being added and deleted at a relatively high frequency. - * So an rb_tree is an ideal data structure to manage tmem_objs. But because - * of the potentially huge number of tmem_objs, each pool manages a hashtable - * of rb_trees to reduce search, insert, delete, and rebalancing time. - * Each hashbucket also has a lock to manage concurrent access. - * - * The following routines manage tmem_objs. When any tmem_obj is accessed, - * the hashbucket lock must be held. - */ - -/* searches for object==oid in pool, returns locked object if found */ -static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, - struct tmem_oid *oidp) -{ - struct rb_node *rbnode; - struct tmem_obj *obj; - - rbnode = hb->obj_rb_root.rb_node; - while (rbnode) { - BUG_ON(RB_EMPTY_NODE(rbnode)); - obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); - switch (tmem_oid_compare(oidp, &obj->oid)) { - case 0: /* equal */ - goto out; - case -1: - rbnode = rbnode->rb_left; - break; - case 1: - rbnode = rbnode->rb_right; - break; - } - } - obj = NULL; -out: - return obj; -} - -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *); - -/* free an object that has no more pampds in it */ -static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) -{ - struct tmem_pool *pool; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pampd_count > 0); - pool = obj->pool; - BUG_ON(pool == NULL); - if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */ - tmem_pampd_destroy_all_in_obj(obj); - BUG_ON(obj->objnode_tree_root != NULL); - BUG_ON((long)obj->objnode_count != 0); - atomic_dec(&pool->obj_count); - BUG_ON(atomic_read(&pool->obj_count) < 0); - INVERT_SENTINEL(obj, OBJ); - obj->pool = NULL; - tmem_oid_set_invalid(&obj->oid); - rb_erase(&obj->rb_tree_node, &hb->obj_rb_root); -} - -/* - * initialize, and insert an tmem_object_root (called only if find failed) - */ -static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, - struct tmem_pool *pool, - struct tmem_oid *oidp) -{ - struct rb_root *root = &hb->obj_rb_root; - struct rb_node **new = &(root->rb_node), *parent = NULL; - struct tmem_obj *this; - - BUG_ON(pool == NULL); - atomic_inc(&pool->obj_count); - obj->objnode_tree_height = 0; - obj->objnode_tree_root = NULL; - obj->pool = pool; - obj->oid = *oidp; - obj->objnode_count = 0; - obj->pampd_count = 0; - (*tmem_pamops.new_obj)(obj); - SET_SENTINEL(obj, OBJ); - while (*new) { - BUG_ON(RB_EMPTY_NODE(*new)); - this = rb_entry(*new, struct tmem_obj, rb_tree_node); - parent = *new; - switch (tmem_oid_compare(oidp, &this->oid)) { - case 0: - BUG(); /* already present; should never happen! */ - break; - case -1: - new = &(*new)->rb_left; - break; - case 1: - new = &(*new)->rb_right; - break; - } - } - rb_link_node(&obj->rb_tree_node, parent, new); - rb_insert_color(&obj->rb_tree_node, root); -} - -/* - * Tmem is managed as a set of tmem_pools with certain attributes, such as - * "ephemeral" vs "persistent". These attributes apply to all tmem_objs - * and all pampds that belong to a tmem_pool. A tmem_pool is created - * or deleted relatively rarely (for example, when a filesystem is - * mounted or unmounted. - */ - -/* flush all data from a pool and, optionally, free it */ -static void tmem_pool_flush(struct tmem_pool *pool, bool destroy) -{ - struct rb_node *rbnode; - struct tmem_obj *obj; - struct tmem_hashbucket *hb = &pool->hashbucket[0]; - int i; - - BUG_ON(pool == NULL); - for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { - spin_lock(&hb->lock); - rbnode = rb_first(&hb->obj_rb_root); - while (rbnode != NULL) { - obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); - rbnode = rb_next(rbnode); - tmem_pampd_destroy_all_in_obj(obj); - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - } - spin_unlock(&hb->lock); - } - if (destroy) - list_del(&pool->pool_list); -} - -/* - * A tmem_obj contains a radix-tree-like tree in which the intermediate - * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation - * is very specialized and tuned for specific uses and is not particularly - * suited for use from this code, though some code from the core algorithms has - * been reused, thus the copyright notices below). Each tmem_objnode contains - * a set of pointers which point to either a set of intermediate tmem_objnodes - * or a set of of pampds. - * - * Portions Copyright (C) 2001 Momchil Velikov - * Portions Copyright (C) 2001 Christoph Hellwig - * Portions Copyright (C) 2005 SGI, Christoph Lameter - */ - -struct tmem_objnode_tree_path { - struct tmem_objnode *objnode; - int offset; -}; - -/* objnode height_to_maxindex translation */ -static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1]; - -static void tmem_objnode_tree_init(void) -{ - unsigned int ht, tmp; - - for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) { - tmp = ht * OBJNODE_TREE_MAP_SHIFT; - if (tmp >= OBJNODE_TREE_INDEX_BITS) - tmem_objnode_tree_h2max[ht] = ~0UL; - else - tmem_objnode_tree_h2max[ht] = - (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1; - } -} - -static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj) -{ - struct tmem_objnode *objnode; - - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - objnode = (*tmem_hostops.objnode_alloc)(obj->pool); - if (unlikely(objnode == NULL)) - goto out; - objnode->obj = obj; - SET_SENTINEL(objnode, OBJNODE); - memset(&objnode->slots, 0, sizeof(objnode->slots)); - objnode->slots_in_use = 0; - obj->objnode_count++; -out: - return objnode; -} - -static void tmem_objnode_free(struct tmem_objnode *objnode) -{ - struct tmem_pool *pool; - int i; - - BUG_ON(objnode == NULL); - for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) - BUG_ON(objnode->slots[i] != NULL); - ASSERT_SENTINEL(objnode, OBJNODE); - INVERT_SENTINEL(objnode, OBJNODE); - BUG_ON(objnode->obj == NULL); - ASSERT_SENTINEL(objnode->obj, OBJ); - pool = objnode->obj->pool; - BUG_ON(pool == NULL); - ASSERT_SENTINEL(pool, POOL); - objnode->obj->objnode_count--; - objnode->obj = NULL; - (*tmem_hostops.objnode_free)(objnode, pool); -} - -/* - * lookup index in object and return associated pampd (or NULL if not found) - */ -static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) -{ - unsigned int height, shift; - struct tmem_objnode **slot = NULL; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - - height = obj->objnode_tree_height; - if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) - goto out; - if (height == 0 && obj->objnode_tree_root) { - slot = &obj->objnode_tree_root; - goto out; - } - shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; - slot = &obj->objnode_tree_root; - while (height > 0) { - if (*slot == NULL) - goto out; - slot = (struct tmem_objnode **) - ((*slot)->slots + - ((index >> shift) & OBJNODE_TREE_MAP_MASK)); - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } -out: - return slot != NULL ? (void **)slot : NULL; -} - -static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) -{ - struct tmem_objnode **slot; - - slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); - return slot != NULL ? *slot : NULL; -} - -static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, - void *new_pampd, bool no_free) -{ - struct tmem_objnode **slot; - void *ret = NULL; - - slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); - if ((slot != NULL) && (*slot != NULL)) { - void *old_pampd = *(void **)slot; - *(void **)slot = new_pampd; - if (!no_free) - (*tmem_pamops.free)(old_pampd, obj->pool, - NULL, 0, false); - ret = new_pampd; - } - return ret; -} - -static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index, - void *pampd) -{ - int ret = 0; - struct tmem_objnode *objnode = NULL, *newnode, *slot; - unsigned int height, shift; - int offset = 0; - - /* if necessary, extend the tree to be higher */ - if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) { - height = obj->objnode_tree_height + 1; - if (index > tmem_objnode_tree_h2max[height]) - while (index > tmem_objnode_tree_h2max[height]) - height++; - if (obj->objnode_tree_root == NULL) { - obj->objnode_tree_height = height; - goto insert; - } - do { - newnode = tmem_objnode_alloc(obj); - if (!newnode) { - ret = -ENOMEM; - goto out; - } - newnode->slots[0] = obj->objnode_tree_root; - newnode->slots_in_use = 1; - obj->objnode_tree_root = newnode; - obj->objnode_tree_height++; - } while (height > obj->objnode_tree_height); - } -insert: - slot = obj->objnode_tree_root; - height = obj->objnode_tree_height; - shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; - while (height > 0) { - if (slot == NULL) { - /* add a child objnode. */ - slot = tmem_objnode_alloc(obj); - if (!slot) { - ret = -ENOMEM; - goto out; - } - if (objnode) { - - objnode->slots[offset] = slot; - objnode->slots_in_use++; - } else - obj->objnode_tree_root = slot; - } - /* go down a level */ - offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; - objnode = slot; - slot = objnode->slots[offset]; - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } - BUG_ON(slot != NULL); - if (objnode) { - objnode->slots_in_use++; - objnode->slots[offset] = pampd; - } else - obj->objnode_tree_root = pampd; - obj->pampd_count++; -out: - return ret; -} - -static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index) -{ - struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1]; - struct tmem_objnode_tree_path *pathp = path; - struct tmem_objnode *slot = NULL; - unsigned int height, shift; - int offset; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - height = obj->objnode_tree_height; - if (index > tmem_objnode_tree_h2max[height]) - goto out; - slot = obj->objnode_tree_root; - if (height == 0 && obj->objnode_tree_root) { - obj->objnode_tree_root = NULL; - goto out; - } - shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT; - pathp->objnode = NULL; - do { - if (slot == NULL) - goto out; - pathp++; - offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; - pathp->offset = offset; - pathp->objnode = slot; - slot = slot->slots[offset]; - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } while (height > 0); - if (slot == NULL) - goto out; - while (pathp->objnode) { - pathp->objnode->slots[pathp->offset] = NULL; - pathp->objnode->slots_in_use--; - if (pathp->objnode->slots_in_use) { - if (pathp->objnode == obj->objnode_tree_root) { - while (obj->objnode_tree_height > 0 && - obj->objnode_tree_root->slots_in_use == 1 && - obj->objnode_tree_root->slots[0]) { - struct tmem_objnode *to_free = - obj->objnode_tree_root; - - obj->objnode_tree_root = - to_free->slots[0]; - obj->objnode_tree_height--; - to_free->slots[0] = NULL; - to_free->slots_in_use = 0; - tmem_objnode_free(to_free); - } - } - goto out; - } - tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */ - pathp--; - } - obj->objnode_tree_height = 0; - obj->objnode_tree_root = NULL; - -out: - if (slot != NULL) - obj->pampd_count--; - BUG_ON(obj->pampd_count < 0); - return slot; -} - -/* recursively walk the objnode_tree destroying pampds and objnodes */ -static void tmem_objnode_node_destroy(struct tmem_obj *obj, - struct tmem_objnode *objnode, - unsigned int ht) -{ - int i; - - if (ht == 0) - return; - for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) { - if (objnode->slots[i]) { - if (ht == 1) { - obj->pampd_count--; - (*tmem_pamops.free)(objnode->slots[i], - obj->pool, NULL, 0, true); - objnode->slots[i] = NULL; - continue; - } - tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1); - tmem_objnode_free(objnode->slots[i]); - objnode->slots[i] = NULL; - } - } -} - -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) -{ - if (obj->objnode_tree_root == NULL) - return; - if (obj->objnode_tree_height == 0) { - obj->pampd_count--; - (*tmem_pamops.free)(obj->objnode_tree_root, - obj->pool, NULL, 0, true); - } else { - tmem_objnode_node_destroy(obj, obj->objnode_tree_root, - obj->objnode_tree_height); - tmem_objnode_free(obj->objnode_tree_root); - obj->objnode_tree_height = 0; - } - obj->objnode_tree_root = NULL; - (*tmem_pamops.free_obj)(obj->pool, obj); -} - -/* - * Tmem is operated on by a set of well-defined actions: - * "put", "get", "flush", "flush_object", "new pool" and "destroy pool". - * (The tmem ABI allows for subpages and exchanges but these operations - * are not included in this implementation.) - * - * These "tmem core" operations are implemented in the following functions. - */ - -/* - * "Put" a page, e.g. copy a page from the kernel into newly allocated - * PAM space (if such space is available). Tmem_put is complicated by - * a corner case: What if a page with matching handle already exists in - * tmem? To guarantee coherency, one of two actions is necessary: Either - * the data for the page must be overwritten, or the page must be - * "flushed" so that the data is not accessible to a subsequent "get". - * Since these "duplicate puts" are relatively rare, this implementation - * always flushes for simplicity. - */ -int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, - char *data, size_t size, bool raw, int ephemeral) -{ - struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; - void *pampd = NULL, *pampd_del = NULL; - int ret = -ENOMEM; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = objfound = tmem_obj_find(hb, oidp); - if (obj != NULL) { - pampd = tmem_pampd_lookup_in_obj(objfound, index); - if (pampd != NULL) { - /* if found, is a dup put, flush the old one */ - pampd_del = tmem_pampd_delete_from_obj(obj, index); - BUG_ON(pampd_del != pampd); - (*tmem_pamops.free)(pampd, pool, oidp, index, true); - if (obj->pampd_count == 0) { - objnew = obj; - objfound = NULL; - } - pampd = NULL; - } - } else { - obj = objnew = (*tmem_hostops.obj_alloc)(pool); - if (unlikely(obj == NULL)) { - ret = -ENOMEM; - goto out; - } - tmem_obj_init(obj, hb, pool, oidp); - } - BUG_ON(obj == NULL); - BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound)); - pampd = (*tmem_pamops.create)(data, size, raw, ephemeral, - obj->pool, &obj->oid, index); - if (unlikely(pampd == NULL)) - goto free; - ret = tmem_pampd_add_to_obj(obj, index, pampd); - if (unlikely(ret == -ENOMEM)) - /* may have partially built objnode tree ("stump") */ - goto delete_and_free; - goto out; - -delete_and_free: - (void)tmem_pampd_delete_from_obj(obj, index); -free: - if (pampd) - (*tmem_pamops.free)(pampd, pool, NULL, 0, true); - if (objnew) { - tmem_obj_free(objnew, hb); - (*tmem_hostops.obj_free)(objnew, pool); - } -out: - spin_unlock(&hb->lock); - return ret; -} - -void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp, - uint32_t index, struct tmem_obj **ret_obj, - void **saved_hb) -{ - struct tmem_hashbucket *hb; - struct tmem_obj *obj = NULL; - void *pampd = NULL; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (likely(obj != NULL)) - pampd = tmem_pampd_lookup_in_obj(obj, index); - *ret_obj = obj; - *saved_hb = (void *)hb; - /* note, hashbucket remains locked */ - return pampd; -} - -void tmem_localify_finish(struct tmem_obj *obj, uint32_t index, - void *pampd, void *saved_hb, bool delete) -{ - struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb; - - BUG_ON(!spin_is_locked(&hb->lock)); - if (pampd != NULL) { - BUG_ON(obj == NULL); - (void)tmem_pampd_replace_in_obj(obj, index, pampd, 1); - } else if (delete) { - BUG_ON(obj == NULL); - (void)tmem_pampd_delete_from_obj(obj, index); - } - spin_unlock(&hb->lock); -} - -static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb, - struct tmem_pool *pool, struct tmem_oid *oidp, - uint32_t index, bool free, char *data) -{ - void *old_pampd = *ppampd, *new_pampd = NULL; - bool intransit = false; - int ret = 0; - - - if (!is_ephemeral(pool)) - new_pampd = (*tmem_pamops.repatriate_preload)( - old_pampd, pool, oidp, index, &intransit); - if (intransit) - ret = -EAGAIN; - else if (new_pampd != NULL) - *ppampd = new_pampd; - /* must release the hb->lock else repatriate can't sleep */ - spin_unlock(&hb->lock); - if (!intransit) - ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool, - oidp, index, free, data); - return ret; -} - -/* - * "Get" a page, e.g. if one can be found, copy the tmem page with the - * matching handle from PAM space to the kernel. By tmem definition, - * when a "get" is successful on an ephemeral page, the page is "flushed", - * and when a "get" is successful on a persistent page, the page is retained - * in tmem. Note that to preserve - * coherency, "get" can never be skipped if tmem contains the data. - * That is, if a get is done with a certain handle and fails, any - * subsequent "get" must also fail (unless of course there is a - * "put" done with the same handle). - - */ -int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, - char *data, size_t *size, bool raw, int get_and_free) -{ - struct tmem_obj *obj; - void *pampd; - bool ephemeral = is_ephemeral(pool); - int ret = -1; - struct tmem_hashbucket *hb; - bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral); - bool lock_held = 0; - void **ppampd; - -again: - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - lock_held = 1; - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - ppampd = __tmem_pampd_lookup_in_obj(obj, index); - if (ppampd == NULL) - goto out; - if (tmem_pamops.is_remote(*ppampd)) { - ret = tmem_repatriate(ppampd, hb, pool, oidp, - index, free, data); - lock_held = 0; /* note hb->lock has been unlocked */ - if (ret == -EAGAIN) { - /* rare I think, but should cond_resched()??? */ - usleep_range(10, 1000); - goto again; - } else if (ret != 0) { - if (ret != -ENOENT) - pr_err("UNTESTED case in tmem_get, ret=%d\n", - ret); - ret = -1; - goto out; - } - goto out; - } - if (free) - pampd = tmem_pampd_delete_from_obj(obj, index); - else - pampd = tmem_pampd_lookup_in_obj(obj, index); - if (pampd == NULL) - goto out; - if (free) { - if (obj->pampd_count == 0) { - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - obj = NULL; - } - } - if (free) - ret = (*tmem_pamops.get_data_and_free)( - data, size, raw, pampd, pool, oidp, index); - else - ret = (*tmem_pamops.get_data)( - data, size, raw, pampd, pool, oidp, index); - if (ret < 0) - goto out; - ret = 0; -out: - if (lock_held) - spin_unlock(&hb->lock); - return ret; -} - -/* - * If a page in tmem matches the handle, "flush" this page from tmem such - * that any subsequent "get" does not succeed (unless, of course, there - * was another "put" with the same handle). - */ -int tmem_flush_page(struct tmem_pool *pool, - struct tmem_oid *oidp, uint32_t index) -{ - struct tmem_obj *obj; - void *pampd; - int ret = -1; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - pampd = tmem_pampd_delete_from_obj(obj, index); - if (pampd == NULL) - goto out; - (*tmem_pamops.free)(pampd, pool, oidp, index, true); - if (obj->pampd_count == 0) { - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - } - ret = 0; - -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * If a page in tmem matches the handle, replace the page so that any - * subsequent "get" gets the new page. Returns the new page if - * there was a page to replace, else returns NULL. - */ -int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, - uint32_t index, void *new_pampd) -{ - struct tmem_obj *obj; - int ret = -1; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0); - ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj); -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Flush" all pages in tmem matching this oid. - */ -int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) -{ - struct tmem_obj *obj; - struct tmem_hashbucket *hb; - int ret = -1; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - tmem_pampd_destroy_all_in_obj(obj); - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - ret = 0; - -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Flush" all pages (and tmem_objs) from this tmem_pool and disable - * all subsequent access to this tmem_pool. - */ -int tmem_destroy_pool(struct tmem_pool *pool) -{ - int ret = -1; - - if (pool == NULL) - goto out; - tmem_pool_flush(pool, 1); - ret = 0; -out: - return ret; -} - -static LIST_HEAD(tmem_global_pool_list); - -/* - * Create a new tmem_pool with the provided flag and return - * a pool id provided by the tmem host implementation. - */ -void tmem_new_pool(struct tmem_pool *pool, uint32_t flags) -{ - int persistent = flags & TMEM_POOL_PERSIST; - int shared = flags & TMEM_POOL_SHARED; - struct tmem_hashbucket *hb = &pool->hashbucket[0]; - int i; - - for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { - hb->obj_rb_root = RB_ROOT; - spin_lock_init(&hb->lock); - } - INIT_LIST_HEAD(&pool->pool_list); - atomic_set(&pool->obj_count, 0); - SET_SENTINEL(pool, POOL); - list_add_tail(&pool->pool_list, &tmem_global_pool_list); - pool->persistent = persistent; - pool->shared = shared; -} diff --git a/drivers/staging/ramster/tmem.h b/drivers/staging/ramster/tmem.h deleted file mode 100644 index 47f1918c8..000000000 --- a/drivers/staging/ramster/tmem.h +++ /dev/null @@ -1,244 +0,0 @@ -/* - * tmem.h - * - * Transcendent memory - * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. - */ - -#ifndef _TMEM_H_ -#define _TMEM_H_ - -#include -#include -#include - -/* - * These are pre-defined by the Xen<->Linux ABI - */ -#define TMEM_PUT_PAGE 4 -#define TMEM_GET_PAGE 5 -#define TMEM_FLUSH_PAGE 6 -#define TMEM_FLUSH_OBJECT 7 -#define TMEM_POOL_PERSIST 1 -#define TMEM_POOL_SHARED 2 -#define TMEM_POOL_PRECOMPRESSED 4 -#define TMEM_POOL_PAGESIZE_SHIFT 4 -#define TMEM_POOL_PAGESIZE_MASK 0xf -#define TMEM_POOL_RESERVED_BITS 0x00ffff00 - -/* - * sentinels have proven very useful for debugging but can be removed - * or disabled before final merge. - */ -#define SENTINELS -#ifdef SENTINELS -#define DECL_SENTINEL uint32_t sentinel; -#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL) -#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL) -#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL) -#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL) -#else -#define DECL_SENTINEL -#define SET_SENTINEL(_x, _y) do { } while (0) -#define INVERT_SENTINEL(_x, _y) do { } while (0) -#define ASSERT_SENTINEL(_x, _y) do { } while (0) -#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0) -#endif - -#define ASSERT_SPINLOCK(_l) WARN_ON(!spin_is_locked(_l)) - -/* - * A pool is the highest-level data structure managed by tmem and - * usually corresponds to a large independent set of pages such as - * a filesystem. Each pool has an id, and certain attributes and counters. - * It also contains a set of hash buckets, each of which contains an rbtree - * of objects and a lock to manage concurrency within the pool. - */ - -#define TMEM_HASH_BUCKET_BITS 8 -#define TMEM_HASH_BUCKETS (1<persistent) -#define is_ephemeral(_p) (!(_p->persistent)) - -/* - * An object id ("oid") is large: 192-bits (to ensure, for example, files - * in a modern filesystem can be uniquely identified). - */ - -struct tmem_oid { - uint64_t oid[3]; -}; - -struct tmem_xhandle { - uint8_t client_id; - uint8_t xh_data_cksum; - uint16_t xh_data_size; - uint16_t pool_id; - struct tmem_oid oid; - uint32_t index; - void *extra; -}; - -static inline struct tmem_xhandle tmem_xhandle_fill(uint16_t client_id, - struct tmem_pool *pool, - struct tmem_oid *oidp, - uint32_t index) -{ - struct tmem_xhandle xh; - xh.client_id = client_id; - xh.xh_data_cksum = (uint8_t)-1; - xh.xh_data_size = (uint16_t)-1; - xh.pool_id = pool->pool_id; - xh.oid = *oidp; - xh.index = index; - return xh; -} - -static inline void tmem_oid_set_invalid(struct tmem_oid *oidp) -{ - oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL; -} - -static inline bool tmem_oid_valid(struct tmem_oid *oidp) -{ - return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL || - oidp->oid[2] != -1UL; -} - -static inline int tmem_oid_compare(struct tmem_oid *left, - struct tmem_oid *right) -{ - int ret; - - if (left->oid[2] == right->oid[2]) { - if (left->oid[1] == right->oid[1]) { - if (left->oid[0] == right->oid[0]) - ret = 0; - else if (left->oid[0] < right->oid[0]) - ret = -1; - else - return 1; - } else if (left->oid[1] < right->oid[1]) - ret = -1; - else - ret = 1; - } else if (left->oid[2] < right->oid[2]) - ret = -1; - else - ret = 1; - return ret; -} - -static inline unsigned tmem_oid_hash(struct tmem_oid *oidp) -{ - return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2], - TMEM_HASH_BUCKET_BITS); -} - -/* - * A tmem_obj contains an identifier (oid), pointers to the parent - * pool and the rb_tree to which it belongs, counters, and an ordered - * set of pampds, structured in a radix-tree-like tree. The intermediate - * nodes of the tree are called tmem_objnodes. - */ - -struct tmem_objnode; - -struct tmem_obj { - struct tmem_oid oid; - struct tmem_pool *pool; - struct rb_node rb_tree_node; - struct tmem_objnode *objnode_tree_root; - unsigned int objnode_tree_height; - unsigned long objnode_count; - long pampd_count; - /* for current design of ramster, all pages belonging to - * an object reside on the same remotenode and extra is - * used to record the number of the remotenode so a - * flush-object operation can specify it */ - void *extra; /* for use by pampd implementation */ - DECL_SENTINEL -}; - -#define OBJNODE_TREE_MAP_SHIFT 6 -#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT) -#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1) -#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -#define OBJNODE_TREE_MAX_PATH \ - (OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2) - -struct tmem_objnode { - struct tmem_obj *obj; - DECL_SENTINEL - void *slots[OBJNODE_TREE_MAP_SIZE]; - unsigned int slots_in_use; -}; - -/* pampd abstract datatype methods provided by the PAM implementation */ -struct tmem_pamops { - void *(*create)(char *, size_t, bool, int, - struct tmem_pool *, struct tmem_oid *, uint32_t); - int (*get_data)(char *, size_t *, bool, void *, struct tmem_pool *, - struct tmem_oid *, uint32_t); - int (*get_data_and_free)(char *, size_t *, bool, void *, - struct tmem_pool *, struct tmem_oid *, - uint32_t); - void (*free)(void *, struct tmem_pool *, - struct tmem_oid *, uint32_t, bool); - void (*free_obj)(struct tmem_pool *, struct tmem_obj *); - bool (*is_remote)(void *); - void *(*repatriate_preload)(void *, struct tmem_pool *, - struct tmem_oid *, uint32_t, bool *); - int (*repatriate)(void *, void *, struct tmem_pool *, - struct tmem_oid *, uint32_t, bool, void *); - void (*new_obj)(struct tmem_obj *); - int (*replace_in_obj)(void *, struct tmem_obj *); -}; -extern void tmem_register_pamops(struct tmem_pamops *m); - -/* memory allocation methods provided by the host implementation */ -struct tmem_hostops { - struct tmem_obj *(*obj_alloc)(struct tmem_pool *); - void (*obj_free)(struct tmem_obj *, struct tmem_pool *); - struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *); - void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *); -}; -extern void tmem_register_hostops(struct tmem_hostops *m); - -/* core tmem accessor functions */ -extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index, - char *, size_t, bool, int); -extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index, - char *, size_t *, bool, int); -extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index, - void *); -extern void *tmem_localify_get_pampd(struct tmem_pool *, struct tmem_oid *, - uint32_t index, struct tmem_obj **, - void **); -extern void tmem_localify_finish(struct tmem_obj *, uint32_t index, - void *, void *, bool); -extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *, - uint32_t index); -extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *); -extern int tmem_destroy_pool(struct tmem_pool *); -extern void tmem_new_pool(struct tmem_pool *, uint32_t); -#endif /* _TMEM_H */ diff --git a/drivers/staging/ramster/xvmalloc.c b/drivers/staging/ramster/xvmalloc.c deleted file mode 100644 index 93ba8e940..000000000 --- a/drivers/staging/ramster/xvmalloc.c +++ /dev/null @@ -1,509 +0,0 @@ -/* - * xvmalloc memory allocator - * - * Copyright (C) 2008, 2009, 2010 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the licence that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - */ - -#ifdef CONFIG_ZRAM_DEBUG -#define DEBUG -#endif - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "xvmalloc.h" -#include "xvmalloc_int.h" - -static void stat_inc(u64 *value) -{ - *value = *value + 1; -} - -static void stat_dec(u64 *value) -{ - *value = *value - 1; -} - -static int test_flag(struct block_header *block, enum blockflags flag) -{ - return block->prev & BIT(flag); -} - -static void set_flag(struct block_header *block, enum blockflags flag) -{ - block->prev |= BIT(flag); -} - -static void clear_flag(struct block_header *block, enum blockflags flag) -{ - block->prev &= ~BIT(flag); -} - -/* - * Given pair, provide a dereferencable pointer. - * This is called from xv_malloc/xv_free path, so it - * needs to be fast. - */ -static void *get_ptr_atomic(struct page *page, u16 offset) -{ - unsigned char *base; - - base = kmap_atomic(page); - return base + offset; -} - -static void put_ptr_atomic(void *ptr) -{ - kunmap_atomic(ptr); -} - -static u32 get_blockprev(struct block_header *block) -{ - return block->prev & PREV_MASK; -} - -static void set_blockprev(struct block_header *block, u16 new_offset) -{ - block->prev = new_offset | (block->prev & FLAGS_MASK); -} - -static struct block_header *BLOCK_NEXT(struct block_header *block) -{ - return (struct block_header *) - ((char *)block + block->size + XV_ALIGN); -} - -/* - * Get index of free list containing blocks of maximum size - * which is less than or equal to given size. - */ -static u32 get_index_for_insert(u32 size) -{ - if (unlikely(size > XV_MAX_ALLOC_SIZE)) - size = XV_MAX_ALLOC_SIZE; - size &= ~FL_DELTA_MASK; - return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT; -} - -/* - * Get index of free list having blocks of size greater than - * or equal to requested size. - */ -static u32 get_index(u32 size) -{ - if (unlikely(size < XV_MIN_ALLOC_SIZE)) - size = XV_MIN_ALLOC_SIZE; - size = ALIGN(size, FL_DELTA); - return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT; -} - -/** - * find_block - find block of at least given size - * @pool: memory pool to search from - * @size: size of block required - * @page: page containing required block - * @offset: offset within the page where block is located. - * - * Searches two level bitmap to locate block of at least - * the given size. If such a block is found, it provides - * to identify this block and returns index - * in freelist where we found this block. - * Otherwise, returns 0 and params are not touched. - */ -static u32 find_block(struct xv_pool *pool, u32 size, - struct page **page, u32 *offset) -{ - ulong flbitmap, slbitmap; - u32 flindex, slindex, slbitstart; - - /* There are no free blocks in this pool */ - if (!pool->flbitmap) - return 0; - - /* Get freelist index correspoding to this size */ - slindex = get_index(size); - slbitmap = pool->slbitmap[slindex / BITS_PER_LONG]; - slbitstart = slindex % BITS_PER_LONG; - - /* - * If freelist is not empty at this index, we found the - * block - head of this list. This is approximate best-fit match. - */ - if (test_bit(slbitstart, &slbitmap)) { - *page = pool->freelist[slindex].page; - *offset = pool->freelist[slindex].offset; - return slindex; - } - - /* - * No best-fit found. Search a bit further in bitmap for a free block. - * Second level bitmap consists of series of 32-bit chunks. Search - * further in the chunk where we expected a best-fit, starting from - * index location found above. - */ - slbitstart++; - slbitmap >>= slbitstart; - - /* Skip this search if we were already at end of this bitmap chunk */ - if ((slbitstart != BITS_PER_LONG) && slbitmap) { - slindex += __ffs(slbitmap) + 1; - *page = pool->freelist[slindex].page; - *offset = pool->freelist[slindex].offset; - return slindex; - } - - /* Now do a full two-level bitmap search to find next nearest fit */ - flindex = slindex / BITS_PER_LONG; - - flbitmap = (pool->flbitmap) >> (flindex + 1); - if (!flbitmap) - return 0; - - flindex += __ffs(flbitmap) + 1; - slbitmap = pool->slbitmap[flindex]; - slindex = (flindex * BITS_PER_LONG) + __ffs(slbitmap); - *page = pool->freelist[slindex].page; - *offset = pool->freelist[slindex].offset; - - return slindex; -} - -/* - * Insert block at in freelist of given pool. - * freelist used depends on block size. - */ -static void insert_block(struct xv_pool *pool, struct page *page, u32 offset, - struct block_header *block) -{ - u32 flindex, slindex; - struct block_header *nextblock; - - slindex = get_index_for_insert(block->size); - flindex = slindex / BITS_PER_LONG; - - block->link.prev_page = NULL; - block->link.prev_offset = 0; - block->link.next_page = pool->freelist[slindex].page; - block->link.next_offset = pool->freelist[slindex].offset; - pool->freelist[slindex].page = page; - pool->freelist[slindex].offset = offset; - - if (block->link.next_page) { - nextblock = get_ptr_atomic(block->link.next_page, - block->link.next_offset); - nextblock->link.prev_page = page; - nextblock->link.prev_offset = offset; - put_ptr_atomic(nextblock); - /* If there was a next page then the free bits are set. */ - return; - } - - __set_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]); - __set_bit(flindex, &pool->flbitmap); -} - -/* - * Remove block from freelist. Index 'slindex' identifies the freelist. - */ -static void remove_block(struct xv_pool *pool, struct page *page, u32 offset, - struct block_header *block, u32 slindex) -{ - u32 flindex = slindex / BITS_PER_LONG; - struct block_header *tmpblock; - - if (block->link.prev_page) { - tmpblock = get_ptr_atomic(block->link.prev_page, - block->link.prev_offset); - tmpblock->link.next_page = block->link.next_page; - tmpblock->link.next_offset = block->link.next_offset; - put_ptr_atomic(tmpblock); - } - - if (block->link.next_page) { - tmpblock = get_ptr_atomic(block->link.next_page, - block->link.next_offset); - tmpblock->link.prev_page = block->link.prev_page; - tmpblock->link.prev_offset = block->link.prev_offset; - put_ptr_atomic(tmpblock); - } - - /* Is this block is at the head of the freelist? */ - if (pool->freelist[slindex].page == page - && pool->freelist[slindex].offset == offset) { - - pool->freelist[slindex].page = block->link.next_page; - pool->freelist[slindex].offset = block->link.next_offset; - - if (pool->freelist[slindex].page) { - struct block_header *tmpblock; - tmpblock = get_ptr_atomic(pool->freelist[slindex].page, - pool->freelist[slindex].offset); - tmpblock->link.prev_page = NULL; - tmpblock->link.prev_offset = 0; - put_ptr_atomic(tmpblock); - } else { - /* This freelist bucket is empty */ - __clear_bit(slindex % BITS_PER_LONG, - &pool->slbitmap[flindex]); - if (!pool->slbitmap[flindex]) - __clear_bit(flindex, &pool->flbitmap); - } - } - - block->link.prev_page = NULL; - block->link.prev_offset = 0; - block->link.next_page = NULL; - block->link.next_offset = 0; -} - -/* - * Allocate a page and add it to freelist of given pool. - */ -static int grow_pool(struct xv_pool *pool, gfp_t flags) -{ - struct page *page; - struct block_header *block; - - page = alloc_page(flags); - if (unlikely(!page)) - return -ENOMEM; - - stat_inc(&pool->total_pages); - - spin_lock(&pool->lock); - block = get_ptr_atomic(page, 0); - - block->size = PAGE_SIZE - XV_ALIGN; - set_flag(block, BLOCK_FREE); - clear_flag(block, PREV_FREE); - set_blockprev(block, 0); - - insert_block(pool, page, 0, block); - - put_ptr_atomic(block); - spin_unlock(&pool->lock); - - return 0; -} - -/* - * Create a memory pool. Allocates freelist, bitmaps and other - * per-pool metadata. - */ -struct xv_pool *xv_create_pool(void) -{ - u32 ovhd_size; - struct xv_pool *pool; - - ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); - pool = kzalloc(ovhd_size, GFP_KERNEL); - if (!pool) - return NULL; - - spin_lock_init(&pool->lock); - - return pool; -} -EXPORT_SYMBOL_GPL(xv_create_pool); - -void xv_destroy_pool(struct xv_pool *pool) -{ - kfree(pool); -} -EXPORT_SYMBOL_GPL(xv_destroy_pool); - -/** - * xv_malloc - Allocate block of given size from pool. - * @pool: pool to allocate from - * @size: size of block to allocate - * @page: page no. that holds the object - * @offset: location of object within page - * - * On success, identifies block allocated - * and 0 is returned. On failure, is set to - * 0 and -ENOMEM is returned. - * - * Allocation requests with size > XV_MAX_ALLOC_SIZE will fail. - */ -int xv_malloc(struct xv_pool *pool, u32 size, struct page **page, - u32 *offset, gfp_t flags) -{ - int error; - u32 index, tmpsize, origsize, tmpoffset; - struct block_header *block, *tmpblock; - - *page = NULL; - *offset = 0; - origsize = size; - - if (unlikely(!size || size > XV_MAX_ALLOC_SIZE)) - return -ENOMEM; - - size = ALIGN(size, XV_ALIGN); - - spin_lock(&pool->lock); - - index = find_block(pool, size, page, offset); - - if (!*page) { - spin_unlock(&pool->lock); - if (flags & GFP_NOWAIT) - return -ENOMEM; - error = grow_pool(pool, flags); - if (unlikely(error)) - return error; - - spin_lock(&pool->lock); - index = find_block(pool, size, page, offset); - } - - if (!*page) { - spin_unlock(&pool->lock); - return -ENOMEM; - } - - block = get_ptr_atomic(*page, *offset); - - remove_block(pool, *page, *offset, block, index); - - /* Split the block if required */ - tmpoffset = *offset + size + XV_ALIGN; - tmpsize = block->size - size; - tmpblock = (struct block_header *)((char *)block + size + XV_ALIGN); - if (tmpsize) { - tmpblock->size = tmpsize - XV_ALIGN; - set_flag(tmpblock, BLOCK_FREE); - clear_flag(tmpblock, PREV_FREE); - - set_blockprev(tmpblock, *offset); - if (tmpblock->size >= XV_MIN_ALLOC_SIZE) - insert_block(pool, *page, tmpoffset, tmpblock); - - if (tmpoffset + XV_ALIGN + tmpblock->size != PAGE_SIZE) { - tmpblock = BLOCK_NEXT(tmpblock); - set_blockprev(tmpblock, tmpoffset); - } - } else { - /* This block is exact fit */ - if (tmpoffset != PAGE_SIZE) - clear_flag(tmpblock, PREV_FREE); - } - - block->size = origsize; - clear_flag(block, BLOCK_FREE); - - put_ptr_atomic(block); - spin_unlock(&pool->lock); - - *offset += XV_ALIGN; - - return 0; -} -EXPORT_SYMBOL_GPL(xv_malloc); - -/* - * Free block identified with - */ -void xv_free(struct xv_pool *pool, struct page *page, u32 offset) -{ - void *page_start; - struct block_header *block, *tmpblock; - - offset -= XV_ALIGN; - - spin_lock(&pool->lock); - - page_start = get_ptr_atomic(page, 0); - block = (struct block_header *)((char *)page_start + offset); - - /* Catch double free bugs */ - BUG_ON(test_flag(block, BLOCK_FREE)); - - block->size = ALIGN(block->size, XV_ALIGN); - - tmpblock = BLOCK_NEXT(block); - if (offset + block->size + XV_ALIGN == PAGE_SIZE) - tmpblock = NULL; - - /* Merge next block if its free */ - if (tmpblock && test_flag(tmpblock, BLOCK_FREE)) { - /* - * Blocks smaller than XV_MIN_ALLOC_SIZE - * are not inserted in any free list. - */ - if (tmpblock->size >= XV_MIN_ALLOC_SIZE) { - remove_block(pool, page, - offset + block->size + XV_ALIGN, tmpblock, - get_index_for_insert(tmpblock->size)); - } - block->size += tmpblock->size + XV_ALIGN; - } - - /* Merge previous block if its free */ - if (test_flag(block, PREV_FREE)) { - tmpblock = (struct block_header *)((char *)(page_start) + - get_blockprev(block)); - offset = offset - tmpblock->size - XV_ALIGN; - - if (tmpblock->size >= XV_MIN_ALLOC_SIZE) - remove_block(pool, page, offset, tmpblock, - get_index_for_insert(tmpblock->size)); - - tmpblock->size += block->size + XV_ALIGN; - block = tmpblock; - } - - /* No used objects in this page. Free it. */ - if (block->size == PAGE_SIZE - XV_ALIGN) { - put_ptr_atomic(page_start); - spin_unlock(&pool->lock); - - __free_page(page); - stat_dec(&pool->total_pages); - return; - } - - set_flag(block, BLOCK_FREE); - if (block->size >= XV_MIN_ALLOC_SIZE) - insert_block(pool, page, offset, block); - - if (offset + block->size + XV_ALIGN != PAGE_SIZE) { - tmpblock = BLOCK_NEXT(block); - set_flag(tmpblock, PREV_FREE); - set_blockprev(tmpblock, offset); - } - - put_ptr_atomic(page_start); - spin_unlock(&pool->lock); -} -EXPORT_SYMBOL_GPL(xv_free); - -u32 xv_get_object_size(void *obj) -{ - struct block_header *blk; - - blk = (struct block_header *)((char *)(obj) - XV_ALIGN); - return blk->size; -} -EXPORT_SYMBOL_GPL(xv_get_object_size); - -/* - * Returns total memory used by allocator (userdata + metadata) - */ -u64 xv_get_total_size_bytes(struct xv_pool *pool) -{ - return pool->total_pages << PAGE_SHIFT; -} -EXPORT_SYMBOL_GPL(xv_get_total_size_bytes); diff --git a/drivers/staging/ramster/xvmalloc.h b/drivers/staging/ramster/xvmalloc.h deleted file mode 100644 index 5b1a81aa5..000000000 --- a/drivers/staging/ramster/xvmalloc.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * xvmalloc memory allocator - * - * Copyright (C) 2008, 2009, 2010 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the licence that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - */ - -#ifndef _XV_MALLOC_H_ -#define _XV_MALLOC_H_ - -#include - -struct xv_pool; - -struct xv_pool *xv_create_pool(void); -void xv_destroy_pool(struct xv_pool *pool); - -int xv_malloc(struct xv_pool *pool, u32 size, struct page **page, - u32 *offset, gfp_t flags); -void xv_free(struct xv_pool *pool, struct page *page, u32 offset); - -u32 xv_get_object_size(void *obj); -u64 xv_get_total_size_bytes(struct xv_pool *pool); - -#endif diff --git a/drivers/staging/ramster/xvmalloc_int.h b/drivers/staging/ramster/xvmalloc_int.h deleted file mode 100644 index b5f1f7feb..000000000 --- a/drivers/staging/ramster/xvmalloc_int.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * xvmalloc memory allocator - * - * Copyright (C) 2008, 2009, 2010 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the licence that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - */ - -#ifndef _XV_MALLOC_INT_H_ -#define _XV_MALLOC_INT_H_ - -#include -#include - -/* User configurable params */ - -/* Must be power of two */ -#ifdef CONFIG_64BIT -#define XV_ALIGN_SHIFT 3 -#else -#define XV_ALIGN_SHIFT 2 -#endif -#define XV_ALIGN (1 << XV_ALIGN_SHIFT) -#define XV_ALIGN_MASK (XV_ALIGN - 1) - -/* This must be greater than sizeof(link_free) */ -#define XV_MIN_ALLOC_SIZE 32 -#define XV_MAX_ALLOC_SIZE (PAGE_SIZE - XV_ALIGN) - -/* - * Free lists are separated by FL_DELTA bytes - * This value is 3 for 4k pages and 4 for 64k pages, for any - * other page size, a conservative (PAGE_SHIFT - 9) is used. - */ -#if PAGE_SHIFT == 16 -#define FL_DELTA_SHIFT 4 -#else -#define FL_DELTA_SHIFT (PAGE_SHIFT - 9) -#endif -#define FL_DELTA (1 << FL_DELTA_SHIFT) -#define FL_DELTA_MASK (FL_DELTA - 1) -#define NUM_FREE_LISTS ((XV_MAX_ALLOC_SIZE - XV_MIN_ALLOC_SIZE) \ - / FL_DELTA + 1) - -#define MAX_FLI DIV_ROUND_UP(NUM_FREE_LISTS, BITS_PER_LONG) - -/* End of user params */ - -enum blockflags { - BLOCK_FREE, - PREV_FREE, - __NR_BLOCKFLAGS, -}; - -#define FLAGS_MASK XV_ALIGN_MASK -#define PREV_MASK (~FLAGS_MASK) - -struct freelist_entry { - struct page *page; - u16 offset; - u16 pad; -}; - -struct link_free { - struct page *prev_page; - struct page *next_page; - u16 prev_offset; - u16 next_offset; -}; - -struct block_header { - union { - /* This common header must be XV_ALIGN bytes */ - u8 common[XV_ALIGN]; - struct { - u16 size; - u16 prev; - }; - }; - struct link_free link; -}; - -struct xv_pool { - ulong flbitmap; - ulong slbitmap[MAX_FLI]; - u64 total_pages; /* stats */ - struct freelist_entry freelist[NUM_FREE_LISTS]; - spinlock_t lock; -}; - -#endif diff --git a/drivers/staging/ramster/zcache-main.c b/drivers/staging/ramster/zcache-main.c deleted file mode 100644 index 68b2e053a..000000000 --- a/drivers/staging/ramster/zcache-main.c +++ /dev/null @@ -1,3320 +0,0 @@ -/* - * zcache.c - * - * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp. - * Copyright (c) 2010,2011, Nitin Gupta - * - * Zcache provides an in-kernel "host implementation" for transcendent memory - * and, thus indirectly, for cleancache and frontswap. Zcache includes two - * page-accessible memory [1] interfaces, both utilizing lzo1x compression: - * 1) "compression buddies" ("zbud") is used for ephemeral pages - * 2) xvmalloc is used for persistent pages. - * Xvmalloc (based on the TLSF allocator) has very low fragmentation - * so maximizes space efficiency, while zbud allows pairs (and potentially, - * in the future, more than a pair of) compressed pages to be closely linked - * so that reclaiming can be done via the kernel's physical-page-oriented - * "shrinker" interface. - * - * [1] For a definition of page-accessible memory (aka PAM), see: - * http://marc.info/?l=linux-mm&m=127811271605009 - * RAMSTER TODO: - * - handle remotifying of buddied pages (see zbud_remotify_zbpg) - * - kernel boot params: nocleancache/nofrontswap don't always work?!? - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "tmem.h" -#include "zcache.h" -#include "ramster.h" -#include "cluster/tcp.h" - -#include "xvmalloc.h" /* temporary until change to zsmalloc */ - -#define RAMSTER_TESTING - -#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP)) -#error "ramster is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP" -#endif -#ifdef CONFIG_CLEANCACHE -#include -#endif -#ifdef CONFIG_FRONTSWAP -#include -#endif - -enum ramster_remotify_op { - RAMSTER_REMOTIFY_EPH_PUT, - RAMSTER_REMOTIFY_PERS_PUT, - RAMSTER_REMOTIFY_FLUSH_PAGE, - RAMSTER_REMOTIFY_FLUSH_OBJ, - RAMSTER_INTRANSIT_PERS -}; - -struct ramster_remotify_hdr { - enum ramster_remotify_op op; - struct list_head list; -}; - -#define ZBH_SENTINEL 0x43214321 -#define ZBPG_SENTINEL 0xdeadbeef - -#define ZBUD_MAX_BUDS 2 - -struct zbud_hdr { - struct ramster_remotify_hdr rem_op; - uint16_t client_id; - uint16_t pool_id; - struct tmem_oid oid; - uint32_t index; - uint16_t size; /* compressed size in bytes, zero means unused */ - DECL_SENTINEL -}; - -#define ZVH_SENTINEL 0x43214321 -static const int zv_max_page_size = (PAGE_SIZE / 8) * 7; - -struct zv_hdr { - struct ramster_remotify_hdr rem_op; - uint16_t client_id; - uint16_t pool_id; - struct tmem_oid oid; - uint32_t index; - DECL_SENTINEL -}; - -struct flushlist_node { - struct ramster_remotify_hdr rem_op; - struct tmem_xhandle xh; -}; - -union { - struct ramster_remotify_hdr rem_op; - struct zv_hdr zv; - struct zbud_hdr zbud; - struct flushlist_node flist; -} remotify_list_node; - -static LIST_HEAD(zcache_rem_op_list); -static DEFINE_SPINLOCK(zcache_rem_op_list_lock); - -#if 0 -/* this is more aggressive but may cause other problems? */ -#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN) -#else -#define ZCACHE_GFP_MASK \ - (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC) -#endif - -#define MAX_POOLS_PER_CLIENT 16 - -#define MAX_CLIENTS 16 -#define LOCAL_CLIENT ((uint16_t)-1) - -MODULE_LICENSE("GPL"); - -struct zcache_client { - struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT]; - struct xv_pool *xvpool; - bool allocated; - atomic_t refcount; -}; - -static struct zcache_client zcache_host; -static struct zcache_client zcache_clients[MAX_CLIENTS]; - -static inline uint16_t get_client_id_from_client(struct zcache_client *cli) -{ - BUG_ON(cli == NULL); - if (cli == &zcache_host) - return LOCAL_CLIENT; - return cli - &zcache_clients[0]; -} - -static inline bool is_local_client(struct zcache_client *cli) -{ - return cli == &zcache_host; -} - -/********** - * Compression buddies ("zbud") provides for packing two (or, possibly - * in the future, more) compressed ephemeral pages into a single "raw" - * (physical) page and tracking them with data structures so that - * the raw pages can be easily reclaimed. - * - * A zbud page ("zbpg") is an aligned page containing a list_head, - * a lock, and two "zbud headers". The remainder of the physical - * page is divided up into aligned 64-byte "chunks" which contain - * the compressed data for zero, one, or two zbuds. Each zbpg - * resides on: (1) an "unused list" if it has no zbuds; (2) a - * "buddied" list if it is fully populated with two zbuds; or - * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks - * the one unbuddied zbud uses. The data inside a zbpg cannot be - * read or written unless the zbpg's lock is held. - */ - -struct zbud_page { - struct list_head bud_list; - spinlock_t lock; - struct zbud_hdr buddy[ZBUD_MAX_BUDS]; - DECL_SENTINEL - /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */ -}; - -#define CHUNK_SHIFT 6 -#define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define CHUNK_MASK (~(CHUNK_SIZE-1)) -#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \ - CHUNK_MASK) >> CHUNK_SHIFT) -#define MAX_CHUNK (NCHUNKS-1) - -static struct { - struct list_head list; - unsigned count; -} zbud_unbuddied[NCHUNKS]; -/* list N contains pages with N chunks USED and NCHUNKS-N unused */ -/* element 0 is never used but optimizing that isn't worth it */ -static unsigned long zbud_cumul_chunk_counts[NCHUNKS]; - -struct list_head zbud_buddied_list; -static unsigned long zcache_zbud_buddied_count; - -/* protects the buddied list and all unbuddied lists */ -static DEFINE_SPINLOCK(zbud_budlists_spinlock); - -static atomic_t zcache_zbud_curr_raw_pages; -static atomic_t zcache_zbud_curr_zpages; -static unsigned long zcache_zbud_curr_zbytes; -static unsigned long zcache_zbud_cumul_zpages; -static unsigned long zcache_zbud_cumul_zbytes; -static unsigned long zcache_compress_poor; -static unsigned long zcache_policy_percent_exceeded; -static unsigned long zcache_mean_compress_poor; - -/* - * RAMster counters - * - Remote pages are pages with a local pampd but the data is remote - * - Foreign pages are pages stored locally but belonging to another node - */ -static atomic_t ramster_remote_pers_pages = ATOMIC_INIT(0); -static unsigned long ramster_pers_remotify_enable; -static unsigned long ramster_eph_remotify_enable; -static unsigned long ramster_eph_pages_remoted; -static unsigned long ramster_eph_pages_remote_failed; -static unsigned long ramster_pers_pages_remoted; -static unsigned long ramster_pers_pages_remote_failed; -static unsigned long ramster_pers_pages_remote_nomem; -static unsigned long ramster_remote_objects_flushed; -static unsigned long ramster_remote_object_flushes_failed; -static unsigned long ramster_remote_pages_flushed; -static unsigned long ramster_remote_page_flushes_failed; -static unsigned long ramster_remote_eph_pages_succ_get; -static unsigned long ramster_remote_pers_pages_succ_get; -static unsigned long ramster_remote_eph_pages_unsucc_get; -static unsigned long ramster_remote_pers_pages_unsucc_get; -static atomic_t ramster_curr_flnode_count = ATOMIC_INIT(0); -static unsigned long ramster_curr_flnode_count_max; -static atomic_t ramster_foreign_eph_pampd_count = ATOMIC_INIT(0); -static unsigned long ramster_foreign_eph_pampd_count_max; -static atomic_t ramster_foreign_pers_pampd_count = ATOMIC_INIT(0); -static unsigned long ramster_foreign_pers_pampd_count_max; - -/* forward references */ -static void *zcache_get_free_page(void); -static void zcache_free_page(void *p); - -/* - * zbud helper functions - */ - -static inline unsigned zbud_max_buddy_size(void) -{ - return MAX_CHUNK << CHUNK_SHIFT; -} - -static inline unsigned zbud_size_to_chunks(unsigned size) -{ - BUG_ON(size == 0 || size > zbud_max_buddy_size()); - return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; -} - -static inline int zbud_budnum(struct zbud_hdr *zh) -{ - unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1); - struct zbud_page *zbpg = NULL; - unsigned budnum = -1U; - int i; - - for (i = 0; i < ZBUD_MAX_BUDS; i++) - if (offset == offsetof(typeof(*zbpg), buddy[i])) { - budnum = i; - break; - } - BUG_ON(budnum == -1U); - return budnum; -} - -static char *zbud_data(struct zbud_hdr *zh, unsigned size) -{ - struct zbud_page *zbpg; - char *p; - unsigned budnum; - - ASSERT_SENTINEL(zh, ZBH); - budnum = zbud_budnum(zh); - BUG_ON(size == 0 || size > zbud_max_buddy_size()); - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - ASSERT_SPINLOCK(&zbpg->lock); - p = (char *)zbpg; - if (budnum == 0) - p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & - CHUNK_MASK); - else if (budnum == 1) - p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); - return p; -} - -static void zbud_copy_from_pampd(char *data, size_t *size, struct zbud_hdr *zh) -{ - struct zbud_page *zbpg; - char *p; - unsigned budnum; - - ASSERT_SENTINEL(zh, ZBH); - budnum = zbud_budnum(zh); - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - spin_lock(&zbpg->lock); - BUG_ON(zh->size > *size); - p = (char *)zbpg; - if (budnum == 0) - p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & - CHUNK_MASK); - else if (budnum == 1) - p += PAGE_SIZE - ((zh->size + CHUNK_SIZE - 1) & CHUNK_MASK); - /* client should be filled in by caller */ - memcpy(data, p, zh->size); - *size = zh->size; - spin_unlock(&zbpg->lock); -} - -/* - * zbud raw page management - */ - -static struct zbud_page *zbud_alloc_raw_page(void) -{ - struct zbud_page *zbpg = NULL; - struct zbud_hdr *zh0, *zh1; - zbpg = zcache_get_free_page(); - if (likely(zbpg != NULL)) { - INIT_LIST_HEAD(&zbpg->bud_list); - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; - spin_lock_init(&zbpg->lock); - atomic_inc(&zcache_zbud_curr_raw_pages); - INIT_LIST_HEAD(&zbpg->bud_list); - SET_SENTINEL(zbpg, ZBPG); - zh0->size = 0; zh1->size = 0; - tmem_oid_set_invalid(&zh0->oid); - tmem_oid_set_invalid(&zh1->oid); - } - return zbpg; -} - -static void zbud_free_raw_page(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1]; - - ASSERT_SENTINEL(zbpg, ZBPG); - BUG_ON(!list_empty(&zbpg->bud_list)); - ASSERT_SPINLOCK(&zbpg->lock); - BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); - BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); - INVERT_SENTINEL(zbpg, ZBPG); - spin_unlock(&zbpg->lock); - atomic_dec(&zcache_zbud_curr_raw_pages); - zcache_free_page(zbpg); -} - -/* - * core zbud handling routines - */ - -static unsigned zbud_free(struct zbud_hdr *zh) -{ - unsigned size; - - ASSERT_SENTINEL(zh, ZBH); - BUG_ON(!tmem_oid_valid(&zh->oid)); - size = zh->size; - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); - zh->size = 0; - tmem_oid_set_invalid(&zh->oid); - INVERT_SENTINEL(zh, ZBH); - zcache_zbud_curr_zbytes -= size; - atomic_dec(&zcache_zbud_curr_zpages); - return size; -} - -static void zbud_free_and_delist(struct zbud_hdr *zh) -{ - unsigned chunks; - struct zbud_hdr *zh_other; - unsigned budnum = zbud_budnum(zh), size; - struct zbud_page *zbpg = - container_of(zh, struct zbud_page, buddy[budnum]); - - /* FIXME, should be BUG_ON, pool destruction path doesn't disable - * interrupts tmem_destroy_pool()->tmem_pampd_destroy_all_in_obj()-> - * tmem_objnode_node_destroy()-> zcache_pampd_free() */ - WARN_ON(!irqs_disabled()); - spin_lock(&zbpg->lock); - if (list_empty(&zbpg->bud_list)) { - /* ignore zombie page... see zbud_evict_pages() */ - spin_unlock(&zbpg->lock); - return; - } - size = zbud_free(zh); - ASSERT_SPINLOCK(&zbpg->lock); - zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0]; - if (zh_other->size == 0) { /* was unbuddied: unlist and free */ - chunks = zbud_size_to_chunks(size) ; - spin_lock(&zbud_budlists_spinlock); - BUG_ON(list_empty(&zbud_unbuddied[chunks].list)); - list_del_init(&zbpg->bud_list); - zbud_unbuddied[chunks].count--; - spin_unlock(&zbud_budlists_spinlock); - zbud_free_raw_page(zbpg); - } else { /* was buddied: move remaining buddy to unbuddied list */ - chunks = zbud_size_to_chunks(zh_other->size) ; - spin_lock(&zbud_budlists_spinlock); - list_del_init(&zbpg->bud_list); - zcache_zbud_buddied_count--; - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list); - zbud_unbuddied[chunks].count++; - spin_unlock(&zbud_budlists_spinlock); - spin_unlock(&zbpg->lock); - } -} - -static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id, - struct tmem_oid *oid, - uint32_t index, struct page *page, - void *cdata, unsigned size) -{ - struct zbud_hdr *zh0, *zh1, *zh = NULL; - struct zbud_page *zbpg = NULL, *ztmp; - unsigned nchunks; - char *to; - int i, found_good_buddy = 0; - - nchunks = zbud_size_to_chunks(size) ; - for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { - spin_lock(&zbud_budlists_spinlock); - if (!list_empty(&zbud_unbuddied[i].list)) { - list_for_each_entry_safe(zbpg, ztmp, - &zbud_unbuddied[i].list, bud_list) { - if (spin_trylock(&zbpg->lock)) { - found_good_buddy = i; - goto found_unbuddied; - } - } - } - spin_unlock(&zbud_budlists_spinlock); - } - /* didn't find a good buddy, try allocating a new page */ - zbpg = zbud_alloc_raw_page(); - if (unlikely(zbpg == NULL)) - goto out; - /* ok, have a page, now compress the data before taking locks */ - spin_lock(&zbud_budlists_spinlock); - spin_lock(&zbpg->lock); - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list); - zbud_unbuddied[nchunks].count++; - zh = &zbpg->buddy[0]; - goto init_zh; - -found_unbuddied: - ASSERT_SPINLOCK(&zbpg->lock); - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; - BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0))); - if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */ - ASSERT_SENTINEL(zh0, ZBH); - zh = zh1; - } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */ - ASSERT_SENTINEL(zh1, ZBH); - zh = zh0; - } else - BUG(); - list_del_init(&zbpg->bud_list); - zbud_unbuddied[found_good_buddy].count--; - list_add_tail(&zbpg->bud_list, &zbud_buddied_list); - zcache_zbud_buddied_count++; - -init_zh: - SET_SENTINEL(zh, ZBH); - zh->size = size; - zh->index = index; - zh->oid = *oid; - zh->pool_id = pool_id; - zh->client_id = client_id; - to = zbud_data(zh, size); - memcpy(to, cdata, size); - spin_unlock(&zbpg->lock); - spin_unlock(&zbud_budlists_spinlock); - zbud_cumul_chunk_counts[nchunks]++; - atomic_inc(&zcache_zbud_curr_zpages); - zcache_zbud_cumul_zpages++; - zcache_zbud_curr_zbytes += size; - zcache_zbud_cumul_zbytes += size; -out: - return zh; -} - -static int zbud_decompress(struct page *page, struct zbud_hdr *zh) -{ - struct zbud_page *zbpg; - unsigned budnum = zbud_budnum(zh); - size_t out_len = PAGE_SIZE; - char *to_va, *from_va; - unsigned size; - int ret = 0; - - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - spin_lock(&zbpg->lock); - if (list_empty(&zbpg->bud_list)) { - /* ignore zombie page... see zbud_evict_pages() */ - ret = -EINVAL; - goto out; - } - ASSERT_SENTINEL(zh, ZBH); - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); - to_va = kmap_atomic(page); - size = zh->size; - from_va = zbud_data(zh, size); - ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len); - BUG_ON(ret != LZO_E_OK); - BUG_ON(out_len != PAGE_SIZE); - kunmap_atomic(to_va); -out: - spin_unlock(&zbpg->lock); - return ret; -} - -/* - * The following routines handle shrinking of ephemeral pages by evicting - * pages "least valuable" first. - */ - -static unsigned long zcache_evicted_raw_pages; -static unsigned long zcache_evicted_buddied_pages; -static unsigned long zcache_evicted_unbuddied_pages; - -static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, - uint16_t poolid); -static void zcache_put_pool(struct tmem_pool *pool); - -/* - * Flush and free all zbuds in a zbpg, then free the pageframe - */ -static void zbud_evict_zbpg(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh; - int i, j; - uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS]; - uint32_t index[ZBUD_MAX_BUDS]; - struct tmem_oid oid[ZBUD_MAX_BUDS]; - struct tmem_pool *pool; - unsigned long flags; - - ASSERT_SPINLOCK(&zbpg->lock); - for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) { - zh = &zbpg->buddy[i]; - if (zh->size) { - client_id[j] = zh->client_id; - pool_id[j] = zh->pool_id; - oid[j] = zh->oid; - index[j] = zh->index; - j++; - } - } - spin_unlock(&zbpg->lock); - for (i = 0; i < j; i++) { - pool = zcache_get_pool_by_id(client_id[i], pool_id[i]); - BUG_ON(pool == NULL); - local_irq_save(flags); - /* these flushes should dispose of any local storage */ - tmem_flush_page(pool, &oid[i], index[i]); - local_irq_restore(flags); - zcache_put_pool(pool); - } -} - -/* - * Free nr pages. This code is funky because we want to hold the locks - * protecting various lists for as short a time as possible, and in some - * circumstances the list may change asynchronously when the list lock is - * not held. In some cases we also trylock not only to avoid waiting on a - * page in use by another cpu, but also to avoid potential deadlock due to - * lock inversion. - */ -static void zbud_evict_pages(int nr) -{ - struct zbud_page *zbpg; - int i, newly_unused_pages = 0; - - - /* now try freeing unbuddied pages, starting with least space avail */ - for (i = 0; i < MAX_CHUNK; i++) { -retry_unbud_list_i: - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_unbuddied[i].list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - continue; - } - list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; - zbud_unbuddied[i].count--; - spin_unlock(&zbud_budlists_spinlock); - zcache_evicted_unbuddied_pages++; - /* want budlists unlocked when doing zbpg eviction */ - zbud_evict_zbpg(zbpg); - newly_unused_pages++; - local_bh_enable(); - if (--nr <= 0) - goto evict_unused; - goto retry_unbud_list_i; - } - spin_unlock_bh(&zbud_budlists_spinlock); - } - - /* as a last resort, free buddied pages */ -retry_bud_list: - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_buddied_list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - goto evict_unused; - } - list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; - zcache_zbud_buddied_count--; - spin_unlock(&zbud_budlists_spinlock); - zcache_evicted_buddied_pages++; - /* want budlists unlocked when doing zbpg eviction */ - zbud_evict_zbpg(zbpg); - newly_unused_pages++; - local_bh_enable(); - if (--nr <= 0) - goto evict_unused; - goto retry_bud_list; - } - spin_unlock_bh(&zbud_budlists_spinlock); - -evict_unused: - return; -} - -static DEFINE_PER_CPU(unsigned char *, zcache_remoteputmem); - -static int zbud_remotify_zbud(struct tmem_xhandle *xh, char *data, - size_t size) -{ - struct tmem_pool *pool; - int i, remotenode, ret = -1; - unsigned char cksum, *p; - unsigned long flags; - - for (p = data, cksum = 0, i = 0; i < size; i++) - cksum += *p; - ret = ramster_remote_put(xh, data, size, true, &remotenode); - if (ret == 0) { - /* data was successfully remoted so change the local version - * to point to the remote node where it landed */ - pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh->pool_id); - BUG_ON(pool == NULL); - local_irq_save(flags); - /* tmem_replace will also free up any local space */ - (void)tmem_replace(pool, &xh->oid, xh->index, - pampd_make_remote(remotenode, size, cksum)); - local_irq_restore(flags); - zcache_put_pool(pool); - ramster_eph_pages_remoted++; - ret = 0; - } else - ramster_eph_pages_remote_failed++; - return ret; -} - -static int zbud_remotify_zbpg(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh1, *zh2 = NULL; - struct tmem_xhandle xh1, xh2 = { 0 }; - char *data1 = NULL, *data2 = NULL; - size_t size1 = 0, size2 = 0; - int ret = 0; - unsigned char *tmpmem = __get_cpu_var(zcache_remoteputmem); - - ASSERT_SPINLOCK(&zbpg->lock); - if (zbpg->buddy[0].size == 0) - zh1 = &zbpg->buddy[1]; - else if (zbpg->buddy[1].size == 0) - zh1 = &zbpg->buddy[0]; - else { - zh1 = &zbpg->buddy[0]; - zh2 = &zbpg->buddy[1]; - } - /* don't remotify pages that are already remotified */ - if (zh1->client_id != LOCAL_CLIENT) - zh1 = NULL; - if ((zh2 != NULL) && (zh2->client_id != LOCAL_CLIENT)) - zh2 = NULL; - - /* copy the data and metadata so can release lock */ - if (zh1 != NULL) { - xh1.client_id = zh1->client_id; - xh1.pool_id = zh1->pool_id; - xh1.oid = zh1->oid; - xh1.index = zh1->index; - size1 = zh1->size; - data1 = zbud_data(zh1, size1); - memcpy(tmpmem, zbud_data(zh1, size1), size1); - data1 = tmpmem; - tmpmem += size1; - } - if (zh2 != NULL) { - xh2.client_id = zh2->client_id; - xh2.pool_id = zh2->pool_id; - xh2.oid = zh2->oid; - xh2.index = zh2->index; - size2 = zh2->size; - memcpy(tmpmem, zbud_data(zh2, size2), size2); - data2 = tmpmem; - } - spin_unlock(&zbpg->lock); - preempt_enable(); - - /* OK, no locks held anymore, remotify one or both zbuds */ - if (zh1 != NULL) - ret = zbud_remotify_zbud(&xh1, data1, size1); - if (zh2 != NULL) - ret |= zbud_remotify_zbud(&xh2, data2, size2); - return ret; -} - -void zbud_remotify_pages(int nr) -{ - struct zbud_page *zbpg; - int i, ret; - - /* - * for now just try remotifying unbuddied pages, starting with - * least space avail - */ - for (i = 0; i < MAX_CHUNK; i++) { -retry_unbud_list_i: - preempt_disable(); /* enable in zbud_remotify_zbpg */ - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_unbuddied[i].list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - preempt_enable(); - continue; /* next i in for loop */ - } - list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; /* next list_for_each_entry */ - zbud_unbuddied[i].count--; - /* want budlists unlocked when doing zbpg remotify */ - spin_unlock_bh(&zbud_budlists_spinlock); - ret = zbud_remotify_zbpg(zbpg); - /* preemption is re-enabled in zbud_remotify_zbpg */ - if (ret == 0) { - if (--nr <= 0) - goto out; - goto retry_unbud_list_i; - } - /* if fail to remotify any page, quit */ - pr_err("TESTING zbud_remotify_pages failed on page," - " trying to re-add\n"); - spin_lock_bh(&zbud_budlists_spinlock); - spin_lock(&zbpg->lock); - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[i].list); - zbud_unbuddied[i].count++; - spin_unlock(&zbpg->lock); - spin_unlock_bh(&zbud_budlists_spinlock); - pr_err("TESTING zbud_remotify_pages failed on page," - " finished re-add\n"); - goto out; - } - spin_unlock_bh(&zbud_budlists_spinlock); - preempt_enable(); - } - -next_buddied_zbpg: - preempt_disable(); /* enable in zbud_remotify_zbpg */ - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_buddied_list)) - goto unlock_out; - list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; /* next list_for_each_entry */ - zcache_zbud_buddied_count--; - /* want budlists unlocked when doing zbpg remotify */ - spin_unlock_bh(&zbud_budlists_spinlock); - ret = zbud_remotify_zbpg(zbpg); - /* preemption is re-enabled in zbud_remotify_zbpg */ - if (ret == 0) { - if (--nr <= 0) - goto out; - goto next_buddied_zbpg; - } - /* if fail to remotify any page, quit */ - pr_err("TESTING zbud_remotify_pages failed on BUDDIED page," - " trying to re-add\n"); - spin_lock_bh(&zbud_budlists_spinlock); - spin_lock(&zbpg->lock); - list_add_tail(&zbpg->bud_list, &zbud_buddied_list); - zcache_zbud_buddied_count++; - spin_unlock(&zbpg->lock); - spin_unlock_bh(&zbud_budlists_spinlock); - pr_err("TESTING zbud_remotify_pages failed on BUDDIED page," - " finished re-add\n"); - goto out; - } -unlock_out: - spin_unlock_bh(&zbud_budlists_spinlock); - preempt_enable(); -out: - return; -} - -/* the "flush list" asynchronously collects pages to remotely flush */ -#define FLUSH_ENTIRE_OBJECT ((uint32_t)-1) -static void ramster_flnode_free(struct flushlist_node *, - struct tmem_pool *); - -static void zcache_remote_flush_page(struct flushlist_node *flnode) -{ - struct tmem_xhandle *xh; - int remotenode, ret; - - preempt_disable(); - xh = &flnode->xh; - remotenode = flnode->xh.client_id; - ret = ramster_remote_flush(xh, remotenode); - if (ret >= 0) - ramster_remote_pages_flushed++; - else - ramster_remote_page_flushes_failed++; - preempt_enable_no_resched(); - ramster_flnode_free(flnode, NULL); -} - -static void zcache_remote_flush_object(struct flushlist_node *flnode) -{ - struct tmem_xhandle *xh; - int remotenode, ret; - - preempt_disable(); - xh = &flnode->xh; - remotenode = flnode->xh.client_id; - ret = ramster_remote_flush_object(xh, remotenode); - if (ret >= 0) - ramster_remote_objects_flushed++; - else - ramster_remote_object_flushes_failed++; - preempt_enable_no_resched(); - ramster_flnode_free(flnode, NULL); -} - -static void zcache_remote_eph_put(struct zbud_hdr *zbud) -{ - /* FIXME */ -} - -static void zcache_remote_pers_put(struct zv_hdr *zv) -{ - struct tmem_xhandle xh; - uint16_t size; - bool ephemeral; - int remotenode, ret = -1; - char *data; - struct tmem_pool *pool; - unsigned long flags; - unsigned char cksum; - char *p; - int i; - unsigned char *tmpmem = __get_cpu_var(zcache_remoteputmem); - - ASSERT_SENTINEL(zv, ZVH); - BUG_ON(zv->client_id != LOCAL_CLIENT); - local_bh_disable(); - xh.client_id = zv->client_id; - xh.pool_id = zv->pool_id; - xh.oid = zv->oid; - xh.index = zv->index; - size = xv_get_object_size(zv) - sizeof(*zv); - BUG_ON(size == 0 || size > zv_max_page_size); - data = (char *)zv + sizeof(*zv); - for (p = data, cksum = 0, i = 0; i < size; i++) - cksum += *p; - memcpy(tmpmem, data, size); - data = tmpmem; - pool = zcache_get_pool_by_id(zv->client_id, zv->pool_id); - ephemeral = is_ephemeral(pool); - zcache_put_pool(pool); - /* now OK to release lock set in caller */ - spin_unlock(&zcache_rem_op_list_lock); - local_bh_enable(); - preempt_disable(); - ret = ramster_remote_put(&xh, data, size, ephemeral, &remotenode); - preempt_enable_no_resched(); - if (ret != 0) { - /* - * This is some form of a memory leak... if the remote put - * fails, there will never be another attempt to remotify - * this page. But since we've dropped the zv pointer, - * the page may have been freed or the data replaced - * so we can't just "put it back" in the remote op list. - * Even if we could, not sure where to put it in the list - * because there may be flushes that must be strictly - * ordered vs the put. So leave this as a FIXME for now. - * But count them so we know if it becomes a problem. - */ - ramster_pers_pages_remote_failed++; - goto out; - } else - atomic_inc(&ramster_remote_pers_pages); - ramster_pers_pages_remoted++; - /* - * data was successfully remoted so change the local version to - * point to the remote node where it landed - */ - local_bh_disable(); - pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh.pool_id); - local_irq_save(flags); - (void)tmem_replace(pool, &xh.oid, xh.index, - pampd_make_remote(remotenode, size, cksum)); - local_irq_restore(flags); - zcache_put_pool(pool); - local_bh_enable(); -out: - return; -} - -static void zcache_do_remotify_ops(int nr) -{ - struct ramster_remotify_hdr *rem_op; - union remotify_list_node *u; - - while (1) { - if (!nr) - goto out; - spin_lock(&zcache_rem_op_list_lock); - if (list_empty(&zcache_rem_op_list)) { - spin_unlock(&zcache_rem_op_list_lock); - goto out; - } - rem_op = list_first_entry(&zcache_rem_op_list, - struct ramster_remotify_hdr, list); - list_del_init(&rem_op->list); - if (rem_op->op != RAMSTER_REMOTIFY_PERS_PUT) - spin_unlock(&zcache_rem_op_list_lock); - u = (union remotify_list_node *)rem_op; - switch (rem_op->op) { - case RAMSTER_REMOTIFY_EPH_PUT: -BUG(); - zcache_remote_eph_put((struct zbud_hdr *)rem_op); - break; - case RAMSTER_REMOTIFY_PERS_PUT: - zcache_remote_pers_put((struct zv_hdr *)rem_op); - break; - case RAMSTER_REMOTIFY_FLUSH_PAGE: - zcache_remote_flush_page((struct flushlist_node *)u); - break; - case RAMSTER_REMOTIFY_FLUSH_OBJ: - zcache_remote_flush_object((struct flushlist_node *)u); - break; - default: - BUG(); - } - } -out: - return; -} - -/* - * Communicate interface revision with userspace - */ -#include "cluster/ramster_nodemanager.h" -static unsigned long ramster_interface_revision = R2NM_API_VERSION; - -/* - * For now, just push over a few pages every few seconds to - * ensure that it basically works - */ -static struct workqueue_struct *ramster_remotify_workqueue; -static void ramster_remotify_process(struct work_struct *work); -static DECLARE_DELAYED_WORK(ramster_remotify_worker, - ramster_remotify_process); - -static void ramster_remotify_queue_delayed_work(unsigned long delay) -{ - if (!queue_delayed_work(ramster_remotify_workqueue, - &ramster_remotify_worker, delay)) - pr_err("ramster_remotify: bad workqueue\n"); -} - - -static int use_frontswap; -static int use_cleancache; -static int ramster_remote_target_nodenum = -1; -static void ramster_remotify_process(struct work_struct *work) -{ - static bool remotify_in_progress; - - BUG_ON(irqs_disabled()); - if (remotify_in_progress) - ramster_remotify_queue_delayed_work(HZ); - else if (ramster_remote_target_nodenum != -1) { - remotify_in_progress = true; -#ifdef CONFIG_CLEANCACHE - if (use_cleancache && ramster_eph_remotify_enable) - zbud_remotify_pages(5000); /* FIXME is this a good number? */ -#endif -#ifdef CONFIG_FRONTSWAP - if (use_frontswap && ramster_pers_remotify_enable) - zcache_do_remotify_ops(500); /* FIXME is this a good number? */ -#endif - remotify_in_progress = false; - ramster_remotify_queue_delayed_work(HZ); - } -} - -static void ramster_remotify_init(void) -{ - unsigned long n = 60UL; - ramster_remotify_workqueue = - create_singlethread_workqueue("ramster_remotify"); - ramster_remotify_queue_delayed_work(n * HZ); -} - - -static void zbud_init(void) -{ - int i; - - INIT_LIST_HEAD(&zbud_buddied_list); - zcache_zbud_buddied_count = 0; - for (i = 0; i < NCHUNKS; i++) { - INIT_LIST_HEAD(&zbud_unbuddied[i].list); - zbud_unbuddied[i].count = 0; - } -} - -#ifdef CONFIG_SYSFS -/* - * These sysfs routines show a nice distribution of how many zbpg's are - * currently (and have ever been placed) in each unbuddied list. It's fun - * to watch but can probably go away before final merge. - */ -static int zbud_show_unbuddied_list_counts(char *buf) -{ - int i; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) - p += sprintf(p, "%u ", zbud_unbuddied[i].count); - return p - buf; -} - -static int zbud_show_cumul_chunk_counts(char *buf) -{ - unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0; - unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0; - unsigned long total_chunks_lte_42 = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]); - chunks += zbud_cumul_chunk_counts[i]; - total_chunks += zbud_cumul_chunk_counts[i]; - sum_total_chunks += i * zbud_cumul_chunk_counts[i]; - if (i == 21) - total_chunks_lte_21 = total_chunks; - if (i == 32) - total_chunks_lte_32 = total_chunks; - if (i == 42) - total_chunks_lte_42 = total_chunks; - } - p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n", - total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42, - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} -#endif - -/********** - * This "zv" PAM implementation combines the TLSF-based xvMalloc - * with lzo1x compression to maximize the amount of data that can - * be packed into a physical page. - * - * Zv represents a PAM page with the index and object (plus a "size" value - * necessary for decompression) immediately preceding the compressed data. - */ - -/* rudimentary policy limits */ -/* total number of persistent pages may not exceed this percentage */ -static unsigned int zv_page_count_policy_percent = 75; -/* - * byte count defining poor compression; pages with greater zsize will be - * rejected - */ -static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7; -/* - * byte count defining poor *mean* compression; pages with greater zsize - * will be rejected until sufficient better-compressed pages are accepted - * driving the mean below this threshold - */ -static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5; - -static atomic_t zv_curr_dist_counts[NCHUNKS]; -static atomic_t zv_cumul_dist_counts[NCHUNKS]; - - -static struct zv_hdr *zv_create(struct zcache_client *cli, uint32_t pool_id, - struct tmem_oid *oid, uint32_t index, - void *cdata, unsigned clen) -{ - struct page *page; - struct zv_hdr *zv = NULL; - uint32_t offset; - int alloc_size = clen + sizeof(struct zv_hdr); - int chunks = (alloc_size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT; - int ret; - - BUG_ON(!irqs_disabled()); - BUG_ON(chunks >= NCHUNKS); - ret = xv_malloc(cli->xvpool, clen + sizeof(struct zv_hdr), - &page, &offset, ZCACHE_GFP_MASK); - if (unlikely(ret)) - goto out; - atomic_inc(&zv_curr_dist_counts[chunks]); - atomic_inc(&zv_cumul_dist_counts[chunks]); - zv = kmap_atomic(page) + offset; - zv->index = index; - zv->oid = *oid; - zv->pool_id = pool_id; - SET_SENTINEL(zv, ZVH); - INIT_LIST_HEAD(&zv->rem_op.list); - zv->client_id = get_client_id_from_client(cli); - zv->rem_op.op = RAMSTER_REMOTIFY_PERS_PUT; - if (zv->client_id == LOCAL_CLIENT) { - spin_lock(&zcache_rem_op_list_lock); - list_add_tail(&zv->rem_op.list, &zcache_rem_op_list); - spin_unlock(&zcache_rem_op_list_lock); - } - memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen); - kunmap_atomic(zv); -out: - return zv; -} - -/* similar to zv_create, but just reserve space, no data yet */ -static struct zv_hdr *zv_alloc(struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index, - unsigned clen) -{ - struct zcache_client *cli = pool->client; - struct page *page; - struct zv_hdr *zv = NULL; - uint32_t offset; - int ret; - - BUG_ON(!irqs_disabled()); - BUG_ON(!is_local_client(pool->client)); - ret = xv_malloc(cli->xvpool, clen + sizeof(struct zv_hdr), - &page, &offset, ZCACHE_GFP_MASK); - if (unlikely(ret)) - goto out; - zv = kmap_atomic(page) + offset; - SET_SENTINEL(zv, ZVH); - INIT_LIST_HEAD(&zv->rem_op.list); - zv->client_id = LOCAL_CLIENT; - zv->rem_op.op = RAMSTER_INTRANSIT_PERS; - zv->index = index; - zv->oid = *oid; - zv->pool_id = pool->pool_id; - kunmap_atomic(zv); -out: - return zv; -} - -static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv) -{ - unsigned long flags; - struct page *page; - uint32_t offset; - uint16_t size = xv_get_object_size(zv); - int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT; - - ASSERT_SENTINEL(zv, ZVH); - BUG_ON(chunks >= NCHUNKS); - atomic_dec(&zv_curr_dist_counts[chunks]); - size -= sizeof(*zv); - spin_lock(&zcache_rem_op_list_lock); - size = xv_get_object_size(zv) - sizeof(*zv); - BUG_ON(size == 0); - INVERT_SENTINEL(zv, ZVH); - if (!list_empty(&zv->rem_op.list)) - list_del_init(&zv->rem_op.list); - spin_unlock(&zcache_rem_op_list_lock); - page = virt_to_page(zv); - offset = (unsigned long)zv & ~PAGE_MASK; - local_irq_save(flags); - xv_free(xvpool, page, offset); - local_irq_restore(flags); -} - -static void zv_decompress(struct page *page, struct zv_hdr *zv) -{ - size_t clen = PAGE_SIZE; - char *to_va; - unsigned size; - int ret; - - ASSERT_SENTINEL(zv, ZVH); - size = xv_get_object_size(zv) - sizeof(*zv); - BUG_ON(size == 0); - to_va = kmap_atomic(page); - ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv), - size, to_va, &clen); - kunmap_atomic(to_va); - BUG_ON(ret != LZO_E_OK); - BUG_ON(clen != PAGE_SIZE); -} - -static void zv_copy_from_pampd(char *data, size_t *bufsize, struct zv_hdr *zv) -{ - unsigned size; - - ASSERT_SENTINEL(zv, ZVH); - size = xv_get_object_size(zv) - sizeof(*zv); - BUG_ON(size == 0 || size > zv_max_page_size); - BUG_ON(size > *bufsize); - memcpy(data, (char *)zv + sizeof(*zv), size); - *bufsize = size; -} - -static void zv_copy_to_pampd(struct zv_hdr *zv, char *data, size_t size) -{ - unsigned zv_size; - - ASSERT_SENTINEL(zv, ZVH); - zv_size = xv_get_object_size(zv) - sizeof(*zv); - BUG_ON(zv_size != size); - BUG_ON(zv_size == 0 || zv_size > zv_max_page_size); - memcpy((char *)zv + sizeof(*zv), data, size); -} - -#ifdef CONFIG_SYSFS -/* - * show a distribution of compression stats for zv pages. - */ - -static int zv_curr_dist_counts_show(char *buf) -{ - unsigned long i, n, chunks = 0, sum_total_chunks = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - n = atomic_read(&zv_curr_dist_counts[i]); - p += sprintf(p, "%lu ", n); - chunks += n; - sum_total_chunks += i * n; - } - p += sprintf(p, "mean:%lu\n", - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} - -static int zv_cumul_dist_counts_show(char *buf) -{ - unsigned long i, n, chunks = 0, sum_total_chunks = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - n = atomic_read(&zv_cumul_dist_counts[i]); - p += sprintf(p, "%lu ", n); - chunks += n; - sum_total_chunks += i * n; - } - p += sprintf(p, "mean:%lu\n", - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} - -/* - * setting zv_max_zsize via sysfs causes all persistent (e.g. swap) - * pages that don't compress to less than this value (including metadata - * overhead) to be rejected. We don't allow the value to get too close - * to PAGE_SIZE. - */ -static ssize_t zv_max_zsize_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", zv_max_zsize); -} - -static ssize_t zv_max_zsize_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7)) - return -EINVAL; - zv_max_zsize = val; - return count; -} - -/* - * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap) - * pages that don't compress to less than this value (including metadata - * overhead) to be rejected UNLESS the mean compression is also smaller - * than this value. In other words, we are load-balancing-by-zsize the - * accepted pages. Again, we don't allow the value to get too close - * to PAGE_SIZE. - */ -static ssize_t zv_max_mean_zsize_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", zv_max_mean_zsize); -} - -static ssize_t zv_max_mean_zsize_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7)) - return -EINVAL; - zv_max_mean_zsize = val; - return count; -} - -/* - * setting zv_page_count_policy_percent via sysfs sets an upper bound of - * persistent (e.g. swap) pages that will be retained according to: - * (zv_page_count_policy_percent * totalram_pages) / 100) - * when that limit is reached, further puts will be rejected (until - * some pages have been flushed). Note that, due to compression, - * this number may exceed 100; it defaults to 75 and we set an - * arbitary limit of 150. A poor choice will almost certainly result - * in OOM's, so this value should only be changed prudently. - */ -static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", zv_page_count_policy_percent); -} - -static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > 150)) - return -EINVAL; - zv_page_count_policy_percent = val; - return count; -} - -static struct kobj_attribute zcache_zv_max_zsize_attr = { - .attr = { .name = "zv_max_zsize", .mode = 0644 }, - .show = zv_max_zsize_show, - .store = zv_max_zsize_store, -}; - -static struct kobj_attribute zcache_zv_max_mean_zsize_attr = { - .attr = { .name = "zv_max_mean_zsize", .mode = 0644 }, - .show = zv_max_mean_zsize_show, - .store = zv_max_mean_zsize_store, -}; - -static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = { - .attr = { .name = "zv_page_count_policy_percent", - .mode = 0644 }, - .show = zv_page_count_policy_percent_show, - .store = zv_page_count_policy_percent_store, -}; -#endif - -/* - * zcache core code starts here - */ - -/* useful stats not collected by cleancache or frontswap */ -static unsigned long zcache_flush_total; -static unsigned long zcache_flush_found; -static unsigned long zcache_flobj_total; -static unsigned long zcache_flobj_found; -static unsigned long zcache_failed_eph_puts; -static unsigned long zcache_nonactive_puts; -static unsigned long zcache_failed_pers_puts; - -/* - * Tmem operations assume the poolid implies the invoking client. - * Zcache only has one client (the kernel itself): LOCAL_CLIENT. - * RAMster has each client numbered by cluster node, and a KVM version - * of zcache would have one client per guest and each client might - * have a poolid==N. - */ -static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid) -{ - struct tmem_pool *pool = NULL; - struct zcache_client *cli = NULL; - - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else { - if (cli_id >= MAX_CLIENTS) - goto out; - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - atomic_inc(&cli->refcount); - } - if (poolid < MAX_POOLS_PER_CLIENT) { - pool = cli->tmem_pools[poolid]; - if (pool != NULL) - atomic_inc(&pool->refcount); - } -out: - return pool; -} - -static void zcache_put_pool(struct tmem_pool *pool) -{ - struct zcache_client *cli = NULL; - - if (pool == NULL) - BUG(); - cli = pool->client; - atomic_dec(&pool->refcount); - atomic_dec(&cli->refcount); -} - -int zcache_new_client(uint16_t cli_id) -{ - struct zcache_client *cli = NULL; - int ret = -1; - - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else if ((unsigned int)cli_id < MAX_CLIENTS) - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - if (cli->allocated) - goto out; - cli->allocated = 1; -#ifdef CONFIG_FRONTSWAP - cli->xvpool = xv_create_pool(); - if (cli->xvpool == NULL) - goto out; -#endif - ret = 0; -out: - return ret; -} - -/* counters for debugging */ -static unsigned long zcache_failed_get_free_pages; -static unsigned long zcache_failed_alloc; -static unsigned long zcache_put_to_flush; - -/* - * for now, used named slabs so can easily track usage; later can - * either just use kmalloc, or perhaps add a slab-like allocator - * to more carefully manage total memory utilization - */ -static struct kmem_cache *zcache_objnode_cache; -static struct kmem_cache *zcache_obj_cache; -static struct kmem_cache *ramster_flnode_cache; -static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_obj_count_max; -static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_objnode_count_max; - -/* - * to avoid memory allocation recursion (e.g. due to direct reclaim), we - * preload all necessary data structures so the hostops callbacks never - * actually do a malloc - */ -struct zcache_preload { - void *page; - struct tmem_obj *obj; - int nr; - struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH]; - struct flushlist_node *flnode; -}; -static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, }; - -static int zcache_do_preload(struct tmem_pool *pool) -{ - struct zcache_preload *kp; - struct tmem_objnode *objnode; - struct tmem_obj *obj; - struct flushlist_node *flnode; - void *page; - int ret = -ENOMEM; - - if (unlikely(zcache_objnode_cache == NULL)) - goto out; - if (unlikely(zcache_obj_cache == NULL)) - goto out; - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - while (kp->nr < ARRAY_SIZE(kp->objnodes)) { - preempt_enable_no_resched(); - objnode = kmem_cache_alloc(zcache_objnode_cache, - ZCACHE_GFP_MASK); - if (unlikely(objnode == NULL)) { - zcache_failed_alloc++; - goto out; - } - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - if (kp->nr < ARRAY_SIZE(kp->objnodes)) - kp->objnodes[kp->nr++] = objnode; - else - kmem_cache_free(zcache_objnode_cache, objnode); - } - preempt_enable_no_resched(); - obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK); - if (unlikely(obj == NULL)) { - zcache_failed_alloc++; - goto out; - } - flnode = kmem_cache_alloc(ramster_flnode_cache, ZCACHE_GFP_MASK); - if (unlikely(flnode == NULL)) { - zcache_failed_alloc++; - goto out; - } - if (is_ephemeral(pool)) { - page = (void *)__get_free_page(ZCACHE_GFP_MASK); - if (unlikely(page == NULL)) { - zcache_failed_get_free_pages++; - kmem_cache_free(zcache_obj_cache, obj); - kmem_cache_free(ramster_flnode_cache, flnode); - goto out; - } - } - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - if (kp->obj == NULL) - kp->obj = obj; - else - kmem_cache_free(zcache_obj_cache, obj); - if (kp->flnode == NULL) - kp->flnode = flnode; - else - kmem_cache_free(ramster_flnode_cache, flnode); - if (is_ephemeral(pool)) { - if (kp->page == NULL) - kp->page = page; - else - free_page((unsigned long)page); - } - ret = 0; -out: - return ret; -} - -static int ramster_do_preload_flnode_only(struct tmem_pool *pool) -{ - struct zcache_preload *kp; - struct flushlist_node *flnode; - int ret = -ENOMEM; - - BUG_ON(!irqs_disabled()); - if (unlikely(ramster_flnode_cache == NULL)) - BUG(); - kp = &__get_cpu_var(zcache_preloads); - flnode = kmem_cache_alloc(ramster_flnode_cache, GFP_ATOMIC); - if (unlikely(flnode == NULL) && kp->flnode == NULL) - BUG(); /* FIXME handle more gracefully, but how??? */ - else if (kp->flnode == NULL) - kp->flnode = flnode; - else - kmem_cache_free(ramster_flnode_cache, flnode); - return ret; -} - -static void *zcache_get_free_page(void) -{ - struct zcache_preload *kp; - void *page; - - kp = &__get_cpu_var(zcache_preloads); - page = kp->page; - BUG_ON(page == NULL); - kp->page = NULL; - return page; -} - -static void zcache_free_page(void *p) -{ - free_page((unsigned long)p); -} - -/* - * zcache implementation for tmem host ops - */ - -static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool) -{ - struct tmem_objnode *objnode = NULL; - unsigned long count; - struct zcache_preload *kp; - - kp = &__get_cpu_var(zcache_preloads); - if (kp->nr <= 0) - goto out; - objnode = kp->objnodes[kp->nr - 1]; - BUG_ON(objnode == NULL); - kp->objnodes[kp->nr - 1] = NULL; - kp->nr--; - count = atomic_inc_return(&zcache_curr_objnode_count); - if (count > zcache_curr_objnode_count_max) - zcache_curr_objnode_count_max = count; -out: - return objnode; -} - -static void zcache_objnode_free(struct tmem_objnode *objnode, - struct tmem_pool *pool) -{ - atomic_dec(&zcache_curr_objnode_count); - BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0); - kmem_cache_free(zcache_objnode_cache, objnode); -} - -static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool) -{ - struct tmem_obj *obj = NULL; - unsigned long count; - struct zcache_preload *kp; - - kp = &__get_cpu_var(zcache_preloads); - obj = kp->obj; - BUG_ON(obj == NULL); - kp->obj = NULL; - count = atomic_inc_return(&zcache_curr_obj_count); - if (count > zcache_curr_obj_count_max) - zcache_curr_obj_count_max = count; - return obj; -} - -static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool) -{ - atomic_dec(&zcache_curr_obj_count); - BUG_ON(atomic_read(&zcache_curr_obj_count) < 0); - kmem_cache_free(zcache_obj_cache, obj); -} - -static struct flushlist_node *ramster_flnode_alloc(struct tmem_pool *pool) -{ - struct flushlist_node *flnode = NULL; - struct zcache_preload *kp; - int count; - - kp = &__get_cpu_var(zcache_preloads); - flnode = kp->flnode; - BUG_ON(flnode == NULL); - kp->flnode = NULL; - count = atomic_inc_return(&ramster_curr_flnode_count); - if (count > ramster_curr_flnode_count_max) - ramster_curr_flnode_count_max = count; - return flnode; -} - -static void ramster_flnode_free(struct flushlist_node *flnode, - struct tmem_pool *pool) -{ - atomic_dec(&ramster_curr_flnode_count); - BUG_ON(atomic_read(&ramster_curr_flnode_count) < 0); - kmem_cache_free(ramster_flnode_cache, flnode); -} - -static struct tmem_hostops zcache_hostops = { - .obj_alloc = zcache_obj_alloc, - .obj_free = zcache_obj_free, - .objnode_alloc = zcache_objnode_alloc, - .objnode_free = zcache_objnode_free, -}; - -/* - * zcache implementations for PAM page descriptor ops - */ - - -static inline void dec_and_check(atomic_t *pvar) -{ - atomic_dec(pvar); - /* later when all accounting is fixed, make this a BUG */ - WARN_ON_ONCE(atomic_read(pvar) < 0); -} - -static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_eph_pampd_count_max; -static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_pers_pampd_count_max; - -/* forward reference */ -static int zcache_compress(struct page *from, void **out_va, size_t *out_len); - -static int zcache_pampd_eph_create(char *data, size_t size, bool raw, - struct tmem_pool *pool, struct tmem_oid *oid, - uint32_t index, void **pampd) -{ - int ret = -1; - void *cdata = data; - size_t clen = size; - struct zcache_client *cli = pool->client; - uint16_t client_id = get_client_id_from_client(cli); - struct page *page = NULL; - unsigned long count; - - if (!raw) { - page = virt_to_page(data); - ret = zcache_compress(page, &cdata, &clen); - if (ret == 0) - goto out; - if (clen == 0 || clen > zbud_max_buddy_size()) { - zcache_compress_poor++; - goto out; - } - } - *pampd = (void *)zbud_create(client_id, pool->pool_id, oid, - index, page, cdata, clen); - if (*pampd == NULL) { - ret = -ENOMEM; - goto out; - } - ret = 0; - count = atomic_inc_return(&zcache_curr_eph_pampd_count); - if (count > zcache_curr_eph_pampd_count_max) - zcache_curr_eph_pampd_count_max = count; - if (client_id != LOCAL_CLIENT) { - count = atomic_inc_return(&ramster_foreign_eph_pampd_count); - if (count > ramster_foreign_eph_pampd_count_max) - ramster_foreign_eph_pampd_count_max = count; - } -out: - return ret; -} - -static int zcache_pampd_pers_create(char *data, size_t size, bool raw, - struct tmem_pool *pool, struct tmem_oid *oid, - uint32_t index, void **pampd) -{ - int ret = -1; - void *cdata = data; - size_t clen = size; - struct zcache_client *cli = pool->client; - struct page *page; - unsigned long count; - unsigned long zv_mean_zsize; - struct zv_hdr *zv; - long curr_pers_pampd_count; - u64 total_zsize; -#ifdef RAMSTER_TESTING - static bool pampd_neg_warned; -#endif - - curr_pers_pampd_count = atomic_read(&zcache_curr_pers_pampd_count) - - atomic_read(&ramster_remote_pers_pages); -#ifdef RAMSTER_TESTING - /* should always be positive, but warn if accounting is off */ - if (!pampd_neg_warned) { - pr_warn("ramster: bad accounting for curr_pers_pampd_count\n"); - pampd_neg_warned = true; - } -#endif - if (curr_pers_pampd_count > - (zv_page_count_policy_percent * totalram_pages) / 100) { - zcache_policy_percent_exceeded++; - goto out; - } - if (raw) - goto ok_to_create; - page = virt_to_page(data); - if (zcache_compress(page, &cdata, &clen) == 0) - goto out; - /* reject if compression is too poor */ - if (clen > zv_max_zsize) { - zcache_compress_poor++; - goto out; - } - /* reject if mean compression is too poor */ - if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) { - total_zsize = xv_get_total_size_bytes(cli->xvpool); - zv_mean_zsize = div_u64(total_zsize, curr_pers_pampd_count); - if (zv_mean_zsize > zv_max_mean_zsize) { - zcache_mean_compress_poor++; - goto out; - } - } -ok_to_create: - *pampd = (void *)zv_create(cli, pool->pool_id, oid, index, cdata, clen); - if (*pampd == NULL) { - ret = -ENOMEM; - goto out; - } - ret = 0; - count = atomic_inc_return(&zcache_curr_pers_pampd_count); - if (count > zcache_curr_pers_pampd_count_max) - zcache_curr_pers_pampd_count_max = count; - if (is_local_client(cli)) - goto out; - zv = *(struct zv_hdr **)pampd; - count = atomic_inc_return(&ramster_foreign_pers_pampd_count); - if (count > ramster_foreign_pers_pampd_count_max) - ramster_foreign_pers_pampd_count_max = count; -out: - return ret; -} - -static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph, - struct tmem_pool *pool, struct tmem_oid *oid, - uint32_t index) -{ - void *pampd = NULL; - int ret; - bool ephemeral; - - BUG_ON(preemptible()); - ephemeral = (eph == 1) || ((eph == 0) && is_ephemeral(pool)); - if (ephemeral) - ret = zcache_pampd_eph_create(data, size, raw, pool, - oid, index, &pampd); - else - ret = zcache_pampd_pers_create(data, size, raw, pool, - oid, index, &pampd); - /* FIXME add some counters here for failed creates? */ - return pampd; -} - -/* - * fill the pageframe corresponding to the struct page with the data - * from the passed pampd - */ -static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw, - void *pampd, struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index) -{ - int ret = 0; - - BUG_ON(preemptible()); - BUG_ON(is_ephemeral(pool)); /* Fix later for shared pools? */ - BUG_ON(pampd_is_remote(pampd)); - if (raw) - zv_copy_from_pampd(data, bufsize, pampd); - else - zv_decompress(virt_to_page(data), pampd); - return ret; -} - -static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw, - void *pampd, struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index) -{ - int ret = 0; - unsigned long flags; - struct zcache_client *cli = pool->client; - - BUG_ON(preemptible()); - BUG_ON(pampd_is_remote(pampd)); - if (is_ephemeral(pool)) { - local_irq_save(flags); - if (raw) - zbud_copy_from_pampd(data, bufsize, pampd); - else - ret = zbud_decompress(virt_to_page(data), pampd); - zbud_free_and_delist((struct zbud_hdr *)pampd); - local_irq_restore(flags); - if (!is_local_client(cli)) - dec_and_check(&ramster_foreign_eph_pampd_count); - dec_and_check(&zcache_curr_eph_pampd_count); - } else { - if (is_local_client(cli)) - BUG(); - if (raw) - zv_copy_from_pampd(data, bufsize, pampd); - else - zv_decompress(virt_to_page(data), pampd); - zv_free(cli->xvpool, pampd); - if (!is_local_client(cli)) - dec_and_check(&ramster_foreign_pers_pampd_count); - dec_and_check(&zcache_curr_pers_pampd_count); - ret = 0; - } - return ret; -} - -static bool zcache_pampd_is_remote(void *pampd) -{ - return pampd_is_remote(pampd); -} - -/* - * free the pampd and remove it from any zcache lists - * pampd must no longer be pointed to from any tmem data structures! - */ -static void zcache_pampd_free(void *pampd, struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index, bool acct) -{ - struct zcache_client *cli = pool->client; - bool eph = is_ephemeral(pool); - struct zv_hdr *zv; - - BUG_ON(preemptible()); - if (pampd_is_remote(pampd)) { - WARN_ON(acct == false); - if (oid == NULL) { - /* - * a NULL oid means to ignore this pampd free - * as the remote freeing will be handled elsewhere - */ - } else if (eph) { - /* FIXME remote flush optional but probably good idea */ - /* FIXME get these working properly again */ - dec_and_check(&zcache_curr_eph_pampd_count); - } else if (pampd_is_intransit(pampd)) { - /* did a pers remote get_and_free, so just free local */ - pampd = pampd_mask_intransit_and_remote(pampd); - goto local_pers; - } else { - struct flushlist_node *flnode = - ramster_flnode_alloc(pool); - - flnode->xh.client_id = pampd_remote_node(pampd); - flnode->xh.pool_id = pool->pool_id; - flnode->xh.oid = *oid; - flnode->xh.index = index; - flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_PAGE; - spin_lock(&zcache_rem_op_list_lock); - list_add(&flnode->rem_op.list, &zcache_rem_op_list); - spin_unlock(&zcache_rem_op_list_lock); - dec_and_check(&zcache_curr_pers_pampd_count); - dec_and_check(&ramster_remote_pers_pages); - } - } else if (eph) { - zbud_free_and_delist((struct zbud_hdr *)pampd); - if (!is_local_client(pool->client)) - dec_and_check(&ramster_foreign_eph_pampd_count); - if (acct) - /* FIXME get these working properly again */ - dec_and_check(&zcache_curr_eph_pampd_count); - } else { -local_pers: - zv = (struct zv_hdr *)pampd; - if (!is_local_client(pool->client)) - dec_and_check(&ramster_foreign_pers_pampd_count); - zv_free(cli->xvpool, zv); - if (acct) - /* FIXME get these working properly again */ - dec_and_check(&zcache_curr_pers_pampd_count); - } -} - -static void zcache_pampd_free_obj(struct tmem_pool *pool, - struct tmem_obj *obj) -{ - struct flushlist_node *flnode; - - BUG_ON(preemptible()); - if (obj->extra == NULL) - return; - BUG_ON(!pampd_is_remote(obj->extra)); - flnode = ramster_flnode_alloc(pool); - flnode->xh.client_id = pampd_remote_node(obj->extra); - flnode->xh.pool_id = pool->pool_id; - flnode->xh.oid = obj->oid; - flnode->xh.index = FLUSH_ENTIRE_OBJECT; - flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_OBJ; - spin_lock(&zcache_rem_op_list_lock); - list_add(&flnode->rem_op.list, &zcache_rem_op_list); - spin_unlock(&zcache_rem_op_list_lock); -} - -void zcache_pampd_new_obj(struct tmem_obj *obj) -{ - obj->extra = NULL; -} - -int zcache_pampd_replace_in_obj(void *new_pampd, struct tmem_obj *obj) -{ - int ret = -1; - - if (new_pampd != NULL) { - if (obj->extra == NULL) - obj->extra = new_pampd; - /* enforce that all remote pages in an object reside - * in the same node! */ - else if (pampd_remote_node(new_pampd) != - pampd_remote_node((void *)(obj->extra))) - BUG(); - ret = 0; - } - return ret; -} - -/* - * Called by the message handler after a (still compressed) page has been - * fetched from the remote machine in response to an "is_remote" tmem_get - * or persistent tmem_localify. For a tmem_get, "extra" is the address of - * the page that is to be filled to succesfully resolve the tmem_get; for - * a (persistent) tmem_localify, "extra" is NULL (as the data is placed only - * in the local zcache). "data" points to "size" bytes of (compressed) data - * passed in the message. In the case of a persistent remote get, if - * pre-allocation was successful (see zcache_repatriate_preload), the page - * is placed into both local zcache and at "extra". - */ -int zcache_localify(int pool_id, struct tmem_oid *oidp, - uint32_t index, char *data, size_t size, - void *extra) -{ - int ret = -ENOENT; - unsigned long flags; - struct tmem_pool *pool; - bool ephemeral, delete = false; - size_t clen = PAGE_SIZE; - void *pampd, *saved_hb; - struct tmem_obj *obj; - - pool = zcache_get_pool_by_id(LOCAL_CLIENT, pool_id); - if (unlikely(pool == NULL)) - /* pool doesn't exist anymore */ - goto out; - ephemeral = is_ephemeral(pool); - local_irq_save(flags); /* FIXME: maybe only disable softirqs? */ - pampd = tmem_localify_get_pampd(pool, oidp, index, &obj, &saved_hb); - if (pampd == NULL) { - /* hmmm... must have been a flush while waiting */ -#ifdef RAMSTER_TESTING - pr_err("UNTESTED pampd==NULL in zcache_localify\n"); -#endif - if (ephemeral) - ramster_remote_eph_pages_unsucc_get++; - else - ramster_remote_pers_pages_unsucc_get++; - obj = NULL; - goto finish; - } else if (unlikely(!pampd_is_remote(pampd))) { - /* hmmm... must have been a dup put while waiting */ -#ifdef RAMSTER_TESTING - pr_err("UNTESTED dup while waiting in zcache_localify\n"); -#endif - if (ephemeral) - ramster_remote_eph_pages_unsucc_get++; - else - ramster_remote_pers_pages_unsucc_get++; - obj = NULL; - pampd = NULL; - ret = -EEXIST; - goto finish; - } else if (size == 0) { - /* no remote data, delete the local is_remote pampd */ - pampd = NULL; - if (ephemeral) - ramster_remote_eph_pages_unsucc_get++; - else - BUG(); - delete = true; - goto finish; - } - if (!ephemeral && pampd_is_intransit(pampd)) { - /* localify to zcache */ - pampd = pampd_mask_intransit_and_remote(pampd); - zv_copy_to_pampd(pampd, data, size); - } else { - pampd = NULL; - obj = NULL; - } - if (extra != NULL) { - /* decompress direct-to-memory to complete remotify */ - ret = lzo1x_decompress_safe((char *)data, size, - (char *)extra, &clen); - BUG_ON(ret != LZO_E_OK); - BUG_ON(clen != PAGE_SIZE); - } - if (ephemeral) - ramster_remote_eph_pages_succ_get++; - else - ramster_remote_pers_pages_succ_get++; - ret = 0; -finish: - tmem_localify_finish(obj, index, pampd, saved_hb, delete); - zcache_put_pool(pool); - local_irq_restore(flags); -out: - return ret; -} - -/* - * Called on a remote persistent tmem_get to attempt to preallocate - * local storage for the data contained in the remote persistent page. - * If succesfully preallocated, returns the pampd, marked as remote and - * in_transit. Else returns NULL. Note that the appropriate tmem data - * structure must be locked. - */ -static void *zcache_pampd_repatriate_preload(void *pampd, - struct tmem_pool *pool, - struct tmem_oid *oid, - uint32_t index, - bool *intransit) -{ - int clen = pampd_remote_size(pampd); - void *ret_pampd = NULL; - unsigned long flags; - - if (!pampd_is_remote(pampd)) - BUG(); - if (is_ephemeral(pool)) - BUG(); - if (pampd_is_intransit(pampd)) { - /* - * to avoid multiple allocations (and maybe a memory leak) - * don't preallocate if already in the process of being - * repatriated - */ - *intransit = true; - goto out; - } - *intransit = false; - local_irq_save(flags); - ret_pampd = (void *)zv_alloc(pool, oid, index, clen); - if (ret_pampd != NULL) { - /* - * a pampd is marked intransit if it is remote and space has - * been allocated for it locally (note, only happens for - * persistent pages, in which case the remote copy is freed) - */ - ret_pampd = pampd_mark_intransit(ret_pampd); - dec_and_check(&ramster_remote_pers_pages); - } else - ramster_pers_pages_remote_nomem++; - local_irq_restore(flags); -out: - return ret_pampd; -} - -/* - * Called on a remote tmem_get to invoke a message to fetch the page. - * Might sleep so no tmem locks can be held. "extra" is passed - * all the way through the round-trip messaging to zcache_localify. - */ -static int zcache_pampd_repatriate(void *fake_pampd, void *real_pampd, - struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index, - bool free, void *extra) -{ - struct tmem_xhandle xh; - int ret; - - if (pampd_is_intransit(real_pampd)) - /* have local space pre-reserved, so free remote copy */ - free = true; - xh = tmem_xhandle_fill(LOCAL_CLIENT, pool, oid, index); - /* unreliable request/response for now */ - ret = ramster_remote_async_get(&xh, free, - pampd_remote_node(fake_pampd), - pampd_remote_size(fake_pampd), - pampd_remote_cksum(fake_pampd), - extra); -#ifdef RAMSTER_TESTING - if (ret != 0 && ret != -ENOENT) - pr_err("TESTING zcache_pampd_repatriate returns, ret=%d\n", - ret); -#endif - return ret; -} - -static struct tmem_pamops zcache_pamops = { - .create = zcache_pampd_create, - .get_data = zcache_pampd_get_data, - .free = zcache_pampd_free, - .get_data_and_free = zcache_pampd_get_data_and_free, - .free_obj = zcache_pampd_free_obj, - .is_remote = zcache_pampd_is_remote, - .repatriate_preload = zcache_pampd_repatriate_preload, - .repatriate = zcache_pampd_repatriate, - .new_obj = zcache_pampd_new_obj, - .replace_in_obj = zcache_pampd_replace_in_obj, -}; - -/* - * zcache compression/decompression and related per-cpu stuff - */ - -#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS -#define LZO_DSTMEM_PAGE_ORDER 1 -static DEFINE_PER_CPU(unsigned char *, zcache_workmem); -static DEFINE_PER_CPU(unsigned char *, zcache_dstmem); - -static int zcache_compress(struct page *from, void **out_va, size_t *out_len) -{ - int ret = 0; - unsigned char *dmem = __get_cpu_var(zcache_dstmem); - unsigned char *wmem = __get_cpu_var(zcache_workmem); - char *from_va; - - BUG_ON(!irqs_disabled()); - if (unlikely(dmem == NULL || wmem == NULL)) - goto out; /* no buffer, so can't compress */ - from_va = kmap_atomic(from); - mb(); - ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem); - BUG_ON(ret != LZO_E_OK); - *out_va = dmem; - kunmap_atomic(from_va); - ret = 1; -out: - return ret; -} - - -static int zcache_cpu_notifier(struct notifier_block *nb, - unsigned long action, void *pcpu) -{ - int cpu = (long)pcpu; - struct zcache_preload *kp; - - switch (action) { - case CPU_UP_PREPARE: - per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages( - GFP_KERNEL | __GFP_REPEAT, - LZO_DSTMEM_PAGE_ORDER), - per_cpu(zcache_workmem, cpu) = - kzalloc(LZO1X_MEM_COMPRESS, - GFP_KERNEL | __GFP_REPEAT); - per_cpu(zcache_remoteputmem, cpu) = - kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT); - break; - case CPU_DEAD: - case CPU_UP_CANCELED: - kfree(per_cpu(zcache_remoteputmem, cpu)); - per_cpu(zcache_remoteputmem, cpu) = NULL; - free_pages((unsigned long)per_cpu(zcache_dstmem, cpu), - LZO_DSTMEM_PAGE_ORDER); - per_cpu(zcache_dstmem, cpu) = NULL; - kfree(per_cpu(zcache_workmem, cpu)); - per_cpu(zcache_workmem, cpu) = NULL; - kp = &per_cpu(zcache_preloads, cpu); - while (kp->nr) { - kmem_cache_free(zcache_objnode_cache, - kp->objnodes[kp->nr - 1]); - kp->objnodes[kp->nr - 1] = NULL; - kp->nr--; - } - if (kp->obj) { - kmem_cache_free(zcache_obj_cache, kp->obj); - kp->obj = NULL; - } - if (kp->flnode) { - kmem_cache_free(ramster_flnode_cache, kp->flnode); - kp->flnode = NULL; - } - if (kp->page) { - free_page((unsigned long)kp->page); - kp->page = NULL; - } - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block zcache_cpu_notifier_block = { - .notifier_call = zcache_cpu_notifier -}; - -#ifdef CONFIG_SYSFS -#define ZCACHE_SYSFS_RO(_name) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", zcache_##_name); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -#define ZCACHE_SYSFS_RO_ATOMIC(_name) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return _func(buf); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -ZCACHE_SYSFS_RO(curr_obj_count_max); -ZCACHE_SYSFS_RO(curr_objnode_count_max); -ZCACHE_SYSFS_RO(flush_total); -ZCACHE_SYSFS_RO(flush_found); -ZCACHE_SYSFS_RO(flobj_total); -ZCACHE_SYSFS_RO(flobj_found); -ZCACHE_SYSFS_RO(failed_eph_puts); -ZCACHE_SYSFS_RO(nonactive_puts); -ZCACHE_SYSFS_RO(failed_pers_puts); -ZCACHE_SYSFS_RO(zbud_curr_zbytes); -ZCACHE_SYSFS_RO(zbud_cumul_zpages); -ZCACHE_SYSFS_RO(zbud_cumul_zbytes); -ZCACHE_SYSFS_RO(zbud_buddied_count); -ZCACHE_SYSFS_RO(evicted_raw_pages); -ZCACHE_SYSFS_RO(evicted_unbuddied_pages); -ZCACHE_SYSFS_RO(evicted_buddied_pages); -ZCACHE_SYSFS_RO(failed_get_free_pages); -ZCACHE_SYSFS_RO(failed_alloc); -ZCACHE_SYSFS_RO(put_to_flush); -ZCACHE_SYSFS_RO(compress_poor); -ZCACHE_SYSFS_RO(mean_compress_poor); -ZCACHE_SYSFS_RO(policy_percent_exceeded); -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages); -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages); -ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count); -ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count); -ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts, - zbud_show_unbuddied_list_counts); -ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts, - zbud_show_cumul_chunk_counts); -ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts, - zv_curr_dist_counts_show); -ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts, - zv_cumul_dist_counts_show); - -static struct attribute *zcache_attrs[] = { - &zcache_curr_obj_count_attr.attr, - &zcache_curr_obj_count_max_attr.attr, - &zcache_curr_objnode_count_attr.attr, - &zcache_curr_objnode_count_max_attr.attr, - &zcache_flush_total_attr.attr, - &zcache_flobj_total_attr.attr, - &zcache_flush_found_attr.attr, - &zcache_flobj_found_attr.attr, - &zcache_failed_eph_puts_attr.attr, - &zcache_nonactive_puts_attr.attr, - &zcache_failed_pers_puts_attr.attr, - &zcache_policy_percent_exceeded_attr.attr, - &zcache_compress_poor_attr.attr, - &zcache_mean_compress_poor_attr.attr, - &zcache_zbud_curr_raw_pages_attr.attr, - &zcache_zbud_curr_zpages_attr.attr, - &zcache_zbud_curr_zbytes_attr.attr, - &zcache_zbud_cumul_zpages_attr.attr, - &zcache_zbud_cumul_zbytes_attr.attr, - &zcache_zbud_buddied_count_attr.attr, - &zcache_evicted_raw_pages_attr.attr, - &zcache_evicted_unbuddied_pages_attr.attr, - &zcache_evicted_buddied_pages_attr.attr, - &zcache_failed_get_free_pages_attr.attr, - &zcache_failed_alloc_attr.attr, - &zcache_put_to_flush_attr.attr, - &zcache_zbud_unbuddied_list_counts_attr.attr, - &zcache_zbud_cumul_chunk_counts_attr.attr, - &zcache_zv_curr_dist_counts_attr.attr, - &zcache_zv_cumul_dist_counts_attr.attr, - &zcache_zv_max_zsize_attr.attr, - &zcache_zv_max_mean_zsize_attr.attr, - &zcache_zv_page_count_policy_percent_attr.attr, - NULL, -}; - -static struct attribute_group zcache_attr_group = { - .attrs = zcache_attrs, - .name = "zcache", -}; - -#define RAMSTER_SYSFS_RO(_name) \ - static ssize_t ramster_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", ramster_##_name); \ - } \ - static struct kobj_attribute ramster_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = ramster_##_name##_show, \ - } - -#define RAMSTER_SYSFS_RW(_name) \ - static ssize_t ramster_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", ramster_##_name); \ - } \ - static ssize_t ramster_##_name##_store(struct kobject *kobj, \ - struct kobj_attribute *attr, const char *buf, size_t count) \ - { \ - int err; \ - unsigned long enable; \ - err = kstrtoul(buf, 10, &enable); \ - if (err) \ - return -EINVAL; \ - ramster_##_name = enable; \ - return count; \ - } \ - static struct kobj_attribute ramster_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0644 }, \ - .show = ramster_##_name##_show, \ - .store = ramster_##_name##_store, \ - } - -#define RAMSTER_SYSFS_RO_ATOMIC(_name) \ - static ssize_t ramster_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%d\n", atomic_read(&ramster_##_name)); \ - } \ - static struct kobj_attribute ramster_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = ramster_##_name##_show, \ - } - -RAMSTER_SYSFS_RO(interface_revision); -RAMSTER_SYSFS_RO_ATOMIC(remote_pers_pages); -RAMSTER_SYSFS_RW(pers_remotify_enable); -RAMSTER_SYSFS_RW(eph_remotify_enable); -RAMSTER_SYSFS_RO(eph_pages_remoted); -RAMSTER_SYSFS_RO(eph_pages_remote_failed); -RAMSTER_SYSFS_RO(pers_pages_remoted); -RAMSTER_SYSFS_RO(pers_pages_remote_failed); -RAMSTER_SYSFS_RO(pers_pages_remote_nomem); -RAMSTER_SYSFS_RO(remote_pages_flushed); -RAMSTER_SYSFS_RO(remote_page_flushes_failed); -RAMSTER_SYSFS_RO(remote_objects_flushed); -RAMSTER_SYSFS_RO(remote_object_flushes_failed); -RAMSTER_SYSFS_RO(remote_eph_pages_succ_get); -RAMSTER_SYSFS_RO(remote_eph_pages_unsucc_get); -RAMSTER_SYSFS_RO(remote_pers_pages_succ_get); -RAMSTER_SYSFS_RO(remote_pers_pages_unsucc_get); -RAMSTER_SYSFS_RO_ATOMIC(foreign_eph_pampd_count); -RAMSTER_SYSFS_RO(foreign_eph_pampd_count_max); -RAMSTER_SYSFS_RO_ATOMIC(foreign_pers_pampd_count); -RAMSTER_SYSFS_RO(foreign_pers_pampd_count_max); -RAMSTER_SYSFS_RO_ATOMIC(curr_flnode_count); -RAMSTER_SYSFS_RO(curr_flnode_count_max); - -#define MANUAL_NODES 8 -static bool ramster_nodes_manual_up[MANUAL_NODES]; -static ssize_t ramster_manual_node_up_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - int i; - char *p = buf; - for (i = 0; i < MANUAL_NODES; i++) - if (ramster_nodes_manual_up[i]) - p += sprintf(p, "%d ", i); - p += sprintf(p, "\n"); - return p - buf; -} - -static ssize_t ramster_manual_node_up_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - int err; - unsigned long node_num; - - err = kstrtoul(buf, 10, &node_num); - if (err) { - pr_err("ramster: bad strtoul?\n"); - return -EINVAL; - } - if (node_num >= MANUAL_NODES) { - pr_err("ramster: bad node_num=%lu?\n", node_num); - return -EINVAL; - } - if (ramster_nodes_manual_up[node_num]) { - pr_err("ramster: node %d already up, ignoring\n", - (int)node_num); - } else { - ramster_nodes_manual_up[node_num] = true; - r2net_hb_node_up_manual((int)node_num); - } - return count; -} - -static struct kobj_attribute ramster_manual_node_up_attr = { - .attr = { .name = "manual_node_up", .mode = 0644 }, - .show = ramster_manual_node_up_show, - .store = ramster_manual_node_up_store, -}; - -static ssize_t ramster_remote_target_nodenum_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - if (ramster_remote_target_nodenum == -1UL) - return sprintf(buf, "unset\n"); - else - return sprintf(buf, "%d\n", ramster_remote_target_nodenum); -} - -static ssize_t ramster_remote_target_nodenum_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - int err; - unsigned long node_num; - - err = kstrtoul(buf, 10, &node_num); - if (err) { - pr_err("ramster: bad strtoul?\n"); - return -EINVAL; - } else if (node_num == -1UL) { - pr_err("ramster: disabling all remotification, " - "data may still reside on remote nodes however\n"); - return -EINVAL; - } else if (node_num >= MANUAL_NODES) { - pr_err("ramster: bad node_num=%lu?\n", node_num); - return -EINVAL; - } else if (!ramster_nodes_manual_up[node_num]) { - pr_err("ramster: node %d not up, ignoring setting " - "of remotification target\n", (int)node_num); - } else if (r2net_remote_target_node_set((int)node_num) >= 0) { - pr_info("ramster: node %d set as remotification target\n", - (int)node_num); - ramster_remote_target_nodenum = (int)node_num; - } else { - pr_err("ramster: bad num to node node_num=%d?\n", - (int)node_num); - return -EINVAL; - } - return count; -} - -static struct kobj_attribute ramster_remote_target_nodenum_attr = { - .attr = { .name = "remote_target_nodenum", .mode = 0644 }, - .show = ramster_remote_target_nodenum_show, - .store = ramster_remote_target_nodenum_store, -}; - - -static struct attribute *ramster_attrs[] = { - &ramster_interface_revision_attr.attr, - &ramster_pers_remotify_enable_attr.attr, - &ramster_eph_remotify_enable_attr.attr, - &ramster_remote_pers_pages_attr.attr, - &ramster_eph_pages_remoted_attr.attr, - &ramster_eph_pages_remote_failed_attr.attr, - &ramster_pers_pages_remoted_attr.attr, - &ramster_pers_pages_remote_failed_attr.attr, - &ramster_pers_pages_remote_nomem_attr.attr, - &ramster_remote_pages_flushed_attr.attr, - &ramster_remote_page_flushes_failed_attr.attr, - &ramster_remote_objects_flushed_attr.attr, - &ramster_remote_object_flushes_failed_attr.attr, - &ramster_remote_eph_pages_succ_get_attr.attr, - &ramster_remote_eph_pages_unsucc_get_attr.attr, - &ramster_remote_pers_pages_succ_get_attr.attr, - &ramster_remote_pers_pages_unsucc_get_attr.attr, - &ramster_foreign_eph_pampd_count_attr.attr, - &ramster_foreign_eph_pampd_count_max_attr.attr, - &ramster_foreign_pers_pampd_count_attr.attr, - &ramster_foreign_pers_pampd_count_max_attr.attr, - &ramster_curr_flnode_count_attr.attr, - &ramster_curr_flnode_count_max_attr.attr, - &ramster_manual_node_up_attr.attr, - &ramster_remote_target_nodenum_attr.attr, - NULL, -}; - -static struct attribute_group ramster_attr_group = { - .attrs = ramster_attrs, - .name = "ramster", -}; - -#endif /* CONFIG_SYSFS */ -/* - * When zcache is disabled ("frozen"), pools can be created and destroyed, - * but all puts (and thus all other operations that require memory allocation) - * must fail. If zcache is unfrozen, accepts puts, then frozen again, - * data consistency requires all puts while frozen to be converted into - * flushes. - */ -static bool zcache_freeze; - -/* - * zcache shrinker interface (only useful for ephemeral pages, so zbud only) - */ -static int shrink_zcache_memory(struct shrinker *shrink, - struct shrink_control *sc) -{ - int ret = -1; - int nr = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; - - if (nr >= 0) { - if (!(gfp_mask & __GFP_FS)) - /* does this case really need to be skipped? */ - goto out; - zbud_evict_pages(nr); - } - ret = (int)atomic_read(&zcache_zbud_curr_raw_pages); -out: - return ret; -} - -static struct shrinker zcache_shrinker = { - .shrink = shrink_zcache_memory, - .seeks = DEFAULT_SEEKS, -}; - -/* - * zcache shims between cleancache/frontswap ops and tmem - */ - -int zcache_put(int cli_id, int pool_id, struct tmem_oid *oidp, - uint32_t index, char *data, size_t size, - bool raw, int ephemeral) -{ - struct tmem_pool *pool; - int ret = -1; - - BUG_ON(!irqs_disabled()); - pool = zcache_get_pool_by_id(cli_id, pool_id); - if (unlikely(pool == NULL)) - goto out; - if (!zcache_freeze && zcache_do_preload(pool) == 0) { - /* preload does preempt_disable on success */ - ret = tmem_put(pool, oidp, index, data, size, raw, ephemeral); - if (ret < 0) { - if (is_ephemeral(pool)) - zcache_failed_eph_puts++; - else - zcache_failed_pers_puts++; - } - zcache_put_pool(pool); - preempt_enable_no_resched(); - } else { - zcache_put_to_flush++; - if (atomic_read(&pool->obj_count) > 0) - /* the put fails whether the flush succeeds or not */ - (void)tmem_flush_page(pool, oidp, index); - zcache_put_pool(pool); - } -out: - return ret; -} - -int zcache_get(int cli_id, int pool_id, struct tmem_oid *oidp, - uint32_t index, char *data, size_t *sizep, - bool raw, int get_and_free) -{ - struct tmem_pool *pool; - int ret = -1; - bool eph; - - if (!raw) { - BUG_ON(irqs_disabled()); - BUG_ON(in_softirq()); - } - pool = zcache_get_pool_by_id(cli_id, pool_id); - eph = is_ephemeral(pool); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_get(pool, oidp, index, data, sizep, - raw, get_and_free); - zcache_put_pool(pool); - } - WARN_ONCE((!eph && (ret != 0)), "zcache_get fails on persistent pool, " - "bad things are very likely to happen soon\n"); -#ifdef RAMSTER_TESTING - if (ret != 0 && ret != -1 && !(ret == -EINVAL && is_ephemeral(pool))) - pr_err("TESTING zcache_get tmem_get returns ret=%d\n", ret); -#endif - if (ret == -EAGAIN) - BUG(); /* FIXME... don't need this anymore??? let's ensure */ - return ret; -} - -int zcache_flush(int cli_id, int pool_id, - struct tmem_oid *oidp, uint32_t index) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - - local_irq_save(flags); - zcache_flush_total++; - pool = zcache_get_pool_by_id(cli_id, pool_id); - ramster_do_preload_flnode_only(pool); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_flush_page(pool, oidp, index); - zcache_put_pool(pool); - } - if (ret >= 0) - zcache_flush_found++; - local_irq_restore(flags); - return ret; -} - -int zcache_flush_object(int cli_id, int pool_id, struct tmem_oid *oidp) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - - local_irq_save(flags); - zcache_flobj_total++; - pool = zcache_get_pool_by_id(cli_id, pool_id); - ramster_do_preload_flnode_only(pool); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_flush_object(pool, oidp); - zcache_put_pool(pool); - } - if (ret >= 0) - zcache_flobj_found++; - local_irq_restore(flags); - return ret; -} - -int zcache_client_destroy_pool(int cli_id, int pool_id) -{ - struct tmem_pool *pool = NULL; - struct zcache_client *cli = NULL; - int ret = -1; - - if (pool_id < 0) - goto out; - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else if ((unsigned int)cli_id < MAX_CLIENTS) - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - atomic_inc(&cli->refcount); - pool = cli->tmem_pools[pool_id]; - if (pool == NULL) - goto out; - cli->tmem_pools[pool_id] = NULL; - /* wait for pool activity on other cpus to quiesce */ - while (atomic_read(&pool->refcount) != 0) - ; - atomic_dec(&cli->refcount); - local_bh_disable(); - ret = tmem_destroy_pool(pool); - local_bh_enable(); - kfree(pool); - pr_info("ramster: destroyed pool id=%d cli_id=%d\n", pool_id, cli_id); -out: - return ret; -} - -static int zcache_destroy_pool(int pool_id) -{ - return zcache_client_destroy_pool(LOCAL_CLIENT, pool_id); -} - -int zcache_new_pool(uint16_t cli_id, uint32_t flags) -{ - int poolid = -1; - struct tmem_pool *pool; - struct zcache_client *cli = NULL; - - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else if ((unsigned int)cli_id < MAX_CLIENTS) - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - atomic_inc(&cli->refcount); - pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC); - if (pool == NULL) { - pr_info("ramster: pool creation failed: out of memory\n"); - goto out; - } - - for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++) - if (cli->tmem_pools[poolid] == NULL) - break; - if (poolid >= MAX_POOLS_PER_CLIENT) { - pr_info("ramster: pool creation failed: max exceeded\n"); - kfree(pool); - poolid = -1; - goto out; - } - atomic_set(&pool->refcount, 0); - pool->client = cli; - pool->pool_id = poolid; - tmem_new_pool(pool, flags); - cli->tmem_pools[poolid] = pool; - if (cli_id == LOCAL_CLIENT) - pr_info("ramster: created %s tmem pool, id=%d, local client\n", - flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", - poolid); - else - pr_info("ramster: created %s tmem pool, id=%d, client=%d\n", - flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", - poolid, cli_id); -out: - if (cli != NULL) - atomic_dec(&cli->refcount); - return poolid; -} - -static int zcache_local_new_pool(uint32_t flags) -{ - return zcache_new_pool(LOCAL_CLIENT, flags); -} - -int zcache_autocreate_pool(int cli_id, int pool_id, bool ephemeral) -{ - struct tmem_pool *pool; - struct zcache_client *cli = NULL; - uint32_t flags = ephemeral ? 0 : TMEM_POOL_PERSIST; - int ret = -1; - - if (cli_id == LOCAL_CLIENT) - goto out; - if (pool_id >= MAX_POOLS_PER_CLIENT) - goto out; - else if ((unsigned int)cli_id < MAX_CLIENTS) - cli = &zcache_clients[cli_id]; - if ((ephemeral && !use_cleancache) || (!ephemeral && !use_frontswap)) - BUG(); /* FIXME, handle more gracefully later */ - if (!cli->allocated) { - if (zcache_new_client(cli_id)) - BUG(); /* FIXME, handle more gracefully later */ - cli = &zcache_clients[cli_id]; - } - atomic_inc(&cli->refcount); - pool = cli->tmem_pools[pool_id]; - if (pool != NULL) { - if (pool->persistent && ephemeral) { - pr_err("zcache_autocreate_pool: type mismatch\n"); - goto out; - } - ret = 0; - goto out; - } - pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL); - if (pool == NULL) { - pr_info("ramster: pool creation failed: out of memory\n"); - goto out; - } - atomic_set(&pool->refcount, 0); - pool->client = cli; - pool->pool_id = pool_id; - tmem_new_pool(pool, flags); - cli->tmem_pools[pool_id] = pool; - pr_info("ramster: AUTOcreated %s tmem poolid=%d, for remote client=%d\n", - flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", - pool_id, cli_id); - ret = 0; -out: - if (cli == NULL) - BUG(); /* FIXME, handle more gracefully later */ - /* pr_err("zcache_autocreate_pool: failed\n"); */ - if (cli != NULL) - atomic_dec(&cli->refcount); - return ret; -} - -/********** - * Two kernel functionalities currently can be layered on top of tmem. - * These are "cleancache" which is used as a second-chance cache for clean - * page cache pages; and "frontswap" which is used for swap pages - * to avoid writes to disk. A generic "shim" is provided here for each - * to translate in-kernel semantics to zcache semantics. - */ - -#ifdef CONFIG_CLEANCACHE -static void zcache_cleancache_put_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - -#ifdef __PG_WAS_ACTIVE - if (!PageWasActive(page)) { - zcache_nonactive_puts++; - return; - } -#endif - if (likely(ind == index)) { - char *kva = page_address(page); - - (void)zcache_put(LOCAL_CLIENT, pool_id, &oid, index, - kva, PAGE_SIZE, 0, 1); - } -} - -static int zcache_cleancache_get_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - int ret = -1; - - preempt_disable(); - if (likely(ind == index)) { - char *kva = page_address(page); - size_t size = PAGE_SIZE; - - ret = zcache_get(LOCAL_CLIENT, pool_id, &oid, index, - kva, &size, 0, 0); -#ifdef __PG_WAS_ACTIVE - if (ret == 0) - SetPageWasActive(page); -#endif - } - preempt_enable(); - return ret; -} - -static void zcache_cleancache_flush_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (likely(ind == index)) - (void)zcache_flush(LOCAL_CLIENT, pool_id, &oid, ind); -} - -static void zcache_cleancache_flush_inode(int pool_id, - struct cleancache_filekey key) -{ - struct tmem_oid oid = *(struct tmem_oid *)&key; - - (void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid); -} - -static void zcache_cleancache_flush_fs(int pool_id) -{ - if (pool_id >= 0) - (void)zcache_destroy_pool(pool_id); -} - -static int zcache_cleancache_init_fs(size_t pagesize) -{ - BUG_ON(sizeof(struct cleancache_filekey) != - sizeof(struct tmem_oid)); - BUG_ON(pagesize != PAGE_SIZE); - return zcache_local_new_pool(0); -} - -static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) -{ - /* shared pools are unsupported and map to private */ - BUG_ON(sizeof(struct cleancache_filekey) != - sizeof(struct tmem_oid)); - BUG_ON(pagesize != PAGE_SIZE); - return zcache_local_new_pool(0); -} - -static struct cleancache_ops zcache_cleancache_ops = { - .put_page = zcache_cleancache_put_page, - .get_page = zcache_cleancache_get_page, - .invalidate_page = zcache_cleancache_flush_page, - .invalidate_inode = zcache_cleancache_flush_inode, - .invalidate_fs = zcache_cleancache_flush_fs, - .init_shared_fs = zcache_cleancache_init_shared_fs, - .init_fs = zcache_cleancache_init_fs -}; - -struct cleancache_ops zcache_cleancache_register_ops(void) -{ - struct cleancache_ops old_ops = - cleancache_register_ops(&zcache_cleancache_ops); - - return old_ops; -} -#endif - -#ifdef CONFIG_FRONTSWAP -/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ -static int zcache_frontswap_poolid = -1; - -/* - * Swizzling increases objects per swaptype, increasing tmem concurrency - * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS - */ -#define SWIZ_BITS 8 -#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) -#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) -#define iswiz(_ind) (_ind >> SWIZ_BITS) - -static inline struct tmem_oid oswiz(unsigned type, u32 ind) -{ - struct tmem_oid oid = { .oid = { 0 } }; - oid.oid[0] = _oswiz(type, ind); - return oid; -} - -static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - int ret = -1; - unsigned long flags; - char *kva; - - BUG_ON(!PageLocked(page)); - if (likely(ind64 == ind)) { - local_irq_save(flags); - kva = page_address(page); - ret = zcache_put(LOCAL_CLIENT, zcache_frontswap_poolid, - &oid, iswiz(ind), kva, PAGE_SIZE, 0, 0); - local_irq_restore(flags); - } - return ret; -} - -/* returns 0 if the page was successfully gotten from frontswap, -1 if - * was not present (should never happen!) */ -static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - int ret = -1; - - preempt_disable(); /* FIXME, remove this? */ - BUG_ON(!PageLocked(page)); - if (likely(ind64 == ind)) { - char *kva = page_address(page); - size_t size = PAGE_SIZE; - - ret = zcache_get(LOCAL_CLIENT, zcache_frontswap_poolid, - &oid, iswiz(ind), kva, &size, 0, -1); - } - preempt_enable(); /* FIXME, remove this? */ - return ret; -} - -/* flush a single page from frontswap */ -static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - - if (likely(ind64 == ind)) - (void)zcache_flush(LOCAL_CLIENT, zcache_frontswap_poolid, - &oid, iswiz(ind)); -} - -/* flush all pages from the passed swaptype */ -static void zcache_frontswap_flush_area(unsigned type) -{ - struct tmem_oid oid; - int ind; - - for (ind = SWIZ_MASK; ind >= 0; ind--) { - oid = oswiz(type, ind); - (void)zcache_flush_object(LOCAL_CLIENT, - zcache_frontswap_poolid, &oid); - } -} - -static void zcache_frontswap_init(unsigned ignored) -{ - /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ - if (zcache_frontswap_poolid < 0) - zcache_frontswap_poolid = - zcache_local_new_pool(TMEM_POOL_PERSIST); -} - -static struct frontswap_ops zcache_frontswap_ops = { - .put_page = zcache_frontswap_put_page, - .get_page = zcache_frontswap_get_page, - .invalidate_page = zcache_frontswap_flush_page, - .invalidate_area = zcache_frontswap_flush_area, - .init = zcache_frontswap_init -}; - -struct frontswap_ops zcache_frontswap_register_ops(void) -{ - struct frontswap_ops old_ops = - frontswap_register_ops(&zcache_frontswap_ops); - - return old_ops; -} -#endif - -/* - * frontswap selfshrinking - */ - -#ifdef CONFIG_FRONTSWAP -/* In HZ, controls frequency of worker invocation. */ -static unsigned int selfshrink_interval __read_mostly = 5; - -static void selfshrink_process(struct work_struct *work); -static DECLARE_DELAYED_WORK(selfshrink_worker, selfshrink_process); - -/* Enable/disable with sysfs. */ -static bool frontswap_selfshrinking __read_mostly; - -/* Enable/disable with kernel boot option. */ -static bool use_frontswap_selfshrink __initdata = true; - -/* - * The default values for the following parameters were deemed reasonable - * by experimentation, may be workload-dependent, and can all be - * adjusted via sysfs. - */ - -/* Control rate for frontswap shrinking. Higher hysteresis is slower. */ -static unsigned int frontswap_hysteresis __read_mostly = 20; - -/* - * Number of selfshrink worker invocations to wait before observing that - * frontswap selfshrinking should commence. Note that selfshrinking does - * not use a separate worker thread. - */ -static unsigned int frontswap_inertia __read_mostly = 3; - -/* Countdown to next invocation of frontswap_shrink() */ -static unsigned long frontswap_inertia_counter; - -/* - * Invoked by the selfshrink worker thread, uses current number of pages - * in frontswap (frontswap_curr_pages()), previous status, and control - * values (hysteresis and inertia) to determine if frontswap should be - * shrunk and what the new frontswap size should be. Note that - * frontswap_shrink is essentially a partial swapoff that immediately - * transfers pages from the "swap device" (frontswap) back into kernel - * RAM; despite the name, frontswap "shrinking" is very different from - * the "shrinker" interface used by the kernel MM subsystem to reclaim - * memory. - */ -static void frontswap_selfshrink(void) -{ - static unsigned long cur_frontswap_pages; - static unsigned long last_frontswap_pages; - static unsigned long tgt_frontswap_pages; - - last_frontswap_pages = cur_frontswap_pages; - cur_frontswap_pages = frontswap_curr_pages(); - if (!cur_frontswap_pages || - (cur_frontswap_pages > last_frontswap_pages)) { - frontswap_inertia_counter = frontswap_inertia; - return; - } - if (frontswap_inertia_counter && --frontswap_inertia_counter) - return; - if (cur_frontswap_pages <= frontswap_hysteresis) - tgt_frontswap_pages = 0; - else - tgt_frontswap_pages = cur_frontswap_pages - - (cur_frontswap_pages / frontswap_hysteresis); - frontswap_shrink(tgt_frontswap_pages); -} - -static int __init ramster_nofrontswap_selfshrink_setup(char *s) -{ - use_frontswap_selfshrink = false; - return 1; -} - -__setup("noselfshrink", ramster_nofrontswap_selfshrink_setup); - -static void selfshrink_process(struct work_struct *work) -{ - if (frontswap_selfshrinking && frontswap_enabled) { - frontswap_selfshrink(); - schedule_delayed_work(&selfshrink_worker, - selfshrink_interval * HZ); - } -} - -static int ramster_enabled; - -static int __init ramster_selfshrink_init(void) -{ - frontswap_selfshrinking = ramster_enabled && use_frontswap_selfshrink; - if (frontswap_selfshrinking) - pr_info("ramster: Initializing frontswap " - "selfshrinking driver.\n"); - else - return -ENODEV; - - schedule_delayed_work(&selfshrink_worker, selfshrink_interval * HZ); - - return 0; -} - -subsys_initcall(ramster_selfshrink_init); -#endif - -/* - * zcache initialization - * NOTE FOR NOW ramster MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR - * NOTHING HAPPENS! - */ - -static int ramster_enabled; - -static int __init enable_ramster(char *s) -{ - ramster_enabled = 1; - return 1; -} -__setup("ramster", enable_ramster); - -/* allow independent dynamic disabling of cleancache and frontswap */ - -static int use_cleancache = 1; - -static int __init no_cleancache(char *s) -{ - pr_info("INIT no_cleancache called\n"); - use_cleancache = 0; - return 1; -} - -/* - * FIXME: need to guarantee this gets checked before zcache_init is called - * What is the correct way to achieve this? - */ -early_param("nocleancache", no_cleancache); - -static int use_frontswap = 1; - -static int __init no_frontswap(char *s) -{ - pr_info("INIT no_frontswap called\n"); - use_frontswap = 0; - return 1; -} - -__setup("nofrontswap", no_frontswap); - -static int __init zcache_init(void) -{ - int ret = 0; - -#ifdef CONFIG_SYSFS - ret = sysfs_create_group(mm_kobj, &zcache_attr_group); - ret = sysfs_create_group(mm_kobj, &ramster_attr_group); - if (ret) { - pr_err("ramster: can't create sysfs\n"); - goto out; - } -#endif /* CONFIG_SYSFS */ -#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP) - if (ramster_enabled) { - unsigned int cpu; - - (void)r2net_register_handlers(); - tmem_register_hostops(&zcache_hostops); - tmem_register_pamops(&zcache_pamops); - ret = register_cpu_notifier(&zcache_cpu_notifier_block); - if (ret) { - pr_err("ramster: can't register cpu notifier\n"); - goto out; - } - for_each_online_cpu(cpu) { - void *pcpu = (void *)(long)cpu; - zcache_cpu_notifier(&zcache_cpu_notifier_block, - CPU_UP_PREPARE, pcpu); - } - } - zcache_objnode_cache = kmem_cache_create("zcache_objnode", - sizeof(struct tmem_objnode), 0, 0, NULL); - zcache_obj_cache = kmem_cache_create("zcache_obj", - sizeof(struct tmem_obj), 0, 0, NULL); - ramster_flnode_cache = kmem_cache_create("ramster_flnode", - sizeof(struct flushlist_node), 0, 0, NULL); -#endif -#ifdef CONFIG_CLEANCACHE - pr_info("INIT ramster_enabled=%d use_cleancache=%d\n", - ramster_enabled, use_cleancache); - if (ramster_enabled && use_cleancache) { - struct cleancache_ops old_ops; - - zbud_init(); - register_shrinker(&zcache_shrinker); - old_ops = zcache_cleancache_register_ops(); - pr_info("ramster: cleancache enabled using kernel " - "transcendent memory and compression buddies\n"); - if (old_ops.init_fs != NULL) - pr_warning("ramster: cleancache_ops overridden"); - } -#endif -#ifdef CONFIG_FRONTSWAP - pr_info("INIT ramster_enabled=%d use_frontswap=%d\n", - ramster_enabled, use_frontswap); - if (ramster_enabled && use_frontswap) { - struct frontswap_ops old_ops; - - zcache_new_client(LOCAL_CLIENT); - old_ops = zcache_frontswap_register_ops(); - pr_info("ramster: frontswap enabled using kernel " - "transcendent memory and xvmalloc\n"); - if (old_ops.init != NULL) - pr_warning("ramster: frontswap_ops overridden"); - } - if (ramster_enabled && (use_frontswap || use_cleancache)) - ramster_remotify_init(); -#endif -out: - return ret; -} - -module_init(zcache_init) diff --git a/drivers/staging/ramster/zcache.h b/drivers/staging/ramster/zcache.h deleted file mode 100644 index 250b121c2..000000000 --- a/drivers/staging/ramster/zcache.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * zcache.h - * - * External zcache functions - * - * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp. - */ - -#ifndef _ZCACHE_H_ -#define _ZCACHE_H_ - -extern int zcache_put(int, int, struct tmem_oid *, uint32_t, - char *, size_t, bool, int); -extern int zcache_autocreate_pool(int, int, bool); -extern int zcache_get(int, int, struct tmem_oid *, uint32_t, - char *, size_t *, bool, int); -extern int zcache_flush(int, int, struct tmem_oid *, uint32_t); -extern int zcache_flush_object(int, int, struct tmem_oid *); -extern int zcache_localify(int, struct tmem_oid *, uint32_t, - char *, size_t, void *); - -#endif /* _ZCACHE_H */ diff --git a/drivers/staging/zcache/Kconfig b/drivers/staging/zcache/Kconfig deleted file mode 100644 index 7048e01f0..000000000 --- a/drivers/staging/zcache/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -config ZCACHE - bool "Dynamic compression of swap pages and clean pagecache pages" - # X86 dependency is because zsmalloc uses non-portable pte/tlb - # functions - depends on (CLEANCACHE || FRONTSWAP) && CRYPTO=y && X86 - select ZSMALLOC - select CRYPTO_LZO - default n - help - Zcache doubles RAM efficiency while providing a significant - performance boosts on many workloads. Zcache uses - compression and an in-kernel implementation of transcendent - memory to store clean page cache pages and swap in RAM, - providing a noticeable reduction in disk I/O. diff --git a/drivers/staging/zcache/Makefile b/drivers/staging/zcache/Makefile deleted file mode 100644 index 60daa272c..000000000 --- a/drivers/staging/zcache/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -zcache-y := zcache-main.o tmem.o - -obj-$(CONFIG_ZCACHE) += zcache.o diff --git a/drivers/staging/zcache/tmem.c b/drivers/staging/zcache/tmem.c deleted file mode 100644 index 1ca66ea9b..000000000 --- a/drivers/staging/zcache/tmem.c +++ /dev/null @@ -1,770 +0,0 @@ -/* - * In-kernel transcendent memory (generic implementation) - * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. - * - * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented - * "handles" (triples containing a pool id, and object id, and an index), to - * pages in a page-accessible memory (PAM). Tmem references the PAM pages via - * an abstract "pampd" (PAM page-descriptor), which can be operated on by a - * set of functions (pamops). Each pampd contains some representation of - * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of - * pages and must be able to insert, find, and delete these pages at a - * potential frequency of thousands per second concurrently across many CPUs, - * (and, if used with KVM, across many vcpus across many guests). - * Tmem is tracked with a hierarchy of data structures, organized by - * the elements in a handle-tuple: pool_id, object_id, and page index. - * One or more "clients" (e.g. guests) each provide one or more tmem_pools. - * Each pool, contains a hash table of rb_trees of tmem_objs. Each - * tmem_obj contains a radix-tree-like tree of pointers, with intermediate - * nodes called tmem_objnodes. Each leaf pointer in this tree points to - * a pampd, which is accessible only through a small set of callbacks - * registered by the PAM implementation (see tmem_register_pamops). Tmem - * does all memory allocation via a set of callbacks registered by the tmem - * host implementation (e.g. see tmem_register_hostops). - */ - -#include -#include -#include - -#include "tmem.h" - -/* data structure sentinels used for debugging... see tmem.h */ -#define POOL_SENTINEL 0x87658765 -#define OBJ_SENTINEL 0x12345678 -#define OBJNODE_SENTINEL 0xfedcba09 - -/* - * A tmem host implementation must use this function to register callbacks - * for memory allocation. - */ -static struct tmem_hostops tmem_hostops; - -static void tmem_objnode_tree_init(void); - -void tmem_register_hostops(struct tmem_hostops *m) -{ - tmem_objnode_tree_init(); - tmem_hostops = *m; -} - -/* - * A tmem host implementation must use this function to register - * callbacks for a page-accessible memory (PAM) implementation - */ -static struct tmem_pamops tmem_pamops; - -void tmem_register_pamops(struct tmem_pamops *m) -{ - tmem_pamops = *m; -} - -/* - * Oid's are potentially very sparse and tmem_objs may have an indeterminately - * short life, being added and deleted at a relatively high frequency. - * So an rb_tree is an ideal data structure to manage tmem_objs. But because - * of the potentially huge number of tmem_objs, each pool manages a hashtable - * of rb_trees to reduce search, insert, delete, and rebalancing time. - * Each hashbucket also has a lock to manage concurrent access. - * - * The following routines manage tmem_objs. When any tmem_obj is accessed, - * the hashbucket lock must be held. - */ - -/* searches for object==oid in pool, returns locked object if found */ -static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, - struct tmem_oid *oidp) -{ - struct rb_node *rbnode; - struct tmem_obj *obj; - - rbnode = hb->obj_rb_root.rb_node; - while (rbnode) { - BUG_ON(RB_EMPTY_NODE(rbnode)); - obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); - switch (tmem_oid_compare(oidp, &obj->oid)) { - case 0: /* equal */ - goto out; - case -1: - rbnode = rbnode->rb_left; - break; - case 1: - rbnode = rbnode->rb_right; - break; - } - } - obj = NULL; -out: - return obj; -} - -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *); - -/* free an object that has no more pampds in it */ -static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) -{ - struct tmem_pool *pool; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pampd_count > 0); - pool = obj->pool; - BUG_ON(pool == NULL); - if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */ - tmem_pampd_destroy_all_in_obj(obj); - BUG_ON(obj->objnode_tree_root != NULL); - BUG_ON((long)obj->objnode_count != 0); - atomic_dec(&pool->obj_count); - BUG_ON(atomic_read(&pool->obj_count) < 0); - INVERT_SENTINEL(obj, OBJ); - obj->pool = NULL; - tmem_oid_set_invalid(&obj->oid); - rb_erase(&obj->rb_tree_node, &hb->obj_rb_root); -} - -/* - * initialize, and insert an tmem_object_root (called only if find failed) - */ -static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, - struct tmem_pool *pool, - struct tmem_oid *oidp) -{ - struct rb_root *root = &hb->obj_rb_root; - struct rb_node **new = &(root->rb_node), *parent = NULL; - struct tmem_obj *this; - - BUG_ON(pool == NULL); - atomic_inc(&pool->obj_count); - obj->objnode_tree_height = 0; - obj->objnode_tree_root = NULL; - obj->pool = pool; - obj->oid = *oidp; - obj->objnode_count = 0; - obj->pampd_count = 0; - (*tmem_pamops.new_obj)(obj); - SET_SENTINEL(obj, OBJ); - while (*new) { - BUG_ON(RB_EMPTY_NODE(*new)); - this = rb_entry(*new, struct tmem_obj, rb_tree_node); - parent = *new; - switch (tmem_oid_compare(oidp, &this->oid)) { - case 0: - BUG(); /* already present; should never happen! */ - break; - case -1: - new = &(*new)->rb_left; - break; - case 1: - new = &(*new)->rb_right; - break; - } - } - rb_link_node(&obj->rb_tree_node, parent, new); - rb_insert_color(&obj->rb_tree_node, root); -} - -/* - * Tmem is managed as a set of tmem_pools with certain attributes, such as - * "ephemeral" vs "persistent". These attributes apply to all tmem_objs - * and all pampds that belong to a tmem_pool. A tmem_pool is created - * or deleted relatively rarely (for example, when a filesystem is - * mounted or unmounted. - */ - -/* flush all data from a pool and, optionally, free it */ -static void tmem_pool_flush(struct tmem_pool *pool, bool destroy) -{ - struct rb_node *rbnode; - struct tmem_obj *obj; - struct tmem_hashbucket *hb = &pool->hashbucket[0]; - int i; - - BUG_ON(pool == NULL); - for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { - spin_lock(&hb->lock); - rbnode = rb_first(&hb->obj_rb_root); - while (rbnode != NULL) { - obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); - rbnode = rb_next(rbnode); - tmem_pampd_destroy_all_in_obj(obj); - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - } - spin_unlock(&hb->lock); - } - if (destroy) - list_del(&pool->pool_list); -} - -/* - * A tmem_obj contains a radix-tree-like tree in which the intermediate - * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation - * is very specialized and tuned for specific uses and is not particularly - * suited for use from this code, though some code from the core algorithms has - * been reused, thus the copyright notices below). Each tmem_objnode contains - * a set of pointers which point to either a set of intermediate tmem_objnodes - * or a set of of pampds. - * - * Portions Copyright (C) 2001 Momchil Velikov - * Portions Copyright (C) 2001 Christoph Hellwig - * Portions Copyright (C) 2005 SGI, Christoph Lameter - */ - -struct tmem_objnode_tree_path { - struct tmem_objnode *objnode; - int offset; -}; - -/* objnode height_to_maxindex translation */ -static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1]; - -static void tmem_objnode_tree_init(void) -{ - unsigned int ht, tmp; - - for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) { - tmp = ht * OBJNODE_TREE_MAP_SHIFT; - if (tmp >= OBJNODE_TREE_INDEX_BITS) - tmem_objnode_tree_h2max[ht] = ~0UL; - else - tmem_objnode_tree_h2max[ht] = - (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1; - } -} - -static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj) -{ - struct tmem_objnode *objnode; - - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - objnode = (*tmem_hostops.objnode_alloc)(obj->pool); - if (unlikely(objnode == NULL)) - goto out; - objnode->obj = obj; - SET_SENTINEL(objnode, OBJNODE); - memset(&objnode->slots, 0, sizeof(objnode->slots)); - objnode->slots_in_use = 0; - obj->objnode_count++; -out: - return objnode; -} - -static void tmem_objnode_free(struct tmem_objnode *objnode) -{ - struct tmem_pool *pool; - int i; - - BUG_ON(objnode == NULL); - for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) - BUG_ON(objnode->slots[i] != NULL); - ASSERT_SENTINEL(objnode, OBJNODE); - INVERT_SENTINEL(objnode, OBJNODE); - BUG_ON(objnode->obj == NULL); - ASSERT_SENTINEL(objnode->obj, OBJ); - pool = objnode->obj->pool; - BUG_ON(pool == NULL); - ASSERT_SENTINEL(pool, POOL); - objnode->obj->objnode_count--; - objnode->obj = NULL; - (*tmem_hostops.objnode_free)(objnode, pool); -} - -/* - * lookup index in object and return associated pampd (or NULL if not found) - */ -static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) -{ - unsigned int height, shift; - struct tmem_objnode **slot = NULL; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - - height = obj->objnode_tree_height; - if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) - goto out; - if (height == 0 && obj->objnode_tree_root) { - slot = &obj->objnode_tree_root; - goto out; - } - shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; - slot = &obj->objnode_tree_root; - while (height > 0) { - if (*slot == NULL) - goto out; - slot = (struct tmem_objnode **) - ((*slot)->slots + - ((index >> shift) & OBJNODE_TREE_MAP_MASK)); - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } -out: - return slot != NULL ? (void **)slot : NULL; -} - -static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) -{ - struct tmem_objnode **slot; - - slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); - return slot != NULL ? *slot : NULL; -} - -static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, - void *new_pampd) -{ - struct tmem_objnode **slot; - void *ret = NULL; - - slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); - if ((slot != NULL) && (*slot != NULL)) { - void *old_pampd = *(void **)slot; - *(void **)slot = new_pampd; - (*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0); - ret = new_pampd; - } - return ret; -} - -static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index, - void *pampd) -{ - int ret = 0; - struct tmem_objnode *objnode = NULL, *newnode, *slot; - unsigned int height, shift; - int offset = 0; - - /* if necessary, extend the tree to be higher */ - if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) { - height = obj->objnode_tree_height + 1; - if (index > tmem_objnode_tree_h2max[height]) - while (index > tmem_objnode_tree_h2max[height]) - height++; - if (obj->objnode_tree_root == NULL) { - obj->objnode_tree_height = height; - goto insert; - } - do { - newnode = tmem_objnode_alloc(obj); - if (!newnode) { - ret = -ENOMEM; - goto out; - } - newnode->slots[0] = obj->objnode_tree_root; - newnode->slots_in_use = 1; - obj->objnode_tree_root = newnode; - obj->objnode_tree_height++; - } while (height > obj->objnode_tree_height); - } -insert: - slot = obj->objnode_tree_root; - height = obj->objnode_tree_height; - shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; - while (height > 0) { - if (slot == NULL) { - /* add a child objnode. */ - slot = tmem_objnode_alloc(obj); - if (!slot) { - ret = -ENOMEM; - goto out; - } - if (objnode) { - - objnode->slots[offset] = slot; - objnode->slots_in_use++; - } else - obj->objnode_tree_root = slot; - } - /* go down a level */ - offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; - objnode = slot; - slot = objnode->slots[offset]; - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } - BUG_ON(slot != NULL); - if (objnode) { - objnode->slots_in_use++; - objnode->slots[offset] = pampd; - } else - obj->objnode_tree_root = pampd; - obj->pampd_count++; -out: - return ret; -} - -static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index) -{ - struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1]; - struct tmem_objnode_tree_path *pathp = path; - struct tmem_objnode *slot = NULL; - unsigned int height, shift; - int offset; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - height = obj->objnode_tree_height; - if (index > tmem_objnode_tree_h2max[height]) - goto out; - slot = obj->objnode_tree_root; - if (height == 0 && obj->objnode_tree_root) { - obj->objnode_tree_root = NULL; - goto out; - } - shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT; - pathp->objnode = NULL; - do { - if (slot == NULL) - goto out; - pathp++; - offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; - pathp->offset = offset; - pathp->objnode = slot; - slot = slot->slots[offset]; - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } while (height > 0); - if (slot == NULL) - goto out; - while (pathp->objnode) { - pathp->objnode->slots[pathp->offset] = NULL; - pathp->objnode->slots_in_use--; - if (pathp->objnode->slots_in_use) { - if (pathp->objnode == obj->objnode_tree_root) { - while (obj->objnode_tree_height > 0 && - obj->objnode_tree_root->slots_in_use == 1 && - obj->objnode_tree_root->slots[0]) { - struct tmem_objnode *to_free = - obj->objnode_tree_root; - - obj->objnode_tree_root = - to_free->slots[0]; - obj->objnode_tree_height--; - to_free->slots[0] = NULL; - to_free->slots_in_use = 0; - tmem_objnode_free(to_free); - } - } - goto out; - } - tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */ - pathp--; - } - obj->objnode_tree_height = 0; - obj->objnode_tree_root = NULL; - -out: - if (slot != NULL) - obj->pampd_count--; - BUG_ON(obj->pampd_count < 0); - return slot; -} - -/* recursively walk the objnode_tree destroying pampds and objnodes */ -static void tmem_objnode_node_destroy(struct tmem_obj *obj, - struct tmem_objnode *objnode, - unsigned int ht) -{ - int i; - - if (ht == 0) - return; - for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) { - if (objnode->slots[i]) { - if (ht == 1) { - obj->pampd_count--; - (*tmem_pamops.free)(objnode->slots[i], - obj->pool, NULL, 0); - objnode->slots[i] = NULL; - continue; - } - tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1); - tmem_objnode_free(objnode->slots[i]); - objnode->slots[i] = NULL; - } - } -} - -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) -{ - if (obj->objnode_tree_root == NULL) - return; - if (obj->objnode_tree_height == 0) { - obj->pampd_count--; - (*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0); - } else { - tmem_objnode_node_destroy(obj, obj->objnode_tree_root, - obj->objnode_tree_height); - tmem_objnode_free(obj->objnode_tree_root); - obj->objnode_tree_height = 0; - } - obj->objnode_tree_root = NULL; - (*tmem_pamops.free_obj)(obj->pool, obj); -} - -/* - * Tmem is operated on by a set of well-defined actions: - * "put", "get", "flush", "flush_object", "new pool" and "destroy pool". - * (The tmem ABI allows for subpages and exchanges but these operations - * are not included in this implementation.) - * - * These "tmem core" operations are implemented in the following functions. - */ - -/* - * "Put" a page, e.g. copy a page from the kernel into newly allocated - * PAM space (if such space is available). Tmem_put is complicated by - * a corner case: What if a page with matching handle already exists in - * tmem? To guarantee coherency, one of two actions is necessary: Either - * the data for the page must be overwritten, or the page must be - * "flushed" so that the data is not accessible to a subsequent "get". - * Since these "duplicate puts" are relatively rare, this implementation - * always flushes for simplicity. - */ -int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, - char *data, size_t size, bool raw, bool ephemeral) -{ - struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; - void *pampd = NULL, *pampd_del = NULL; - int ret = -ENOMEM; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = objfound = tmem_obj_find(hb, oidp); - if (obj != NULL) { - pampd = tmem_pampd_lookup_in_obj(objfound, index); - if (pampd != NULL) { - /* if found, is a dup put, flush the old one */ - pampd_del = tmem_pampd_delete_from_obj(obj, index); - BUG_ON(pampd_del != pampd); - (*tmem_pamops.free)(pampd, pool, oidp, index); - if (obj->pampd_count == 0) { - objnew = obj; - objfound = NULL; - } - pampd = NULL; - } - } else { - obj = objnew = (*tmem_hostops.obj_alloc)(pool); - if (unlikely(obj == NULL)) { - ret = -ENOMEM; - goto out; - } - tmem_obj_init(obj, hb, pool, oidp); - } - BUG_ON(obj == NULL); - BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound)); - pampd = (*tmem_pamops.create)(data, size, raw, ephemeral, - obj->pool, &obj->oid, index); - if (unlikely(pampd == NULL)) - goto free; - ret = tmem_pampd_add_to_obj(obj, index, pampd); - if (unlikely(ret == -ENOMEM)) - /* may have partially built objnode tree ("stump") */ - goto delete_and_free; - goto out; - -delete_and_free: - (void)tmem_pampd_delete_from_obj(obj, index); -free: - if (pampd) - (*tmem_pamops.free)(pampd, pool, NULL, 0); - if (objnew) { - tmem_obj_free(objnew, hb); - (*tmem_hostops.obj_free)(objnew, pool); - } -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Get" a page, e.g. if one can be found, copy the tmem page with the - * matching handle from PAM space to the kernel. By tmem definition, - * when a "get" is successful on an ephemeral page, the page is "flushed", - * and when a "get" is successful on a persistent page, the page is retained - * in tmem. Note that to preserve - * coherency, "get" can never be skipped if tmem contains the data. - * That is, if a get is done with a certain handle and fails, any - * subsequent "get" must also fail (unless of course there is a - * "put" done with the same handle). - - */ -int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, - char *data, size_t *size, bool raw, int get_and_free) -{ - struct tmem_obj *obj; - void *pampd; - bool ephemeral = is_ephemeral(pool); - int ret = -1; - struct tmem_hashbucket *hb; - bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral); - bool lock_held = false; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - lock_held = true; - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - if (free) - pampd = tmem_pampd_delete_from_obj(obj, index); - else - pampd = tmem_pampd_lookup_in_obj(obj, index); - if (pampd == NULL) - goto out; - if (free) { - if (obj->pampd_count == 0) { - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - obj = NULL; - } - } - if (tmem_pamops.is_remote(pampd)) { - lock_held = false; - spin_unlock(&hb->lock); - } - if (free) - ret = (*tmem_pamops.get_data_and_free)( - data, size, raw, pampd, pool, oidp, index); - else - ret = (*tmem_pamops.get_data)( - data, size, raw, pampd, pool, oidp, index); - if (ret < 0) - goto out; - ret = 0; -out: - if (lock_held) - spin_unlock(&hb->lock); - return ret; -} - -/* - * If a page in tmem matches the handle, "flush" this page from tmem such - * that any subsequent "get" does not succeed (unless, of course, there - * was another "put" with the same handle). - */ -int tmem_flush_page(struct tmem_pool *pool, - struct tmem_oid *oidp, uint32_t index) -{ - struct tmem_obj *obj; - void *pampd; - int ret = -1; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - pampd = tmem_pampd_delete_from_obj(obj, index); - if (pampd == NULL) - goto out; - (*tmem_pamops.free)(pampd, pool, oidp, index); - if (obj->pampd_count == 0) { - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - } - ret = 0; - -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * If a page in tmem matches the handle, replace the page so that any - * subsequent "get" gets the new page. Returns 0 if - * there was a page to replace, else returns -1. - */ -int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, - uint32_t index, void *new_pampd) -{ - struct tmem_obj *obj; - int ret = -1; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd); - ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj); -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Flush" all pages in tmem matching this oid. - */ -int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) -{ - struct tmem_obj *obj; - struct tmem_hashbucket *hb; - int ret = -1; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - tmem_pampd_destroy_all_in_obj(obj); - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - ret = 0; - -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Flush" all pages (and tmem_objs) from this tmem_pool and disable - * all subsequent access to this tmem_pool. - */ -int tmem_destroy_pool(struct tmem_pool *pool) -{ - int ret = -1; - - if (pool == NULL) - goto out; - tmem_pool_flush(pool, 1); - ret = 0; -out: - return ret; -} - -static LIST_HEAD(tmem_global_pool_list); - -/* - * Create a new tmem_pool with the provided flag and return - * a pool id provided by the tmem host implementation. - */ -void tmem_new_pool(struct tmem_pool *pool, uint32_t flags) -{ - int persistent = flags & TMEM_POOL_PERSIST; - int shared = flags & TMEM_POOL_SHARED; - struct tmem_hashbucket *hb = &pool->hashbucket[0]; - int i; - - for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { - hb->obj_rb_root = RB_ROOT; - spin_lock_init(&hb->lock); - } - INIT_LIST_HEAD(&pool->pool_list); - atomic_set(&pool->obj_count, 0); - SET_SENTINEL(pool, POOL); - list_add_tail(&pool->pool_list, &tmem_global_pool_list); - pool->persistent = persistent; - pool->shared = shared; -} diff --git a/drivers/staging/zcache/tmem.h b/drivers/staging/zcache/tmem.h deleted file mode 100644 index 0d4aa8270..000000000 --- a/drivers/staging/zcache/tmem.h +++ /dev/null @@ -1,206 +0,0 @@ -/* - * tmem.h - * - * Transcendent memory - * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. - */ - -#ifndef _TMEM_H_ -#define _TMEM_H_ - -#include -#include -#include -#include - -/* - * These are pre-defined by the Xen<->Linux ABI - */ -#define TMEM_PUT_PAGE 4 -#define TMEM_GET_PAGE 5 -#define TMEM_FLUSH_PAGE 6 -#define TMEM_FLUSH_OBJECT 7 -#define TMEM_POOL_PERSIST 1 -#define TMEM_POOL_SHARED 2 -#define TMEM_POOL_PRECOMPRESSED 4 -#define TMEM_POOL_PAGESIZE_SHIFT 4 -#define TMEM_POOL_PAGESIZE_MASK 0xf -#define TMEM_POOL_RESERVED_BITS 0x00ffff00 - -/* - * sentinels have proven very useful for debugging but can be removed - * or disabled before final merge. - */ -#define SENTINELS -#ifdef SENTINELS -#define DECL_SENTINEL uint32_t sentinel; -#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL) -#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL) -#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL) -#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL) -#else -#define DECL_SENTINEL -#define SET_SENTINEL(_x, _y) do { } while (0) -#define INVERT_SENTINEL(_x, _y) do { } while (0) -#define ASSERT_SENTINEL(_x, _y) do { } while (0) -#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0) -#endif - -#define ASSERT_SPINLOCK(_l) lockdep_assert_held(_l) - -/* - * A pool is the highest-level data structure managed by tmem and - * usually corresponds to a large independent set of pages such as - * a filesystem. Each pool has an id, and certain attributes and counters. - * It also contains a set of hash buckets, each of which contains an rbtree - * of objects and a lock to manage concurrency within the pool. - */ - -#define TMEM_HASH_BUCKET_BITS 8 -#define TMEM_HASH_BUCKETS (1<persistent) -#define is_ephemeral(_p) (!(_p->persistent)) - -/* - * An object id ("oid") is large: 192-bits (to ensure, for example, files - * in a modern filesystem can be uniquely identified). - */ - -struct tmem_oid { - uint64_t oid[3]; -}; - -static inline void tmem_oid_set_invalid(struct tmem_oid *oidp) -{ - oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL; -} - -static inline bool tmem_oid_valid(struct tmem_oid *oidp) -{ - return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL || - oidp->oid[2] != -1UL; -} - -static inline int tmem_oid_compare(struct tmem_oid *left, - struct tmem_oid *right) -{ - int ret; - - if (left->oid[2] == right->oid[2]) { - if (left->oid[1] == right->oid[1]) { - if (left->oid[0] == right->oid[0]) - ret = 0; - else if (left->oid[0] < right->oid[0]) - ret = -1; - else - return 1; - } else if (left->oid[1] < right->oid[1]) - ret = -1; - else - ret = 1; - } else if (left->oid[2] < right->oid[2]) - ret = -1; - else - ret = 1; - return ret; -} - -static inline unsigned tmem_oid_hash(struct tmem_oid *oidp) -{ - return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2], - TMEM_HASH_BUCKET_BITS); -} - -/* - * A tmem_obj contains an identifier (oid), pointers to the parent - * pool and the rb_tree to which it belongs, counters, and an ordered - * set of pampds, structured in a radix-tree-like tree. The intermediate - * nodes of the tree are called tmem_objnodes. - */ - -struct tmem_objnode; - -struct tmem_obj { - struct tmem_oid oid; - struct tmem_pool *pool; - struct rb_node rb_tree_node; - struct tmem_objnode *objnode_tree_root; - unsigned int objnode_tree_height; - unsigned long objnode_count; - long pampd_count; - void *extra; /* for private use by pampd implementation */ - DECL_SENTINEL -}; - -#define OBJNODE_TREE_MAP_SHIFT 6 -#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT) -#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1) -#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -#define OBJNODE_TREE_MAX_PATH \ - (OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2) - -struct tmem_objnode { - struct tmem_obj *obj; - DECL_SENTINEL - void *slots[OBJNODE_TREE_MAP_SIZE]; - unsigned int slots_in_use; -}; - -/* pampd abstract datatype methods provided by the PAM implementation */ -struct tmem_pamops { - void *(*create)(char *, size_t, bool, int, - struct tmem_pool *, struct tmem_oid *, uint32_t); - int (*get_data)(char *, size_t *, bool, void *, struct tmem_pool *, - struct tmem_oid *, uint32_t); - int (*get_data_and_free)(char *, size_t *, bool, void *, - struct tmem_pool *, struct tmem_oid *, - uint32_t); - void (*free)(void *, struct tmem_pool *, struct tmem_oid *, uint32_t); - void (*free_obj)(struct tmem_pool *, struct tmem_obj *); - bool (*is_remote)(void *); - void (*new_obj)(struct tmem_obj *); - int (*replace_in_obj)(void *, struct tmem_obj *); -}; -extern void tmem_register_pamops(struct tmem_pamops *m); - -/* memory allocation methods provided by the host implementation */ -struct tmem_hostops { - struct tmem_obj *(*obj_alloc)(struct tmem_pool *); - void (*obj_free)(struct tmem_obj *, struct tmem_pool *); - struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *); - void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *); -}; -extern void tmem_register_hostops(struct tmem_hostops *m); - -/* core tmem accessor functions */ -extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index, - char *, size_t, bool, bool); -extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index, - char *, size_t *, bool, int); -extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index, - void *); -extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *, - uint32_t index); -extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *); -extern int tmem_destroy_pool(struct tmem_pool *); -extern void tmem_new_pool(struct tmem_pool *, uint32_t); -#endif /* _TMEM_H */ diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c deleted file mode 100644 index 1812bedab..000000000 --- a/drivers/staging/zcache/zcache-main.c +++ /dev/null @@ -1,2083 +0,0 @@ -/* - * zcache.c - * - * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp. - * Copyright (c) 2010,2011, Nitin Gupta - * - * Zcache provides an in-kernel "host implementation" for transcendent memory - * and, thus indirectly, for cleancache and frontswap. Zcache includes two - * page-accessible memory [1] interfaces, both utilizing the crypto compression - * API: - * 1) "compression buddies" ("zbud") is used for ephemeral pages - * 2) zsmalloc is used for persistent pages. - * Xvmalloc (based on the TLSF allocator) has very low fragmentation - * so maximizes space efficiency, while zbud allows pairs (and potentially, - * in the future, more than a pair of) compressed pages to be closely linked - * so that reclaiming can be done via the kernel's physical-page-oriented - * "shrinker" interface. - * - * [1] For a definition of page-accessible memory (aka PAM), see: - * http://marc.info/?l=linux-mm&m=127811271605009 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "tmem.h" - -#include "../zsmalloc/zsmalloc.h" - -#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP)) -#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP" -#endif -#ifdef CONFIG_CLEANCACHE -#include -#endif -#ifdef CONFIG_FRONTSWAP -#include -#endif - -#if 0 -/* this is more aggressive but may cause other problems? */ -#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN) -#else -#define ZCACHE_GFP_MASK \ - (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC) -#endif - -#define MAX_POOLS_PER_CLIENT 16 - -#define MAX_CLIENTS 16 -#define LOCAL_CLIENT ((uint16_t)-1) - -MODULE_LICENSE("GPL"); - -struct zcache_client { - struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT]; - struct zs_pool *zspool; - bool allocated; - atomic_t refcount; -}; - -static struct zcache_client zcache_host; -static struct zcache_client zcache_clients[MAX_CLIENTS]; - -static inline uint16_t get_client_id_from_client(struct zcache_client *cli) -{ - BUG_ON(cli == NULL); - if (cli == &zcache_host) - return LOCAL_CLIENT; - return cli - &zcache_clients[0]; -} - -static inline bool is_local_client(struct zcache_client *cli) -{ - return cli == &zcache_host; -} - -/* crypto API for zcache */ -#define ZCACHE_COMP_NAME_SZ CRYPTO_MAX_ALG_NAME -static char zcache_comp_name[ZCACHE_COMP_NAME_SZ]; -static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms; - -enum comp_op { - ZCACHE_COMPOP_COMPRESS, - ZCACHE_COMPOP_DECOMPRESS -}; - -static inline int zcache_comp_op(enum comp_op op, - const u8 *src, unsigned int slen, - u8 *dst, unsigned int *dlen) -{ - struct crypto_comp *tfm; - int ret; - - BUG_ON(!zcache_comp_pcpu_tfms); - tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu()); - BUG_ON(!tfm); - switch (op) { - case ZCACHE_COMPOP_COMPRESS: - ret = crypto_comp_compress(tfm, src, slen, dst, dlen); - break; - case ZCACHE_COMPOP_DECOMPRESS: - ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); - break; - } - put_cpu(); - return ret; -} - -/********** - * Compression buddies ("zbud") provides for packing two (or, possibly - * in the future, more) compressed ephemeral pages into a single "raw" - * (physical) page and tracking them with data structures so that - * the raw pages can be easily reclaimed. - * - * A zbud page ("zbpg") is an aligned page containing a list_head, - * a lock, and two "zbud headers". The remainder of the physical - * page is divided up into aligned 64-byte "chunks" which contain - * the compressed data for zero, one, or two zbuds. Each zbpg - * resides on: (1) an "unused list" if it has no zbuds; (2) a - * "buddied" list if it is fully populated with two zbuds; or - * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks - * the one unbuddied zbud uses. The data inside a zbpg cannot be - * read or written unless the zbpg's lock is held. - */ - -#define ZBH_SENTINEL 0x43214321 -#define ZBPG_SENTINEL 0xdeadbeef - -#define ZBUD_MAX_BUDS 2 - -struct zbud_hdr { - uint16_t client_id; - uint16_t pool_id; - struct tmem_oid oid; - uint32_t index; - uint16_t size; /* compressed size in bytes, zero means unused */ - DECL_SENTINEL -}; - -struct zbud_page { - struct list_head bud_list; - spinlock_t lock; - struct zbud_hdr buddy[ZBUD_MAX_BUDS]; - DECL_SENTINEL - /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */ -}; - -#define CHUNK_SHIFT 6 -#define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define CHUNK_MASK (~(CHUNK_SIZE-1)) -#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \ - CHUNK_MASK) >> CHUNK_SHIFT) -#define MAX_CHUNK (NCHUNKS-1) - -static struct { - struct list_head list; - unsigned count; -} zbud_unbuddied[NCHUNKS]; -/* list N contains pages with N chunks USED and NCHUNKS-N unused */ -/* element 0 is never used but optimizing that isn't worth it */ -static unsigned long zbud_cumul_chunk_counts[NCHUNKS]; - -struct list_head zbud_buddied_list; -static unsigned long zcache_zbud_buddied_count; - -/* protects the buddied list and all unbuddied lists */ -static DEFINE_SPINLOCK(zbud_budlists_spinlock); - -static LIST_HEAD(zbpg_unused_list); -static unsigned long zcache_zbpg_unused_list_count; - -/* protects the unused page list */ -static DEFINE_SPINLOCK(zbpg_unused_list_spinlock); - -static atomic_t zcache_zbud_curr_raw_pages; -static atomic_t zcache_zbud_curr_zpages; -static unsigned long zcache_zbud_curr_zbytes; -static unsigned long zcache_zbud_cumul_zpages; -static unsigned long zcache_zbud_cumul_zbytes; -static unsigned long zcache_compress_poor; -static unsigned long zcache_mean_compress_poor; - -/* forward references */ -static void *zcache_get_free_page(void); -static void zcache_free_page(void *p); - -/* - * zbud helper functions - */ - -static inline unsigned zbud_max_buddy_size(void) -{ - return MAX_CHUNK << CHUNK_SHIFT; -} - -static inline unsigned zbud_size_to_chunks(unsigned size) -{ - BUG_ON(size == 0 || size > zbud_max_buddy_size()); - return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; -} - -static inline int zbud_budnum(struct zbud_hdr *zh) -{ - unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1); - struct zbud_page *zbpg = NULL; - unsigned budnum = -1U; - int i; - - for (i = 0; i < ZBUD_MAX_BUDS; i++) - if (offset == offsetof(typeof(*zbpg), buddy[i])) { - budnum = i; - break; - } - BUG_ON(budnum == -1U); - return budnum; -} - -static char *zbud_data(struct zbud_hdr *zh, unsigned size) -{ - struct zbud_page *zbpg; - char *p; - unsigned budnum; - - ASSERT_SENTINEL(zh, ZBH); - budnum = zbud_budnum(zh); - BUG_ON(size == 0 || size > zbud_max_buddy_size()); - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - ASSERT_SPINLOCK(&zbpg->lock); - p = (char *)zbpg; - if (budnum == 0) - p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & - CHUNK_MASK); - else if (budnum == 1) - p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); - return p; -} - -/* - * zbud raw page management - */ - -static struct zbud_page *zbud_alloc_raw_page(void) -{ - struct zbud_page *zbpg = NULL; - struct zbud_hdr *zh0, *zh1; - bool recycled = 0; - - /* if any pages on the zbpg list, use one */ - spin_lock(&zbpg_unused_list_spinlock); - if (!list_empty(&zbpg_unused_list)) { - zbpg = list_first_entry(&zbpg_unused_list, - struct zbud_page, bud_list); - list_del_init(&zbpg->bud_list); - zcache_zbpg_unused_list_count--; - recycled = 1; - } - spin_unlock(&zbpg_unused_list_spinlock); - if (zbpg == NULL) - /* none on zbpg list, try to get a kernel page */ - zbpg = zcache_get_free_page(); - if (likely(zbpg != NULL)) { - INIT_LIST_HEAD(&zbpg->bud_list); - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; - spin_lock_init(&zbpg->lock); - if (recycled) { - ASSERT_INVERTED_SENTINEL(zbpg, ZBPG); - SET_SENTINEL(zbpg, ZBPG); - BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); - BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); - } else { - atomic_inc(&zcache_zbud_curr_raw_pages); - INIT_LIST_HEAD(&zbpg->bud_list); - SET_SENTINEL(zbpg, ZBPG); - zh0->size = 0; zh1->size = 0; - tmem_oid_set_invalid(&zh0->oid); - tmem_oid_set_invalid(&zh1->oid); - } - } - return zbpg; -} - -static void zbud_free_raw_page(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1]; - - ASSERT_SENTINEL(zbpg, ZBPG); - BUG_ON(!list_empty(&zbpg->bud_list)); - ASSERT_SPINLOCK(&zbpg->lock); - BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); - BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); - INVERT_SENTINEL(zbpg, ZBPG); - spin_unlock(&zbpg->lock); - spin_lock(&zbpg_unused_list_spinlock); - list_add(&zbpg->bud_list, &zbpg_unused_list); - zcache_zbpg_unused_list_count++; - spin_unlock(&zbpg_unused_list_spinlock); -} - -/* - * core zbud handling routines - */ - -static unsigned zbud_free(struct zbud_hdr *zh) -{ - unsigned size; - - ASSERT_SENTINEL(zh, ZBH); - BUG_ON(!tmem_oid_valid(&zh->oid)); - size = zh->size; - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); - zh->size = 0; - tmem_oid_set_invalid(&zh->oid); - INVERT_SENTINEL(zh, ZBH); - zcache_zbud_curr_zbytes -= size; - atomic_dec(&zcache_zbud_curr_zpages); - return size; -} - -static void zbud_free_and_delist(struct zbud_hdr *zh) -{ - unsigned chunks; - struct zbud_hdr *zh_other; - unsigned budnum = zbud_budnum(zh), size; - struct zbud_page *zbpg = - container_of(zh, struct zbud_page, buddy[budnum]); - - spin_lock(&zbud_budlists_spinlock); - spin_lock(&zbpg->lock); - if (list_empty(&zbpg->bud_list)) { - /* ignore zombie page... see zbud_evict_pages() */ - spin_unlock(&zbpg->lock); - spin_unlock(&zbud_budlists_spinlock); - return; - } - size = zbud_free(zh); - ASSERT_SPINLOCK(&zbpg->lock); - zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0]; - if (zh_other->size == 0) { /* was unbuddied: unlist and free */ - chunks = zbud_size_to_chunks(size) ; - BUG_ON(list_empty(&zbud_unbuddied[chunks].list)); - list_del_init(&zbpg->bud_list); - zbud_unbuddied[chunks].count--; - spin_unlock(&zbud_budlists_spinlock); - zbud_free_raw_page(zbpg); - } else { /* was buddied: move remaining buddy to unbuddied list */ - chunks = zbud_size_to_chunks(zh_other->size) ; - list_del_init(&zbpg->bud_list); - zcache_zbud_buddied_count--; - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list); - zbud_unbuddied[chunks].count++; - spin_unlock(&zbud_budlists_spinlock); - spin_unlock(&zbpg->lock); - } -} - -static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id, - struct tmem_oid *oid, - uint32_t index, struct page *page, - void *cdata, unsigned size) -{ - struct zbud_hdr *zh0, *zh1, *zh = NULL; - struct zbud_page *zbpg = NULL, *ztmp; - unsigned nchunks; - char *to; - int i, found_good_buddy = 0; - - nchunks = zbud_size_to_chunks(size) ; - for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { - spin_lock(&zbud_budlists_spinlock); - if (!list_empty(&zbud_unbuddied[i].list)) { - list_for_each_entry_safe(zbpg, ztmp, - &zbud_unbuddied[i].list, bud_list) { - if (spin_trylock(&zbpg->lock)) { - found_good_buddy = i; - goto found_unbuddied; - } - } - } - spin_unlock(&zbud_budlists_spinlock); - } - /* didn't find a good buddy, try allocating a new page */ - zbpg = zbud_alloc_raw_page(); - if (unlikely(zbpg == NULL)) - goto out; - /* ok, have a page, now compress the data before taking locks */ - spin_lock(&zbud_budlists_spinlock); - spin_lock(&zbpg->lock); - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list); - zbud_unbuddied[nchunks].count++; - zh = &zbpg->buddy[0]; - goto init_zh; - -found_unbuddied: - ASSERT_SPINLOCK(&zbpg->lock); - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; - BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0))); - if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */ - ASSERT_SENTINEL(zh0, ZBH); - zh = zh1; - } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */ - ASSERT_SENTINEL(zh1, ZBH); - zh = zh0; - } else - BUG(); - list_del_init(&zbpg->bud_list); - zbud_unbuddied[found_good_buddy].count--; - list_add_tail(&zbpg->bud_list, &zbud_buddied_list); - zcache_zbud_buddied_count++; - -init_zh: - SET_SENTINEL(zh, ZBH); - zh->size = size; - zh->index = index; - zh->oid = *oid; - zh->pool_id = pool_id; - zh->client_id = client_id; - to = zbud_data(zh, size); - memcpy(to, cdata, size); - spin_unlock(&zbpg->lock); - spin_unlock(&zbud_budlists_spinlock); - - zbud_cumul_chunk_counts[nchunks]++; - atomic_inc(&zcache_zbud_curr_zpages); - zcache_zbud_cumul_zpages++; - zcache_zbud_curr_zbytes += size; - zcache_zbud_cumul_zbytes += size; -out: - return zh; -} - -static int zbud_decompress(struct page *page, struct zbud_hdr *zh) -{ - struct zbud_page *zbpg; - unsigned budnum = zbud_budnum(zh); - unsigned int out_len = PAGE_SIZE; - char *to_va, *from_va; - unsigned size; - int ret = 0; - - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - spin_lock(&zbpg->lock); - if (list_empty(&zbpg->bud_list)) { - /* ignore zombie page... see zbud_evict_pages() */ - ret = -EINVAL; - goto out; - } - ASSERT_SENTINEL(zh, ZBH); - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); - to_va = kmap_atomic(page); - size = zh->size; - from_va = zbud_data(zh, size); - ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, from_va, size, - to_va, &out_len); - BUG_ON(ret); - BUG_ON(out_len != PAGE_SIZE); - kunmap_atomic(to_va); -out: - spin_unlock(&zbpg->lock); - return ret; -} - -/* - * The following routines handle shrinking of ephemeral pages by evicting - * pages "least valuable" first. - */ - -static unsigned long zcache_evicted_raw_pages; -static unsigned long zcache_evicted_buddied_pages; -static unsigned long zcache_evicted_unbuddied_pages; - -static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, - uint16_t poolid); -static void zcache_put_pool(struct tmem_pool *pool); - -/* - * Flush and free all zbuds in a zbpg, then free the pageframe - */ -static void zbud_evict_zbpg(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh; - int i, j; - uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS]; - uint32_t index[ZBUD_MAX_BUDS]; - struct tmem_oid oid[ZBUD_MAX_BUDS]; - struct tmem_pool *pool; - - ASSERT_SPINLOCK(&zbpg->lock); - BUG_ON(!list_empty(&zbpg->bud_list)); - for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) { - zh = &zbpg->buddy[i]; - if (zh->size) { - client_id[j] = zh->client_id; - pool_id[j] = zh->pool_id; - oid[j] = zh->oid; - index[j] = zh->index; - j++; - zbud_free(zh); - } - } - spin_unlock(&zbpg->lock); - for (i = 0; i < j; i++) { - pool = zcache_get_pool_by_id(client_id[i], pool_id[i]); - if (pool != NULL) { - tmem_flush_page(pool, &oid[i], index[i]); - zcache_put_pool(pool); - } - } - ASSERT_SENTINEL(zbpg, ZBPG); - spin_lock(&zbpg->lock); - zbud_free_raw_page(zbpg); -} - -/* - * Free nr pages. This code is funky because we want to hold the locks - * protecting various lists for as short a time as possible, and in some - * circumstances the list may change asynchronously when the list lock is - * not held. In some cases we also trylock not only to avoid waiting on a - * page in use by another cpu, but also to avoid potential deadlock due to - * lock inversion. - */ -static void zbud_evict_pages(int nr) -{ - struct zbud_page *zbpg; - int i; - - /* first try freeing any pages on unused list */ -retry_unused_list: - spin_lock_bh(&zbpg_unused_list_spinlock); - if (!list_empty(&zbpg_unused_list)) { - /* can't walk list here, since it may change when unlocked */ - zbpg = list_first_entry(&zbpg_unused_list, - struct zbud_page, bud_list); - list_del_init(&zbpg->bud_list); - zcache_zbpg_unused_list_count--; - atomic_dec(&zcache_zbud_curr_raw_pages); - spin_unlock_bh(&zbpg_unused_list_spinlock); - zcache_free_page(zbpg); - zcache_evicted_raw_pages++; - if (--nr <= 0) - goto out; - goto retry_unused_list; - } - spin_unlock_bh(&zbpg_unused_list_spinlock); - - /* now try freeing unbuddied pages, starting with least space avail */ - for (i = 0; i < MAX_CHUNK; i++) { -retry_unbud_list_i: - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_unbuddied[i].list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - continue; - } - list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; - list_del_init(&zbpg->bud_list); - zbud_unbuddied[i].count--; - spin_unlock(&zbud_budlists_spinlock); - zcache_evicted_unbuddied_pages++; - /* want budlists unlocked when doing zbpg eviction */ - zbud_evict_zbpg(zbpg); - local_bh_enable(); - if (--nr <= 0) - goto out; - goto retry_unbud_list_i; - } - spin_unlock_bh(&zbud_budlists_spinlock); - } - - /* as a last resort, free buddied pages */ -retry_bud_list: - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_buddied_list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - goto out; - } - list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; - list_del_init(&zbpg->bud_list); - zcache_zbud_buddied_count--; - spin_unlock(&zbud_budlists_spinlock); - zcache_evicted_buddied_pages++; - /* want budlists unlocked when doing zbpg eviction */ - zbud_evict_zbpg(zbpg); - local_bh_enable(); - if (--nr <= 0) - goto out; - goto retry_bud_list; - } - spin_unlock_bh(&zbud_budlists_spinlock); -out: - return; -} - -static void zbud_init(void) -{ - int i; - - INIT_LIST_HEAD(&zbud_buddied_list); - zcache_zbud_buddied_count = 0; - for (i = 0; i < NCHUNKS; i++) { - INIT_LIST_HEAD(&zbud_unbuddied[i].list); - zbud_unbuddied[i].count = 0; - } -} - -#ifdef CONFIG_SYSFS -/* - * These sysfs routines show a nice distribution of how many zbpg's are - * currently (and have ever been placed) in each unbuddied list. It's fun - * to watch but can probably go away before final merge. - */ -static int zbud_show_unbuddied_list_counts(char *buf) -{ - int i; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) - p += sprintf(p, "%u ", zbud_unbuddied[i].count); - return p - buf; -} - -static int zbud_show_cumul_chunk_counts(char *buf) -{ - unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0; - unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0; - unsigned long total_chunks_lte_42 = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]); - chunks += zbud_cumul_chunk_counts[i]; - total_chunks += zbud_cumul_chunk_counts[i]; - sum_total_chunks += i * zbud_cumul_chunk_counts[i]; - if (i == 21) - total_chunks_lte_21 = total_chunks; - if (i == 32) - total_chunks_lte_32 = total_chunks; - if (i == 42) - total_chunks_lte_42 = total_chunks; - } - p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n", - total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42, - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} -#endif - -/********** - * This "zv" PAM implementation combines the slab-based zsmalloc - * with the crypto compression API to maximize the amount of data that can - * be packed into a physical page. - * - * Zv represents a PAM page with the index and object (plus a "size" value - * necessary for decompression) immediately preceding the compressed data. - */ - -#define ZVH_SENTINEL 0x43214321 - -struct zv_hdr { - uint32_t pool_id; - struct tmem_oid oid; - uint32_t index; - size_t size; - DECL_SENTINEL -}; - -/* rudimentary policy limits */ -/* total number of persistent pages may not exceed this percentage */ -static unsigned int zv_page_count_policy_percent = 75; -/* - * byte count defining poor compression; pages with greater zsize will be - * rejected - */ -static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7; -/* - * byte count defining poor *mean* compression; pages with greater zsize - * will be rejected until sufficient better-compressed pages are accepted - * driving the mean below this threshold - */ -static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5; - -static atomic_t zv_curr_dist_counts[NCHUNKS]; -static atomic_t zv_cumul_dist_counts[NCHUNKS]; - -static struct zv_hdr *zv_create(struct zs_pool *pool, uint32_t pool_id, - struct tmem_oid *oid, uint32_t index, - void *cdata, unsigned clen) -{ - struct zv_hdr *zv; - u32 size = clen + sizeof(struct zv_hdr); - int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT; - void *handle = NULL; - - BUG_ON(!irqs_disabled()); - BUG_ON(chunks >= NCHUNKS); - handle = zs_malloc(pool, size); - if (!handle) - goto out; - atomic_inc(&zv_curr_dist_counts[chunks]); - atomic_inc(&zv_cumul_dist_counts[chunks]); - zv = zs_map_object(pool, handle); - zv->index = index; - zv->oid = *oid; - zv->pool_id = pool_id; - zv->size = clen; - SET_SENTINEL(zv, ZVH); - memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen); - zs_unmap_object(pool, handle); -out: - return handle; -} - -static void zv_free(struct zs_pool *pool, void *handle) -{ - unsigned long flags; - struct zv_hdr *zv; - uint16_t size; - int chunks; - - zv = zs_map_object(pool, handle); - ASSERT_SENTINEL(zv, ZVH); - size = zv->size + sizeof(struct zv_hdr); - INVERT_SENTINEL(zv, ZVH); - zs_unmap_object(pool, handle); - - chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT; - BUG_ON(chunks >= NCHUNKS); - atomic_dec(&zv_curr_dist_counts[chunks]); - - local_irq_save(flags); - zs_free(pool, handle); - local_irq_restore(flags); -} - -static void zv_decompress(struct page *page, void *handle) -{ - unsigned int clen = PAGE_SIZE; - char *to_va; - int ret; - struct zv_hdr *zv; - - zv = zs_map_object(zcache_host.zspool, handle); - BUG_ON(zv->size == 0); - ASSERT_SENTINEL(zv, ZVH); - to_va = kmap_atomic(page); - ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, (char *)zv + sizeof(*zv), - zv->size, to_va, &clen); - kunmap_atomic(to_va); - zs_unmap_object(zcache_host.zspool, handle); - BUG_ON(ret); - BUG_ON(clen != PAGE_SIZE); -} - -#ifdef CONFIG_SYSFS -/* - * show a distribution of compression stats for zv pages. - */ - -static int zv_curr_dist_counts_show(char *buf) -{ - unsigned long i, n, chunks = 0, sum_total_chunks = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - n = atomic_read(&zv_curr_dist_counts[i]); - p += sprintf(p, "%lu ", n); - chunks += n; - sum_total_chunks += i * n; - } - p += sprintf(p, "mean:%lu\n", - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} - -static int zv_cumul_dist_counts_show(char *buf) -{ - unsigned long i, n, chunks = 0, sum_total_chunks = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - n = atomic_read(&zv_cumul_dist_counts[i]); - p += sprintf(p, "%lu ", n); - chunks += n; - sum_total_chunks += i * n; - } - p += sprintf(p, "mean:%lu\n", - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} - -/* - * setting zv_max_zsize via sysfs causes all persistent (e.g. swap) - * pages that don't compress to less than this value (including metadata - * overhead) to be rejected. We don't allow the value to get too close - * to PAGE_SIZE. - */ -static ssize_t zv_max_zsize_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", zv_max_zsize); -} - -static ssize_t zv_max_zsize_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7)) - return -EINVAL; - zv_max_zsize = val; - return count; -} - -/* - * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap) - * pages that don't compress to less than this value (including metadata - * overhead) to be rejected UNLESS the mean compression is also smaller - * than this value. In other words, we are load-balancing-by-zsize the - * accepted pages. Again, we don't allow the value to get too close - * to PAGE_SIZE. - */ -static ssize_t zv_max_mean_zsize_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", zv_max_mean_zsize); -} - -static ssize_t zv_max_mean_zsize_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7)) - return -EINVAL; - zv_max_mean_zsize = val; - return count; -} - -/* - * setting zv_page_count_policy_percent via sysfs sets an upper bound of - * persistent (e.g. swap) pages that will be retained according to: - * (zv_page_count_policy_percent * totalram_pages) / 100) - * when that limit is reached, further puts will be rejected (until - * some pages have been flushed). Note that, due to compression, - * this number may exceed 100; it defaults to 75 and we set an - * arbitary limit of 150. A poor choice will almost certainly result - * in OOM's, so this value should only be changed prudently. - */ -static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", zv_page_count_policy_percent); -} - -static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > 150)) - return -EINVAL; - zv_page_count_policy_percent = val; - return count; -} - -static struct kobj_attribute zcache_zv_max_zsize_attr = { - .attr = { .name = "zv_max_zsize", .mode = 0644 }, - .show = zv_max_zsize_show, - .store = zv_max_zsize_store, -}; - -static struct kobj_attribute zcache_zv_max_mean_zsize_attr = { - .attr = { .name = "zv_max_mean_zsize", .mode = 0644 }, - .show = zv_max_mean_zsize_show, - .store = zv_max_mean_zsize_store, -}; - -static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = { - .attr = { .name = "zv_page_count_policy_percent", - .mode = 0644 }, - .show = zv_page_count_policy_percent_show, - .store = zv_page_count_policy_percent_store, -}; -#endif - -/* - * zcache core code starts here - */ - -/* useful stats not collected by cleancache or frontswap */ -static unsigned long zcache_flush_total; -static unsigned long zcache_flush_found; -static unsigned long zcache_flobj_total; -static unsigned long zcache_flobj_found; -static unsigned long zcache_failed_eph_puts; -static unsigned long zcache_failed_pers_puts; - -/* - * Tmem operations assume the poolid implies the invoking client. - * Zcache only has one client (the kernel itself): LOCAL_CLIENT. - * RAMster has each client numbered by cluster node, and a KVM version - * of zcache would have one client per guest and each client might - * have a poolid==N. - */ -static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid) -{ - struct tmem_pool *pool = NULL; - struct zcache_client *cli = NULL; - - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else { - if (cli_id >= MAX_CLIENTS) - goto out; - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - atomic_inc(&cli->refcount); - } - if (poolid < MAX_POOLS_PER_CLIENT) { - pool = cli->tmem_pools[poolid]; - if (pool != NULL) - atomic_inc(&pool->refcount); - } -out: - return pool; -} - -static void zcache_put_pool(struct tmem_pool *pool) -{ - struct zcache_client *cli = NULL; - - if (pool == NULL) - BUG(); - cli = pool->client; - atomic_dec(&pool->refcount); - atomic_dec(&cli->refcount); -} - -int zcache_new_client(uint16_t cli_id) -{ - struct zcache_client *cli = NULL; - int ret = -1; - - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else if ((unsigned int)cli_id < MAX_CLIENTS) - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - if (cli->allocated) - goto out; - cli->allocated = 1; -#ifdef CONFIG_FRONTSWAP - cli->zspool = zs_create_pool("zcache", ZCACHE_GFP_MASK); - if (cli->zspool == NULL) - goto out; -#endif - ret = 0; -out: - return ret; -} - -/* counters for debugging */ -static unsigned long zcache_failed_get_free_pages; -static unsigned long zcache_failed_alloc; -static unsigned long zcache_put_to_flush; - -/* - * for now, used named slabs so can easily track usage; later can - * either just use kmalloc, or perhaps add a slab-like allocator - * to more carefully manage total memory utilization - */ -static struct kmem_cache *zcache_objnode_cache; -static struct kmem_cache *zcache_obj_cache; -static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_obj_count_max; -static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_objnode_count_max; - -/* - * to avoid memory allocation recursion (e.g. due to direct reclaim), we - * preload all necessary data structures so the hostops callbacks never - * actually do a malloc - */ -struct zcache_preload { - void *page; - struct tmem_obj *obj; - int nr; - struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH]; -}; -static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, }; - -static int zcache_do_preload(struct tmem_pool *pool) -{ - struct zcache_preload *kp; - struct tmem_objnode *objnode; - struct tmem_obj *obj; - void *page; - int ret = -ENOMEM; - - if (unlikely(zcache_objnode_cache == NULL)) - goto out; - if (unlikely(zcache_obj_cache == NULL)) - goto out; - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - while (kp->nr < ARRAY_SIZE(kp->objnodes)) { - preempt_enable_no_resched(); - objnode = kmem_cache_alloc(zcache_objnode_cache, - ZCACHE_GFP_MASK); - if (unlikely(objnode == NULL)) { - zcache_failed_alloc++; - goto out; - } - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - if (kp->nr < ARRAY_SIZE(kp->objnodes)) - kp->objnodes[kp->nr++] = objnode; - else - kmem_cache_free(zcache_objnode_cache, objnode); - } - preempt_enable_no_resched(); - obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK); - if (unlikely(obj == NULL)) { - zcache_failed_alloc++; - goto out; - } - page = (void *)__get_free_page(ZCACHE_GFP_MASK); - if (unlikely(page == NULL)) { - zcache_failed_get_free_pages++; - kmem_cache_free(zcache_obj_cache, obj); - goto out; - } - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - if (kp->obj == NULL) - kp->obj = obj; - else - kmem_cache_free(zcache_obj_cache, obj); - if (kp->page == NULL) - kp->page = page; - else - free_page((unsigned long)page); - ret = 0; -out: - return ret; -} - -static void *zcache_get_free_page(void) -{ - struct zcache_preload *kp; - void *page; - - kp = &__get_cpu_var(zcache_preloads); - page = kp->page; - BUG_ON(page == NULL); - kp->page = NULL; - return page; -} - -static void zcache_free_page(void *p) -{ - free_page((unsigned long)p); -} - -/* - * zcache implementation for tmem host ops - */ - -static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool) -{ - struct tmem_objnode *objnode = NULL; - unsigned long count; - struct zcache_preload *kp; - - kp = &__get_cpu_var(zcache_preloads); - if (kp->nr <= 0) - goto out; - objnode = kp->objnodes[kp->nr - 1]; - BUG_ON(objnode == NULL); - kp->objnodes[kp->nr - 1] = NULL; - kp->nr--; - count = atomic_inc_return(&zcache_curr_objnode_count); - if (count > zcache_curr_objnode_count_max) - zcache_curr_objnode_count_max = count; -out: - return objnode; -} - -static void zcache_objnode_free(struct tmem_objnode *objnode, - struct tmem_pool *pool) -{ - atomic_dec(&zcache_curr_objnode_count); - BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0); - kmem_cache_free(zcache_objnode_cache, objnode); -} - -static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool) -{ - struct tmem_obj *obj = NULL; - unsigned long count; - struct zcache_preload *kp; - - kp = &__get_cpu_var(zcache_preloads); - obj = kp->obj; - BUG_ON(obj == NULL); - kp->obj = NULL; - count = atomic_inc_return(&zcache_curr_obj_count); - if (count > zcache_curr_obj_count_max) - zcache_curr_obj_count_max = count; - return obj; -} - -static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool) -{ - atomic_dec(&zcache_curr_obj_count); - BUG_ON(atomic_read(&zcache_curr_obj_count) < 0); - kmem_cache_free(zcache_obj_cache, obj); -} - -static struct tmem_hostops zcache_hostops = { - .obj_alloc = zcache_obj_alloc, - .obj_free = zcache_obj_free, - .objnode_alloc = zcache_objnode_alloc, - .objnode_free = zcache_objnode_free, -}; - -/* - * zcache implementations for PAM page descriptor ops - */ - -static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_eph_pampd_count_max; -static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_pers_pampd_count_max; - -/* forward reference */ -static int zcache_compress(struct page *from, void **out_va, unsigned *out_len); - -static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph, - struct tmem_pool *pool, struct tmem_oid *oid, - uint32_t index) -{ - void *pampd = NULL, *cdata; - unsigned clen; - int ret; - unsigned long count; - struct page *page = (struct page *)(data); - struct zcache_client *cli = pool->client; - uint16_t client_id = get_client_id_from_client(cli); - unsigned long zv_mean_zsize; - unsigned long curr_pers_pampd_count; - u64 total_zsize; - - if (eph) { - ret = zcache_compress(page, &cdata, &clen); - if (ret == 0) - goto out; - if (clen == 0 || clen > zbud_max_buddy_size()) { - zcache_compress_poor++; - goto out; - } - pampd = (void *)zbud_create(client_id, pool->pool_id, oid, - index, page, cdata, clen); - if (pampd != NULL) { - count = atomic_inc_return(&zcache_curr_eph_pampd_count); - if (count > zcache_curr_eph_pampd_count_max) - zcache_curr_eph_pampd_count_max = count; - } - } else { - curr_pers_pampd_count = - atomic_read(&zcache_curr_pers_pampd_count); - if (curr_pers_pampd_count > - (zv_page_count_policy_percent * totalram_pages) / 100) - goto out; - ret = zcache_compress(page, &cdata, &clen); - if (ret == 0) - goto out; - /* reject if compression is too poor */ - if (clen > zv_max_zsize) { - zcache_compress_poor++; - goto out; - } - /* reject if mean compression is too poor */ - if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) { - total_zsize = zs_get_total_size_bytes(cli->zspool); - zv_mean_zsize = div_u64(total_zsize, - curr_pers_pampd_count); - if (zv_mean_zsize > zv_max_mean_zsize) { - zcache_mean_compress_poor++; - goto out; - } - } - pampd = (void *)zv_create(cli->zspool, pool->pool_id, - oid, index, cdata, clen); - if (pampd == NULL) - goto out; - count = atomic_inc_return(&zcache_curr_pers_pampd_count); - if (count > zcache_curr_pers_pampd_count_max) - zcache_curr_pers_pampd_count_max = count; - } -out: - return pampd; -} - -/* - * fill the pageframe corresponding to the struct page with the data - * from the passed pampd - */ -static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw, - void *pampd, struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index) -{ - int ret = 0; - - BUG_ON(is_ephemeral(pool)); - zv_decompress((struct page *)(data), pampd); - return ret; -} - -/* - * fill the pageframe corresponding to the struct page with the data - * from the passed pampd - */ -static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw, - void *pampd, struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index) -{ - BUG_ON(!is_ephemeral(pool)); - if (zbud_decompress((struct page *)(data), pampd) < 0) - return -EINVAL; - zbud_free_and_delist((struct zbud_hdr *)pampd); - atomic_dec(&zcache_curr_eph_pampd_count); - return 0; -} - -/* - * free the pampd and remove it from any zcache lists - * pampd must no longer be pointed to from any tmem data structures! - */ -static void zcache_pampd_free(void *pampd, struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index) -{ - struct zcache_client *cli = pool->client; - - if (is_ephemeral(pool)) { - zbud_free_and_delist((struct zbud_hdr *)pampd); - atomic_dec(&zcache_curr_eph_pampd_count); - BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0); - } else { - zv_free(cli->zspool, pampd); - atomic_dec(&zcache_curr_pers_pampd_count); - BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0); - } -} - -static void zcache_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj) -{ -} - -static void zcache_pampd_new_obj(struct tmem_obj *obj) -{ -} - -static int zcache_pampd_replace_in_obj(void *pampd, struct tmem_obj *obj) -{ - return -1; -} - -static bool zcache_pampd_is_remote(void *pampd) -{ - return 0; -} - -static struct tmem_pamops zcache_pamops = { - .create = zcache_pampd_create, - .get_data = zcache_pampd_get_data, - .get_data_and_free = zcache_pampd_get_data_and_free, - .free = zcache_pampd_free, - .free_obj = zcache_pampd_free_obj, - .new_obj = zcache_pampd_new_obj, - .replace_in_obj = zcache_pampd_replace_in_obj, - .is_remote = zcache_pampd_is_remote, -}; - -/* - * zcache compression/decompression and related per-cpu stuff - */ - -static DEFINE_PER_CPU(unsigned char *, zcache_dstmem); -#define ZCACHE_DSTMEM_ORDER 1 - -static int zcache_compress(struct page *from, void **out_va, unsigned *out_len) -{ - int ret = 0; - unsigned char *dmem = __get_cpu_var(zcache_dstmem); - char *from_va; - - BUG_ON(!irqs_disabled()); - if (unlikely(dmem == NULL)) - goto out; /* no buffer or no compressor so can't compress */ - *out_len = PAGE_SIZE << ZCACHE_DSTMEM_ORDER; - from_va = kmap_atomic(from); - mb(); - ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, from_va, PAGE_SIZE, dmem, - out_len); - BUG_ON(ret); - *out_va = dmem; - kunmap_atomic(from_va); - ret = 1; -out: - return ret; -} - -static int zcache_comp_cpu_up(int cpu) -{ - struct crypto_comp *tfm; - - tfm = crypto_alloc_comp(zcache_comp_name, 0, 0); - if (IS_ERR(tfm)) - return NOTIFY_BAD; - *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm; - return NOTIFY_OK; -} - -static void zcache_comp_cpu_down(int cpu) -{ - struct crypto_comp *tfm; - - tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu); - crypto_free_comp(tfm); - *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL; -} - -static int zcache_cpu_notifier(struct notifier_block *nb, - unsigned long action, void *pcpu) -{ - int ret, cpu = (long)pcpu; - struct zcache_preload *kp; - - switch (action) { - case CPU_UP_PREPARE: - ret = zcache_comp_cpu_up(cpu); - if (ret != NOTIFY_OK) { - pr_err("zcache: can't allocate compressor transform\n"); - return ret; - } - per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages( - GFP_KERNEL | __GFP_REPEAT, ZCACHE_DSTMEM_ORDER); - break; - case CPU_DEAD: - case CPU_UP_CANCELED: - zcache_comp_cpu_down(cpu); - free_pages((unsigned long)per_cpu(zcache_dstmem, cpu), - ZCACHE_DSTMEM_ORDER); - per_cpu(zcache_dstmem, cpu) = NULL; - kp = &per_cpu(zcache_preloads, cpu); - while (kp->nr) { - kmem_cache_free(zcache_objnode_cache, - kp->objnodes[kp->nr - 1]); - kp->objnodes[kp->nr - 1] = NULL; - kp->nr--; - } - if (kp->obj) { - kmem_cache_free(zcache_obj_cache, kp->obj); - kp->obj = NULL; - } - if (kp->page) { - free_page((unsigned long)kp->page); - kp->page = NULL; - } - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block zcache_cpu_notifier_block = { - .notifier_call = zcache_cpu_notifier -}; - -#ifdef CONFIG_SYSFS -#define ZCACHE_SYSFS_RO(_name) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", zcache_##_name); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -#define ZCACHE_SYSFS_RO_ATOMIC(_name) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return _func(buf); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -ZCACHE_SYSFS_RO(curr_obj_count_max); -ZCACHE_SYSFS_RO(curr_objnode_count_max); -ZCACHE_SYSFS_RO(flush_total); -ZCACHE_SYSFS_RO(flush_found); -ZCACHE_SYSFS_RO(flobj_total); -ZCACHE_SYSFS_RO(flobj_found); -ZCACHE_SYSFS_RO(failed_eph_puts); -ZCACHE_SYSFS_RO(failed_pers_puts); -ZCACHE_SYSFS_RO(zbud_curr_zbytes); -ZCACHE_SYSFS_RO(zbud_cumul_zpages); -ZCACHE_SYSFS_RO(zbud_cumul_zbytes); -ZCACHE_SYSFS_RO(zbud_buddied_count); -ZCACHE_SYSFS_RO(zbpg_unused_list_count); -ZCACHE_SYSFS_RO(evicted_raw_pages); -ZCACHE_SYSFS_RO(evicted_unbuddied_pages); -ZCACHE_SYSFS_RO(evicted_buddied_pages); -ZCACHE_SYSFS_RO(failed_get_free_pages); -ZCACHE_SYSFS_RO(failed_alloc); -ZCACHE_SYSFS_RO(put_to_flush); -ZCACHE_SYSFS_RO(compress_poor); -ZCACHE_SYSFS_RO(mean_compress_poor); -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages); -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages); -ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count); -ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count); -ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts, - zbud_show_unbuddied_list_counts); -ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts, - zbud_show_cumul_chunk_counts); -ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts, - zv_curr_dist_counts_show); -ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts, - zv_cumul_dist_counts_show); - -static struct attribute *zcache_attrs[] = { - &zcache_curr_obj_count_attr.attr, - &zcache_curr_obj_count_max_attr.attr, - &zcache_curr_objnode_count_attr.attr, - &zcache_curr_objnode_count_max_attr.attr, - &zcache_flush_total_attr.attr, - &zcache_flobj_total_attr.attr, - &zcache_flush_found_attr.attr, - &zcache_flobj_found_attr.attr, - &zcache_failed_eph_puts_attr.attr, - &zcache_failed_pers_puts_attr.attr, - &zcache_compress_poor_attr.attr, - &zcache_mean_compress_poor_attr.attr, - &zcache_zbud_curr_raw_pages_attr.attr, - &zcache_zbud_curr_zpages_attr.attr, - &zcache_zbud_curr_zbytes_attr.attr, - &zcache_zbud_cumul_zpages_attr.attr, - &zcache_zbud_cumul_zbytes_attr.attr, - &zcache_zbud_buddied_count_attr.attr, - &zcache_zbpg_unused_list_count_attr.attr, - &zcache_evicted_raw_pages_attr.attr, - &zcache_evicted_unbuddied_pages_attr.attr, - &zcache_evicted_buddied_pages_attr.attr, - &zcache_failed_get_free_pages_attr.attr, - &zcache_failed_alloc_attr.attr, - &zcache_put_to_flush_attr.attr, - &zcache_zbud_unbuddied_list_counts_attr.attr, - &zcache_zbud_cumul_chunk_counts_attr.attr, - &zcache_zv_curr_dist_counts_attr.attr, - &zcache_zv_cumul_dist_counts_attr.attr, - &zcache_zv_max_zsize_attr.attr, - &zcache_zv_max_mean_zsize_attr.attr, - &zcache_zv_page_count_policy_percent_attr.attr, - NULL, -}; - -static struct attribute_group zcache_attr_group = { - .attrs = zcache_attrs, - .name = "zcache", -}; - -#endif /* CONFIG_SYSFS */ -/* - * When zcache is disabled ("frozen"), pools can be created and destroyed, - * but all puts (and thus all other operations that require memory allocation) - * must fail. If zcache is unfrozen, accepts puts, then frozen again, - * data consistency requires all puts while frozen to be converted into - * flushes. - */ -static bool zcache_freeze; - -/* - * zcache shrinker interface (only useful for ephemeral pages, so zbud only) - */ -static int shrink_zcache_memory(struct shrinker *shrink, - struct shrink_control *sc) -{ - int ret = -1; - int nr = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; - - if (nr >= 0) { - if (!(gfp_mask & __GFP_FS)) - /* does this case really need to be skipped? */ - goto out; - zbud_evict_pages(nr); - } - ret = (int)atomic_read(&zcache_zbud_curr_raw_pages); -out: - return ret; -} - -static struct shrinker zcache_shrinker = { - .shrink = shrink_zcache_memory, - .seeks = DEFAULT_SEEKS, -}; - -/* - * zcache shims between cleancache/frontswap ops and tmem - */ - -static int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp, - uint32_t index, struct page *page) -{ - struct tmem_pool *pool; - int ret = -1; - - BUG_ON(!irqs_disabled()); - pool = zcache_get_pool_by_id(cli_id, pool_id); - if (unlikely(pool == NULL)) - goto out; - if (!zcache_freeze && zcache_do_preload(pool) == 0) { - /* preload does preempt_disable on success */ - ret = tmem_put(pool, oidp, index, (char *)(page), - PAGE_SIZE, 0, is_ephemeral(pool)); - if (ret < 0) { - if (is_ephemeral(pool)) - zcache_failed_eph_puts++; - else - zcache_failed_pers_puts++; - } - zcache_put_pool(pool); - preempt_enable_no_resched(); - } else { - zcache_put_to_flush++; - if (atomic_read(&pool->obj_count) > 0) - /* the put fails whether the flush succeeds or not */ - (void)tmem_flush_page(pool, oidp, index); - zcache_put_pool(pool); - } -out: - return ret; -} - -static int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp, - uint32_t index, struct page *page) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - size_t size = PAGE_SIZE; - - local_irq_save(flags); - pool = zcache_get_pool_by_id(cli_id, pool_id); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_get(pool, oidp, index, (char *)(page), - &size, 0, is_ephemeral(pool)); - zcache_put_pool(pool); - } - local_irq_restore(flags); - return ret; -} - -static int zcache_flush_page(int cli_id, int pool_id, - struct tmem_oid *oidp, uint32_t index) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - - local_irq_save(flags); - zcache_flush_total++; - pool = zcache_get_pool_by_id(cli_id, pool_id); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_flush_page(pool, oidp, index); - zcache_put_pool(pool); - } - if (ret >= 0) - zcache_flush_found++; - local_irq_restore(flags); - return ret; -} - -static int zcache_flush_object(int cli_id, int pool_id, - struct tmem_oid *oidp) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - - local_irq_save(flags); - zcache_flobj_total++; - pool = zcache_get_pool_by_id(cli_id, pool_id); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_flush_object(pool, oidp); - zcache_put_pool(pool); - } - if (ret >= 0) - zcache_flobj_found++; - local_irq_restore(flags); - return ret; -} - -static int zcache_destroy_pool(int cli_id, int pool_id) -{ - struct tmem_pool *pool = NULL; - struct zcache_client *cli = NULL; - int ret = -1; - - if (pool_id < 0) - goto out; - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else if ((unsigned int)cli_id < MAX_CLIENTS) - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - atomic_inc(&cli->refcount); - pool = cli->tmem_pools[pool_id]; - if (pool == NULL) - goto out; - cli->tmem_pools[pool_id] = NULL; - /* wait for pool activity on other cpus to quiesce */ - while (atomic_read(&pool->refcount) != 0) - ; - atomic_dec(&cli->refcount); - local_bh_disable(); - ret = tmem_destroy_pool(pool); - local_bh_enable(); - kfree(pool); - pr_info("zcache: destroyed pool id=%d, cli_id=%d\n", - pool_id, cli_id); -out: - return ret; -} - -static int zcache_new_pool(uint16_t cli_id, uint32_t flags) -{ - int poolid = -1; - struct tmem_pool *pool; - struct zcache_client *cli = NULL; - - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else if ((unsigned int)cli_id < MAX_CLIENTS) - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - atomic_inc(&cli->refcount); - pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC); - if (pool == NULL) { - pr_info("zcache: pool creation failed: out of memory\n"); - goto out; - } - - for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++) - if (cli->tmem_pools[poolid] == NULL) - break; - if (poolid >= MAX_POOLS_PER_CLIENT) { - pr_info("zcache: pool creation failed: max exceeded\n"); - kfree(pool); - poolid = -1; - goto out; - } - atomic_set(&pool->refcount, 0); - pool->client = cli; - pool->pool_id = poolid; - tmem_new_pool(pool, flags); - cli->tmem_pools[poolid] = pool; - pr_info("zcache: created %s tmem pool, id=%d, client=%d\n", - flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", - poolid, cli_id); -out: - if (cli != NULL) - atomic_dec(&cli->refcount); - return poolid; -} - -/********** - * Two kernel functionalities currently can be layered on top of tmem. - * These are "cleancache" which is used as a second-chance cache for clean - * page cache pages; and "frontswap" which is used for swap pages - * to avoid writes to disk. A generic "shim" is provided here for each - * to translate in-kernel semantics to zcache semantics. - */ - -#ifdef CONFIG_CLEANCACHE -static void zcache_cleancache_put_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (likely(ind == index)) - (void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index, page); -} - -static int zcache_cleancache_get_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - int ret = -1; - - if (likely(ind == index)) - ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index, page); - return ret; -} - -static void zcache_cleancache_flush_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (likely(ind == index)) - (void)zcache_flush_page(LOCAL_CLIENT, pool_id, &oid, ind); -} - -static void zcache_cleancache_flush_inode(int pool_id, - struct cleancache_filekey key) -{ - struct tmem_oid oid = *(struct tmem_oid *)&key; - - (void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid); -} - -static void zcache_cleancache_flush_fs(int pool_id) -{ - if (pool_id >= 0) - (void)zcache_destroy_pool(LOCAL_CLIENT, pool_id); -} - -static int zcache_cleancache_init_fs(size_t pagesize) -{ - BUG_ON(sizeof(struct cleancache_filekey) != - sizeof(struct tmem_oid)); - BUG_ON(pagesize != PAGE_SIZE); - return zcache_new_pool(LOCAL_CLIENT, 0); -} - -static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) -{ - /* shared pools are unsupported and map to private */ - BUG_ON(sizeof(struct cleancache_filekey) != - sizeof(struct tmem_oid)); - BUG_ON(pagesize != PAGE_SIZE); - return zcache_new_pool(LOCAL_CLIENT, 0); -} - -static struct cleancache_ops zcache_cleancache_ops = { - .put_page = zcache_cleancache_put_page, - .get_page = zcache_cleancache_get_page, - .invalidate_page = zcache_cleancache_flush_page, - .invalidate_inode = zcache_cleancache_flush_inode, - .invalidate_fs = zcache_cleancache_flush_fs, - .init_shared_fs = zcache_cleancache_init_shared_fs, - .init_fs = zcache_cleancache_init_fs -}; - -struct cleancache_ops zcache_cleancache_register_ops(void) -{ - struct cleancache_ops old_ops = - cleancache_register_ops(&zcache_cleancache_ops); - - return old_ops; -} -#endif - -#ifdef CONFIG_FRONTSWAP -/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ -static int zcache_frontswap_poolid = -1; - -/* - * Swizzling increases objects per swaptype, increasing tmem concurrency - * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS - * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from - * frontswap_get_page(), but has side-effects. Hence using 8. - */ -#define SWIZ_BITS 8 -#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) -#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) -#define iswiz(_ind) (_ind >> SWIZ_BITS) - -static inline struct tmem_oid oswiz(unsigned type, u32 ind) -{ - struct tmem_oid oid = { .oid = { 0 } }; - oid.oid[0] = _oswiz(type, ind); - return oid; -} - -static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - int ret = -1; - unsigned long flags; - - BUG_ON(!PageLocked(page)); - if (likely(ind64 == ind)) { - local_irq_save(flags); - ret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid, - &oid, iswiz(ind), page); - local_irq_restore(flags); - } - return ret; -} - -/* returns 0 if the page was successfully gotten from frontswap, -1 if - * was not present (should never happen!) */ -static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - int ret = -1; - - BUG_ON(!PageLocked(page)); - if (likely(ind64 == ind)) - ret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid, - &oid, iswiz(ind), page); - return ret; -} - -/* flush a single page from frontswap */ -static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - - if (likely(ind64 == ind)) - (void)zcache_flush_page(LOCAL_CLIENT, zcache_frontswap_poolid, - &oid, iswiz(ind)); -} - -/* flush all pages from the passed swaptype */ -static void zcache_frontswap_flush_area(unsigned type) -{ - struct tmem_oid oid; - int ind; - - for (ind = SWIZ_MASK; ind >= 0; ind--) { - oid = oswiz(type, ind); - (void)zcache_flush_object(LOCAL_CLIENT, - zcache_frontswap_poolid, &oid); - } -} - -static void zcache_frontswap_init(unsigned ignored) -{ - /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ - if (zcache_frontswap_poolid < 0) - zcache_frontswap_poolid = - zcache_new_pool(LOCAL_CLIENT, TMEM_POOL_PERSIST); -} - -static struct frontswap_ops zcache_frontswap_ops = { - .put_page = zcache_frontswap_put_page, - .get_page = zcache_frontswap_get_page, - .invalidate_page = zcache_frontswap_flush_page, - .invalidate_area = zcache_frontswap_flush_area, - .init = zcache_frontswap_init -}; - -struct frontswap_ops zcache_frontswap_register_ops(void) -{ - struct frontswap_ops old_ops = - frontswap_register_ops(&zcache_frontswap_ops); - - return old_ops; -} -#endif - -/* - * zcache initialization - * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR - * NOTHING HAPPENS! - */ - -static int zcache_enabled; - -static int __init enable_zcache(char *s) -{ - zcache_enabled = 1; - return 1; -} -__setup("zcache", enable_zcache); - -/* allow independent dynamic disabling of cleancache and frontswap */ - -static int use_cleancache = 1; - -static int __init no_cleancache(char *s) -{ - use_cleancache = 0; - return 1; -} - -__setup("nocleancache", no_cleancache); - -static int use_frontswap = 1; - -static int __init no_frontswap(char *s) -{ - use_frontswap = 0; - return 1; -} - -__setup("nofrontswap", no_frontswap); - -static int __init enable_zcache_compressor(char *s) -{ - strncpy(zcache_comp_name, s, ZCACHE_COMP_NAME_SZ); - zcache_enabled = 1; - return 1; -} -__setup("zcache=", enable_zcache_compressor); - - -static int zcache_comp_init(void) -{ - int ret = 0; - - /* check crypto algorithm */ - if (*zcache_comp_name != '\0') { - ret = crypto_has_comp(zcache_comp_name, 0, 0); - if (!ret) - pr_info("zcache: %s not supported\n", - zcache_comp_name); - } - if (!ret) - strcpy(zcache_comp_name, "lzo"); - ret = crypto_has_comp(zcache_comp_name, 0, 0); - if (!ret) { - ret = 1; - goto out; - } - pr_info("zcache: using %s compressor\n", zcache_comp_name); - - /* alloc percpu transforms */ - ret = 0; - zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); - if (!zcache_comp_pcpu_tfms) - ret = 1; -out: - return ret; -} - -static int __init zcache_init(void) -{ - int ret = 0; - -#ifdef CONFIG_SYSFS - ret = sysfs_create_group(mm_kobj, &zcache_attr_group); - if (ret) { - pr_err("zcache: can't create sysfs\n"); - goto out; - } -#endif /* CONFIG_SYSFS */ -#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP) - if (zcache_enabled) { - unsigned int cpu; - - tmem_register_hostops(&zcache_hostops); - tmem_register_pamops(&zcache_pamops); - ret = register_cpu_notifier(&zcache_cpu_notifier_block); - if (ret) { - pr_err("zcache: can't register cpu notifier\n"); - goto out; - } - ret = zcache_comp_init(); - if (ret) { - pr_err("zcache: compressor initialization failed\n"); - goto out; - } - for_each_online_cpu(cpu) { - void *pcpu = (void *)(long)cpu; - zcache_cpu_notifier(&zcache_cpu_notifier_block, - CPU_UP_PREPARE, pcpu); - } - } - zcache_objnode_cache = kmem_cache_create("zcache_objnode", - sizeof(struct tmem_objnode), 0, 0, NULL); - zcache_obj_cache = kmem_cache_create("zcache_obj", - sizeof(struct tmem_obj), 0, 0, NULL); - ret = zcache_new_client(LOCAL_CLIENT); - if (ret) { - pr_err("zcache: can't create client\n"); - goto out; - } -#endif -#ifdef CONFIG_CLEANCACHE - if (zcache_enabled && use_cleancache) { - struct cleancache_ops old_ops; - - zbud_init(); - register_shrinker(&zcache_shrinker); - old_ops = zcache_cleancache_register_ops(); - pr_info("zcache: cleancache enabled using kernel " - "transcendent memory and compression buddies\n"); - if (old_ops.init_fs != NULL) - pr_warning("zcache: cleancache_ops overridden"); - } -#endif -#ifdef CONFIG_FRONTSWAP - if (zcache_enabled && use_frontswap) { - struct frontswap_ops old_ops; - - old_ops = zcache_frontswap_register_ops(); - pr_info("zcache: frontswap enabled using kernel " - "transcendent memory and zsmalloc\n"); - if (old_ops.init != NULL) - pr_warning("zcache: frontswap_ops overridden"); - } -#endif -out: - return ret; -} - -module_init(zcache_init) diff --git a/drivers/staging/zram/Makefile b/drivers/staging/zram/Makefile deleted file mode 100644 index cb0f9ced6..000000000 --- a/drivers/staging/zram/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -zram-y := zram_drv.o - -obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/staging/zram/zram.txt b/drivers/staging/zram/zram.txt deleted file mode 100644 index 765d790ae..000000000 --- a/drivers/staging/zram/zram.txt +++ /dev/null @@ -1,77 +0,0 @@ -zram: Compressed RAM based block devices ----------------------------------------- - -Project home: http://compcache.googlecode.com/ - -* Introduction - -The zram module creates RAM based block devices named /dev/zram -( = 0, 1, ...). Pages written to these disks are compressed and stored -in memory itself. These disks allow very fast I/O and compression provides -good amounts of memory savings. Some of the usecases include /tmp storage, -use as swap disks, various caches under /var and maybe many more :) - -Statistics for individual zram devices are exported through sysfs nodes at -/sys/block/zram/ - -* Usage - -Following shows a typical sequence of steps for using zram. - -1) Load Module: - modprobe zram num_devices=4 - This creates 4 devices: /dev/zram{0,1,2,3} - (num_devices parameter is optional. Default: 1) - -2) Set Disksize - Set disk size by writing the value to sysfs node 'disksize'. - The value can be either in bytes or you can use mem suffixes. - Examples: - # Initialize /dev/zram0 with 50MB disksize - echo $((50*1024*1024)) > /sys/block/zram0/disksize - - # Using mem suffixes - echo 256K > /sys/block/zram0/disksize - echo 512M > /sys/block/zram0/disksize - echo 1G > /sys/block/zram0/disksize - -3) Activate: - mkswap /dev/zram0 - swapon /dev/zram0 - - mkfs.ext4 /dev/zram1 - mount /dev/zram1 /tmp - -4) Stats: - Per-device statistics are exported as various nodes under - /sys/block/zram/ - disksize - num_reads - num_writes - invalid_io - notify_free - discard - zero_pages - orig_data_size - compr_data_size - mem_used_total - -5) Deactivate: - swapoff /dev/zram0 - umount /dev/zram1 - -6) Reset: - Write any positive value to 'reset' sysfs node - echo 1 > /sys/block/zram0/reset - echo 1 > /sys/block/zram1/reset - - This frees all the memory allocated for the given device and - resets the disksize to zero. You must set the disksize again - before reusing the device. - -Please report any problems at: - - Mailing list: linux-mm-cc at laptop dot org - - Issue tracker: http://code.google.com/p/compcache/issues/list - -Nitin Gupta -ngupta@vflare.org diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c deleted file mode 100644 index 83fa65773..000000000 --- a/drivers/staging/zram/zram_drv.c +++ /dev/null @@ -1,995 +0,0 @@ -/* - * Compressed RAM block device - * - * Copyright (C) 2008, 2009, 2010 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the licence that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - * - * Project home: http://compcache.googlecode.com - */ - -#define KMSG_COMPONENT "zram" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#ifdef CONFIG_ZRAM_DEBUG -#define DEBUG -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zram_drv.h" - -/* Globals */ -static int zram_major; -static struct zram *zram_devices; - -/* - * We don't need to see memory allocation errors more than once every 1 - * second to know that a problem is occurring. - */ -#define ALLOC_ERROR_LOG_RATE_MS 1000 - -/* Module params (documentation at end) */ -static unsigned int num_devices = 1; - -static inline struct zram *dev_to_zram(struct device *dev) -{ - return (struct zram *)dev_to_disk(dev)->private_data; -} - -static ssize_t disksize_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", zram->disksize); -} - -static ssize_t initstate_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%u\n", zram->init_done); -} - -static ssize_t num_reads_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.num_reads)); -} - -static ssize_t num_writes_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.num_writes)); -} - -static ssize_t invalid_io_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.invalid_io)); -} - -static ssize_t notify_free_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.notify_free)); -} - -static ssize_t zero_pages_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%u\n", zram->stats.pages_zero); -} - -static ssize_t orig_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)(zram->stats.pages_stored) << PAGE_SHIFT); -} - -static ssize_t compr_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.compr_size)); -} - -static ssize_t mem_used_total_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val = 0; - struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta = zram->meta; - - down_read(&zram->init_lock); - if (zram->init_done) - val = zs_get_total_size_bytes(meta->mem_pool); - up_read(&zram->init_lock); - - return sprintf(buf, "%llu\n", val); -} - -static int zram_test_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) -{ - return meta->table[index].flags & BIT(flag); -} - -static void zram_set_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) -{ - meta->table[index].flags |= BIT(flag); -} - -static void zram_clear_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) -{ - meta->table[index].flags &= ~BIT(flag); -} - -static inline int is_partial_io(struct bio_vec *bvec) -{ - return bvec->bv_len != PAGE_SIZE; -} - -/* - * Check if request is within bounds and aligned on zram logical blocks. - */ -static inline int valid_io_request(struct zram *zram, struct bio *bio) -{ - u64 start, end, bound; - - /* unaligned request */ - if (unlikely(bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) - return 0; - if (unlikely(bio->bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) - return 0; - - start = bio->bi_sector; - end = start + (bio->bi_size >> SECTOR_SHIFT); - bound = zram->disksize >> SECTOR_SHIFT; - /* out of range range */ - if (unlikely(start >= bound || end > bound || start > end)) - return 0; - - /* I/O request is valid */ - return 1; -} - -static void zram_meta_free(struct zram_meta *meta) -{ - zs_destroy_pool(meta->mem_pool); - kfree(meta->compress_workmem); - free_pages((unsigned long)meta->compress_buffer, 1); - vfree(meta->table); - kfree(meta); -} - -static struct zram_meta *zram_meta_alloc(u64 disksize) -{ - size_t num_pages; - struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); - if (!meta) - goto out; - - meta->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); - if (!meta->compress_workmem) - goto free_meta; - - meta->compress_buffer = - (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); - if (!meta->compress_buffer) { - pr_err("Error allocating compressor buffer space\n"); - goto free_workmem; - } - - num_pages = disksize >> PAGE_SHIFT; - meta->table = vzalloc(num_pages * sizeof(*meta->table)); - if (!meta->table) { - pr_err("Error allocating zram address table\n"); - goto free_buffer; - } - - meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM | - __GFP_NOWARN); - if (!meta->mem_pool) { - pr_err("Error creating memory pool\n"); - goto free_table; - } - - return meta; - -free_table: - vfree(meta->table); -free_buffer: - free_pages((unsigned long)meta->compress_buffer, 1); -free_workmem: - kfree(meta->compress_workmem); -free_meta: - kfree(meta); - meta = NULL; -out: - return meta; -} - -static void update_position(u32 *index, int *offset, struct bio_vec *bvec) -{ - if (*offset + bvec->bv_len >= PAGE_SIZE) - (*index)++; - *offset = (*offset + bvec->bv_len) % PAGE_SIZE; -} - -static int page_zero_filled(void *ptr) -{ - unsigned int pos; - unsigned long *page; - - page = (unsigned long *)ptr; - - for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { - if (page[pos]) - return 0; - } - - return 1; -} - -static void handle_zero_page(struct bio_vec *bvec) -{ - struct page *page = bvec->bv_page; - void *user_mem; - - user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) - memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); - else - clear_page(user_mem); - kunmap_atomic(user_mem); - - flush_dcache_page(page); -} - -static void zram_free_page(struct zram *zram, size_t index) -{ - struct zram_meta *meta = zram->meta; - unsigned long handle = meta->table[index].handle; - u16 size = meta->table[index].size; - - if (unlikely(!handle)) { - /* - * No memory is allocated for zero filled pages. - * Simply clear zero page flag. - */ - if (zram_test_flag(meta, index, ZRAM_ZERO)) { - zram_clear_flag(meta, index, ZRAM_ZERO); - zram->stats.pages_zero--; - } - return; - } - - if (unlikely(size > max_zpage_size)) - zram->stats.bad_compress--; - - zs_free(meta->mem_pool, handle); - - if (size <= PAGE_SIZE / 2) - zram->stats.good_compress--; - - atomic64_sub(meta->table[index].size, &zram->stats.compr_size); - zram->stats.pages_stored--; - - meta->table[index].handle = 0; - meta->table[index].size = 0; -} - -static int zram_decompress_page(struct zram *zram, char *mem, u32 index) -{ - int ret = LZO_E_OK; - size_t clen = PAGE_SIZE; - unsigned char *cmem; - struct zram_meta *meta = zram->meta; - unsigned long handle = meta->table[index].handle; - - if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { - clear_page(mem); - return 0; - } - - cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); - if (meta->table[index].size == PAGE_SIZE) - copy_page(mem, cmem); - else - ret = lzo1x_decompress_safe(cmem, meta->table[index].size, - mem, &clen); - zs_unmap_object(meta->mem_pool, handle); - - /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret != LZO_E_OK)) { - pr_err("Decompression failed! err=%d, page=%u\n", ret, index); - atomic64_inc(&zram->stats.failed_reads); - return ret; - } - - return 0; -} - -static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset, struct bio *bio) -{ - int ret; - struct page *page; - unsigned char *user_mem, *uncmem = NULL; - struct zram_meta *meta = zram->meta; - page = bvec->bv_page; - - if (unlikely(!meta->table[index].handle) || - zram_test_flag(meta, index, ZRAM_ZERO)) { - handle_zero_page(bvec); - return 0; - } - - if (is_partial_io(bvec)) - /* Use a temporary buffer to decompress the page */ - uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); - - user_mem = kmap_atomic(page); - if (!is_partial_io(bvec)) - uncmem = user_mem; - - if (!uncmem) { - pr_info("Unable to allocate temp memory\n"); - ret = -ENOMEM; - goto out_cleanup; - } - - ret = zram_decompress_page(zram, uncmem, index); - /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret != LZO_E_OK)) - goto out_cleanup; - - if (is_partial_io(bvec)) - memcpy(user_mem + bvec->bv_offset, uncmem + offset, - bvec->bv_len); - - flush_dcache_page(page); - ret = 0; -out_cleanup: - kunmap_atomic(user_mem); - if (is_partial_io(bvec)) - kfree(uncmem); - return ret; -} - -static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset) -{ - int ret = 0; - size_t clen; - unsigned long handle; - struct page *page; - unsigned char *user_mem, *cmem, *src, *uncmem = NULL; - struct zram_meta *meta = zram->meta; - static unsigned long zram_rs_time; - - page = bvec->bv_page; - src = meta->compress_buffer; - - if (is_partial_io(bvec)) { - /* - * This is a partial IO. We need to read the full page - * before to write the changes. - */ - uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); - if (!uncmem) { - ret = -ENOMEM; - goto out; - } - ret = zram_decompress_page(zram, uncmem, index); - if (ret) - goto out; - } - - user_mem = kmap_atomic(page); - - if (is_partial_io(bvec)) { - memcpy(uncmem + offset, user_mem + bvec->bv_offset, - bvec->bv_len); - kunmap_atomic(user_mem); - user_mem = NULL; - } else { - uncmem = user_mem; - } - - if (page_zero_filled(uncmem)) { - kunmap_atomic(user_mem); - /* Free memory associated with this sector now. */ - zram_free_page(zram, index); - - zram->stats.pages_zero++; - zram_set_flag(meta, index, ZRAM_ZERO); - ret = 0; - goto out; - } - - /* - * zram_slot_free_notify could miss free so that let's - * double check. - */ - if (unlikely(meta->table[index].handle || - zram_test_flag(meta, index, ZRAM_ZERO))) - zram_free_page(zram, index); - - ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen, - meta->compress_workmem); - - if (!is_partial_io(bvec)) { - kunmap_atomic(user_mem); - user_mem = NULL; - uncmem = NULL; - } - - if (unlikely(ret != LZO_E_OK)) { - pr_err("Compression failed! err=%d\n", ret); - goto out; - } - - if (unlikely(clen > max_zpage_size)) { - zram->stats.bad_compress++; - clen = PAGE_SIZE; - src = NULL; - if (is_partial_io(bvec)) - src = uncmem; - } - - handle = zs_malloc(meta->mem_pool, clen); - if (!handle) { - if (printk_timed_ratelimit(&zram_rs_time, - ALLOC_ERROR_LOG_RATE_MS)) - pr_info("Error allocating memory for compressed page: %u, size=%zu\n", - index, clen); - ret = -ENOMEM; - goto out; - } - cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); - - if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { - src = kmap_atomic(page); - copy_page(cmem, src); - kunmap_atomic(src); - } else { - memcpy(cmem, src, clen); - } - - zs_unmap_object(meta->mem_pool, handle); - - /* - * Free memory associated with this sector - * before overwriting unused sectors. - */ - zram_free_page(zram, index); - - meta->table[index].handle = handle; - meta->table[index].size = clen; - - /* Update stats */ - atomic64_add(clen, &zram->stats.compr_size); - zram->stats.pages_stored++; - if (clen <= PAGE_SIZE / 2) - zram->stats.good_compress++; - -out: - if (is_partial_io(bvec)) - kfree(uncmem); - - if (ret) - atomic64_inc(&zram->stats.failed_writes); - return ret; -} - -static void handle_pending_slot_free(struct zram *zram) -{ - struct zram_slot_free *free_rq; - - spin_lock(&zram->slot_free_lock); - while (zram->slot_free_rq) { - free_rq = zram->slot_free_rq; - zram->slot_free_rq = free_rq->next; - zram_free_page(zram, free_rq->index); - kfree(free_rq); - } - spin_unlock(&zram->slot_free_lock); -} - -static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, struct bio *bio, int rw) -{ - int ret; - - if (rw == READ) { - down_read(&zram->lock); - handle_pending_slot_free(zram); - ret = zram_bvec_read(zram, bvec, index, offset, bio); - up_read(&zram->lock); - } else { - down_write(&zram->lock); - handle_pending_slot_free(zram); - ret = zram_bvec_write(zram, bvec, index, offset); - up_write(&zram->lock); - } - - return ret; -} - -static void zram_reset_device(struct zram *zram, bool reset_capacity) -{ - size_t index; - struct zram_meta *meta; - - flush_work(&zram->free_work); - - down_write(&zram->init_lock); - if (!zram->init_done) { - up_write(&zram->init_lock); - return; - } - - meta = zram->meta; - zram->init_done = 0; - - /* Free all pages that are still in this zram device */ - for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) { - unsigned long handle = meta->table[index].handle; - if (!handle) - continue; - - zs_free(meta->mem_pool, handle); - } - - zram_meta_free(zram->meta); - zram->meta = NULL; - /* Reset stats */ - memset(&zram->stats, 0, sizeof(zram->stats)); - - zram->disksize = 0; - if (reset_capacity) - set_capacity(zram->disk, 0); - up_write(&zram->init_lock); -} - -static void zram_init_device(struct zram *zram, struct zram_meta *meta) -{ - if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) { - pr_info( - "There is little point creating a zram of greater than " - "twice the size of memory since we expect a 2:1 compression " - "ratio. Note that zram uses about 0.1%% of the size of " - "the disk when not in use so a huge zram is " - "wasteful.\n" - "\tMemory Size: %lu kB\n" - "\tSize you selected: %llu kB\n" - "Continuing anyway ...\n", - (totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10 - ); - } - - /* zram devices sort of resembles non-rotational disks */ - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); - - zram->meta = meta; - zram->init_done = 1; - - pr_debug("Initialization done!\n"); -} - -static ssize_t disksize_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - u64 disksize; - struct zram_meta *meta; - struct zram *zram = dev_to_zram(dev); - - disksize = memparse(buf, NULL); - if (!disksize) - return -EINVAL; - - disksize = PAGE_ALIGN(disksize); - meta = zram_meta_alloc(disksize); - down_write(&zram->init_lock); - if (zram->init_done) { - up_write(&zram->init_lock); - zram_meta_free(meta); - pr_info("Cannot change disksize for initialized device\n"); - return -EBUSY; - } - - zram->disksize = disksize; - set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - zram_init_device(zram, meta); - up_write(&zram->init_lock); - - return len; -} - -static ssize_t reset_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - int ret; - unsigned short do_reset; - struct zram *zram; - struct block_device *bdev; - - zram = dev_to_zram(dev); - bdev = bdget_disk(zram->disk, 0); - - /* Do not reset an active device! */ - if (bdev->bd_holders) - return -EBUSY; - - ret = kstrtou16(buf, 10, &do_reset); - if (ret) - return ret; - - if (!do_reset) - return -EINVAL; - - /* Make sure all pending I/O is finished */ - if (bdev) - fsync_bdev(bdev); - - zram_reset_device(zram, true); - return len; -} - -static void __zram_make_request(struct zram *zram, struct bio *bio, int rw) -{ - int i, offset; - u32 index; - struct bio_vec *bvec; - - switch (rw) { - case READ: - atomic64_inc(&zram->stats.num_reads); - break; - case WRITE: - atomic64_inc(&zram->stats.num_writes); - break; - } - - index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; - offset = (bio->bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; - - bio_for_each_segment(bvec, bio, i) { - int max_transfer_size = PAGE_SIZE - offset; - - if (bvec->bv_len > max_transfer_size) { - /* - * zram_bvec_rw() can only make operation on a single - * zram page. Split the bio vector. - */ - struct bio_vec bv; - - bv.bv_page = bvec->bv_page; - bv.bv_len = max_transfer_size; - bv.bv_offset = bvec->bv_offset; - - if (zram_bvec_rw(zram, &bv, index, offset, bio, rw) < 0) - goto out; - - bv.bv_len = bvec->bv_len - max_transfer_size; - bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index+1, 0, bio, rw) < 0) - goto out; - } else - if (zram_bvec_rw(zram, bvec, index, offset, bio, rw) - < 0) - goto out; - - update_position(&index, &offset, bvec); - } - - set_bit(BIO_UPTODATE, &bio->bi_flags); - bio_endio(bio, 0); - return; - -out: - bio_io_error(bio); -} - -/* - * Handler function for all zram I/O requests. - */ -static void zram_make_request(struct request_queue *queue, struct bio *bio) -{ - struct zram *zram = queue->queuedata; - - down_read(&zram->init_lock); - if (unlikely(!zram->init_done)) - goto error; - - if (!valid_io_request(zram, bio)) { - atomic64_inc(&zram->stats.invalid_io); - goto error; - } - - __zram_make_request(zram, bio, bio_data_dir(bio)); - up_read(&zram->init_lock); - - return; - -error: - up_read(&zram->init_lock); - bio_io_error(bio); -} - -static void zram_slot_free(struct work_struct *work) -{ - struct zram *zram; - - zram = container_of(work, struct zram, free_work); - down_write(&zram->lock); - handle_pending_slot_free(zram); - up_write(&zram->lock); -} - -static void add_slot_free(struct zram *zram, struct zram_slot_free *free_rq) -{ - spin_lock(&zram->slot_free_lock); - free_rq->next = zram->slot_free_rq; - zram->slot_free_rq = free_rq; - spin_unlock(&zram->slot_free_lock); -} - -static void zram_slot_free_notify(struct block_device *bdev, - unsigned long index) -{ - struct zram *zram; - struct zram_slot_free *free_rq; - - zram = bdev->bd_disk->private_data; - atomic64_inc(&zram->stats.notify_free); - - free_rq = kmalloc(sizeof(struct zram_slot_free), GFP_ATOMIC); - if (!free_rq) - return; - - free_rq->index = index; - add_slot_free(zram, free_rq); - schedule_work(&zram->free_work); -} - -static const struct block_device_operations zram_devops = { - .swap_slot_free_notify = zram_slot_free_notify, - .owner = THIS_MODULE -}; - -static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, - disksize_show, disksize_store); -static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); -static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); -static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL); -static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL); -static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL); -static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL); -static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL); -static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); -static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL); -static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); - -static struct attribute *zram_disk_attrs[] = { - &dev_attr_disksize.attr, - &dev_attr_initstate.attr, - &dev_attr_reset.attr, - &dev_attr_num_reads.attr, - &dev_attr_num_writes.attr, - &dev_attr_invalid_io.attr, - &dev_attr_notify_free.attr, - &dev_attr_zero_pages.attr, - &dev_attr_orig_data_size.attr, - &dev_attr_compr_data_size.attr, - &dev_attr_mem_used_total.attr, - NULL, -}; - -static struct attribute_group zram_disk_attr_group = { - .attrs = zram_disk_attrs, -}; - -static int create_device(struct zram *zram, int device_id) -{ - int ret = -ENOMEM; - - init_rwsem(&zram->lock); - init_rwsem(&zram->init_lock); - - INIT_WORK(&zram->free_work, zram_slot_free); - spin_lock_init(&zram->slot_free_lock); - zram->slot_free_rq = NULL; - - zram->queue = blk_alloc_queue(GFP_KERNEL); - if (!zram->queue) { - pr_err("Error allocating disk queue for device %d\n", - device_id); - goto out; - } - - blk_queue_make_request(zram->queue, zram_make_request); - zram->queue->queuedata = zram; - - /* gendisk structure */ - zram->disk = alloc_disk(1); - if (!zram->disk) { - pr_warn("Error allocating disk structure for device %d\n", - device_id); - goto out_free_queue; - } - - zram->disk->major = zram_major; - zram->disk->first_minor = device_id; - zram->disk->fops = &zram_devops; - zram->disk->queue = zram->queue; - zram->disk->private_data = zram; - snprintf(zram->disk->disk_name, 16, "zram%d", device_id); - - /* Actual capacity set using syfs (/sys/block/zram/disksize */ - set_capacity(zram->disk, 0); - - /* - * To ensure that we always get PAGE_SIZE aligned - * and n*PAGE_SIZED sized I/O requests. - */ - blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE); - blk_queue_logical_block_size(zram->disk->queue, - ZRAM_LOGICAL_BLOCK_SIZE); - blk_queue_io_min(zram->disk->queue, PAGE_SIZE); - blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); - - add_disk(zram->disk); - - ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, - &zram_disk_attr_group); - if (ret < 0) { - pr_warn("Error creating sysfs group"); - goto out_free_disk; - } - - zram->init_done = 0; - return 0; - -out_free_disk: - del_gendisk(zram->disk); - put_disk(zram->disk); -out_free_queue: - blk_cleanup_queue(zram->queue); -out: - return ret; -} - -static void destroy_device(struct zram *zram) -{ - sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, - &zram_disk_attr_group); - - if (zram->disk) { - del_gendisk(zram->disk); - put_disk(zram->disk); - } - - if (zram->queue) - blk_cleanup_queue(zram->queue); -} - -static int __init zram_init(void) -{ - int ret, dev_id; - - if (num_devices > max_num_devices) { - pr_warn("Invalid value for num_devices: %u\n", - num_devices); - ret = -EINVAL; - goto out; - } - - zram_major = register_blkdev(0, "zram"); - if (zram_major <= 0) { - pr_warn("Unable to get major number\n"); - ret = -EBUSY; - goto out; - } - - /* Allocate the device array and initialize each one */ - zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL); - if (!zram_devices) { - ret = -ENOMEM; - goto unregister; - } - - for (dev_id = 0; dev_id < num_devices; dev_id++) { - ret = create_device(&zram_devices[dev_id], dev_id); - if (ret) - goto free_devices; - } - - pr_info("Created %u device(s) ...\n", num_devices); - - return 0; - -free_devices: - while (dev_id) - destroy_device(&zram_devices[--dev_id]); - kfree(zram_devices); -unregister: - unregister_blkdev(zram_major, "zram"); -out: - return ret; -} - -static void __exit zram_exit(void) -{ - int i; - struct zram *zram; - - for (i = 0; i < num_devices; i++) { - zram = &zram_devices[i]; - - destroy_device(zram); - /* - * Shouldn't access zram->disk after destroy_device - * because destroy_device already released zram->disk. - */ - zram_reset_device(zram, false); - } - - unregister_blkdev(zram_major, "zram"); - - kfree(zram_devices); - pr_debug("Cleanup done!\n"); -} - -module_init(zram_init); -module_exit(zram_exit); - -module_param(num_devices, uint, 0); -MODULE_PARM_DESC(num_devices, "Number of zram devices"); - -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_AUTHOR("Nitin Gupta "); -MODULE_DESCRIPTION("Compressed RAM Block Device"); -MODULE_ALIAS("devname:zram"); diff --git a/drivers/staging/zram/zram_sysfs.c b/drivers/staging/zram/zram_sysfs.c deleted file mode 100644 index aafcf0330..000000000 --- a/drivers/staging/zram/zram_sysfs.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Compressed RAM block device - * - * Copyright (C) 2008, 2009, 2010 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the licence that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - * - * Project home: http://compcache.googlecode.com/ - */ - -#include -#include -#include - -#include "zram_drv.h" - -static u64 zram_stat64_read(struct zram *zram, u64 *v) -{ - u64 val; - - spin_lock(&zram->stat64_lock); - val = *v; - spin_unlock(&zram->stat64_lock); - - return val; -} - -static struct zram *dev_to_zram(struct device *dev) -{ - int i; - struct zram *zram = NULL; - - for (i = 0; i < zram_get_num_devices(); i++) { - zram = &zram_devices[i]; - if (disk_to_dev(zram->disk) == dev) - break; - } - - return zram; -} - -static ssize_t disksize_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", zram->disksize); -} - -static ssize_t disksize_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - int ret; - u64 disksize; - struct zram *zram = dev_to_zram(dev); - - ret = kstrtoull(buf, 10, &disksize); - if (ret) - return ret; - - down_write(&zram->init_lock); - if (zram->init_done) { - up_write(&zram->init_lock); - pr_info("Cannot change disksize for initialized device\n"); - return -EBUSY; - } - - zram->disksize = PAGE_ALIGN(disksize); - set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - up_write(&zram->init_lock); - - return len; -} - -static ssize_t initstate_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%u\n", zram->init_done); -} - -static ssize_t reset_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - int ret; - unsigned short do_reset; - struct zram *zram; - struct block_device *bdev; - - zram = dev_to_zram(dev); - bdev = bdget_disk(zram->disk, 0); - - if (!bdev) - return -ENOMEM; - - /* Do not reset an active device! */ - if (bdev->bd_holders) { - ret = -EBUSY; - goto out; - } - - ret = kstrtou16(buf, 10, &do_reset); - if (ret) - goto out; - - if (!do_reset) { - ret = -EINVAL; - goto out; - } - - /* Make sure all pending I/O is finished */ - fsync_bdev(bdev); - bdput(bdev); - - down_write(&zram->init_lock); - if (zram->init_done) - __zram_reset_device(zram); - up_write(&zram->init_lock); - - return len; - -out: - bdput(bdev); - return ret; -} - -static ssize_t num_reads_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.num_reads)); -} - -static ssize_t num_writes_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.num_writes)); -} - -static ssize_t invalid_io_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.invalid_io)); -} - -static ssize_t notify_free_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.notify_free)); -} - -static ssize_t zero_pages_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%u\n", zram->stats.pages_zero); -} - -static ssize_t orig_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)(zram->stats.pages_stored) << PAGE_SHIFT); -} - -static ssize_t compr_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.compr_size)); -} - -static ssize_t mem_used_total_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val = 0; - struct zram *zram = dev_to_zram(dev); - - down_read(&zram->init_lock); - if (zram->init_done) { - val = zs_get_total_size_bytes(zram->mem_pool) + - ((u64)(zram->stats.pages_expand) << PAGE_SHIFT); - } - up_read(&zram->init_lock); - - return sprintf(buf, "%llu\n", val); -} - -static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, - disksize_show, disksize_store); -static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); -static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); -static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL); -static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL); -static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL); -static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL); -static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL); -static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); -static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL); -static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); - -static struct attribute *zram_disk_attrs[] = { - &dev_attr_disksize.attr, - &dev_attr_initstate.attr, - &dev_attr_reset.attr, - &dev_attr_num_reads.attr, - &dev_attr_num_writes.attr, - &dev_attr_invalid_io.attr, - &dev_attr_notify_free.attr, - &dev_attr_zero_pages.attr, - &dev_attr_orig_data_size.attr, - &dev_attr_compr_data_size.attr, - &dev_attr_mem_used_total.attr, - NULL, -}; - -struct attribute_group zram_disk_attr_group = { - .attrs = zram_disk_attrs, -}; diff --git a/drivers/staging/zsmalloc/Kconfig b/drivers/staging/zsmalloc/Kconfig deleted file mode 100644 index 7fab03229..000000000 --- a/drivers/staging/zsmalloc/Kconfig +++ /dev/null @@ -1,10 +0,0 @@ -config ZSMALLOC - bool "Memory allocator for compressed pages" - default n - help - zsmalloc is a slab-based memory allocator designed to store - compressed RAM pages. zsmalloc uses virtual memory mapping - in order to reduce fragmentation. However, this results in a - non-standard allocator interface where a handle, not a pointer, is - returned by an alloc(). This handle must be mapped in order to - access the allocated space. diff --git a/drivers/staging/zsmalloc/Makefile b/drivers/staging/zsmalloc/Makefile deleted file mode 100644 index b134848a5..000000000 --- a/drivers/staging/zsmalloc/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -zsmalloc-y := zsmalloc-main.o - -obj-$(CONFIG_ZSMALLOC) += zsmalloc.o diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c deleted file mode 100644 index 41a68032c..000000000 --- a/drivers/staging/zsmalloc/zsmalloc-main.c +++ /dev/null @@ -1,1072 +0,0 @@ -/* - * zsmalloc memory allocator - * - * Copyright (C) 2011 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the license that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - */ - - -/* - * This allocator is designed for use with zcache and zram. Thus, the - * allocator is supposed to work well under low memory conditions. In - * particular, it never attempts higher order page allocation which is - * very likely to fail under memory pressure. On the other hand, if we - * just use single (0-order) pages, it would suffer from very high - * fragmentation -- any object of size PAGE_SIZE/2 or larger would occupy - * an entire page. This was one of the major issues with its predecessor - * (xvmalloc). - * - * To overcome these issues, zsmalloc allocates a bunch of 0-order pages - * and links them together using various 'struct page' fields. These linked - * pages act as a single higher-order page i.e. an object can span 0-order - * page boundaries. The code refers to these linked pages as a single entity - * called zspage. - * - * Following is how we use various fields and flags of underlying - * struct page(s) to form a zspage. - * - * Usage of struct page fields: - * page->first_page: points to the first component (0-order) page - * page->index (union with page->freelist): offset of the first object - * starting in this page. For the first page, this is - * always 0, so we use this field (aka freelist) to point - * to the first free object in zspage. - * page->lru: links together all component pages (except the first page) - * of a zspage - * - * For _first_ page only: - * - * page->private (union with page->first_page): refers to the - * component page after the first page - * page->freelist: points to the first free object in zspage. - * Free objects are linked together using in-place - * metadata. - * page->objects: maximum number of objects we can store in this - * zspage (class->zspage_order * PAGE_SIZE / class->size) - * page->lru: links together first pages of various zspages. - * Basically forming list of zspages in a fullness group. - * page->mapping: class index and fullness group of the zspage - * - * Usage of struct page flags: - * PG_private: identifies the first component page - * PG_private2: identifies the last component page - * - */ - -#ifdef CONFIG_ZSMALLOC_DEBUG -#define DEBUG -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zsmalloc.h" - -/* - * This must be power of 2 and greater than of equal to sizeof(link_free). - * These two conditions ensure that any 'struct link_free' itself doesn't - * span more than 1 page which avoids complex case of mapping 2 pages simply - * to restore link_free pointer values. - */ -#define ZS_ALIGN 8 - -/* - * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) - * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. - */ -#define ZS_MAX_ZSPAGE_ORDER 2 -#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) - -/* - * Object location (, ) is encoded as - * as single (void *) handle value. - * - * Note that object index is relative to system - * page it is stored in, so for each sub-page belonging - * to a zspage, obj_idx starts with 0. - * - * This is made more complicated by various memory models and PAE. - */ - -#ifndef MAX_PHYSMEM_BITS -#ifdef CONFIG_HIGHMEM64G -#define MAX_PHYSMEM_BITS 36 -#else /* !CONFIG_HIGHMEM64G */ -/* - * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just - * be PAGE_SHIFT - */ -#define MAX_PHYSMEM_BITS BITS_PER_LONG -#endif -#endif -#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) -#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) -#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) - -#define MAX(a, b) ((a) >= (b) ? (a) : (b)) -/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ -#define ZS_MIN_ALLOC_SIZE \ - MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) -#define ZS_MAX_ALLOC_SIZE PAGE_SIZE - -/* - * On systems with 4K page size, this gives 254 size classes! There is a - * trader-off here: - * - Large number of size classes is potentially wasteful as free page are - * spread across these classes - * - Small number of size classes causes large internal fragmentation - * - Probably its better to use specific size classes (empirically - * determined). NOTE: all those class sizes must be set as multiple of - * ZS_ALIGN to make sure link_free itself never has to span 2 pages. - * - * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN - * (reason above) - */ -#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) -#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ - ZS_SIZE_CLASS_DELTA + 1) - -/* - * We do not maintain any list for completely empty or full pages - */ -enum fullness_group { - ZS_ALMOST_FULL, - ZS_ALMOST_EMPTY, - _ZS_NR_FULLNESS_GROUPS, - - ZS_EMPTY, - ZS_FULL -}; - -/* - * We assign a page to ZS_ALMOST_EMPTY fullness group when: - * n <= N / f, where - * n = number of allocated objects - * N = total number of objects zspage can store - * f = 1/fullness_threshold_frac - * - * Similarly, we assign zspage to: - * ZS_ALMOST_FULL when n > N / f - * ZS_EMPTY when n == 0 - * ZS_FULL when n == N - * - * (see: fix_fullness_group()) - */ -static const int fullness_threshold_frac = 4; - -struct size_class { - /* - * Size of objects stored in this class. Must be multiple - * of ZS_ALIGN. - */ - int size; - unsigned int index; - - /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ - int pages_per_zspage; - - spinlock_t lock; - - /* stats */ - u64 pages_allocated; - - struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; -}; - -/* - * Placed within free objects to form a singly linked list. - * For every zspage, first_page->freelist gives head of this list. - * - * This must be power of 2 and less than or equal to ZS_ALIGN - */ -struct link_free { - /* Handle of next free chunk (encodes ) */ - void *next; -}; - -struct zs_pool { - struct size_class size_class[ZS_SIZE_CLASSES]; - - gfp_t flags; /* allocation flags used when growing pool */ -}; - -/* - * A zspage's class index and fullness group - * are encoded in its (first)page->mapping - */ -#define CLASS_IDX_BITS 28 -#define FULLNESS_BITS 4 -#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) -#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) - -/* - * By default, zsmalloc uses a copy-based object mapping method to access - * allocations that span two pages. However, if a particular architecture - * performs VM mapping faster than copying, then it should be added here - * so that USE_PGTABLE_MAPPING is defined. This causes zsmalloc to use - * page table mapping rather than copying for object mapping. - */ -#if defined(CONFIG_ARM) && !defined(MODULE) -#define USE_PGTABLE_MAPPING -#endif - -struct mapping_area { -#ifdef USE_PGTABLE_MAPPING - struct vm_struct *vm; /* vm area for mapping object that span pages */ -#else - char *vm_buf; /* copy buffer for objects that span pages */ -#endif - char *vm_addr; /* address of kmap_atomic()'ed pages */ - enum zs_mapmode vm_mm; /* mapping mode */ -}; - - -/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ -static DEFINE_PER_CPU(struct mapping_area, zs_map_area); - -static int is_first_page(struct page *page) -{ - return PagePrivate(page); -} - -static int is_last_page(struct page *page) -{ - return PagePrivate2(page); -} - -static void get_zspage_mapping(struct page *page, unsigned int *class_idx, - enum fullness_group *fullness) -{ - unsigned long m; - BUG_ON(!is_first_page(page)); - - m = (unsigned long)page->mapping; - *fullness = m & FULLNESS_MASK; - *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; -} - -static void set_zspage_mapping(struct page *page, unsigned int class_idx, - enum fullness_group fullness) -{ - unsigned long m; - BUG_ON(!is_first_page(page)); - - m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | - (fullness & FULLNESS_MASK); - page->mapping = (struct address_space *)m; -} - -static int get_size_class_index(int size) -{ - int idx = 0; - - if (likely(size > ZS_MIN_ALLOC_SIZE)) - idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, - ZS_SIZE_CLASS_DELTA); - - return idx; -} - -static enum fullness_group get_fullness_group(struct page *page) -{ - int inuse, max_objects; - enum fullness_group fg; - BUG_ON(!is_first_page(page)); - - inuse = page->inuse; - max_objects = page->objects; - - if (inuse == 0) - fg = ZS_EMPTY; - else if (inuse == max_objects) - fg = ZS_FULL; - else if (inuse <= max_objects / fullness_threshold_frac) - fg = ZS_ALMOST_EMPTY; - else - fg = ZS_ALMOST_FULL; - - return fg; -} - -static void insert_zspage(struct page *page, struct size_class *class, - enum fullness_group fullness) -{ - struct page **head; - - BUG_ON(!is_first_page(page)); - - if (fullness >= _ZS_NR_FULLNESS_GROUPS) - return; - - head = &class->fullness_list[fullness]; - if (*head) - list_add_tail(&page->lru, &(*head)->lru); - - *head = page; -} - -static void remove_zspage(struct page *page, struct size_class *class, - enum fullness_group fullness) -{ - struct page **head; - - BUG_ON(!is_first_page(page)); - - if (fullness >= _ZS_NR_FULLNESS_GROUPS) - return; - - head = &class->fullness_list[fullness]; - BUG_ON(!*head); - if (list_empty(&(*head)->lru)) - *head = NULL; - else if (*head == page) - *head = (struct page *)list_entry((*head)->lru.next, - struct page, lru); - - list_del_init(&page->lru); -} - -static enum fullness_group fix_fullness_group(struct zs_pool *pool, - struct page *page) -{ - int class_idx; - struct size_class *class; - enum fullness_group currfg, newfg; - - BUG_ON(!is_first_page(page)); - - get_zspage_mapping(page, &class_idx, &currfg); - newfg = get_fullness_group(page); - if (newfg == currfg) - goto out; - - class = &pool->size_class[class_idx]; - remove_zspage(page, class, currfg); - insert_zspage(page, class, newfg); - set_zspage_mapping(page, class_idx, newfg); - -out: - return newfg; -} - -/* - * We have to decide on how many pages to link together - * to form a zspage for each size class. This is important - * to reduce wastage due to unusable space left at end of - * each zspage which is given as: - * wastage = Zp - Zp % size_class - * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... - * - * For example, for size class of 3/8 * PAGE_SIZE, we should - * link together 3 PAGE_SIZE sized pages to form a zspage - * since then we can perfectly fit in 8 such objects. - */ -static int get_pages_per_zspage(int class_size) -{ - int i, max_usedpc = 0; - /* zspage order which gives maximum used size per KB */ - int max_usedpc_order = 1; - - for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { - int zspage_size; - int waste, usedpc; - - zspage_size = i * PAGE_SIZE; - waste = zspage_size % class_size; - usedpc = (zspage_size - waste) * 100 / zspage_size; - - if (usedpc > max_usedpc) { - max_usedpc = usedpc; - max_usedpc_order = i; - } - } - - return max_usedpc_order; -} - -/* - * A single 'zspage' is composed of many system pages which are - * linked together using fields in struct page. This function finds - * the first/head page, given any component page of a zspage. - */ -static struct page *get_first_page(struct page *page) -{ - if (is_first_page(page)) - return page; - else - return page->first_page; -} - -static struct page *get_next_page(struct page *page) -{ - struct page *next; - - if (is_last_page(page)) - next = NULL; - else if (is_first_page(page)) - next = (struct page *)page_private(page); - else - next = list_entry(page->lru.next, struct page, lru); - - return next; -} - -/* - * Encode as a single handle value. - * On hardware platforms with physical memory starting at 0x0 the pfn - * could be 0 so we ensure that the handle will never be 0 by adjusting the - * encoded obj_idx value before encoding. - */ -static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) -{ - unsigned long handle; - - if (!page) { - BUG_ON(obj_idx); - return NULL; - } - - handle = page_to_pfn(page) << OBJ_INDEX_BITS; - handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); - - return (void *)handle; -} - -/* - * Decode pair from the given object handle. We adjust the - * decoded obj_idx back to its original value since it was adjusted in - * obj_location_to_handle(). - */ -static void obj_handle_to_location(unsigned long handle, struct page **page, - unsigned long *obj_idx) -{ - *page = pfn_to_page(handle >> OBJ_INDEX_BITS); - *obj_idx = (handle & OBJ_INDEX_MASK) - 1; -} - -static unsigned long obj_idx_to_offset(struct page *page, - unsigned long obj_idx, int class_size) -{ - unsigned long off = 0; - - if (!is_first_page(page)) - off = page->index; - - return off + obj_idx * class_size; -} - -static void reset_page(struct page *page) -{ - clear_bit(PG_private, &page->flags); - clear_bit(PG_private_2, &page->flags); - set_page_private(page, 0); - page->mapping = NULL; - page->freelist = NULL; - reset_page_mapcount(page); -} - -static void free_zspage(struct page *first_page) -{ - struct page *nextp, *tmp, *head_extra; - - BUG_ON(!is_first_page(first_page)); - BUG_ON(first_page->inuse); - - head_extra = (struct page *)page_private(first_page); - - reset_page(first_page); - __free_page(first_page); - - /* zspage with only 1 system page */ - if (!head_extra) - return; - - list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { - list_del(&nextp->lru); - reset_page(nextp); - __free_page(nextp); - } - reset_page(head_extra); - __free_page(head_extra); -} - -/* Initialize a newly allocated zspage */ -static void init_zspage(struct page *first_page, struct size_class *class) -{ - unsigned long off = 0; - struct page *page = first_page; - - BUG_ON(!is_first_page(first_page)); - while (page) { - struct page *next_page; - struct link_free *link; - unsigned int i, objs_on_page; - - /* - * page->index stores offset of first object starting - * in the page. For the first page, this is always 0, - * so we use first_page->index (aka ->freelist) to store - * head of corresponding zspage's freelist. - */ - if (page != first_page) - page->index = off; - - link = (struct link_free *)kmap_atomic(page) + - off / sizeof(*link); - objs_on_page = (PAGE_SIZE - off) / class->size; - - for (i = 1; i <= objs_on_page; i++) { - off += class->size; - if (off < PAGE_SIZE) { - link->next = obj_location_to_handle(page, i); - link += class->size / sizeof(*link); - } - } - - /* - * We now come to the last (full or partial) object on this - * page, which must point to the first object on the next - * page (if present) - */ - next_page = get_next_page(page); - link->next = obj_location_to_handle(next_page, 0); - kunmap_atomic(link); - page = next_page; - off = (off + class->size) % PAGE_SIZE; - } -} - -/* - * Allocate a zspage for the given size class - */ -static struct page *alloc_zspage(struct size_class *class, gfp_t flags) -{ - int i, error; - struct page *first_page = NULL, *uninitialized_var(prev_page); - - /* - * Allocate individual pages and link them together as: - * 1. first page->private = first sub-page - * 2. all sub-pages are linked together using page->lru - * 3. each sub-page is linked to the first page using page->first_page - * - * For each size class, First/Head pages are linked together using - * page->lru. Also, we set PG_private to identify the first page - * (i.e. no other sub-page has this flag set) and PG_private_2 to - * identify the last page. - */ - error = -ENOMEM; - for (i = 0; i < class->pages_per_zspage; i++) { - struct page *page; - - page = alloc_page(flags); - if (!page) - goto cleanup; - - INIT_LIST_HEAD(&page->lru); - if (i == 0) { /* first page */ - SetPagePrivate(page); - set_page_private(page, 0); - first_page = page; - first_page->inuse = 0; - } - if (i == 1) - set_page_private(first_page, (unsigned long)page); - if (i >= 1) - page->first_page = first_page; - if (i >= 2) - list_add(&page->lru, &prev_page->lru); - if (i == class->pages_per_zspage - 1) /* last page */ - SetPagePrivate2(page); - prev_page = page; - } - - init_zspage(first_page, class); - - first_page->freelist = obj_location_to_handle(first_page, 0); - /* Maximum number of objects we can store in this zspage */ - first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; - - error = 0; /* Success */ - -cleanup: - if (unlikely(error) && first_page) { - free_zspage(first_page); - first_page = NULL; - } - - return first_page; -} - -static struct page *find_get_zspage(struct size_class *class) -{ - int i; - struct page *page; - - for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { - page = class->fullness_list[i]; - if (page) - break; - } - - return page; -} - -#ifdef USE_PGTABLE_MAPPING -static inline int __zs_cpu_up(struct mapping_area *area) -{ - /* - * Make sure we don't leak memory if a cpu UP notification - * and zs_init() race and both call zs_cpu_up() on the same cpu - */ - if (area->vm) - return 0; - area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); - if (!area->vm) - return -ENOMEM; - return 0; -} - -static inline void __zs_cpu_down(struct mapping_area *area) -{ - if (area->vm) - free_vm_area(area->vm); - area->vm = NULL; -} - -static inline void *__zs_map_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); - area->vm_addr = area->vm->addr; - return area->vm_addr + off; -} - -static inline void __zs_unmap_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - unsigned long addr = (unsigned long)area->vm_addr; - - unmap_kernel_range(addr, PAGE_SIZE * 2); -} - -#else /* USE_PGTABLE_MAPPING */ - -static inline int __zs_cpu_up(struct mapping_area *area) -{ - /* - * Make sure we don't leak memory if a cpu UP notification - * and zs_init() race and both call zs_cpu_up() on the same cpu - */ - if (area->vm_buf) - return 0; - area->vm_buf = (char *)__get_free_page(GFP_KERNEL); - if (!area->vm_buf) - return -ENOMEM; - return 0; -} - -static inline void __zs_cpu_down(struct mapping_area *area) -{ - if (area->vm_buf) - free_page((unsigned long)area->vm_buf); - area->vm_buf = NULL; -} - -static void *__zs_map_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - int sizes[2]; - void *addr; - char *buf = area->vm_buf; - - /* disable page faults to match kmap_atomic() return conditions */ - pagefault_disable(); - - /* no read fastpath */ - if (area->vm_mm == ZS_MM_WO) - goto out; - - sizes[0] = PAGE_SIZE - off; - sizes[1] = size - sizes[0]; - - /* copy object to per-cpu buffer */ - addr = kmap_atomic(pages[0]); - memcpy(buf, addr + off, sizes[0]); - kunmap_atomic(addr); - addr = kmap_atomic(pages[1]); - memcpy(buf + sizes[0], addr, sizes[1]); - kunmap_atomic(addr); -out: - return area->vm_buf; -} - -static void __zs_unmap_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - int sizes[2]; - void *addr; - char *buf = area->vm_buf; - - /* no write fastpath */ - if (area->vm_mm == ZS_MM_RO) - goto out; - - sizes[0] = PAGE_SIZE - off; - sizes[1] = size - sizes[0]; - - /* copy per-cpu buffer to object */ - addr = kmap_atomic(pages[0]); - memcpy(addr + off, buf, sizes[0]); - kunmap_atomic(addr); - addr = kmap_atomic(pages[1]); - memcpy(addr, buf + sizes[0], sizes[1]); - kunmap_atomic(addr); - -out: - /* enable page faults to match kunmap_atomic() return conditions */ - pagefault_enable(); -} - -#endif /* USE_PGTABLE_MAPPING */ - -static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, - void *pcpu) -{ - int ret, cpu = (long)pcpu; - struct mapping_area *area; - - switch (action) { - case CPU_UP_PREPARE: - area = &per_cpu(zs_map_area, cpu); - ret = __zs_cpu_up(area); - if (ret) - return notifier_from_errno(ret); - break; - case CPU_DEAD: - case CPU_UP_CANCELED: - area = &per_cpu(zs_map_area, cpu); - __zs_cpu_down(area); - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block zs_cpu_nb = { - .notifier_call = zs_cpu_notifier -}; - -static void zs_exit(void) -{ - int cpu; - - for_each_online_cpu(cpu) - zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); - unregister_cpu_notifier(&zs_cpu_nb); -} - -static int zs_init(void) -{ - int cpu, ret; - - register_cpu_notifier(&zs_cpu_nb); - for_each_online_cpu(cpu) { - ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - if (notifier_to_errno(ret)) - goto fail; - } - return 0; -fail: - zs_exit(); - return notifier_to_errno(ret); -} - -/** - * zs_create_pool - Creates an allocation pool to work from. - * @flags: allocation flags used to allocate pool metadata - * - * This function must be called before anything when using - * the zsmalloc allocator. - * - * On success, a pointer to the newly created pool is returned, - * otherwise NULL. - */ -struct zs_pool *zs_create_pool(gfp_t flags) -{ - int i, ovhd_size; - struct zs_pool *pool; - - ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); - pool = kzalloc(ovhd_size, GFP_KERNEL); - if (!pool) - return NULL; - - for (i = 0; i < ZS_SIZE_CLASSES; i++) { - int size; - struct size_class *class; - - size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; - if (size > ZS_MAX_ALLOC_SIZE) - size = ZS_MAX_ALLOC_SIZE; - - class = &pool->size_class[i]; - class->size = size; - class->index = i; - spin_lock_init(&class->lock); - class->pages_per_zspage = get_pages_per_zspage(size); - - } - - pool->flags = flags; - - return pool; -} -EXPORT_SYMBOL_GPL(zs_create_pool); - -void zs_destroy_pool(struct zs_pool *pool) -{ - int i; - - for (i = 0; i < ZS_SIZE_CLASSES; i++) { - int fg; - struct size_class *class = &pool->size_class[i]; - - for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { - if (class->fullness_list[fg]) { - pr_info("Freeing non-empty class with size %db, fullness group %d\n", - class->size, fg); - } - } - } - kfree(pool); -} -EXPORT_SYMBOL_GPL(zs_destroy_pool); - -/** - * zs_malloc - Allocate block of given size from pool. - * @pool: pool to allocate from - * @size: size of block to allocate - * - * On success, handle to the allocated object is returned, - * otherwise 0. - * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. - */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size) -{ - unsigned long obj; - struct link_free *link; - int class_idx; - struct size_class *class; - - struct page *first_page, *m_page; - unsigned long m_objidx, m_offset; - - if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) - return 0; - - class_idx = get_size_class_index(size); - class = &pool->size_class[class_idx]; - BUG_ON(class_idx != class->index); - - spin_lock(&class->lock); - first_page = find_get_zspage(class); - - if (!first_page) { - spin_unlock(&class->lock); - first_page = alloc_zspage(class, pool->flags); - if (unlikely(!first_page)) - return 0; - - set_zspage_mapping(first_page, class->index, ZS_EMPTY); - spin_lock(&class->lock); - class->pages_allocated += class->pages_per_zspage; - } - - obj = (unsigned long)first_page->freelist; - obj_handle_to_location(obj, &m_page, &m_objidx); - m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); - - link = (struct link_free *)kmap_atomic(m_page) + - m_offset / sizeof(*link); - first_page->freelist = link->next; - memset(link, POISON_INUSE, sizeof(*link)); - kunmap_atomic(link); - - first_page->inuse++; - /* Now move the zspage to another fullness group, if required */ - fix_fullness_group(pool, first_page); - spin_unlock(&class->lock); - - return obj; -} -EXPORT_SYMBOL_GPL(zs_malloc); - -void zs_free(struct zs_pool *pool, unsigned long obj) -{ - struct link_free *link; - struct page *first_page, *f_page; - unsigned long f_objidx, f_offset; - - int class_idx; - struct size_class *class; - enum fullness_group fullness; - - if (unlikely(!obj)) - return; - - obj_handle_to_location(obj, &f_page, &f_objidx); - first_page = get_first_page(f_page); - - get_zspage_mapping(first_page, &class_idx, &fullness); - class = &pool->size_class[class_idx]; - f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); - - spin_lock(&class->lock); - - /* Insert this object in containing zspage's freelist */ - link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) - + f_offset); - link->next = first_page->freelist; - kunmap_atomic(link); - first_page->freelist = (void *)obj; - - first_page->inuse--; - fullness = fix_fullness_group(pool, first_page); - - if (fullness == ZS_EMPTY) - class->pages_allocated -= class->pages_per_zspage; - - spin_unlock(&class->lock); - - if (fullness == ZS_EMPTY) - free_zspage(first_page); -} -EXPORT_SYMBOL_GPL(zs_free); - -/** - * zs_map_object - get address of allocated object from handle. - * @pool: pool from which the object was allocated - * @handle: handle returned from zs_malloc - * - * Before using an object allocated from zs_malloc, it must be mapped using - * this function. When done with the object, it must be unmapped using - * zs_unmap_object. - * - * Only one object can be mapped per cpu at a time. There is no protection - * against nested mappings. - * - * This function returns with preemption and page faults disabled. - */ -void *zs_map_object(struct zs_pool *pool, unsigned long handle, - enum zs_mapmode mm) -{ - struct page *page; - unsigned long obj_idx, off; - - unsigned int class_idx; - enum fullness_group fg; - struct size_class *class; - struct mapping_area *area; - struct page *pages[2]; - - BUG_ON(!handle); - - /* - * Because we use per-cpu mapping areas shared among the - * pools/users, we can't allow mapping in interrupt context - * because it can corrupt another users mappings. - */ - BUG_ON(in_interrupt()); - - obj_handle_to_location(handle, &page, &obj_idx); - get_zspage_mapping(get_first_page(page), &class_idx, &fg); - class = &pool->size_class[class_idx]; - off = obj_idx_to_offset(page, obj_idx, class->size); - - area = &get_cpu_var(zs_map_area); - area->vm_mm = mm; - if (off + class->size <= PAGE_SIZE) { - /* this object is contained entirely within a page */ - area->vm_addr = kmap_atomic(page); - return area->vm_addr + off; - } - - /* this object spans two pages */ - pages[0] = page; - pages[1] = get_next_page(page); - BUG_ON(!pages[1]); - - return __zs_map_object(area, pages, off, class->size); -} -EXPORT_SYMBOL_GPL(zs_map_object); - -void zs_unmap_object(struct zs_pool *pool, unsigned long handle) -{ - struct page *page; - unsigned long obj_idx, off; - - unsigned int class_idx; - enum fullness_group fg; - struct size_class *class; - struct mapping_area *area; - - BUG_ON(!handle); - - obj_handle_to_location(handle, &page, &obj_idx); - get_zspage_mapping(get_first_page(page), &class_idx, &fg); - class = &pool->size_class[class_idx]; - off = obj_idx_to_offset(page, obj_idx, class->size); - - area = &__get_cpu_var(zs_map_area); - if (off + class->size <= PAGE_SIZE) - kunmap_atomic(area->vm_addr); - else { - struct page *pages[2]; - - pages[0] = page; - pages[1] = get_next_page(page); - BUG_ON(!pages[1]); - - __zs_unmap_object(area, pages, off, class->size); - } - put_cpu_var(zs_map_area); -} -EXPORT_SYMBOL_GPL(zs_unmap_object); - -u64 zs_get_total_size_bytes(struct zs_pool *pool) -{ - int i; - u64 npages = 0; - - for (i = 0; i < ZS_SIZE_CLASSES; i++) - npages += pool->size_class[i].pages_allocated; - - return npages << PAGE_SHIFT; -} -EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); - -module_init(zs_init); -module_exit(zs_exit); - -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_AUTHOR("Nitin Gupta "); diff --git a/drivers/staging/zsmalloc/zsmalloc.h b/drivers/staging/zsmalloc/zsmalloc.h deleted file mode 100644 index fbe6bec42..000000000 --- a/drivers/staging/zsmalloc/zsmalloc.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * zsmalloc memory allocator - * - * Copyright (C) 2011 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the license that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - */ - -#ifndef _ZS_MALLOC_H_ -#define _ZS_MALLOC_H_ - -#include - -/* - * zsmalloc mapping modes - * - * NOTE: These only make a difference when a mapped object spans pages - */ -enum zs_mapmode { - ZS_MM_RW, /* normal read-write mapping */ - ZS_MM_RO, /* read-only (no copy-out at unmap time) */ - ZS_MM_WO /* write-only (no copy-in at map time) */ -}; - -struct zs_pool; - -struct zs_pool *zs_create_pool(gfp_t flags); -void zs_destroy_pool(struct zs_pool *pool); - -unsigned long zs_malloc(struct zs_pool *pool, size_t size); -void zs_free(struct zs_pool *pool, unsigned long obj); - -void *zs_map_object(struct zs_pool *pool, unsigned long handle, - enum zs_mapmode mm); -void zs_unmap_object(struct zs_pool *pool, unsigned long handle); - -u64 zs_get_total_size_bytes(struct zs_pool *pool); - -#endif diff --git a/drivers/staging/zsmalloc/zsmalloc_int.h b/drivers/staging/zsmalloc/zsmalloc_int.h deleted file mode 100644 index 92eefc663..000000000 --- a/drivers/staging/zsmalloc/zsmalloc_int.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - * zsmalloc memory allocator - * - * Copyright (C) 2011 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the license that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - */ - -#ifndef _ZS_MALLOC_INT_H_ -#define _ZS_MALLOC_INT_H_ - -#include -#include -#include - -/* - * This must be power of 2 and greater than of equal to sizeof(link_free). - * These two conditions ensure that any 'struct link_free' itself doesn't - * span more than 1 page which avoids complex case of mapping 2 pages simply - * to restore link_free pointer values. - */ -#define ZS_ALIGN 8 - -/* - * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) - * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. - */ -#define ZS_MAX_ZSPAGE_ORDER 2 -#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) - -/* - * Object location (, ) is encoded as - * as single (void *) handle value. - * - * Note that object index is relative to system - * page it is stored in, so for each sub-page belonging - * to a zspage, obj_idx starts with 0. - * - * This is made more complicated by various memory models and PAE. - */ - -#ifndef MAX_PHYSMEM_BITS -#ifdef CONFIG_HIGHMEM64G -#define MAX_PHYSMEM_BITS 36 -#else /* !CONFIG_HIGHMEM64G */ -/* - * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just - * be PAGE_SHIFT - */ -#define MAX_PHYSMEM_BITS BITS_PER_LONG -#endif -#endif -#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) -#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) -#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) - -#define MAX(a, b) ((a) >= (b) ? (a) : (b)) -/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ -#define ZS_MIN_ALLOC_SIZE \ - MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) -#define ZS_MAX_ALLOC_SIZE PAGE_SIZE - -/* - * On systems with 4K page size, this gives 254 size classes! There is a - * trader-off here: - * - Large number of size classes is potentially wasteful as free page are - * spread across these classes - * - Small number of size classes causes large internal fragmentation - * - Probably its better to use specific size classes (empirically - * determined). NOTE: all those class sizes must be set as multiple of - * ZS_ALIGN to make sure link_free itself never has to span 2 pages. - * - * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN - * (reason above) - */ -#define ZS_SIZE_CLASS_DELTA 16 -#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ - ZS_SIZE_CLASS_DELTA + 1) - -/* - * We do not maintain any list for completely empty or full pages - */ -enum fullness_group { - ZS_ALMOST_FULL, - ZS_ALMOST_EMPTY, - _ZS_NR_FULLNESS_GROUPS, - - ZS_EMPTY, - ZS_FULL -}; - -/* - * We assign a page to ZS_ALMOST_EMPTY fullness group when: - * n <= N / f, where - * n = number of allocated objects - * N = total number of objects zspage can store - * f = 1/fullness_threshold_frac - * - * Similarly, we assign zspage to: - * ZS_ALMOST_FULL when n > N / f - * ZS_EMPTY when n == 0 - * ZS_FULL when n == N - * - * (see: fix_fullness_group()) - */ -static const int fullness_threshold_frac = 4; - -struct mapping_area { - struct vm_struct *vm; - pte_t *vm_ptes[2]; - char *vm_addr; -}; - -struct size_class { - /* - * Size of objects stored in this class. Must be multiple - * of ZS_ALIGN. - */ - int size; - unsigned int index; - - /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ - int zspage_order; - - spinlock_t lock; - - /* stats */ - u64 pages_allocated; - - struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; -}; - -/* - * Placed within free objects to form a singly linked list. - * For every zspage, first_page->freelist gives head of this list. - * - * This must be power of 2 and less than or equal to ZS_ALIGN - */ -struct link_free { - /* Handle of next free chunk (encodes ) */ - void *next; -}; - -struct zs_pool { - struct size_class size_class[ZS_SIZE_CLASSES]; - - gfp_t flags; /* allocation flags used when growing pool */ - const char *name; -}; - -#endif diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 0a2d86ea5..b7df271f2 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -1310,8 +1310,7 @@ static inline void n_tty_receive_char(struct tty_struct *tty, unsigned char c) tty->canon_data++; spin_unlock_irqrestore(&tty->read_lock, flags); kill_fasync(&tty->fasync, SIGIO, POLL_IN); - if (waitqueue_active(&tty->read_wait)) - wake_up_interruptible(&tty->read_wait); + wake_up_interruptible(&tty->read_wait); return; } } @@ -1434,8 +1433,7 @@ static void n_tty_receive_buf(struct tty_struct *tty, const unsigned char *cp, if ((!tty->icanon && (tty->read_cnt >= tty->minimum_to_wake)) || L_EXTPROC(tty)) { kill_fasync(&tty->fasync, SIGIO, POLL_IN); - if (waitqueue_active(&tty->read_wait)) - wake_up_interruptible(&tty->read_wait); + wake_up_interruptible(&tty->read_wait); } /* diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 8d4a762aa..061fcf47f 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2018,8 +2018,24 @@ static int tty_open(struct inode *inode, struct file *filp) if (!noctty && current->signal->leader && !current->signal->tty && - tty->session == NULL) - __proc_set_tty(current, tty); + tty->session == NULL) { + /* + * Don't let a process that only has write access to the tty + * obtain the privileges associated with having a tty as + * controlling terminal (being able to reopen it with full + * access through /dev/tty, being able to perform pushback). + * Many distributions set the group of all ttys to "tty" and + * grant write-only access to all terminals for setgid tty + * binaries, which should not imply full privileges on all ttys. + * + * This could theoretically break old code that performs open() + * on a write-only file descriptor. In that case, it might be + * necessary to also permit this if + * inode_permission(inode, MAY_READ) == 0. + */ + if (filp->f_mode & FMODE_READ) + __proc_set_tty(current, tty); + } spin_unlock_irq(¤t->sighand->siglock); tty_unlock(); mutex_unlock(&tty_mutex); @@ -2308,7 +2324,7 @@ static int fionbio(struct file *file, int __user *p) * Takes ->siglock() when updating signal->tty */ -static int tiocsctty(struct tty_struct *tty, int arg) +static int tiocsctty(struct tty_struct *tty, struct file *file, int arg) { int ret = 0; if (current->signal->leader && (task_session(current) == tty->session)) @@ -2341,6 +2357,13 @@ static int tiocsctty(struct tty_struct *tty, int arg) goto unlock; } } + + /* See the comment in tty_open(). */ + if ((file->f_mode & FMODE_READ) == 0 && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto unlock; + } + proc_set_tty(current, tty); unlock: mutex_unlock(&tty_mutex); @@ -2698,7 +2721,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) no_tty(); return 0; case TIOCSCTTY: - return tiocsctty(tty, arg); + return tiocsctty(tty, file, arg); case TIOCGPGRP: return tiocgpgrp(tty, real_tty, p); case TIOCSPGRP: diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index cc1004a2f..bfc9b6912 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -114,16 +114,18 @@ static void usb_parse_ss_endpoint_companion(struct device *ddev, int cfgno, cfgno, inum, asnum, ep->desc.bEndpointAddress); ep->ss_ep_comp.bmAttributes = 16; } else if (usb_endpoint_xfer_isoc(&ep->desc) && - desc->bmAttributes > 2) { + USB_SS_MULT(desc->bmAttributes) > 3) { dev_warn(ddev, "Isoc endpoint has Mult of %d in " "config %d interface %d altsetting %d ep %d: " - "setting to 3\n", desc->bmAttributes + 1, + "setting to 3\n", + USB_SS_MULT(desc->bmAttributes), cfgno, inum, asnum, ep->desc.bEndpointAddress); ep->ss_ep_comp.bmAttributes = 2; } if (usb_endpoint_xfer_isoc(&ep->desc)) - max_tx = (desc->bMaxBurst + 1) * (desc->bmAttributes + 1) * + max_tx = (desc->bMaxBurst + 1) * + (USB_SS_MULT(desc->bmAttributes)) * usb_endpoint_maxp(&ep->desc); else if (usb_endpoint_xfer_int(&ep->desc)) max_tx = usb_endpoint_maxp(&ep->desc) * diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index e845a1f80..392c125fc 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -49,6 +49,13 @@ static const struct usb_device_id usb_quirk_list[] = { /* Microsoft LifeCam-VX700 v2.0 */ { USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME }, + /* Logitech ConferenceCam CC3000e */ + { USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT }, + { USB_DEVICE(0x046d, 0x0848), .driver_info = USB_QUIRK_DELAY_INIT }, + + /* Logitech PTZ Pro Camera */ + { USB_DEVICE(0x046d, 0x0853), .driver_info = USB_QUIRK_DELAY_INIT }, + /* Logitech Quickcam Fusion */ { USB_DEVICE(0x046d, 0x08c1), .driver_info = USB_QUIRK_RESET_RESUME }, @@ -73,6 +80,12 @@ static const struct usb_device_id usb_quirk_list[] = { /* Philips PSC805 audio device */ { USB_DEVICE(0x0471, 0x0155), .driver_info = USB_QUIRK_RESET_RESUME }, + /* Plantronic Audio 655 DSP */ + { USB_DEVICE(0x047f, 0xc008), .driver_info = USB_QUIRK_RESET_RESUME }, + + /* Plantronic Audio 648 USB */ + { USB_DEVICE(0x047f, 0xc013), .driver_info = USB_QUIRK_RESET_RESUME }, + /* Artisman Watchdog Dongle */ { USB_DEVICE(0x04b4, 0x0526), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, diff --git a/drivers/usb/host/ehci-sysfs.c b/drivers/usb/host/ehci-sysfs.c index 14ced00ba..065902429 100644 --- a/drivers/usb/host/ehci-sysfs.c +++ b/drivers/usb/host/ehci-sysfs.c @@ -29,7 +29,7 @@ static ssize_t show_companion(struct device *dev, int count = PAGE_SIZE; char *ptr = buf; - ehci = hcd_to_ehci(bus_to_hcd(dev_get_drvdata(dev))); + ehci = hcd_to_ehci(dev_get_drvdata(dev)); nports = HCS_N_PORTS(ehci->hcs_params); for (index = 0; index < nports; ++index) { @@ -54,7 +54,7 @@ static ssize_t store_companion(struct device *dev, struct ehci_hcd *ehci; int portnum, new_owner; - ehci = hcd_to_ehci(bus_to_hcd(dev_get_drvdata(dev))); + ehci = hcd_to_ehci(dev_get_drvdata(dev)); new_owner = PORT_OWNER; /* Owned by companion */ if (sscanf(buf, "%d", &portnum) != 1) return -EINVAL; @@ -85,7 +85,7 @@ static ssize_t show_uframe_periodic_max(struct device *dev, struct ehci_hcd *ehci; int n; - ehci = hcd_to_ehci(bus_to_hcd(dev_get_drvdata(dev))); + ehci = hcd_to_ehci(dev_get_drvdata(dev)); n = scnprintf(buf, PAGE_SIZE, "%d\n", ehci->uframe_periodic_max); return n; } @@ -102,7 +102,7 @@ static ssize_t store_uframe_periodic_max(struct device *dev, unsigned long flags; ssize_t ret; - ehci = hcd_to_ehci(bus_to_hcd(dev_get_drvdata(dev))); + ehci = hcd_to_ehci(dev_get_drvdata(dev)); if (kstrtouint(buf, 0, &uframe_periodic_max) < 0) return -EINVAL; diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 048cc382a..cad4a174e 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -1493,10 +1493,10 @@ int xhci_endpoint_init(struct xhci_hcd *xhci, * use Event Data TRBs, and we don't chain in a link TRB on short * transfers, we're basically dividing by 1. * - * xHCI 1.0 specification indicates that the Average TRB Length should - * be set to 8 for control endpoints. + * xHCI 1.0 and 1.1 specification indicates that the Average TRB Length + * should be set to 8 for control endpoints. */ - if (usb_endpoint_xfer_control(&ep->desc) && xhci->hci_version == 0x100) + if (usb_endpoint_xfer_control(&ep->desc) && xhci->hci_version >= 0x100) ep_ctx->tx_info |= cpu_to_le32(AVG_TRB_LENGTH_FOR_EP(8)); else ep_ctx->tx_info |= diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 710b2e98b..305393373 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -121,6 +121,7 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) * PPT chipsets. */ xhci->quirks |= XHCI_SPURIOUS_REBOOT; + xhci->quirks |= XHCI_SPURIOUS_WAKEUP; } if (pdev->vendor == PCI_VENDOR_ID_INTEL && (pdev->device == PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_XHCI || diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index d857ba543..5d4f1b754 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -331,6 +331,15 @@ static int xhci_abort_cmd_ring(struct xhci_hcd *xhci) ret = handshake(xhci, &xhci->op_regs->cmd_ring, CMD_RING_RUNNING, 0, 5 * 1000 * 1000); if (ret < 0) { + /* we are about to kill xhci, give it one more chance */ + xhci_write_64(xhci, temp_64 | CMD_RING_ABORT, + &xhci->op_regs->cmd_ring); + udelay(1000); + ret = handshake(xhci, &xhci->op_regs->cmd_ring, + CMD_RING_RUNNING, 0, 3 * 1000 * 1000); + if (ret == 0) + return 0; + xhci_err(xhci, "Stopped the command ring failed, " "maybe the host is dead\n"); xhci->xhc_state |= XHCI_STATE_DYING; @@ -2337,6 +2346,7 @@ static int handle_tx_event(struct xhci_hcd *xhci, u32 trb_comp_code; int ret = 0; int td_num = 0; + bool handling_skipped_tds = false; slot_id = TRB_TO_SLOT_ID(le32_to_cpu(event->flags)); xdev = xhci->devs[slot_id]; @@ -2470,6 +2480,10 @@ static int handle_tx_event(struct xhci_hcd *xhci, ep->skip = true; xhci_dbg(xhci, "Miss service interval error, set skip flag\n"); goto cleanup; + case COMP_PING_ERR: + ep->skip = true; + xhci_dbg(xhci, "No Ping response error, Skip one Isoc TD\n"); + goto cleanup; default: if (xhci_is_vendor_info_code(xhci, trb_comp_code)) { status = 0; @@ -2601,13 +2615,18 @@ static int handle_tx_event(struct xhci_hcd *xhci, ep, &status); cleanup: + + + handling_skipped_tds = ep->skip && + trb_comp_code != COMP_MISSED_INT && + trb_comp_code != COMP_PING_ERR; + /* - * Do not update event ring dequeue pointer if ep->skip is set. - * Will roll back to continue process missed tds. + * Do not update event ring dequeue pointer if we're in a loop + * processing missed tds. */ - if (trb_comp_code == COMP_MISSED_INT || !ep->skip) { + if (!handling_skipped_tds) inc_deq(xhci, xhci->event_ring); - } if (ret) { urb = td->urb; @@ -2642,7 +2661,7 @@ static int handle_tx_event(struct xhci_hcd *xhci, * Process them as short transfer until reach the td pointed by * the event. */ - } while (ep->skip && trb_comp_code != COMP_MISSED_INT); + } while (handling_skipped_tds); return 0; } @@ -3493,8 +3512,8 @@ int xhci_queue_ctrl_tx(struct xhci_hcd *xhci, gfp_t mem_flags, if (start_cycle == 0) field |= 0x1; - /* xHCI 1.0 6.4.1.2.1: Transfer Type field */ - if (xhci->hci_version == 0x100) { + /* xHCI 1.0/1.1 6.4.1.2.1: Transfer Type field */ + if (xhci->hci_version >= 0x100) { if (urb->transfer_buffer_length > 0) { if (setup->bRequestType & USB_DIR_IN) field |= TRB_TX_TYPE(TRB_DATA_IN); diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index c918fd6aa..eca811ef9 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -141,7 +141,8 @@ static int xhci_start(struct xhci_hcd *xhci) "waited %u microseconds.\n", XHCI_MAX_HALT_USEC); if (!ret) - xhci->xhc_state &= ~XHCI_STATE_HALTED; + xhci->xhc_state &= ~(XHCI_STATE_HALTED | XHCI_STATE_DYING); + return ret; } diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index 1e4899c2d..4038789d6 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -629,6 +629,10 @@ static struct usb_device_id id_table_combined [] = { { USB_DEVICE(FTDI_VID, FTDI_NT_ORIONLXM_PID), .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk }, { USB_DEVICE(FTDI_VID, FTDI_SYNAPSE_SS200_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_CUSTOMWARE_MINIPLEX_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_CUSTOMWARE_MINIPLEX2_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_CUSTOMWARE_MINIPLEX2WI_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_CUSTOMWARE_MINIPLEX3_PID) }, /* * ELV devices: */ diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index 1fee973f1..70b24c02b 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -568,6 +568,14 @@ */ #define FTDI_SYNAPSE_SS200_PID 0x9090 /* SS200 - SNAP Stick 200 */ +/* + * CustomWare / ShipModul NMEA multiplexers product ids (FTDI_VID) + */ +#define FTDI_CUSTOMWARE_MINIPLEX_PID 0xfd48 /* MiniPlex first generation NMEA Multiplexer */ +#define FTDI_CUSTOMWARE_MINIPLEX2_PID 0xfd49 /* MiniPlex-USB and MiniPlex-2 series */ +#define FTDI_CUSTOMWARE_MINIPLEX2WI_PID 0xfd4a /* MiniPlex-2Wi */ +#define FTDI_CUSTOMWARE_MINIPLEX3_PID 0xfd4b /* MiniPlex-3 series */ + /********************************/ /** third-party VID/PID combos **/ diff --git a/drivers/video/msm/mdss/mdss_dsi.c b/drivers/video/msm/mdss/mdss_dsi.c index 3163ebd9a..a15204ef3 100644 --- a/drivers/video/msm/mdss/mdss_dsi.c +++ b/drivers/video/msm/mdss/mdss_dsi.c @@ -23,6 +23,10 @@ #include #include +#ifdef CONFIG_STATE_NOTIFIER +#include +#endif + #include "mdss.h" #include "mdss_fb.h" #include "mdss_panel.h" @@ -1474,6 +1478,10 @@ static int mdss_dsi_event_handler(struct mdss_panel_data *pdata, ctrl_pdata->mdp_tg_on = 1; if (ctrl_pdata->on_cmds.link_state == DSI_HS_MODE) rc = mdss_dsi_unblank(pdata); +#ifdef CONFIG_STATE_NOTIFIER + if (!use_fb_notifier) + state_resume(); +#endif break; case MDSS_EVENT_BLANK: if (ctrl_pdata->off_cmds.link_state == DSI_HS_MODE) @@ -1484,6 +1492,10 @@ static int mdss_dsi_event_handler(struct mdss_panel_data *pdata, if (ctrl_pdata->off_cmds.link_state == DSI_LP_MODE) rc = mdss_dsi_blank(pdata); rc = mdss_dsi_off(pdata); +#ifdef CONFIG_STATE_NOTIFIER + if (!use_fb_notifier) + state_suspend(); +#endif break; #if defined(CONFIG_FB_MSM_MDSS_S6E8AA0A_HD_PANEL) case MTP_READ: diff --git a/drivers/video/msm/mdss/mdss_mdp_pp.c b/drivers/video/msm/mdss/mdss_mdp_pp.c index 94ab630a2..def3f7de9 100644 --- a/drivers/video/msm/mdss/mdss_mdp_pp.c +++ b/drivers/video/msm/mdss/mdss_mdp_pp.c @@ -4373,7 +4373,7 @@ int mdss_mdp_ad_input(struct msm_fb_data_type *mfd, mutex_lock(&ad->lock); if ((!PP_AD_STATE_IS_INITCFG(ad->state) && !PP_AD_STS_IS_DIRTY(ad->sts)) && - !input->mode == MDSS_AD_MODE_CALIB) { + input->mode != MDSS_AD_MODE_CALIB) { pr_warn("AD not initialized or configured."); ret = -EPERM; goto error; diff --git a/drivers/video/msm/mhl_v2/sii8240/sii8240.c b/drivers/video/msm/mhl_v2/sii8240/sii8240.c index 9c8c43769..9ad69723f 100755 --- a/drivers/video/msm/mhl_v2/sii8240/sii8240.c +++ b/drivers/video/msm/mhl_v2/sii8240/sii8240.c @@ -444,8 +444,7 @@ static int tmds_control(struct sii8240_data *sii8240, bool tmds_on) return ret; } - switch (tmds_on) { - case true: + if (tmds_on) { #ifdef SFEATURE_HDCP_SUPPORT ret = mhl_read_byte_reg(tpi, 0x1A, &value); if (TMDS_OUTPUT_CONTROL_POWER_DOWN & value) { @@ -497,9 +496,8 @@ static int tmds_control(struct sii8240_data *sii8240, bool tmds_on) if (unlikely(ret < 0)) pr_err("[ERROR] %s() send AVIF fail\n", __func__); #endif - break; + } else { - case false: #ifdef SFEATURE_HDCP_SUPPORT sii8240_hdcp_on(sii8240, false); #endif @@ -520,11 +518,6 @@ static int tmds_control(struct sii8240_data *sii8240, bool tmds_on) __func__, __LINE__); return ret; } - break; - - default: - pr_err("[ERROR] %s() unknown value\n", __func__); - break; } return ret; } diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c index 89f264c67..de9273ea7 100644 --- a/drivers/xen/tmem.c +++ b/drivers/xen/tmem.c @@ -235,7 +235,7 @@ static int __init no_cleancache(char *s) } __setup("nocleancache", no_cleancache); -static struct cleancache_ops __initdata tmem_cleancache_ops = { +static const struct cleancache_ops tmem_cleancache_ops = { .put_page = tmem_cleancache_put_page, .get_page = tmem_cleancache_get_page, .invalidate_page = tmem_cleancache_flush_page, @@ -389,14 +389,26 @@ static int __init xen_tmem_init(void) #endif #ifdef CONFIG_CLEANCACHE BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); - if (tmem_enabled && use_cleancache) { - char *s = ""; - struct cleancache_ops old_ops = - cleancache_register_ops(&tmem_cleancache_ops); - if (old_ops.init_fs != NULL) - s = " (WARNING: cleancache_ops overridden)"; - printk(KERN_INFO "cleancache enabled, RAM provided by " - "Xen Transcendent Memory%s\n", s); + if (tmem_enabled && cleancache) { + int err; + + err = cleancache_register_ops(&tmem_cleancache_ops); + if (err) + pr_warn("xen-tmem: failed to enable cleancache: %d\n", + err); + else + pr_info("cleancache enabled, RAM provided by " + "Xen Transcendent Memory\n"); + } +#endif +#ifdef CONFIG_XEN_SELFBALLOONING + /* + * There is no point of driving pages to the swap system if they + * aren't going anywhere in tmem universe. + */ + if (!frontswap) { + selfshrinking = false; + selfballooning = false; } #endif return 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9e5132582..575c1902d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3685,7 +3685,8 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ - btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (!special_file(inode->i_mode)) + btrfs_wait_ordered_range(inode, 0, (u64)-1); if (root->fs_info->log_root_recovering) { BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f4fa5cf0c..e5eacd9dd 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -383,8 +383,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (opt->flags & CEPH_OPT_NOCRC) seq_puts(m, ",nocrc"); - if (opt->name) - seq_printf(m, ",name=%s", opt->name); + if (opt->name) { + seq_puts(m, ",name="); + seq_escape(m, opt->name, ", \t\n\\"); + } if (opt->key) seq_puts(m, ",secret="); @@ -429,7 +431,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) - seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + seq_show_option(m, "snapdirname", fsopt->snapdir_name); return 0; } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 6dd3b61ea..8431216ee 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -388,6 +388,48 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp) return 0; } +/* Server has provided av pairs/target info in the type 2 challenge + * packet and we have plucked it and stored within smb session. + * We parse that blob here to find the server given timestamp + * as part of ntlmv2 authentication (or local current time as + * default in case of failure) + */ +static __le64 +find_timestamp(struct cifs_ses *ses) +{ + unsigned int attrsize; + unsigned int type; + unsigned int onesize = sizeof(struct ntlmssp2_name); + unsigned char *blobptr; + unsigned char *blobend; + struct ntlmssp2_name *attrptr; + + if (!ses->auth_key.len || !ses->auth_key.response) + return 0; + + blobptr = ses->auth_key.response; + blobend = blobptr + ses->auth_key.len; + + while (blobptr + onesize < blobend) { + attrptr = (struct ntlmssp2_name *) blobptr; + type = le16_to_cpu(attrptr->type); + if (type == NTLMSSP_AV_EOL) + break; + blobptr += 2; /* advance attr type */ + attrsize = le16_to_cpu(attrptr->length); + blobptr += 2; /* advance attr size */ + if (blobptr + attrsize > blobend) + break; + if (type == NTLMSSP_AV_TIMESTAMP) { + if (attrsize == sizeof(u64)) + return *((__le64 *)blobptr); + } + blobptr += attrsize; /* advance attr value */ + } + + return cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); +} + static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, const struct nls_table *nls_cp) { @@ -549,6 +591,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) struct ntlmv2_resp *buf; char ntlmv2_hash[16]; unsigned char *tiblob = NULL; /* target info blob */ + __le64 rsp_timestamp; if (ses->server->secType == RawNTLMSSP) { if (!ses->domainName) { @@ -566,6 +609,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) } } + /* Must be within 5 minutes of the server (or in range +/-2h + * in case of Mac OS X), so simply carry over server timestamp + * (as Windows 7 does) + */ + rsp_timestamp = find_timestamp(ses); + baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp); tilen = ses->auth_key.len; tiblob = ses->auth_key.response; @@ -583,7 +632,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) (ses->auth_key.response + CIFS_SESS_KEY_SIZE); buf->blob_signature = cpu_to_le32(0x00000101); buf->reserved = 0; - buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); + buf->time = rsp_timestamp; get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); buf->reserved2 = 0; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c0f65e848..5b730ba78 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -373,10 +373,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) seq_printf(s, ",multiuser"); else if (tcon->ses->user_name) - seq_printf(s, ",username=%s", tcon->ses->user_name); + seq_show_option(s, "username", tcon->ses->user_name); if (tcon->ses->domainName) - seq_printf(s, ",domain=%s", tcon->ses->domainName); + seq_show_option(s, "domain", tcon->ses->domainName); if (srcaddr->sa_family != AF_UNSPEC) { struct sockaddr_in *saddr4; diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 534c1d46e..eba8f1d4a 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -55,26 +55,26 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) lower_dentry = ecryptfs_dentry_to_lower(dentry); lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); - if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) - goto out; - if (nd) { - dentry_save = nd->path.dentry; - vfsmount_save = nd->path.mnt; - nd->path.dentry = lower_dentry; - nd->path.mnt = lower_mnt; - } - rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd); - if (nd) { - nd->path.dentry = dentry_save; - nd->path.mnt = vfsmount_save; + if (lower_dentry->d_op && lower_dentry->d_op->d_revalidate) { + if (nd) { + dentry_save = nd->path.dentry; + vfsmount_save = nd->path.mnt; + nd->path.dentry = lower_dentry; + nd->path.mnt = lower_mnt; + } + rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd); + if (nd) { + nd->path.dentry = dentry_save; + nd->path.mnt = vfsmount_save; + } } if (dentry->d_inode) { - struct inode *lower_inode = - ecryptfs_inode_to_lower(dentry->d_inode); + struct inode *inode = dentry->d_inode; - fsstack_copy_attr_all(dentry->d_inode, lower_inode); + fsstack_copy_attr_all(inode, ecryptfs_inode_to_lower(inode)); + if (!inode->i_nlink) + return 0; } -out: return rc; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8f408762a..951ff257f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1677,10 +1677,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq, } if (sbi->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); if (sbi->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); if (test_opt(sb, USRQUOTA)) seq_puts(seq, ",usrquota"); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 6172fa77a..4db9a9a31 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1298,11 +1298,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) if (is_ancestor(root, sdp->sd_master_dir)) seq_printf(s, ",meta"); if (args->ar_lockproto[0]) - seq_printf(s, ",lockproto=%s", args->ar_lockproto); + seq_show_option(s, "lockproto", args->ar_lockproto); if (args->ar_locktable[0]) - seq_printf(s, ",locktable=%s", args->ar_locktable); + seq_show_option(s, "locktable", args->ar_locktable); if (args->ar_hostdata[0]) - seq_printf(s, ",hostdata=%s", args->ar_hostdata); + seq_show_option(s, "hostdata", args->ar_hostdata); if (args->ar_spectator) seq_printf(s, ",spectator"); if (args->ar_localflocks) diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index cdb41a1f6..8daea16ef 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -287,7 +287,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page_cache_release(page); goto fail; } - page_cache_release(page); node->page[i] = page; } @@ -397,11 +396,11 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) void hfs_bnode_free(struct hfs_bnode *node) { - //int i; + int i; - //for (i = 0; i < node->tree->pages_per_bnode; i++) - // if (node->page[i]) - // page_cache_release(node->page[i]); + for (i = 0; i < node->tree->pages_per_bnode; i++) + if (node->page[i]) + page_cache_release(node->page[i]); kfree(node); } diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 92fb358ce..db240c54a 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -132,13 +132,16 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) hfs_bnode_write(node, entry, data_off + key_len, entry_len); hfs_bnode_dump(node); - if (new_node) { - /* update parent key if we inserted a key - * at the start of the first node - */ - if (!rec && new_node != node) - hfs_brec_update_parent(fd); + /* + * update parent key if we inserted a key + * at the start of the node and it is not the new node + */ + if (!rec && new_node != node) { + hfs_bnode_read_key(node, fd->search_key, data_off + size); + hfs_brec_update_parent(fd); + } + if (new_node) { hfs_bnode_put(fd->bnode); if (!new_node->parent) { hfs_btree_inc_height(tree); @@ -167,9 +170,6 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) goto again; } - if (!rec) - hfs_brec_update_parent(fd); - return 0; } @@ -366,6 +366,8 @@ static int hfs_brec_update_parent(struct hfs_find_data *fd) if (IS_ERR(parent)) return PTR_ERR(parent); __hfs_brec_find(parent, fd); + if (fd->record < 0) + return -ENOENT; hfs_bnode_dump(parent); rec = fd->record; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 7b4c537d6..be0e218a3 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -138,9 +138,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root) struct hfs_sb_info *sbi = HFS_SB(root->d_sb); if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f)) - seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator); + seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4); if (sbi->s_type != cpu_to_be32(0x3f3f3f3f)) - seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type); + seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4); seq_printf(seq, ",uid=%u,gid=%u", sbi->s_uid, sbi->s_gid); if (sbi->s_file_umask != 0133) seq_printf(seq, ",file_umask=%o", sbi->s_file_umask); diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 1c42cc5b8..a1e91092f 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page_cache_release(page); goto fail; } - page_cache_release(page); node->page[i] = page; } @@ -566,13 +565,11 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) void hfs_bnode_free(struct hfs_bnode *node) { -#if 0 int i; for (i = 0; i < node->tree->pages_per_bnode; i++) if (node->page[i]) page_cache_release(node->page[i]); -#endif kfree(node); } diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 06fa56186..38e41d07d 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -211,9 +211,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root) struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb); if (sbi->creator != HFSPLUS_DEF_CR_TYPE) - seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); + seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4); if (sbi->type != HFSPLUS_DEF_CR_TYPE) - seq_printf(seq, ",type=%.4s", (char *)&sbi->type); + seq_show_option_n(seq, "type", (char *)&sbi->type, 4); seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, sbi->uid, sbi->gid); if (sbi->part >= 0) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 07c516bfe..fe63b15f5 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -264,7 +264,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) size_t offset = strlen(root_ino) + 1; if (strlen(root_path) > offset) - seq_printf(seq, ",%s", root_path + offset); + seq_show_option(seq, root_path + offset, NULL); return 0; } diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 30dd7b10b..bdb86a8a8 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -8,6 +8,17 @@ #include #include "hpfs_fn.h" +static void hpfs_update_directory_times(struct inode *dir) +{ + time_t t = get_seconds(); + if (t == dir->i_mtime.tv_sec && + t == dir->i_ctime.tv_sec) + return; + dir->i_mtime.tv_sec = dir->i_ctime.tv_sec = t; + dir->i_mtime.tv_nsec = dir->i_ctime.tv_nsec = 0; + hpfs_write_inode_nolock(dir); +} + static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { const unsigned char *name = dentry->d_name.name; @@ -99,6 +110,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) result->i_mode = mode | S_IFDIR; hpfs_write_inode_nolock(result); } + hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); return 0; @@ -187,6 +199,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, s result->i_mode = mode | S_IFREG; hpfs_write_inode_nolock(result); } + hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); return 0; @@ -262,6 +275,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de insert_inode_hash(result); hpfs_write_inode_nolock(result); + hpfs_update_directory_times(dir); d_instantiate(dentry, result); brelse(bh); hpfs_unlock(dir->i_sb); @@ -340,6 +354,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy insert_inode_hash(result); hpfs_write_inode_nolock(result); + hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); return 0; @@ -423,6 +438,8 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry) out1: hpfs_brelse4(&qbh); out: + if (!err) + hpfs_update_directory_times(dir); hpfs_unlock(dir->i_sb); return err; } @@ -477,6 +494,8 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) out1: hpfs_brelse4(&qbh); out: + if (!err) + hpfs_update_directory_times(dir); hpfs_unlock(dir->i_sb); return err; } @@ -595,7 +614,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto end1; } - end: +end: hpfs_i(i)->i_parent_dir = new_dir->i_ino; if (S_ISDIR(i->i_mode)) { inc_nlink(new_dir); @@ -610,6 +629,10 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, brelse(bh); } end1: + if (!err) { + hpfs_update_directory_times(old_dir); + hpfs_update_directory_times(new_dir); + } hpfs_unlock(i->i_sb); return err; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3d344ab0b..92eff4da0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1851,7 +1851,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); - if (opendata->o_arg.open_flags & O_EXCL) { + if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) { nfs4_exclusive_attrset(opendata, sattr); nfs_fattr_init(opendata->o_res.f_attr); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 7ba6ac187..8e48ba5f6 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1411,6 +1411,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, int found, ret; int set_maybe; int dispatch_assert = 0; + int dispatched = 0; if (!dlm_grab(dlm)) return DLM_MASTER_RESP_NO; @@ -1617,13 +1618,16 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, mlog(ML_ERROR, "failed to dispatch assert master work\n"); response = DLM_MASTER_RESP_ERROR; dlm_lockres_put(res); + } else { + dispatched = 1; } } else { if (res) dlm_lockres_put(res); } - dlm_put(dlm); + if (!dispatched) + dlm_put(dlm); return response; } @@ -2041,7 +2045,6 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, /* queue up work for dlm_assert_master_worker */ - dlm_grab(dlm); /* get an extra ref for the work item */ dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL); item->u.am.lockres = res; /* already have a ref */ /* can optionally ignore node numbers higher than this node */ diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index d15b0714e..0e5013ed7 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1689,6 +1689,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, unsigned int hash; int master = DLM_LOCK_RES_OWNER_UNKNOWN; u32 flags = DLM_ASSERT_MASTER_REQUERY; + int dispatched = 0; if (!dlm_grab(dlm)) { /* since the domain has gone away on this @@ -1710,6 +1711,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, mlog_errno(-ENOMEM); /* retry!? */ BUG(); + } else { + dispatched = 1; } } else /* put.. incase we are not the master */ dlm_lockres_put(res); @@ -1717,7 +1720,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, } spin_unlock(&dlm->spinlock); - dlm_put(dlm); + if (!dispatched) + dlm_put(dlm); return master; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 68f4541c2..7e478160b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1578,8 +1578,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",localflocks,"); if (osb->osb_cluster_stack[0]) - seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, - osb->osb_cluster_stack); + seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack, + OCFS2_STACK_LABEL_LEN); if (opts & OCFS2_MOUNT_USRQUOTA) seq_printf(s, ",usrquota"); if (opts & OCFS2_MOUNT_GRPQUOTA) @@ -2357,7 +2357,8 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_errno(status); goto bail; } - cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); + + cleancache_init_shared_fs(sb); bail: return status; diff --git a/fs/pipe.c b/fs/pipe.c index abfb93525..6049235e2 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -390,6 +390,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, void *addr; size_t chars = buf->len, remaining; int error, atomic; + int offset; if (chars > total_len) chars = total_len; @@ -403,9 +404,10 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, atomic = !iov_fault_in_pages_write(iov, chars); remaining = chars; + offset = buf->offset; redo: addr = ops->map(pipe, buf, atomic); - error = pipe_iov_copy_to_user(iov, addr, &buf->offset, + error = pipe_iov_copy_to_user(iov, addr, &offset, &remaining, atomic); ops->unmap(pipe, buf, addr); if (unlikely(error)) { @@ -421,6 +423,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, break; } ret += chars; + buf->offset += chars; buf->len -= chars; /* Was it a packet buffer? Clean up and exit */ diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 33f9e9b81..e1bd03dda 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) * sysctl_overcommit_ratio / 100) + total_swap_pages; cached = global_page_state(NR_FILE_PAGES) - - total_swapcache_pages - i.bufferram; + total_swapcache_pages()- i.bufferram; if (cached < 0) cached = 0; @@ -112,7 +112,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeram), K(i.bufferram), K(cached), - K(total_swapcache_pages), + K(total_swapcache_pages()), K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), K(pages[LRU_ACTIVE_ANON]), diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 85a854e38..0abccc488 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -170,7 +170,7 @@ static void init_header(struct ctl_table_header *head, if (node) { struct ctl_table *entry; for (entry = table; entry->procname; entry++, node++) { - rb_init_node(&node->node); + RB_CLEAR_NODE(&node->node); node->header = head; } } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 8169be93a..e12357bb3 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -645,18 +645,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",acl"); if (REISERFS_SB(s)->s_jdev) - seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev); + seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev); if (journal->j_max_commit_age != journal->j_default_max_commit_age) seq_printf(seq, ",commit=%d", journal->j_max_commit_age); #ifdef CONFIG_QUOTA if (REISERFS_SB(s)->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]); + seq_show_option(seq, "usrjquota", + REISERFS_SB(s)->s_qf_names[USRQUOTA]); else if (opts & (1 << REISERFS_USRQUOTA)) seq_puts(seq, ",usrquota"); if (REISERFS_SB(s)->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]); + seq_show_option(seq, "grpjquota", + REISERFS_SB(s)->s_qf_names[GRPQUOTA]); else if (opts & (1 << REISERFS_GRPQUOTA)) seq_puts(seq, ",grpquota"); if (REISERFS_SB(s)->s_jquota_fmt) { diff --git a/fs/splice.c b/fs/splice.c index 67c5210e7..286417764 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1165,7 +1165,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, long ret, bytes; umode_t i_mode; size_t len; - int i, flags; + int i, flags, more; /* * We require the input being a regular file, as we don't want to @@ -1208,6 +1208,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, * Don't block on output, we have to drain the direct pipe. */ sd->flags &= ~SPLICE_F_NONBLOCK; + more = sd->flags & SPLICE_F_MORE; while (len) { size_t read_len; @@ -1220,6 +1221,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, read_len = ret; sd->total_len = read_len; + /* + * If more data is pending, set SPLICE_F_MORE + * If this is the last data and SPLICE_F_MORE was not set + * initially, clears it. + */ + if (read_len < len) + sd->flags |= SPLICE_F_MORE; + else if (!more) + sd->flags &= ~SPLICE_F_MORE; /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we diff --git a/fs/super.c b/fs/super.c index 672ccae0f..88b352193 100644 --- a/fs/super.c +++ b/fs/super.c @@ -167,7 +167,7 @@ static struct super_block *alloc_super(struct file_system_type *type) s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; - s->cleancache_poolid = -1; + s->cleancache_poolid = CLEANCACHE_NO_POOL; s->s_shrink.seeks = DEFAULT_SEEKS; s->s_shrink.shrink = prune_super; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index dab9a5f6d..d6c787dc2 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -523,9 +523,9 @@ xfs_showargs( seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); if (mp->m_logname) - seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); + seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname); if (mp->m_rtname) - seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); + seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname); if (mp->m_dalign > 0) seq_printf(m, "," MNTOPT_SUNIT "=%d", diff --git a/ik.ramdisk/common/default.prop b/ik.ramdisk/common/default.prop index 077870132..8200bd439 100644 --- a/ik.ramdisk/common/default.prop +++ b/ik.ramdisk/common/default.prop @@ -5,7 +5,7 @@ persist.cne.feature=0 persist.security.ams.enforcing=0 ro.secure=0 ro.allow.mock.location=1 -ro.debuggable=1 +ro.debuggable=0 ro.adb.secure=0 ro.zygote=zygote32 persist.sys.strict_op_enable=false diff --git a/ik.ramdisk/common/fstab.qcom b/ik.ramdisk/common/fstab.qcom index 03e280b7f..d373b4236 100644 --- a/ik.ramdisk/common/fstab.qcom +++ b/ik.ramdisk/common/fstab.qcom @@ -3,10 +3,10 @@ # specify MF_CHECK, and must come before any filesystems that do specify MF_CHECK # -/dev/block/platform/msm_sdcc.1/by-name/system /system f2fs ro wait /dev/block/platform/msm_sdcc.1/by-name/system /system ext4 ro,errors=panic wait -/dev/block/platform/msm_sdcc.1/by-name/userdata /data f2fs nosuid,nodev,noatime,nodiratime,discard,inline_xattr wait,encryptable=footer +/dev/block/platform/msm_sdcc.1/by-name/system /system f2fs ro wait /dev/block/platform/msm_sdcc.1/by-name/userdata /data ext4 nosuid,nodev,noatime,noauto_da_alloc,discard,journal_async_commit,errors=panic wait,check,encryptable=footer +/dev/block/platform/msm_sdcc.1/by-name/userdata /data f2fs nosuid,nodev,noatime,nodiratime,discard,inline_xattr wait,encryptable=footer # VOLD /devices/msm_sdcc.[.23]/mmc_host/mmc* auto vfat default voldmanaged=extSdCard:auto,noemulatedsd /devices/platform/xhci-hcd/usb*sda auto vfat default voldmanaged=UsbDriveA:auto diff --git a/ik.ramdisk/common/init.idlekernel.sh b/ik.ramdisk/common/init.idlekernel.sh index 44022c976..691d2dd02 100755 --- a/ik.ramdisk/common/init.idlekernel.sh +++ b/ik.ramdisk/common/init.idlekernel.sh @@ -118,6 +118,16 @@ if [ ! -f $CFILE ]; then echo $CDEF > $CFILE fi echo `cat $CFILE` > $SFILE +# +# uksm +# +CFILE=$DDIR/uksm_run +SFILE=/sys/kernel/mm/uksm/run +CDEF=0 +if [ ! -f $CFILE ]; then + echo $CDEF > $CFILE +fi +echo `cat $CFILE` > $SFILE /sbin/setonboot apply & /system/xbin/busybox run-parts /system/etc/init.d & @@ -125,3 +135,36 @@ echo `cat $CFILE` > $SFILE stop thermal-engine sleep 2 start thermal-engine + +( + sleep 60; + + BB=/system/xbin/busybox + + # stop google service and restart it on boot. this remove high cpu load and ram leak! + if [ "$($BB pidof com.google.android.gms | wc -l)" -eq "1" ]; then + $BB kill "$($BB pidof com.google.android.gms)"; + fi; + if [ "$($BB pidof com.google.android.gms.unstable | wc -l)" -eq "1" ]; then + $BB kill "$($BB pidof com.google.android.gms.unstable)"; + fi; + if [ "$($BB pidof com.google.android.gms.persistent | wc -l)" -eq "1" ]; then + $BB kill "$($BB pidof com.google.android.gms.persistent)"; + fi; + if [ "$($BB pidof com.google.android.gms.wearable | wc -l)" -eq "1" ]; then + $BB kill "$($BB pidof com.google.android.gms.wearable)"; + fi; + + # Google Services battery drain fixer by Alcolawl@xda + # http://forum.xda-developers.com/google-nexus-5/general/script-google-play-services-battery-t3059585/post59563859 + pm enable com.google.android.gms/.update.SystemUpdateActivity + pm enable com.google.android.gms/.update.SystemUpdateService + pm enable com.google.android.gms/.update.SystemUpdateService$ActiveReceiver + pm enable com.google.android.gms/.update.SystemUpdateService$Receiver + pm enable com.google.android.gms/.update.SystemUpdateService$SecretCodeReceiver + pm enable com.google.android.gsf/.update.SystemUpdateActivity + pm enable com.google.android.gsf/.update.SystemUpdatePanoActivity + pm enable com.google.android.gsf/.update.SystemUpdateService + pm enable com.google.android.gsf/.update.SystemUpdateService$Receiver + pm enable com.google.android.gsf/.update.SystemUpdateService$SecretCodeReceiver +)& diff --git a/ik.ramdisk/common/init.qcom.rc b/ik.ramdisk/common/init.qcom.rc index 412392f73..aee4cfc3a 100755 --- a/ik.ramdisk/common/init.qcom.rc +++ b/ik.ramdisk/common/init.qcom.rc @@ -27,6 +27,7 @@ import init.qcom.usb.rc import init.target.rc +#import init.carrier.rc on early-init mount debugfs debugfs /sys/kernel/debug diff --git a/include/linux/async.h b/include/linux/async.h index 68a953019..364e7ff16 100644 --- a/include/linux/async.h +++ b/include/linux/async.h @@ -9,19 +9,46 @@ * as published by the Free Software Foundation; version 2 * of the License. */ +#ifndef __ASYNC_H__ +#define __ASYNC_H__ #include #include typedef u64 async_cookie_t; typedef void (async_func_ptr) (void *data, async_cookie_t cookie); +struct async_domain { + struct list_head node; + struct list_head domain; + int count; + unsigned registered:1; +}; + +/* + * domain participates in global async_synchronize_full + */ +#define ASYNC_DOMAIN(_name) \ + struct async_domain _name = { .node = LIST_HEAD_INIT(_name.node), \ + .domain = LIST_HEAD_INIT(_name.domain), \ + .count = 0, \ + .registered = 1 } + +/* + * domain is free to go out of scope as soon as all pending work is + * complete, this domain does not participate in async_synchronize_full + */ +#define ASYNC_DOMAIN_EXCLUSIVE(_name) \ + struct async_domain _name = { .node = LIST_HEAD_INIT(_name.node), \ + .domain = LIST_HEAD_INIT(_name.domain), \ + .count = 0, \ + .registered = 0 } extern async_cookie_t async_schedule(async_func_ptr *ptr, void *data); extern async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, - struct list_head *list); + struct async_domain *domain); extern void async_synchronize_full(void); -extern void async_synchronize_full_domain(struct list_head *list); +extern void async_synchronize_full_domain(struct async_domain *domain); extern void async_synchronize_cookie(async_cookie_t cookie); extern void async_synchronize_cookie_domain(async_cookie_t cookie, - struct list_head *list); - + struct async_domain *domain); +#endif diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6619eb255..f134c8334 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -431,6 +431,7 @@ struct request_queue { #define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ #define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ #define QUEUE_FLAG_SANITIZE 19 /* supports SANITIZE */ +#define QUEUE_FLAG_FAST 20 /* fast block device (e.g. ram based) */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -513,6 +514,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_sanitize(q) test_bit(QUEUE_FLAG_SANITIZE, &(q)->queue_flags) #define blk_queue_secdiscard(q) (blk_queue_discard(q) && \ test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags)) +#define blk_queue_fast(q) test_bit(QUEUE_FLAG_FAST, &(q)->queue_flags) #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index 42e55deee..cb3e14234 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -5,6 +5,10 @@ #include #include +#define CLEANCACHE_NO_POOL -1 +#define CLEANCACHE_NO_BACKEND -2 +#define CLEANCACHE_NO_BACKEND_SHARED -3 + #define CLEANCACHE_KEY_MAX 6 /* @@ -33,18 +37,17 @@ struct cleancache_ops { void (*invalidate_fs)(int); }; -extern struct cleancache_ops - cleancache_register_ops(struct cleancache_ops *ops); +extern int cleancache_register_ops(const struct cleancache_ops *ops); extern void __cleancache_init_fs(struct super_block *); -extern void __cleancache_init_shared_fs(char *, struct super_block *); +extern void __cleancache_init_shared_fs(struct super_block *); extern int __cleancache_get_page(struct page *); extern void __cleancache_put_page(struct page *); extern void __cleancache_invalidate_page(struct address_space *, struct page *); extern void __cleancache_invalidate_inode(struct address_space *); extern void __cleancache_invalidate_fs(struct super_block *); -extern int cleancache_enabled; #ifdef CONFIG_CLEANCACHE +#define cleancache_enabled (1) static inline bool cleancache_fs_enabled(struct page *page) { return page->mapping->host->i_sb->cleancache_poolid >= 0; @@ -78,10 +81,10 @@ static inline void cleancache_init_fs(struct super_block *sb) __cleancache_init_fs(sb); } -static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb) +static inline void cleancache_init_shared_fs(struct super_block *sb) { if (cleancache_enabled) - __cleancache_init_shared_fs(uuid, sb); + __cleancache_init_shared_fs(sb); } static inline int cleancache_get_page(struct page *page) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 923d093c9..7f5c2a39e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -164,6 +164,110 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); (typeof(ptr)) (__ptr + (off)); }) #endif +#include + +#define __READ_ONCE_SIZE \ +({ \ + switch (size) { \ + case 1: *(__u8 *)res = *(volatile __u8 *)p; break; \ + case 2: *(__u16 *)res = *(volatile __u16 *)p; break; \ + case 4: *(__u32 *)res = *(volatile __u32 *)p; break; \ + case 8: *(__u64 *)res = *(volatile __u64 *)p; break; \ + default: \ + barrier(); \ + __builtin_memcpy((void *)res, (const void *)p, size); \ + barrier(); \ + } \ +}) + +static __always_inline +void __read_once_size(const volatile void *p, void *res, int size) +{ + __READ_ONCE_SIZE; +} + +#ifdef CONFIG_KASAN +/* + * This function is not 'inline' because __no_sanitize_address confilcts + * with inlining. Attempt to inline it may cause a build failure. + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368 + * '__maybe_unused' allows us to avoid defined-but-not-used warnings. + */ +static __no_sanitize_address __maybe_unused +void __read_once_size_nocheck(const volatile void *p, void *res, int size) +{ + __READ_ONCE_SIZE; +} +#else +static __always_inline +void __read_once_size_nocheck(const volatile void *p, void *res, int size) +{ + __READ_ONCE_SIZE; +} +#endif + +static __always_inline void __write_once_size(volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(volatile __u8 *)p = *(__u8 *)res; break; + case 2: *(volatile __u16 *)p = *(__u16 *)res; break; + case 4: *(volatile __u32 *)p = *(__u32 *)res; break; + case 8: *(volatile __u64 *)p = *(__u64 *)res; break; + default: + barrier(); + __builtin_memcpy((void *)p, (const void *)res, size); + barrier(); + } +} + +/* + * Prevent the compiler from merging or refetching reads or writes. The + * compiler is also forbidden from reordering successive instances of + * READ_ONCE, WRITE_ONCE and ACCESS_ONCE (see below), but only when the + * compiler is aware of some particular ordering. One way to make the + * compiler aware of ordering is to put the two invocations of READ_ONCE, + * WRITE_ONCE or ACCESS_ONCE() in different C statements. + * + * In contrast to ACCESS_ONCE these two macros will also work on aggregate + * data types like structs or unions. If the size of the accessed data + * type exceeds the word size of the machine (e.g., 32 bits or 64 bits) + * READ_ONCE() and WRITE_ONCE() will fall back to memcpy(). There's at + * least two memcpy()s: one for the __builtin_memcpy() and then one for + * the macro doing the copy of variable - '__u' allocated on the stack. + * + * Their two major use cases are: (1) Mediating communication between + * process-level code and irq/NMI handlers, all running on the same CPU, + * and (2) Ensuring that the compiler does not fold, spindle, or otherwise + * mutilate accesses that either do not require ordering or that interact + * with an explicit memory barrier or atomic instruction that provides the + * required ordering. + */ + +#define __READ_ONCE(x, check) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u; \ + if (check) \ + __read_once_size(&(x), __u.__c, sizeof(x)); \ + else \ + __read_once_size_nocheck(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) +#define READ_ONCE(x) __READ_ONCE(x, 1) + +/* + * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need + * to hide memory access from KASAN. + */ +#define READ_ONCE_NOCHECK(x) __READ_ONCE(x, 0) + +#define WRITE_ONCE(x, val) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__val = (__force typeof(x)) (val) }; \ + __write_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 92f9fcdc5..233c27277 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -126,21 +126,33 @@ enum { #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */ #ifdef CONFIG_HOTPLUG_CPU extern int register_cpu_notifier(struct notifier_block *nb); +extern int __register_cpu_notifier(struct notifier_block *nb); extern void unregister_cpu_notifier(struct notifier_block *nb); +extern void __unregister_cpu_notifier(struct notifier_block *nb); #else #ifndef MODULE extern int register_cpu_notifier(struct notifier_block *nb); +extern int __register_cpu_notifier(struct notifier_block *nb); #else static inline int register_cpu_notifier(struct notifier_block *nb) { return 0; } + +static inline int __register_cpu_notifier(struct notifier_block *nb) +{ + return 0; +} #endif static inline void unregister_cpu_notifier(struct notifier_block *nb) { } + +static inline void __unregister_cpu_notifier(struct notifier_block *nb) +{ +} #endif int cpu_up(unsigned int cpu); @@ -148,6 +160,9 @@ void notify_cpu_starting(unsigned int cpu); extern void cpu_maps_update_begin(void); extern void cpu_maps_update_done(void); +#define cpu_notifier_register_begin cpu_maps_update_begin +#define cpu_notifier_register_done cpu_maps_update_done + #else /* CONFIG_SMP */ #define cpu_notifier(fn, pri) do { (void)(fn); } while (0) diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 3a4cef532..10ca9e2a7 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -83,6 +83,22 @@ enum fid_type { * 64 bit parent inode number. */ FILEID_NILFS_WITH_PARENT = 0x62, + + /* + * 32 bit generation number, 40 bit i_pos. + */ + FILEID_FAT_WITHOUT_PARENT = 0x71, + + /* + * 32 bit generation number, 40 bit i_pos, + * 32 bit parent generation number, 40 bit parent i_pos + */ + FILEID_FAT_WITH_PARENT = 0x72, + + /* + * Filesystems must not use 0xff file ID. + */ + FILEID_INVALID = 0xff, }; struct fid { diff --git a/include/linux/init_task.h b/include/linux/init_task.h index b11e298e9..2b9894067 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -11,6 +11,7 @@ #include #include #include +#include #ifdef CONFIG_SMP # define INIT_PUSHABLE_TASKS(tsk) \ diff --git a/include/linux/list.h b/include/linux/list.h index 1712c7e1f..8a8acdcf3 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -361,6 +361,28 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) +/** + * list_last_entry - get the last element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note, that list is expected to be not empty. + */ +#define list_last_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) + +/** + * list_first_entry_or_null - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note that if the list is empty, it returns NULL. + */ +#define list_first_entry_or_null(ptr, type, member) \ + (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) + /** * list_next_entry - get the next element in list * @pos: the type * to cursor @@ -432,9 +454,9 @@ static inline void list_splice_tail_init(struct list_head *list, * @member: the name of the list_struct within the struct. */ #define list_for_each_entry(pos, head, member) \ - for (pos = list_entry((head)->next, typeof(*pos), member); \ - &pos->member != (head); \ - pos = list_entry(pos->member.next, typeof(*pos), member)) + for (pos = list_first_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. @@ -443,9 +465,9 @@ static inline void list_splice_tail_init(struct list_head *list, * @member: the name of the list_struct within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ - for (pos = list_entry((head)->prev, typeof(*pos), member); \ - &pos->member != (head); \ - pos = list_entry(pos->member.prev, typeof(*pos), member)) + for (pos = list_last_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_prev_entry(pos, member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() @@ -468,9 +490,9 @@ static inline void list_splice_tail_init(struct list_head *list, * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ - for (pos = list_entry(pos->member.next, typeof(*pos), member); \ - &pos->member != (head); \ - pos = list_entry(pos->member.next, typeof(*pos), member)) + for (pos = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point @@ -482,9 +504,9 @@ static inline void list_splice_tail_init(struct list_head *list, * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ - for (pos = list_entry(pos->member.prev, typeof(*pos), member); \ - &pos->member != (head); \ - pos = list_entry(pos->member.prev, typeof(*pos), member)) + for (pos = list_prev_entry(pos, member); \ + &pos->member != (head); \ + pos = list_prev_entry(pos, member)) /** * list_for_each_entry_from - iterate over list of given type from the current point @@ -495,8 +517,8 @@ static inline void list_splice_tail_init(struct list_head *list, * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ - for (; &pos->member != (head); \ - pos = list_entry(pos->member.next, typeof(*pos), member)) + for (; &pos->member != (head); \ + pos = list_next_entry(pos, member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry @@ -506,10 +528,10 @@ static inline void list_splice_tail_init(struct list_head *list, * @member: the name of the list_struct within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ - for (pos = list_entry((head)->next, typeof(*pos), member), \ - n = list_entry(pos->member.next, typeof(*pos), member); \ + for (pos = list_first_entry(head, typeof(*pos), member), \ + n = list_next_entry(pos, member); \ &pos->member != (head); \ - pos = n, n = list_entry(n->member.next, typeof(*n), member)) + pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal @@ -522,10 +544,10 @@ static inline void list_splice_tail_init(struct list_head *list, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ - for (pos = list_entry(pos->member.next, typeof(*pos), member), \ - n = list_entry(pos->member.next, typeof(*pos), member); \ + for (pos = list_next_entry(pos, member), \ + n = list_next_entry(pos, member); \ &pos->member != (head); \ - pos = n, n = list_entry(n->member.next, typeof(*n), member)) + pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal @@ -538,9 +560,9 @@ static inline void list_splice_tail_init(struct list_head *list, * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ - for (n = list_entry(pos->member.next, typeof(*pos), member); \ + for (n = list_next_entry(pos, member); \ &pos->member != (head); \ - pos = n, n = list_entry(n->member.next, typeof(*n), member)) + pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal @@ -553,10 +575,10 @@ static inline void list_splice_tail_init(struct list_head *list, * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ - for (pos = list_entry((head)->prev, typeof(*pos), member), \ - n = list_entry(pos->member.prev, typeof(*pos), member); \ + for (pos = list_last_entry(head, typeof(*pos), member), \ + n = list_prev_entry(pos, member); \ &pos->member != (head); \ - pos = n, n = list_entry(n->member.prev, typeof(*n), member)) + pos = n, n = list_prev_entry(n, member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop @@ -571,7 +593,7 @@ static inline void list_splice_tail_init(struct list_head *list, * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ - n = list_entry(pos->member.next, typeof(*pos), member) + n = list_next_entry(pos, member) /* * Double linked lists with a single pointer list head. diff --git a/include/linux/lz4.h b/include/linux/lz4.h new file mode 100644 index 000000000..6b784c59f --- /dev/null +++ b/include/linux/lz4.h @@ -0,0 +1,87 @@ +#ifndef __LZ4_H__ +#define __LZ4_H__ +/* + * LZ4 Kernel Interface + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define LZ4_MEM_COMPRESS (16384) +#define LZ4HC_MEM_COMPRESS (262144 + (2 * sizeof(unsigned char *))) + +/* + * lz4_compressbound() + * Provides the maximum size that LZ4 may output in a "worst case" scenario + * (input data not compressible) + */ +static inline size_t lz4_compressbound(size_t isize) +{ + return isize + (isize / 255) + 16; +} + +/* + * lz4_compress() + * src : source address of the original data + * src_len : size of the original data + * dst : output buffer address of the compressed data + * This requires 'dst' of size LZ4_COMPRESSBOUND. + * dst_len : is the output size, which is returned after compress done + * workmem : address of the working memory. + * This requires 'workmem' of size LZ4_MEM_COMPRESS. + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer and workmem must be already allocated with + * the defined size. + */ +int lz4_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + + /* + * lz4hc_compress() + * src : source address of the original data + * src_len : size of the original data + * dst : output buffer address of the compressed data + * This requires 'dst' of size LZ4_COMPRESSBOUND. + * dst_len : is the output size, which is returned after compress done + * workmem : address of the working memory. + * This requires 'workmem' of size LZ4HC_MEM_COMPRESS. + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer and workmem must be already allocated with + * the defined size. + */ +int lz4hc_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + +/* + * lz4_decompress() + * src : source address of the compressed data + * src_len : is the input size, whcih is returned after decompress done + * dest : output buffer address of the decompressed data + * actual_dest_len: is the size of uncompressed data, supposing it's known + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer must be already allocated. + * slightly faster than lz4_decompress_unknownoutputsize() + */ +int lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len); + +/* + * lz4_decompress_unknownoutputsize() + * src : source address of the compressed data + * src_len : is the input size, therefore the compressed size + * dest : output buffer address of the decompressed data + * dest_len: is the max size of the destination buffer, which is + * returned with actual size of decompressed data after + * decompress done + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer must be already allocated. + */ +int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, + unsigned char *dest, size_t *dest_len); +#endif diff --git a/include/linux/mm.h b/include/linux/mm.h index f5ca7ecfc..4e7889f1b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -389,7 +389,7 @@ static inline struct page *compound_head(struct page *page) * both from it and to it can be tracked, using atomic_inc_and_test * and atomic_add_negative(-1). */ -static inline void reset_page_mapcount(struct page *page) +static inline void page_mapcount_reset(struct page *page) { atomic_set(&(page)->_mapcount, -1); } @@ -810,18 +810,7 @@ void page_address_init(void); #define PAGE_MAPPING_KSM 2 #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) -extern struct address_space swapper_space; -static inline struct address_space *page_mapping(struct page *page) -{ - struct address_space *mapping = page->mapping; - - VM_BUG_ON(PageSlab(page)); - if (unlikely(PageSwapCache(page))) - mapping = &swapper_space; - else if ((unsigned long)mapping & PAGE_MAPPING_ANON) - mapping = NULL; - return mapping; -} +extern struct address_space *page_mapping(struct page *page); /* Neutral page->mapping pointer to address_space or anon_vma or other */ static inline void *page_rmapping(struct page *page) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 2d0243350..1bd8e64f3 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -115,6 +115,9 @@ enum pageflags { #endif #ifdef CONFIG_SDP PG_sensitive, +#endif +#ifdef CONFIG_ZCACHE + PG_was_active, #endif __NR_PAGEFLAGS, #if defined(CONFIG_CMA_PAGE_COUNTING) @@ -221,6 +224,12 @@ PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) __PAGEFLAG(SlobFree, slob_free) +#ifdef CONFIG_ZCACHE +PAGEFLAG(WasActive, was_active) +#else +PAGEFLAG_FALSE(WasActive) +#endif + /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 696ca3920..530b807fc 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -236,6 +236,11 @@ static inline struct page *page_cache_alloc_readahead(struct address_space *x) typedef int filler_t(void *, struct page *); +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan); +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan); + extern struct page * find_get_page(struct address_space *mapping, pgoff_t index); extern struct page * find_lock_page(struct address_space *mapping, diff --git a/include/linux/pci.h b/include/linux/pci.h index 469c9536c..579baf06f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -176,6 +176,8 @@ enum pci_dev_flags { PCI_DEV_FLAGS_NO_D3 = (__force pci_dev_flags_t) 2, /* Provide indication device is assigned by a Virtual Machine Manager */ PCI_DEV_FLAGS_ASSIGNED = (__force pci_dev_flags_t) 4, + /* Get VPD from function 0 VPD */ + PCI_DEV_FLAGS_VPD_REF_F0 = (__force pci_dev_flags_t) (1 << 8), }; enum pci_irq_reroute_variant { diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index ffc444c38..74aee022b 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -60,6 +60,33 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) #define RADIX_TREE_MAX_TAGS 3 +#ifdef __KERNEL__ +#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) +#else +#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ +#endif + +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) + +#define RADIX_TREE_TAG_LONGS \ + ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) + +struct radix_tree_node { + unsigned int height; /* Height from the bottom */ + unsigned int count; + union { + struct radix_tree_node *parent; /* Used when ascending tree */ + struct rcu_head rcu_head; /* Used when freeing node */ + }; + void __rcu *slots[RADIX_TREE_MAP_SIZE]; + unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; +}; + +#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) +#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ + RADIX_TREE_MAP_SHIFT)) + /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ struct radix_tree_root { unsigned int height; @@ -101,6 +128,7 @@ do { \ * concurrently with other readers. * * The notable exceptions to this rule are the following functions: + * __radix_tree_lookup * radix_tree_lookup * radix_tree_lookup_slot * radix_tree_tag_get @@ -216,21 +244,29 @@ static inline void radix_tree_replace_slot(void **pslot, void *item) rcu_assign_pointer(*pslot, item); } +int __radix_tree_create(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node **nodep, void ***slotp); int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); +void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node *node); +void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); +unsigned int +radix_tree_gang_lookup_index(struct radix_tree_root *root, void **results, + unsigned long *indices, unsigned long first_index, + unsigned int max_items); unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, unsigned long *indices, unsigned long first_index, unsigned int max_items); -unsigned long radix_tree_next_hole(struct radix_tree_root *root, - unsigned long index, unsigned long max_scan); -unsigned long radix_tree_prev_hole(struct radix_tree_root *root, - unsigned long index, unsigned long max_scan); int radix_tree_preload(gfp_t gfp_mask); +int radix_tree_maybe_preload(gfp_t gfp_mask); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag); @@ -321,13 +357,29 @@ radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start) void **radix_tree_next_chunk(struct radix_tree_root *root, struct radix_tree_iter *iter, unsigned flags); +/** + * radix_tree_iter_retry - retry this chunk of the iteration + * @iter: iterator state + * + * If we iterate over a tree protected only by the RCU lock, a race + * against deletion or creation may result in seeing a slot for which + * radix_tree_deref_retry() returns true. If so, call this function + * and continue the iteration. + */ +static inline __must_check +void **radix_tree_iter_retry(struct radix_tree_iter *iter) +{ + iter->next_index = iter->index; + return NULL; +} + /** * radix_tree_chunk_size - get current chunk size * * @iter: pointer to radix tree iterator * Returns: current chunk size */ -static __always_inline unsigned +static __always_inline long radix_tree_chunk_size(struct radix_tree_iter *iter) { return iter->next_index - iter->index; @@ -361,9 +413,9 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) return slot + offset + 1; } } else { - unsigned size = radix_tree_chunk_size(iter) - 1; + long size = radix_tree_chunk_size(iter); - while (size--) { + while (--size > 0) { slot++; iter->index++; if (likely(*slot)) diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 033b507b3..a5aa7ae67 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -23,72 +23,7 @@ I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... - Some example of insert and search follows here. The search is a plain - normal search over an ordered tree. The insert instead must be implemented - in two steps: First, the code must insert the element in order as a red leaf - in the tree, and then the support library function rb_insert_color() must - be called. Such function will do the not trivial work to rebalance the - rbtree, if necessary. - ------------------------------------------------------------------------ -static inline struct page * rb_search_page_cache(struct inode * inode, - unsigned long offset) -{ - struct rb_node * n = inode->i_rb_page_cache.rb_node; - struct page * page; - - while (n) - { - page = rb_entry(n, struct page, rb_page_cache); - - if (offset < page->offset) - n = n->rb_left; - else if (offset > page->offset) - n = n->rb_right; - else - return page; - } - return NULL; -} - -static inline struct page * __rb_insert_page_cache(struct inode * inode, - unsigned long offset, - struct rb_node * node) -{ - struct rb_node ** p = &inode->i_rb_page_cache.rb_node; - struct rb_node * parent = NULL; - struct page * page; - - while (*p) - { - parent = *p; - page = rb_entry(parent, struct page, rb_page_cache); - - if (offset < page->offset) - p = &(*p)->rb_left; - else if (offset > page->offset) - p = &(*p)->rb_right; - else - return page; - } - - rb_link_node(node, parent, p); - - return NULL; -} - -static inline struct page * rb_insert_page_cache(struct inode * inode, - unsigned long offset, - struct rb_node * node) -{ - struct page * ret; - if ((ret = __rb_insert_page_cache(inode, offset, node))) - goto out; - rb_insert_color(node, &inode->i_rb_page_cache); - out: - return ret; -} ------------------------------------------------------------------------ + See Documentation/rbtree.txt for documentation and samples. */ #ifndef _LINUX_RBTREE_H @@ -96,64 +31,37 @@ static inline struct page * rb_insert_page_cache(struct inode * inode, #include #include +#include -struct rb_node -{ - unsigned long rb_parent_color; -#define RB_RED 0 -#define RB_BLACK 1 +struct rb_node { + unsigned long __rb_parent_color; struct rb_node *rb_right; struct rb_node *rb_left; } __attribute__((aligned(sizeof(long)))); /* The alignment might seem pointless, but allegedly CRIS needs it */ -struct rb_root -{ +struct rb_root { struct rb_node *rb_node; }; -#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3)) -#define rb_color(r) ((r)->rb_parent_color & 1) -#define rb_is_red(r) (!rb_color(r)) -#define rb_is_black(r) rb_color(r) -#define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0) -#define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0) - -static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) -{ - rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p; -} -static inline void rb_set_color(struct rb_node *rb, int color) -{ - rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; -} +#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define RB_ROOT (struct rb_root) { NULL, } #define rb_entry(ptr, type, member) container_of(ptr, type, member) -#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) -#define RB_EMPTY_NODE(node) (rb_parent(node) == node) -#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) +#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) + +/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ +#define RB_EMPTY_NODE(node) \ + ((node)->__rb_parent_color == (unsigned long)(node)) +#define RB_CLEAR_NODE(node) \ + ((node)->__rb_parent_color = (unsigned long)(node)) -static inline void rb_init_node(struct rb_node *rb) -{ - rb->rb_parent_color = 0; - rb->rb_right = NULL; - rb->rb_left = NULL; - RB_CLEAR_NODE(rb); -} extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); -typedef void (*rb_augment_f)(struct rb_node *node, void *data); - -extern void rb_augment_insert(struct rb_node *node, - rb_augment_f func, void *data); -extern struct rb_node *rb_augment_erase_begin(struct rb_node *node); -extern void rb_augment_erase_end(struct rb_node *node, - rb_augment_f func, void *data); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); @@ -161,17 +69,58 @@ extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); +/* Postorder iteration - always visit the parent after its children */ +extern struct rb_node *rb_first_postorder(const struct rb_root *); +extern struct rb_node *rb_next_postorder(const struct rb_node *); + /* Fast replacement of a single node without remove/rebalance/add/rebalance */ -extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, +extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); -static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, - struct rb_node ** rb_link) +static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, + struct rb_node **rb_link) { - node->rb_parent_color = (unsigned long )parent; + node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } +static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, + struct rb_node **rb_link) +{ + node->__rb_parent_color = (unsigned long)parent; + node->rb_left = node->rb_right = NULL; + + rcu_assign_pointer(*rb_link, node); +} + +#define rb_entry_safe(ptr, type, member) \ + ({ typeof(ptr) ____ptr = (ptr); \ + ____ptr ? rb_entry(____ptr, type, member) : NULL; \ + }) + +/** + * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of + * given type allowing the backing memory of @pos to be invalidated + * + * @pos: the 'type *' to use as a loop cursor. + * @n: another 'type *' to use as temporary storage + * @root: 'rb_root *' of the rbtree. + * @field: the name of the rb_node field within 'type'. + * + * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as + * list_for_each_entry_safe() and allows the iteration to continue independent + * of changes to @pos by the body of the loop. + * + * Note, however, that it cannot handle other modifications that re-order the + * rbtree it is iterating over. This includes calling rb_erase() on @pos, as + * rb_erase() may rebalance the tree, causing us to miss some nodes. + */ +#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ + for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ + pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ + typeof(*pos), field); 1; }); \ + pos = n) + #endif /* _LINUX_RBTREE_H */ diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h new file mode 100644 index 000000000..14d7b831b --- /dev/null +++ b/include/linux/rbtree_augmented.h @@ -0,0 +1,249 @@ +/* + Red Black Trees + (C) 1999 Andrea Arcangeli + (C) 2002 David Woodhouse + (C) 2012 Michel Lespinasse + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + linux/include/linux/rbtree_augmented.h +*/ + +#ifndef _LINUX_RBTREE_AUGMENTED_H +#define _LINUX_RBTREE_AUGMENTED_H + +#include +#include + +/* + * Please note - only struct rb_augment_callbacks and the prototypes for + * rb_insert_augmented() and rb_erase_augmented() are intended to be public. + * The rest are implementation details you are not expected to depend on. + * + * See Documentation/rbtree.txt for documentation and samples. + */ + +struct rb_augment_callbacks { + void (*propagate)(struct rb_node *node, struct rb_node *stop); + void (*copy)(struct rb_node *old, struct rb_node *new); + void (*rotate)(struct rb_node *old, struct rb_node *new); +}; + +extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); +/* + * Fixup the rbtree and update the augmented information when rebalancing. + * + * On insertion, the user must update the augmented information on the path + * leading to the inserted node, then call rb_link_node() as usual and + * rb_augment_inserted() instead of the usual rb_insert_color() call. + * If rb_augment_inserted() rebalances the rbtree, it will callback into + * a user provided function to update the augmented information on the + * affected subtrees. + */ +static inline void +rb_insert_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) +{ + __rb_insert_augmented(node, root, augment->rotate); +} + +#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ + rbtype, rbaugmented, rbcompute) \ +static inline void \ +rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ +{ \ + while (rb != stop) { \ + rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ + rbtype augmented = rbcompute(node); \ + if (node->rbaugmented == augmented) \ + break; \ + node->rbaugmented = augmented; \ + rb = rb_parent(&node->rbfield); \ + } \ +} \ +static inline void \ +rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ +{ \ + rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ + rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ + new->rbaugmented = old->rbaugmented; \ +} \ +static void \ +rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ +{ \ + rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ + rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ + new->rbaugmented = old->rbaugmented; \ + old->rbaugmented = rbcompute(old); \ +} \ +rbstatic const struct rb_augment_callbacks rbname = { \ + rbname ## _propagate, rbname ## _copy, rbname ## _rotate \ +}; + + +#define RB_RED 0 +#define RB_BLACK 1 + +#define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) + +#define __rb_color(pc) ((pc) & 1) +#define __rb_is_black(pc) __rb_color(pc) +#define __rb_is_red(pc) (!__rb_color(pc)) +#define rb_color(rb) __rb_color((rb)->__rb_parent_color) +#define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) +#define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) + +static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) +{ + rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; +} + +static inline void rb_set_parent_color(struct rb_node *rb, + struct rb_node *p, int color) +{ + rb->__rb_parent_color = (unsigned long)p | color; +} + +static inline void +__rb_change_child(struct rb_node *old, struct rb_node *new, + struct rb_node *parent, struct rb_root *root) +{ + if (parent) { + if (parent->rb_left == old) + WRITE_ONCE(parent->rb_left, new); + else + WRITE_ONCE(parent->rb_right, new); + } else + WRITE_ONCE(root->rb_node, new); +} + +extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); + +static __always_inline struct rb_node * +__rb_erase_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) +{ + struct rb_node *child = node->rb_right; + struct rb_node *tmp = node->rb_left; + struct rb_node *parent, *rebalance; + unsigned long pc; + + if (!tmp) { + /* + * Case 1: node to erase has no more than 1 child (easy!) + * + * Note that if there is one child it must be red due to 5) + * and node must be black due to 4). We adjust colors locally + * so as to bypass __rb_erase_color() later on. + */ + pc = node->__rb_parent_color; + parent = __rb_parent(pc); + __rb_change_child(node, child, parent, root); + if (child) { + child->__rb_parent_color = pc; + rebalance = NULL; + } else + rebalance = __rb_is_black(pc) ? parent : NULL; + tmp = parent; + } else if (!child) { + /* Still case 1, but this time the child is node->rb_left */ + tmp->__rb_parent_color = pc = node->__rb_parent_color; + parent = __rb_parent(pc); + __rb_change_child(node, tmp, parent, root); + rebalance = NULL; + tmp = parent; + } else { + struct rb_node *successor = child, *child2; + + tmp = child->rb_left; + if (!tmp) { + /* + * Case 2: node's successor is its right child + * + * (n) (s) + * / \ / \ + * (x) (s) -> (x) (c) + * \ + * (c) + */ + parent = successor; + child2 = successor->rb_right; + + augment->copy(node, successor); + } else { + /* + * Case 3: node's successor is leftmost under + * node's right child subtree + * + * (n) (s) + * / \ / \ + * (x) (y) -> (x) (y) + * / / + * (p) (p) + * / / + * (s) (c) + * \ + * (c) + */ + do { + parent = successor; + successor = tmp; + tmp = tmp->rb_left; + } while (tmp); + child2 = successor->rb_right; + WRITE_ONCE(parent->rb_left, child2); + WRITE_ONCE(successor->rb_right, child); + rb_set_parent(child, successor); + + augment->copy(node, successor); + augment->propagate(parent, successor); + } + + tmp = node->rb_left; + WRITE_ONCE(successor->rb_left, tmp); + rb_set_parent(tmp, successor); + + pc = node->__rb_parent_color; + tmp = __rb_parent(pc); + __rb_change_child(node, successor, tmp, root); + + if (child2) { + successor->__rb_parent_color = pc; + rb_set_parent_color(child2, parent, RB_BLACK); + rebalance = NULL; + } else { + unsigned long pc2 = successor->__rb_parent_color; + successor->__rb_parent_color = pc; + rebalance = __rb_is_black(pc2) ? parent : NULL; + } + tmp = successor; + } + + augment->propagate(tmp, NULL); + return rebalance; +} + +static __always_inline void +rb_erase_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) +{ + struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); + if (rebalance) + __rb_erase_color(rebalance, root, augment->rotate); +} + +#endif /* _LINUX_RBTREE_AUGMENTED_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index bf4b7ba11..55eac4d5f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -338,19 +338,6 @@ static inline void lockup_detector_init(void) } #endif -#ifdef CONFIG_DETECT_HUNG_TASK -extern unsigned int sysctl_hung_task_panic; -extern unsigned long sysctl_hung_task_check_count; -extern unsigned long sysctl_hung_task_timeout_secs; -extern unsigned long sysctl_hung_task_warnings; -extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); -#else -/* Avoid need for ifdefs elsewhere in the code */ -enum { sysctl_hung_task_timeout_secs = 0 }; -#endif - /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) @@ -372,23 +359,6 @@ extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; struct user_namespace; -/* - * Default maximum number of active map areas, this limits the number of vmas - * per mm struct. Users can overwrite this number by sysctl but there is a - * problem. - * - * When a program's coredump is generated as ELF format, a section is created - * per a vma. In ELF, the number of sections is represented in unsigned short. - * This means the number of sections should be smaller than 65535 at coredump. - * Because the kernel adds some informative sections to a image of program at - * generating coredump, we need some margin. The number of extra sections is - * 1-3 now and depends on arch. We use "5" as safe margin, here. - */ -#define MAPCOUNT_ELF_CORE_MARGIN (5) -#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) - -extern int sysctl_max_map_count; - #include #ifdef CONFIG_MMU @@ -1258,12 +1228,6 @@ struct sched_rt_entity { #endif }; -/* - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. - */ -#define RR_TIMESLICE (100 * HZ / 1000) - struct rcu_node; enum perf_event_task_context { @@ -2083,52 +2047,10 @@ extern void wake_up_idle_cpu(int cpu); static inline void wake_up_idle_cpu(int cpu) { } #endif -extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_min_granularity; -extern unsigned int sysctl_sched_wakeup_granularity; -extern unsigned int sysctl_sched_child_runs_first; -extern unsigned int sysctl_sched_wake_to_idle; - -enum sched_tunable_scaling { - SCHED_TUNABLESCALING_NONE, - SCHED_TUNABLESCALING_LOG, - SCHED_TUNABLESCALING_LINEAR, - SCHED_TUNABLESCALING_END, -}; -extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; - -#ifdef CONFIG_SCHED_DEBUG -extern unsigned int sysctl_sched_migration_cost; -extern unsigned int sysctl_sched_nr_migrate; -extern unsigned int sysctl_sched_time_avg; -extern unsigned int sysctl_timer_migration; -extern unsigned int sysctl_sched_shares_window; - -int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos); -#endif -#ifdef CONFIG_SCHED_DEBUG -static inline unsigned int get_sysctl_timer_migration(void) -{ - return sysctl_timer_migration; -} -#else -static inline unsigned int get_sysctl_timer_migration(void) -{ - return 1; -} -#endif -extern unsigned int sysctl_sched_rt_period; -extern int sysctl_sched_rt_runtime; - -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +extern unsigned int sysctl_sched_ravg_window; +extern unsigned int sysctl_sched_wakeup_load_threshold; #ifdef CONFIG_SCHED_AUTOGROUP -extern unsigned int sysctl_sched_autogroup_enabled; - extern void sched_autogroup_create_attach(struct task_struct *p); extern void sched_autogroup_detach(struct task_struct *p); extern void sched_autogroup_fork(struct signal_struct *sig); @@ -2144,10 +2066,6 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { } static inline void sched_autogroup_exit(struct signal_struct *sig) { } #endif -#ifdef CONFIG_CFS_BANDWIDTH -extern unsigned int sysctl_sched_cfs_bandwidth_slice; -#endif - #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h new file mode 100644 index 000000000..d8c6e8606 --- /dev/null +++ b/include/linux/sched/sysctl.h @@ -0,0 +1,91 @@ +#ifndef _SCHED_SYSCTL_H +#define _SCHED_SYSCTL_H + +#ifdef CONFIG_DETECT_HUNG_TASK +extern unsigned int sysctl_hung_task_panic; +extern unsigned long sysctl_hung_task_check_count; +extern unsigned long sysctl_hung_task_timeout_secs; +extern unsigned long sysctl_hung_task_warnings; +extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos); +#else +/* Avoid need for ifdefs elsewhere in the code */ +enum { sysctl_hung_task_timeout_secs = 0 }; +#endif + +/* + * Default maximum number of active map areas, this limits the number of vmas + * per mm struct. Users can overwrite this number by sysctl but there is a + * problem. + * + * When a program's coredump is generated as ELF format, a section is created + * per a vma. In ELF, the number of sections is represented in unsigned short. + * This means the number of sections should be smaller than 65535 at coredump. + * Because the kernel adds some informative sections to a image of program at + * generating coredump, we need some margin. The number of extra sections is + * 1-3 now and depends on arch. We use "5" as safe margin, here. + */ +#define MAPCOUNT_ELF_CORE_MARGIN (5) +#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) + +extern int sysctl_max_map_count; + +extern unsigned int sysctl_sched_latency; +extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_wakeup_granularity; +extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_sched_wake_to_idle; + +enum sched_tunable_scaling { + SCHED_TUNABLESCALING_NONE, + SCHED_TUNABLESCALING_LOG, + SCHED_TUNABLESCALING_LINEAR, + SCHED_TUNABLESCALING_END, +}; +extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; + +#ifdef CONFIG_SCHED_DEBUG +extern unsigned int sysctl_sched_migration_cost; +extern unsigned int sysctl_sched_nr_migrate; +extern unsigned int sysctl_sched_time_avg; +extern unsigned int sysctl_timer_migration; +extern unsigned int sysctl_sched_shares_window; + +int sched_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos); +#endif +#ifdef CONFIG_SCHED_DEBUG +static inline unsigned int get_sysctl_timer_migration(void) +{ + return sysctl_timer_migration; +} +#else +static inline unsigned int get_sysctl_timer_migration(void) +{ + return 1; +} +#endif +extern unsigned int sysctl_sched_rt_period; +extern int sysctl_sched_rt_runtime; + +#ifdef CONFIG_CFS_BANDWIDTH +extern unsigned int sysctl_sched_cfs_bandwidth_slice; +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +extern unsigned int sysctl_sched_autogroup_enabled; +#endif + +/* + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define RR_TIMESLICE (100 * HZ / 1000) + +int sched_rt_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + +#endif /* _SCHED_SYSCTL_H */ diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index e156ce155..6218a2dad 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -142,6 +142,41 @@ int seq_put_decimal_ull(struct seq_file *m, char delimiter, int seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num); +/** + * seq_show_options - display mount options with appropriate escapes. + * @m: the seq_file handle + * @name: the mount option name + * @value: the mount option name's value, can be NULL + */ +static inline void seq_show_option(struct seq_file *m, const char *name, + const char *value) +{ + seq_putc(m, ','); + seq_escape(m, name, ",= \t\n\\"); + if (value) { + seq_putc(m, '='); + seq_escape(m, value, ", \t\n\\"); + } +} + +/** + * seq_show_option_n - display mount options with appropriate escapes + * where @value must be a specific length. + * @m: the seq_file handle + * @name: the mount option name + * @value: the mount option name's value, cannot be NULL + * @length: the length of @value to display + * + * This is a macro since this uses "length" to define the size of the + * stack buffer. + */ +#define seq_show_option_n(m, name, value, length) { \ + char val_buf[length + 1]; \ + strncpy(val_buf, value, length); \ + val_buf[length] = '\0'; \ + seq_show_option(m, name, val_buf); \ +} + #define SEQ_START_TOKEN ((void *)1) /* * Helpers for iteration over list_head-s in seq_files diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 5c25b9a8c..e4a26ea07 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -4,6 +4,12 @@ /* * This struct is used to pass information from page reclaim to the shrinkers. * We consolidate the values for easier extention later. + * + * The 'gfpmask' refers to the allocation we are currently trying to + * fulfil. + * + * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is + * querying the cache size, so a fastpath for that case is appropriate. */ struct shrink_control { gfp_t gfp_mask; @@ -14,24 +20,38 @@ struct shrink_control { int priority; }; +#define SHRINK_STOP (~0UL) + /* * A callback you can register to apply pressure to ageable caches. * - * 'sc' is passed shrink_control which includes a count 'nr_to_scan' - * and a 'gfpmask'. It should look through the least-recently-used - * 'nr_to_scan' entries and attempt to free them up. It should return - * the number of objects which remain in the cache. If it returns -1, it means - * it cannot do any scanning at this time (eg. there is a risk of deadlock). - * The callback must not return -1 if nr_to_scan is zero. + * @shrink() should look through the least-recently-used 'nr_to_scan' entries + * and attempt to free them up. It should return the number of objects which + * remain in the cache. If it returns -1, it means it cannot do any scanning at + * this time (eg. there is a risk of deadlock). * - * The 'gfpmask' refers to the allocation we are currently trying to - * fulfil. + * @count_objects should return the number of freeable items in the cache. If + * there are no objects to free or the number of freeable items cannot be + * determined, it should return 0. No deadlock checks should be done during the + * count callback - the shrinker relies on aggregating scan counts that couldn't + * be executed due to potential deadlocks to be run at a later call when the + * deadlock condition is no longer pending. * - * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is - * querying the cache size, so a fastpath for that case is appropriate. + * @scan_objects will only be called if @count_objects returned a non-zero + * value for the number of freeable objects. The callout should scan the cache + * and attempt to free items from the cache. It should then return the number + * of objects freed during the scan, or SHRINK_STOP if progress cannot be made + * due to potential deadlocks. If SHRINK_STOP is returned, then no further + * attempts to call the @scan_objects will be made from the current reclaim + * context. */ struct shrinker { int (*shrink)(struct shrinker *, struct shrink_control *sc); + unsigned long (*count_objects)(struct shrinker *, + struct shrink_control *sc); + unsigned long (*scan_objects)(struct shrinker *, + struct shrink_control *sc); + int seeks; /* seeks to recreate an obj */ long batch; /* reclaim batch size, 0 = default */ diff --git a/include/linux/state_notifier.h b/include/linux/state_notifier.h new file mode 100644 index 000000000..34f9287aa --- /dev/null +++ b/include/linux/state_notifier.h @@ -0,0 +1,19 @@ +#ifndef __LINUX_STATE_NOTIFIER_H +#define __LINUX_STATE_NOTIFIER_H + +#include + +#define STATE_NOTIFIER_ACTIVE 0x01 +#define STATE_NOTIFIER_SUSPEND 0x02 + +struct state_event { + void *data; +}; + +extern void state_suspend(void); +extern void state_resume(void); +int state_register_client(struct notifier_block *nb); +int state_unregister_client(struct notifier_block *nb); +int state_notifier_call_chain(unsigned long val, void *v); + +#endif /* _LINUX_STATE_NOTIFIER_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 886dda7f7..453425ace 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -19,10 +20,13 @@ struct bio; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_PRIO_SHIFT 0 -#define SWAP_FLAG_DISCARD 0x10000 /* discard swap cluster after use */ +#define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ +#define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ +#define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ #define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \ - SWAP_FLAG_DISCARD) + SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \ + SWAP_FLAG_DISCARD_PAGES) static inline int current_is_kswapd(void) { @@ -146,16 +150,20 @@ struct swap_extent { enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ - SWP_DISCARDABLE = (1 << 2), /* swapon+blkdev support discard */ + SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ + SWP_FILE = (1 << 7), /* set after swap_activate success */ + SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ + SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ /* add others here before... */ - SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ + SWP_FAST = (1 << 10), /* blkdev access is fast and cheap */ + SWP_SCANNING = (1 << 11), /* refcount in scan_swap_map */ }; -#define SWAP_CLUSTER_MAX 32 +#define SWAP_CLUSTER_MAX 32UL #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* @@ -329,14 +337,16 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) /* linux/mm/page_io.c */ extern int swap_readpage(struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); +extern int swap_set_page_dirty(struct page *page); extern void end_swap_bio_write(struct bio *bio, int err); extern int __swap_writepage(struct page *page, struct writeback_control *wbc, void (*end_write_func)(struct bio *, int)); extern void end_swap_bio_read(struct bio *bio, int err); /* linux/mm/swap_state.c */ -extern struct address_space swapper_space; -#define total_swapcache_pages swapper_space.nrpages +extern struct address_space swapper_spaces[]; +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) +extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); extern int add_to_swap(struct page *); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); @@ -354,10 +364,18 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t, /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; +extern bool is_swap_fast(swp_entry_t entry); /* Swap 50% full? Release swapcache more aggressively.. */ -static inline bool vm_swap_full(void) +static inline bool vm_swap_full(struct swap_info_struct *si) { + /* + * If the swap device is fast, return true + * not to delay swap free. + */ + if (si->flags & SWP_FAST) + return true; + return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages; } @@ -401,8 +419,8 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) #define get_nr_swap_pages() 0L #define total_swap_pages 0L -#define total_swapcache_pages 0UL -#define vm_swap_full() 0 +#define total_swapcache_pages() 0UL +#define vm_swap_full(si) 0 #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index 508872747..a520fd70a 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -39,7 +39,7 @@ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) static inline void timerqueue_init(struct timerqueue_node *node) { - rb_init_node(&node->node); + RB_CLEAR_NODE(&node->node); } static inline void timerqueue_init_head(struct timerqueue_head *head) diff --git a/include/linux/unlz4.h b/include/linux/unlz4.h new file mode 100644 index 000000000..3273c2f36 --- /dev/null +++ b/include/linux/unlz4.h @@ -0,0 +1,10 @@ +#ifndef DECOMPRESS_UNLZ4_H +#define DECOMPRESS_UNLZ4_H + +int unlz4(unsigned char *inbuf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), + unsigned char *output, + long *pos, + void(*error)(char *x)); +#endif diff --git a/include/linux/zbud.h b/include/linux/zbud.h new file mode 100644 index 000000000..e183a0a65 --- /dev/null +++ b/include/linux/zbud.h @@ -0,0 +1,22 @@ +#ifndef _ZBUD_H_ +#define _ZBUD_H_ + +#include + +struct zbud_pool; + +struct zbud_ops { + int (*evict)(struct zbud_pool *pool, unsigned long handle); +}; + +struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops); +void zbud_destroy_pool(struct zbud_pool *pool); +int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, + unsigned long *handle); +void zbud_free(struct zbud_pool *pool, unsigned long handle); +int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries); +void *zbud_map(struct zbud_pool *pool, unsigned long handle); +void zbud_unmap(struct zbud_pool *pool, unsigned long handle); +u64 zbud_get_pool_size(struct zbud_pool *pool); + +#endif /* _ZBUD_H_ */ diff --git a/include/linux/zcache.h b/include/linux/zcache.h new file mode 100644 index 000000000..2db7e4bbb --- /dev/null +++ b/include/linux/zcache.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2015, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#ifndef _LINUX_ZCACHE_H +#define _LINUX_ZCACHE_H + +#ifdef CONFIG_ZCACHE +extern u64 zcache_pages(void); +#else +u64 zcache_pages(void) { return 0; } +#endif + +#endif /* _LINUX_ZCACHE_H */ diff --git a/include/linux/zpool.h b/include/linux/zpool.h new file mode 100644 index 000000000..24e0173c7 --- /dev/null +++ b/include/linux/zpool.h @@ -0,0 +1,110 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for the zbud and zsmalloc memory + * storage pool implementations. Typically, this is used to + * store compressed memory. + */ + +#ifndef _ZPOOL_H_ +#define _ZPOOL_H_ + +struct zpool; + +struct zpool_ops { + int (*evict)(struct zpool *pool, unsigned long handle); +}; + +/* + * Control how a handle is mapped. It will be ignored if the + * implementation does not support it. Its use is optional. + * Note that this does not refer to memory protection, it + * refers to how the memory will be copied in/out if copying + * is necessary during mapping; read-write is the safest as + * it copies the existing memory in on map, and copies the + * changed memory back out on unmap. Write-only does not copy + * in the memory and should only be used for initialization. + * If in doubt, use ZPOOL_MM_DEFAULT which is read-write. + */ +enum zpool_mapmode { + ZPOOL_MM_RW, /* normal read-write mapping */ + ZPOOL_MM_RO, /* read-only (no copy-out at unmap time) */ + ZPOOL_MM_WO, /* write-only (no copy-in at map time) */ + + ZPOOL_MM_DEFAULT = ZPOOL_MM_RW +}; + +bool zpool_has_pool(char *type); + +struct zpool *zpool_create_pool(const char *type, const char *name, + gfp_t gfp, const struct zpool_ops *ops); + +char *zpool_get_type(struct zpool *pool); + +void zpool_destroy_pool(struct zpool *pool); + +int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, + unsigned long *handle); + +void zpool_free(struct zpool *pool, unsigned long handle); + +int zpool_shrink(struct zpool *pool, unsigned int pages, + unsigned int *reclaimed); + +void *zpool_map_handle(struct zpool *pool, unsigned long handle, + enum zpool_mapmode mm); + +void zpool_unmap_handle(struct zpool *pool, unsigned long handle); + +u64 zpool_get_total_size(struct zpool *pool); + + +/** + * struct zpool_driver - driver implementation for zpool + * @type: name of the driver. + * @list: entry in the list of zpool drivers. + * @create: create a new pool. + * @destroy: destroy a pool. + * @malloc: allocate mem from a pool. + * @free: free mem from a pool. + * @shrink: shrink the pool. + * @map: map a handle. + * @unmap: unmap a handle. + * @total_size: get total size of a pool. + * + * This is created by a zpool implementation and registered + * with zpool. + */ +struct zpool_driver { + char *type; + struct module *owner; + atomic_t refcount; + struct list_head list; + + void *(*create)(const char *name, + gfp_t gfp, + const struct zpool_ops *ops, + struct zpool *zpool); + void (*destroy)(void *pool); + + int (*malloc)(void *pool, size_t size, gfp_t gfp, + unsigned long *handle); + void (*free)(void *pool, unsigned long handle); + + int (*shrink)(void *pool, unsigned int pages, + unsigned int *reclaimed); + + void *(*map)(void *pool, unsigned long handle, + enum zpool_mapmode mm); + void (*unmap)(void *pool, unsigned long handle); + + u64 (*total_size)(void *pool); +}; + +void zpool_register_driver(struct zpool_driver *driver); + +int zpool_unregister_driver(struct zpool_driver *driver); + +#endif diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 398dae365..57a8e98f2 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -2,6 +2,7 @@ * zsmalloc memory allocator * * Copyright (C) 2011 Nitin Gupta + * Copyright (C) 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the license that better fits your requirements. @@ -14,14 +15,13 @@ #define _ZS_MALLOC_H_ #include -#include /* * zsmalloc mapping modes * * NOTE: These only make a difference when a mapped object spans pages. - * They also have no effect when PGTABLE_MAPPING is selected. -*/ + * They also have no effect when PGTABLE_MAPPING is selected. + */ enum zs_mapmode { ZS_MM_RW, /* normal read-write mapping */ ZS_MM_RO, /* read-only (no copy-out at unmap time) */ @@ -34,14 +34,14 @@ enum zs_mapmode { */ }; -struct zs_ops { - struct page * (*alloc)(gfp_t); - void (*free)(struct page *); +struct zs_pool_stats { + /* How many pages were migrated (freed) */ + unsigned long pages_compacted; }; struct zs_pool; -struct zs_pool *zs_create_pool(gfp_t flags, struct zs_ops *ops); +struct zs_pool *zs_create_pool(const char *name); void zs_destroy_pool(struct zs_pool *pool); unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags); @@ -51,6 +51,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm); void zs_unmap_object(struct zs_pool *pool, unsigned long handle); -u64 zs_get_total_size_bytes(struct zs_pool *pool); +unsigned long zs_get_total_pages(struct zs_pool *pool); +unsigned long zs_compact(struct zs_pool *pool); +void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); #endif diff --git a/include/sound/wm8904.h b/include/sound/wm8904.h index 898be3a8d..6d8f8fba3 100644 --- a/include/sound/wm8904.h +++ b/include/sound/wm8904.h @@ -119,7 +119,7 @@ #define WM8904_MIC_REGS 2 #define WM8904_GPIO_REGS 4 #define WM8904_DRC_REGS 4 -#define WM8904_EQ_REGS 25 +#define WM8904_EQ_REGS 24 /** * DRC configurations are specified with a label and a set of register diff --git a/kernel/async.c b/kernel/async.c index 32d8dc960..b7717733f 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -62,7 +62,7 @@ static async_cookie_t next_cookie = 1; #define MAX_WORK 32768 static LIST_HEAD(async_pending); -static LIST_HEAD(async_running); +static ASYNC_DOMAIN(async_running); static DEFINE_SPINLOCK(async_lock); struct async_entry { @@ -71,7 +71,7 @@ struct async_entry { async_cookie_t cookie; async_func_ptr *func; void *data; - struct list_head *running; + struct async_domain *running; }; static DECLARE_WAIT_QUEUE_HEAD(async_done); @@ -82,7 +82,7 @@ static atomic_t entry_count; /* * MUST be called with the lock held! */ -static async_cookie_t __lowest_in_progress(struct list_head *running) +static async_cookie_t __lowest_in_progress(struct async_domain *running) { struct async_entry *entry; @@ -93,9 +93,8 @@ static async_cookie_t __lowest_in_progress(struct list_head *running) return next_cookie; } - if (!list_empty(running)) { - entry = list_first_entry(running, - struct async_entry, list); + if (!list_empty(&running->domain)) { + entry = list_first_entry(&running->domain, typeof(*entry), list); return entry->cookie; } @@ -106,7 +105,7 @@ static async_cookie_t __lowest_in_progress(struct list_head *running) return next_cookie; /* "infinity" value */ } -static async_cookie_t lowest_in_progress(struct list_head *running) +static async_cookie_t lowest_in_progress(struct async_domain *running) { unsigned long flags; async_cookie_t ret; @@ -126,10 +125,11 @@ static void async_run_entry_fn(struct work_struct *work) container_of(work, struct async_entry, work); unsigned long flags; ktime_t uninitialized_var(calltime), delta, rettime; + struct async_domain *running = entry->running; /* 1) move self to the running queue */ spin_lock_irqsave(&async_lock, flags); - list_move_tail(&entry->list, entry->running); + list_move_tail(&entry->list, &running->domain); spin_unlock_irqrestore(&async_lock, flags); /* 2) run (and print duration) */ @@ -163,7 +163,7 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) +static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) { struct async_entry *entry; unsigned long flags; @@ -230,7 +230,7 @@ EXPORT_SYMBOL_GPL(async_schedule); * Note: This function may be called from atomic or non-atomic contexts. */ async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, - struct list_head *running) + struct async_domain *running) { return __async_schedule(ptr, data, running); } @@ -249,14 +249,14 @@ EXPORT_SYMBOL_GPL(async_synchronize_full); /** * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain - * @list: running list to synchronize on + * @domain: running list to synchronize on * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @list have been done. + * synchronization domain specified by the running list @domain have been done. */ -void async_synchronize_full_domain(struct list_head *list) +void async_synchronize_full_domain(struct async_domain *domain) { - async_synchronize_cookie_domain(next_cookie, list); + async_synchronize_cookie_domain(next_cookie, domain); } EXPORT_SYMBOL_GPL(async_synchronize_full_domain); @@ -266,11 +266,10 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); * @running: running list to synchronize on, NULL indicates all lists * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @list submitted + * synchronization domain specified by running list @running submitted * prior to @cookie have been done. */ -void async_synchronize_cookie_domain(async_cookie_t cookie, - struct list_head *running) +void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) { ktime_t uninitialized_var(starttime), delta, endtime; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ad70cada2..f48e4e739 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1053,15 +1053,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) - seq_printf(seq, ",%s", ss->name); + seq_show_option(seq, ss->name, NULL); if (test_bit(ROOT_NOPREFIX, &root->flags)) seq_puts(seq, ",noprefix"); if (strlen(root->release_agent_path)) - seq_printf(seq, ",release_agent=%s", root->release_agent_path); + seq_show_option(seq, "release_agent", + root->release_agent_path); if (clone_children(&root->top_cgroup)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) - seq_printf(seq, ",name=%s", root->name); + seq_show_option(seq, "name", root->name); mutex_unlock(&cgroup_root_mutex); return 0; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 02caf610c..b2e02501b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -166,6 +166,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb) return ret; } +int __ref __register_cpu_notifier(struct notifier_block *nb) +{ + return raw_notifier_chain_register(&cpu_chain, nb); +} + static int __cpu_notify(unsigned long val, void *v, int nr_to_call, int *nr_calls) { @@ -189,6 +194,7 @@ static void cpu_notify_nofail(unsigned long val, void *v) BUG_ON(cpu_notify(val, v)); } EXPORT_SYMBOL(register_cpu_notifier); +EXPORT_SYMBOL(__register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) { @@ -198,6 +204,12 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); +void __ref __unregister_cpu_notifier(struct notifier_block *nb) +{ + raw_notifier_chain_unregister(&cpu_chain, nb); +} +EXPORT_SYMBOL(__unregister_cpu_notifier); + static inline void check_for_tasks(int cpu) { struct task_struct *p; @@ -262,6 +274,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) goto out_release; } + /* + * By now we've cleared cpu_active_mask, wait for all preempt-disabled + * and RCU users of this state to go away such that all new such users + * will observe it. + * + * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might + * not imply sync_sched(), so explicitly call both. + */ +#ifdef CONFIG_PREEMPT + synchronize_sched(); +#endif + synchronize_rcu(); + + /* + * So now all preempt/rcu users must observe !cpu_active(). + */ + err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 6a72395d3..72bc17919 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index fb655f5f9..15374d0ac 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "internals.h" @@ -326,18 +327,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) void register_irq_proc(unsigned int irq, struct irq_desc *desc) { + static DEFINE_MUTEX(register_lock); char name [MAX_NAMELEN]; - if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) + if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) return; + /* + * irq directories are registered only when a handler is + * added, not when the descriptor is created, so multiple + * tasks might try to register at the same time. + */ + mutex_lock(®ister_lock); + + if (desc->dir) + goto out_unlock; + memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ desc->dir = proc_mkdir(name, root_irq_dir); if (!desc->dir) - return; + goto out_unlock; #ifdef CONFIG_SMP /* create /proc/irq//smp_affinity */ @@ -358,6 +370,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) proc_create_data("spurious", 0444, desc->dir, &irq_spurious_proc_fops, (void *)(long)irq); + +out_unlock: + mutex_unlock(®ister_lock); } void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) diff --git a/kernel/module.c b/kernel/module.c index 6282fdc4d..fac398bb5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -969,11 +969,15 @@ void symbol_put_addr(void *addr) if (core_kernel_text(a)) return; - /* module_text_address is safe here: we're supposed to have reference - * to module from symbol_get, so it can't go away. */ + /* + * Even though we hold a reference on the module; we still need to + * disable preemption in order to safely traverse the data structure. + */ + preempt_disable(); modaddr = __module_text_address(a); BUG_ON(!modaddr); module_put(modaddr); + preempt_enable(); } EXPORT_SYMBOL_GPL(symbol_put_addr); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8d443c36c..751d37d87 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1983,11 +1983,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul + * + * We must observe prev->state before clearing prev->on_cpu (in + * finish_lock_switch), otherwise a concurrent wakeup can get prev + * running on another CPU and we could rave with its RUNNING -> DEAD + * transition, resulting in a double drop. */ prev_state = prev->state; finish_arch_switch(prev); @@ -4625,7 +4625,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) struct task_struct *p; int retval; - get_online_cpus(); rcu_read_lock(); p = find_process_by_pid(pid); @@ -4678,7 +4677,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) free_cpumask_var(cpus_allowed); out_put_task: put_task_struct(p); - put_online_cpus(); return retval; } @@ -4721,7 +4719,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) unsigned long flags; int retval; - get_online_cpus(); rcu_read_lock(); retval = -ESRCH; @@ -4734,12 +4731,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) goto out_unlock; raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); - put_online_cpus(); return retval; } @@ -7005,13 +7001,11 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) static void reinit_sched_domains(void) { - get_online_cpus(); /* Destroy domains first to force the rebuild */ partition_sched_domains(0, NULL, NULL); rebuild_sched_domains(); - put_online_cpus(); } static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) @@ -7162,14 +7156,18 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); - get_online_cpus(); + + /* + * There's no userspace yet to cause hotplug operations; hence all the + * cpu masks are stable and all blatant races in the below code cannot + * happen. + */ mutex_lock(&sched_domains_mutex); init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); - put_online_cpus(); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c65ccfcae..261c65543 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -712,8 +713,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. + * + * Pairs with the control dependency and rmb in try_to_wake_up(). */ - smp_wmb(); + smp_mb(); prev->on_cpu = 0; #endif #ifdef CONFIG_DEBUG_SPINLOCK diff --git a/kernel/sysctl.c b/kernel/sysctl.c index aa8debaa1..2d0dc1461 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c95833821..b3f540792 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -291,7 +291,7 @@ static void clocksource_watchdog(unsigned long data) continue; /* Check the deviation from the watchdog clocksource. */ - if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { + if ((abs64(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { clocksource_unstable(cs, cs_nsec - wd_nsec); continue; } diff --git a/kernel/timer.c b/kernel/timer.c index 219865e85..5b5310630 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include diff --git a/lib/Kconfig b/lib/Kconfig index 09f0d8a64..4b23c9600 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -177,6 +177,15 @@ config LZO_COMPRESS config LZO_DECOMPRESS tristate +config LZ4_COMPRESS + tristate + +config LZ4HC_COMPRESS + tristate + +config LZ4_DECOMPRESS + tristate + source "lib/xz/Kconfig" # @@ -201,6 +210,10 @@ config DECOMPRESS_LZO select LZO_DECOMPRESS tristate +config DECOMPRESS_LZ4 + select LZ4_DECOMPRESS + tristate + # # Generic allocator support is selected if needed # diff --git a/lib/Makefile b/lib/Makefile index 9161b969d..1919769ec 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -72,6 +72,9 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_BCH) += bch.o obj-$(CONFIG_LZO_COMPRESS) += lzo/ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ +obj-$(CONFIG_LZ4_COMPRESS) += lz4/ +obj-$(CONFIG_LZ4HC_COMPRESS) += lz4/ +obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/ obj-$(CONFIG_XZ_DEC) += xz/ obj-$(CONFIG_RAID6_PQ) += raid6/ @@ -80,6 +83,7 @@ lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o lib-$(CONFIG_DECOMPRESS_XZ) += decompress_unxz.o lib-$(CONFIG_DECOMPRESS_LZO) += decompress_unlzo.o +lib-$(CONFIG_DECOMPRESS_LZ4) += decompress_unlz4.o obj-$(CONFIG_TEXTSEARCH) += textsearch.o obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile new file mode 100644 index 000000000..8085d04e9 --- /dev/null +++ b/lib/lz4/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LZ4_COMPRESS) += lz4_compress.o +obj-$(CONFIG_LZ4HC_COMPRESS) += lz4hc_compress.o +obj-$(CONFIG_LZ4_DECOMPRESS) += lz4_decompress.o diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c new file mode 100644 index 000000000..28321d8f7 --- /dev/null +++ b/lib/lz4/lz4_compress.c @@ -0,0 +1,443 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + * + * Changed for kernel use by: + * Chanho Min + */ + +#include +#include +#include +#include +#include "lz4defs.h" + +/* + * LZ4_compressCtx : + * ----------------- + * Compress 'isize' bytes from 'source' into an output buffer 'dest' of + * maximum size 'maxOutputSize'. * If it cannot achieve it, compression + * will stop, and result of the function will be zero. + * return : the number of bytes written in buffer 'dest', or 0 if the + * compression fails + */ +static inline int lz4_compressctx(void *ctx, + const char *source, + char *dest, + int isize, + int maxoutputsize) +{ + HTYPE *hashtable = (HTYPE *)ctx; + const u8 *ip = (u8 *)source; +#if LZ4_ARCH64 + const BYTE * const base = ip; +#else + const int base = 0; +#endif + const u8 *anchor = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + #define MATCHLIMIT (iend - LASTLITERALS) + + u8 *op = (u8 *) dest; + u8 *const oend = op + maxoutputsize; + int length; + const int skipstrength = SKIPSTRENGTH; + u32 forwardh; + int lastrun; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + memset((void *)hashtable, 0, LZ4_MEM_COMPRESS); + + /* First Byte */ + hashtable[LZ4_HASH_VALUE(ip)] = ip - base; + ip++; + forwardh = LZ4_HASH_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findmatchattempts = (1U << skipstrength) + 3; + const u8 *forwardip = ip; + const u8 *ref; + u8 *token; + + /* Find a match */ + do { + u32 h = forwardh; + int step = findmatchattempts++ >> skipstrength; + ip = forwardip; + forwardip = ip + step; + + if (unlikely(forwardip > mflimit)) + goto _last_literals; + + forwardh = LZ4_HASH_VALUE(forwardip); + ref = base + hashtable[h]; + hashtable[h] = ip - base; + } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); + + /* Catch up */ + while ((ip > anchor) && (ref > (u8 *)source) && + unlikely(ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = (int)(ip - anchor); + token = op++; + /* check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return 0; + + if (length >= (int)RUN_MASK) { + int len; + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *op++ = 255; + *op++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); +_next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref)); + + /* Start Counting */ + ip += MINMATCH; + /* MinMatch verified */ + ref += MINMATCH; + anchor = ip; + while (likely(ip < MATCHLIMIT - (STEPSIZE - 1))) { + #if LZ4_ARCH64 + u64 diff = A64(ref) ^ A64(ip); + #else + u32 diff = A32(ref) ^ A32(ip); + #endif + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } + #endif + if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < MATCHLIMIT) && (*ref == *ip)) + ip++; +_endcount: + /* Encode MatchLength */ + length = (int)(ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (length >> 8) > oend)) + return 0; + if (length >= (int)ML_MASK) { + *token += ML_MASK; + length -= ML_MASK; + for (; length > 509 ; length -= 510) { + *op++ = 255; + *op++ = 255; + } + if (length > 254) { + length -= 255; + *op++ = 255; + } + *op++ = (u8)length; + } else + *token += length; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + + /* Fill table */ + hashtable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base; + + /* Test next position */ + ref = base + hashtable[LZ4_HASH_VALUE(ip)]; + hashtable[LZ4_HASH_VALUE(ip)] = ip - base; + if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { + token = op++; + *token = 0; + goto _next_match; + } + + /* Prepare next loop */ + anchor = ip++; + forwardh = LZ4_HASH_VALUE(ip); + } + +_last_literals: + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (((char *)op - dest) + lastrun + 1 + + ((lastrun + 255 - RUN_MASK) / 255) > (u32)maxoutputsize) + return 0; + + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8)lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + + /* End */ + return (int)(((char *)op) - dest); +} + +static inline int lz4_compress64kctx(void *ctx, + const char *source, + char *dest, + int isize, + int maxoutputsize) +{ + u16 *hashtable = (u16 *)ctx; + const u8 *ip = (u8 *) source; + const u8 *anchor = ip; + const u8 *const base = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + #define MATCHLIMIT (iend - LASTLITERALS) + + u8 *op = (u8 *) dest; + u8 *const oend = op + maxoutputsize; + int len, length; + const int skipstrength = SKIPSTRENGTH; + u32 forwardh; + int lastrun; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + memset((void *)hashtable, 0, LZ4_MEM_COMPRESS); + + /* First Byte */ + ip++; + forwardh = LZ4_HASH64K_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findmatchattempts = (1U << skipstrength) + 3; + const u8 *forwardip = ip; + const u8 *ref; + u8 *token; + + /* Find a match */ + do { + u32 h = forwardh; + int step = findmatchattempts++ >> skipstrength; + ip = forwardip; + forwardip = ip + step; + + if (forwardip > mflimit) + goto _last_literals; + + forwardh = LZ4_HASH64K_VALUE(forwardip); + ref = base + hashtable[h]; + hashtable[h] = (u16)(ip - base); + } while (A32(ref) != A32(ip)); + + /* Catch up */ + while ((ip > anchor) && (ref > (u8 *)source) + && (ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = (int)(ip - anchor); + token = op++; + /* Check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return 0; + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *op++ = 255; + *op++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); + +_next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref)); + + /* Start Counting */ + ip += MINMATCH; + /* MinMatch verified */ + ref += MINMATCH; + anchor = ip; + + while (ip < MATCHLIMIT - (STEPSIZE - 1)) { + #if LZ4_ARCH64 + u64 diff = A64(ref) ^ A64(ip); + #else + u32 diff = A32(ref) ^ A32(ip); + #endif + + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } + #endif + if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < MATCHLIMIT) && (*ref == *ip)) + ip++; +_endcount: + + /* Encode MatchLength */ + len = (int)(ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)) + return 0; + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509 ; len -= 510) { + *op++ = 255; + *op++ = 255; + } + if (len > 254) { + len -= 255; + *op++ = 255; + } + *op++ = (u8)len; + } else + *token += len; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + + /* Fill table */ + hashtable[LZ4_HASH64K_VALUE(ip-2)] = (u16)(ip - 2 - base); + + /* Test next position */ + ref = base + hashtable[LZ4_HASH64K_VALUE(ip)]; + hashtable[LZ4_HASH64K_VALUE(ip)] = (u16)(ip - base); + if (A32(ref) == A32(ip)) { + token = op++; + *token = 0; + goto _next_match; + } + + /* Prepare next loop */ + anchor = ip++; + forwardh = LZ4_HASH64K_VALUE(ip); + } + +_last_literals: + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (op + lastrun + 1 + (lastrun - RUN_MASK + 255) / 255 > oend) + return 0; + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8)lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + /* End */ + return (int)(((char *)op) - dest); +} + +int lz4_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem) +{ + int ret = -1; + int out_len = 0; + + if (src_len < LZ4_64KLIMIT) + out_len = lz4_compress64kctx(wrkmem, src, dst, src_len, + lz4_compressbound(src_len)); + else + out_len = lz4_compressctx(wrkmem, src, dst, src_len, + lz4_compressbound(src_len)); + + if (out_len < 0) + goto exit; + + *dst_len = out_len; + + return 0; +exit: + return ret; +} +EXPORT_SYMBOL(lz4_compress); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("LZ4 compressor"); diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c new file mode 100644 index 000000000..6d940c72b --- /dev/null +++ b/lib/lz4/lz4_decompress.c @@ -0,0 +1,343 @@ +/* + * LZ4 Decompressor for Linux kernel + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee + * + * Based on LZ4 implementation by Yann Collet. + * + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + */ + +#ifndef STATIC +#include +#include +#endif +#include + +#include + +#include "lz4defs.h" + +static const int dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 +static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; +#endif + +static int lz4_uncompress(const char *source, char *dest, int osize) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *ref; + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + osize; + BYTE *cpy; + unsigned token; + size_t length; + + while (1) { + + /* get runlength */ + token = *ip++; + length = (token >> ML_BITS); + if (length == RUN_MASK) { + size_t len; + + len = *ip++; + for (; len == 255; length += 255) + len = *ip++; + if (unlikely(length > (size_t)(length + len))) + goto _output_error; + length += len; + } + + /* copy literals */ + cpy = op + length; + if (unlikely(cpy > oend - COPYLENGTH)) { + /* + * Error: not enough place for another match + * (min 4) + 5 literals + */ + if (cpy != oend) + goto _output_error; + + memcpy(op, ip, length); + ip += length; + break; /* EOF */ + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + + /* Error: offset create reference outside destination buffer */ + if (unlikely(ref < (BYTE *const) dest)) + goto _output_error; + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + for (; *ip == 255; length += 255) + ip++; + if (unlikely(length > (size_t)(length + *ip))) + goto _output_error; + length += *ip++; + } + + /* copy repeated sequence */ + if (unlikely((op - ref) < STEPSIZE)) { +#if LZ4_ARCH64 + int dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op-ref]; + PUT4(ref, op); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE - 4); + if (cpy > (oend - COPYLENGTH)) { + + /* Error: request to write beyond destination buffer */ + if (cpy > oend) + goto _output_error; +#if LZ4_ARCH64 + if ((ref + COPYLENGTH) > oend) +#else + if ((ref + COPYLENGTH) > oend || + (op + COPYLENGTH) > oend) +#endif + goto _output_error; + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + /* + * Check EOF (should never happen, since last 5 bytes + * are supposed to be literals) + */ + if (op == oend) + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + /* end of decoding */ + return (int) (((char *)ip) - source); + + /* write overflow error detected */ +_output_error: + return -1; +} + +static int lz4_uncompress_unknownoutputsize(const char *source, char *dest, + int isize, size_t maxoutputsize) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *const iend = ip + isize; + const BYTE *ref; + + + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + maxoutputsize; + BYTE *cpy; + + /* Main Loop */ + while (ip < iend) { + + unsigned token; + size_t length; + + /* get runlength */ + token = *ip++; + length = (token >> ML_BITS); + if (length == RUN_MASK) { + int s = 255; + while ((ip < iend) && (s == 255)) { + s = *ip++; + if (unlikely(length > (size_t)(length + s))) + goto _output_error; + length += s; + } + } + /* copy literals */ + cpy = op + length; + if ((cpy > oend - COPYLENGTH) || + (ip + length > iend - COPYLENGTH)) { + + if (cpy > oend) + goto _output_error;/* writes beyond buffer */ + + if (ip + length != iend) + goto _output_error;/* + * Error: LZ4 format requires + * to consume all input + * at this stage + */ + memcpy(op, ip, length); + op += length; + break;/* Necessarily EOF, due to parsing restrictions */ + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + if (ref < (BYTE * const) dest) + goto _output_error; + /* + * Error : offset creates reference + * outside of destination buffer + */ + + /* get matchlength */ + length = (token & ML_MASK); + if (length == ML_MASK) { + while (ip < iend) { + int s = *ip++; + if (unlikely(length > (size_t)(length + s))) + goto _output_error; + length += s; + if (s == 255) + continue; + break; + } + } + + /* copy repeated sequence */ + if (unlikely((op - ref) < STEPSIZE)) { +#if LZ4_ARCH64 + int dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op - ref]; + PUT4(ref, op); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE-4); + if (cpy > oend - COPYLENGTH) { + if (cpy > oend) + goto _output_error; /* write outside of buf */ +#if LZ4_ARCH64 + if ((ref + COPYLENGTH) > oend) +#else + if ((ref + COPYLENGTH) > oend || + (op + COPYLENGTH) > oend) +#endif + goto _output_error; + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + /* + * Check EOF (should never happen, since last 5 bytes + * are supposed to be literals) + */ + if (op == oend) + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + /* end of decoding */ + return (int) (((char *) op) - dest); + + /* write overflow error detected */ +_output_error: + return -1; +} + +int lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len) +{ + int ret = -1; + int input_len = 0; + + input_len = lz4_uncompress(src, dest, actual_dest_len); + if (input_len < 0) + goto exit_0; + *src_len = input_len; + + return 0; +exit_0: + return ret; +} +#ifndef STATIC +EXPORT_SYMBOL(lz4_decompress); +#endif + +int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, + unsigned char *dest, size_t *dest_len) +{ + int ret = -1; + int out_len = 0; + + out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len, + *dest_len); + if (out_len < 0) + goto exit_0; + *dest_len = out_len; + + return 0; +exit_0: + return ret; +} +#ifndef STATIC +EXPORT_SYMBOL(lz4_decompress_unknownoutputsize); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("LZ4 Decompressor"); +#endif diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h new file mode 100644 index 000000000..9b4182fd3 --- /dev/null +++ b/lib/lz4/lz4defs.h @@ -0,0 +1,157 @@ +/* + * lz4defs.h -- architecture specific defines + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * Detects 64 bits mode + */ +#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) \ + || defined(__ppc64__) || defined(__LP64__)) +#define LZ4_ARCH64 1 +#else +#define LZ4_ARCH64 0 +#endif + +/* + * Architecture-specific macros + */ +#define ARM_EFFICIENT_UNALIGNED_ACCESS +#define BYTE u8 +typedef struct _U16_S { u16 v; } U16_S; +typedef struct _U32_S { u32 v; } U32_S; +typedef struct _U64_S { u64 v; } U64_S; +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \ + || defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 6 \ + && defined(ARM_EFFICIENT_UNALIGNED_ACCESS) + +#define A16(x) (((U16_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A64(x) (((U64_S *)(x))->v) + +#define PUT4(s, d) (A32(d) = A32(s)) +#define PUT8(s, d) (A64(d) = A64(s)) +#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ + do { \ + A16(p) = v; \ + p += 2; \ + } while (0) +#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ + +#define A64(x) get_unaligned((u64 *)&(((U16_S *)(x))->v)) +#define A32(x) get_unaligned((u32 *)&(((U16_S *)(x))->v)) +#define A16(x) get_unaligned((u16 *)&(((U16_S *)(x))->v)) + +#define PUT4(s, d) \ + put_unaligned(get_unaligned((const u32 *) s), (u32 *) d) +#define PUT8(s, d) \ + put_unaligned(get_unaligned((const u64 *) s), (u64 *) d) + +#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ + do { \ + put_unaligned(v, (u16 *)(p)); \ + p += 2; \ + } while (0) +#endif + +#define COPYLENGTH 8 +#define ML_BITS 4 +#define ML_MASK ((1U << ML_BITS) - 1) +#define RUN_BITS (8 - ML_BITS) +#define RUN_MASK ((1U << RUN_BITS) - 1) +#define MEMORY_USAGE 14 +#define MINMATCH 4 +#define SKIPSTRENGTH 6 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH + MINMATCH) +#define MINLENGTH (MFLIMIT + 1) +#define MAXD_LOG 16 +#define MAXD (1 << MAXD_LOG) +#define MAXD_MASK (u32)(MAXD - 1) +#define MAX_DISTANCE (MAXD - 1) +#define HASH_LOG (MAXD_LOG - 1) +#define HASHTABLESIZE (1 << HASH_LOG) +#define MAX_NB_ATTEMPTS 256 +#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH) +#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT - 1)) +#define HASHLOG64K ((MEMORY_USAGE - 2) + 1) +#define HASH64KTABLESIZE (1U << HASHLOG64K) +#define LZ4_HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - (MEMORY_USAGE-2))) +#define LZ4_HASH64K_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - HASHLOG64K)) +#define HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - HASH_LOG)) + +#if LZ4_ARCH64/* 64-bit */ +#define STEPSIZE 8 + +#define LZ4_COPYSTEP(s, d) \ + do { \ + PUT8(s, d); \ + d += 8; \ + s += 8; \ + } while (0) + +#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d) + +#define LZ4_SECURECOPY(s, d, e) \ + do { \ + if (d < e) { \ + LZ4_WILDCOPY(s, d, e); \ + } \ + } while (0) +#define HTYPE u32 + +#ifdef __BIG_ENDIAN +#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3) +#else +#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3) +#endif + +#else /* 32-bit */ +#define STEPSIZE 4 + +#define LZ4_COPYSTEP(s, d) \ + do { \ + PUT4(s, d); \ + d += 4; \ + s += 4; \ + } while (0) + +#define LZ4_COPYPACKET(s, d) \ + do { \ + LZ4_COPYSTEP(s, d); \ + LZ4_COPYSTEP(s, d); \ + } while (0) + +#define LZ4_SECURECOPY LZ4_WILDCOPY +#define HTYPE const u8* + +#ifdef __BIG_ENDIAN +#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3) +#else +#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3) +#endif + +#endif + +#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ + (d = s - get_unaligned_le16(p)) + +#define LZ4_WILDCOPY(s, d, e) \ + do { \ + LZ4_COPYPACKET(s, d); \ + } while (d < e) + +#define LZ4_BLINDCOPY(s, d, l) \ + do { \ + u8 *e = (d) + l; \ + LZ4_WILDCOPY(s, d, e); \ + d = e; \ + } while (0) diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c new file mode 100644 index 000000000..f344f76b6 --- /dev/null +++ b/lib/lz4/lz4hc_compress.c @@ -0,0 +1,539 @@ +/* + * LZ4 HC - High Compression Mode of LZ4 + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + * + * Changed for kernel use by: + * Chanho Min + */ + +#include +#include +#include +#include +#include "lz4defs.h" + +struct lz4hc_data { + const u8 *base; + HTYPE hashtable[HASHTABLESIZE]; + u16 chaintable[MAXD]; + const u8 *nexttoupdate; +} __attribute__((__packed__)); + +static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base) +{ + memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable)); + memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable)); + +#if LZ4_ARCH64 + hc4->nexttoupdate = base + 1; +#else + hc4->nexttoupdate = base; +#endif + hc4->base = base; + return 1; +} + +/* Update chains up to ip (excluded) */ +static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip) +{ + u16 *chaintable = hc4->chaintable; + HTYPE *hashtable = hc4->hashtable; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + + while (hc4->nexttoupdate < ip) { + const u8 *p = hc4->nexttoupdate; + size_t delta = p - (hashtable[HASH_VALUE(p)] + base); + if (delta > MAX_DISTANCE) + delta = MAX_DISTANCE; + chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta; + hashtable[HASH_VALUE(p)] = (p) - base; + hc4->nexttoupdate++; + } +} + +static inline size_t lz4hc_commonlength(const u8 *p1, const u8 *p2, + const u8 *const matchlimit) +{ + const u8 *p1t = p1; + + while (p1t < matchlimit - (STEPSIZE - 1)) { +#if LZ4_ARCH64 + u64 diff = A64(p2) ^ A64(p1t); +#else + u32 diff = A32(p2) ^ A32(p1t); +#endif + if (!diff) { + p1t += STEPSIZE; + p2 += STEPSIZE; + continue; + } + p1t += LZ4_NBCOMMONBYTES(diff); + return p1t - p1; + } +#if LZ4_ARCH64 + if ((p1t < (matchlimit-3)) && (A32(p2) == A32(p1t))) { + p1t += 4; + p2 += 4; + } +#endif + + if ((p1t < (matchlimit - 1)) && (A16(p2) == A16(p1t))) { + p1t += 2; + p2 += 2; + } + if ((p1t < matchlimit) && (*p2 == *p1t)) + p1t++; + return p1t - p1; +} + +static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4, + const u8 *ip, const u8 *const matchlimit, const u8 **matchpos) +{ + u16 *const chaintable = hc4->chaintable; + HTYPE *const hashtable = hc4->hashtable; + const u8 *ref; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + int nbattempts = MAX_NB_ATTEMPTS; + size_t repl = 0, ml = 0; + u16 delta; + + /* HC4 match finder */ + lz4hc_insert(hc4, ip); + ref = hashtable[HASH_VALUE(ip)] + base; + + /* potential repetition */ + if (ref >= ip-4) { + /* confirmed */ + if (A32(ref) == A32(ip)) { + delta = (u16)(ip-ref); + repl = ml = lz4hc_commonlength(ip + MINMATCH, + ref + MINMATCH, matchlimit) + MINMATCH; + *matchpos = ref; + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + + while ((ref >= ip - MAX_DISTANCE) && nbattempts) { + nbattempts--; + if (*(ref + ml) == *(ip + ml)) { + if (A32(ref) == A32(ip)) { + size_t mlt = + lz4hc_commonlength(ip + MINMATCH, + ref + MINMATCH, matchlimit) + MINMATCH; + if (mlt > ml) { + ml = mlt; + *matchpos = ref; + } + } + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + + /* Complete table */ + if (repl) { + const BYTE *ptr = ip; + const BYTE *end; + end = ip + repl - (MINMATCH-1); + /* Pre-Load */ + while (ptr < end - delta) { + chaintable[(size_t)(ptr) & MAXD_MASK] = delta; + ptr++; + } + do { + chaintable[(size_t)(ptr) & MAXD_MASK] = delta; + /* Head of chain */ + hashtable[HASH_VALUE(ptr)] = (ptr) - base; + ptr++; + } while (ptr < end); + hc4->nexttoupdate = end; + } + + return (int)ml; +} + +static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4, + const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest, + const u8 **matchpos, const u8 **startpos) +{ + u16 *const chaintable = hc4->chaintable; + HTYPE *const hashtable = hc4->hashtable; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + const u8 *ref; + int nbattempts = MAX_NB_ATTEMPTS; + int delta = (int)(ip - startlimit); + + /* First Match */ + lz4hc_insert(hc4, ip); + ref = hashtable[HASH_VALUE(ip)] + base; + + while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base) + && (nbattempts)) { + nbattempts--; + if (*(startlimit + longest) == *(ref - delta + longest)) { + if (A32(ref) == A32(ip)) { + const u8 *reft = ref + MINMATCH; + const u8 *ipt = ip + MINMATCH; + const u8 *startt = ip; + + while (ipt < matchlimit-(STEPSIZE - 1)) { + #if LZ4_ARCH64 + u64 diff = A64(reft) ^ A64(ipt); + #else + u32 diff = A32(reft) ^ A32(ipt); + #endif + + if (!diff) { + ipt += STEPSIZE; + reft += STEPSIZE; + continue; + } + ipt += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ipt < (matchlimit - 3)) + && (A32(reft) == A32(ipt))) { + ipt += 4; + reft += 4; + } + ipt += 2; + #endif + if ((ipt < (matchlimit - 1)) + && (A16(reft) == A16(ipt))) { + reft += 2; + } + if ((ipt < matchlimit) && (*reft == *ipt)) + ipt++; +_endcount: + reft = ref; + + while ((startt > startlimit) + && (reft > hc4->base) + && (startt[-1] == reft[-1])) { + startt--; + reft--; + } + + if ((ipt - startt) > longest) { + longest = (int)(ipt - startt); + *matchpos = reft; + *startpos = startt; + } + } + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + return longest; +} + +static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor, + int ml, const u8 *ref) +{ + int length, len; + u8 *token; + + /* Encode Literal length */ + length = (int)(*ip - *anchor); + token = (*op)++; + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *(*op)++ = 255; + *(*op)++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(*anchor, *op, length); + + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(*op, (u16)(*ip - ref)); + + /* Encode MatchLength */ + len = (int)(ml - MINMATCH); + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509 ; len -= 510) { + *(*op)++ = 255; + *(*op)++ = 255; + } + if (len > 254) { + len -= 255; + *(*op)++ = 255; + } + *(*op)++ = (u8)len; + } else + *token += len; + + /* Prepare next loop */ + *ip += ml; + *anchor = *ip; + + return 0; +} + +static int lz4_compresshcctx(struct lz4hc_data *ctx, + const char *source, + char *dest, + int isize) +{ + const u8 *ip = (const u8 *)source; + const u8 *anchor = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + const u8 *const matchlimit = (iend - LASTLITERALS); + + u8 *op = (u8 *)dest; + + int ml, ml2, ml3, ml0; + const u8 *ref = NULL; + const u8 *start2 = NULL; + const u8 *ref2 = NULL; + const u8 *start3 = NULL; + const u8 *ref3 = NULL; + const u8 *start0; + const u8 *ref0; + int lastrun; + + ip++; + + /* Main Loop */ + while (ip < mflimit) { + ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref)); + if (!ml) { + ip++; + continue; + } + + /* saved, in case we would skip too much */ + start0 = ip; + ref0 = ref; + ml0 = ml; +_search2: + if (ip+ml < mflimit) + ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2, + ip + 1, matchlimit, ml, &ref2, &start2); + else + ml2 = ml; + /* No better match */ + if (ml2 == ml) { + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + continue; + } + + if (start0 < ip) { + /* empirical */ + if (start2 < ip + ml0) { + ip = start0; + ref = ref0; + ml = ml0; + } + } + /* + * Here, start0==ip + * First Match too small : removed + */ + if ((start2 - ip) < 3) { + ml = ml2; + ip = start2; + ref = ref2; + goto _search2; + } + +_search3: + /* + * Currently we have : + * ml2 > ml1, and + * ip1+3 <= ip2 (usually < ip1+ml1) + */ + if ((start2 - ip) < OPTIMAL_ML) { + int correction; + int new_ml = ml; + if (new_ml > OPTIMAL_ML) + new_ml = OPTIMAL_ML; + if (ip + new_ml > start2 + ml2 - MINMATCH) + new_ml = (int)(start2 - ip) + ml2 - MINMATCH; + correction = new_ml - (int)(start2 - ip); + if (correction > 0) { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } + /* + * Now, we have start2 = ip+new_ml, + * with new_ml=min(ml, OPTIMAL_ML=18) + */ + if (start2 + ml2 < mflimit) + ml3 = lz4hc_insertandgetwidermatch(ctx, + start2 + ml2 - 3, start2, matchlimit, + ml2, &ref3, &start3); + else + ml3 = ml2; + + /* No better match : 2 sequences to encode */ + if (ml3 == ml2) { + /* ip & ref are known; Now for ml */ + if (start2 < ip+ml) + ml = (int)(start2 - ip); + + /* Now, encode 2 sequences */ + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + ip = start2; + lz4_encodesequence(&ip, &op, &anchor, ml2, ref2); + continue; + } + + /* Not enough space for match 2 : remove it */ + if (start3 < ip + ml + 3) { + /* + * can write Seq1 immediately ==> Seq2 is removed, + * so Seq3 becomes Seq1 + */ + if (start3 >= (ip + ml)) { + if (start2 < ip + ml) { + int correction = + (int)(ip + ml - start2); + start2 += correction; + ref2 += correction; + ml2 -= correction; + if (ml2 < MINMATCH) { + start2 = start3; + ref2 = ref3; + ml2 = ml3; + } + } + + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + ip = start3; + ref = ref3; + ml = ml3; + + start0 = start2; + ref0 = ref2; + ml0 = ml2; + goto _search2; + } + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + goto _search3; + } + + /* + * OK, now we have 3 ascending matches; let's write at least + * the first one ip & ref are known; Now for ml + */ + if (start2 < ip + ml) { + if ((start2 - ip) < (int)ML_MASK) { + int correction; + if (ml > OPTIMAL_ML) + ml = OPTIMAL_ML; + if (ip + ml > start2 + ml2 - MINMATCH) + ml = (int)(start2 - ip) + ml2 + - MINMATCH; + correction = ml - (int)(start2 - ip); + if (correction > 0) { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } else + ml = (int)(start2 - ip); + } + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + + ip = start2; + ref = ref2; + ml = ml2; + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + + goto _search3; + } + + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8) lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + /* End */ + return (int) (((char *)op) - dest); +} + +int lz4hc_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem) +{ + int ret = -1; + int out_len = 0; + + struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem; + lz4hc_init(hc4, (const u8 *)src); + out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src, + (char *)dst, (int)src_len); + + if (out_len < 0) + goto exit; + + *dst_len = out_len; + return 0; + +exit: + return ret; +} +EXPORT_SYMBOL(lz4hc_compress); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("LZ4HC compressor"); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 3ac50dc55..6ebbdba6b 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -32,35 +32,9 @@ #include #include #include +#include /* in_interrupt() */ -#ifdef __KERNEL__ -#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) -#else -#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ -#endif - -#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) -#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) - -#define RADIX_TREE_TAG_LONGS \ - ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) - -struct radix_tree_node { - unsigned int height; /* Height from the bottom */ - unsigned int count; - union { - struct radix_tree_node *parent; /* Used when ascending tree */ - struct rcu_head rcu_head; /* Used when freeing node */ - }; - void __rcu *slots[RADIX_TREE_MAP_SIZE]; - unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; -}; - -#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ - RADIX_TREE_MAP_SHIFT)) - /* * The height_to_maxindex array needs to be one deeper than the maximum * path as height 0 holds only 1 entry. @@ -72,12 +46,25 @@ static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly; */ static struct kmem_cache *radix_tree_node_cachep; +/* + * The radix tree is variable-height, so an insert operation not only has + * to build the branch to its corresponding item, it also has to build the + * branch to existing items if the size has to be increased (by + * radix_tree_extend). + * + * The worst case is a zero height tree with just a single item at index 0, + * and then inserting an item at index ULONG_MAX. This requires 2 new branches + * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared. + * Hence: + */ +#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1) + /* * Per-cpu pool of preloaded nodes */ struct radix_tree_preload { int nr; - struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH]; + struct radix_tree_node *nodes[RADIX_TREE_PRELOAD_SIZE]; }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; @@ -194,7 +181,12 @@ radix_tree_node_alloc(struct radix_tree_root *root) struct radix_tree_node *ret = NULL; gfp_t gfp_mask = root_gfp_mask(root); - if (!(gfp_mask & __GFP_WAIT)) { + /* + * Preload code isn't irq safe and it doesn't make sence to use + * preloading in the interrupt anyway as all the allocations have to + * be atomic. So just do normal allocation when in interrupt. + */ + if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) { struct radix_tree_preload *rtp; /* @@ -202,7 +194,7 @@ radix_tree_node_alloc(struct radix_tree_root *root) * succeed in getting a node here (and never reach * kmem_cache_alloc) */ - rtp = &__get_cpu_var(radix_tree_preloads); + rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; rtp->nodes[rtp->nr - 1] = NULL; @@ -251,21 +243,21 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_WAIT being passed to INIT_RADIX_TREE(). */ -int radix_tree_preload(gfp_t gfp_mask) +static int __radix_tree_preload(gfp_t gfp_mask) { struct radix_tree_preload *rtp; struct radix_tree_node *node; int ret = -ENOMEM; preempt_disable(); - rtp = &__get_cpu_var(radix_tree_preloads); + rtp = this_cpu_ptr(&radix_tree_preloads); while (rtp->nr < ARRAY_SIZE(rtp->nodes)) { preempt_enable(); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; preempt_disable(); - rtp = &__get_cpu_var(radix_tree_preloads); + rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr < ARRAY_SIZE(rtp->nodes)) rtp->nodes[rtp->nr++] = node; else @@ -275,8 +267,39 @@ int radix_tree_preload(gfp_t gfp_mask) out: return ret; } + +/* + * Load up this CPU's radix_tree_node buffer with sufficient objects to + * ensure that the addition of a single element in the tree cannot fail. On + * success, return zero, with preemption disabled. On error, return -ENOMEM + * with preemption not disabled. + * + * To make use of this facility, the radix tree must be initialised without + * __GFP_WAIT being passed to INIT_RADIX_TREE(). + */ +int radix_tree_preload(gfp_t gfp_mask) +{ + /* Warn on non-sensical use... */ + WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT)); + return __radix_tree_preload(gfp_mask); +} EXPORT_SYMBOL(radix_tree_preload); +/* + * The same as above function, except we don't guarantee preloading happens. + * We do it, if we decide it helps. On success, return zero with preemption + * disabled. On error, return -ENOMEM with preemption not disabled. + */ +int radix_tree_maybe_preload(gfp_t gfp_mask) +{ + if (gfp_mask & __GFP_WAIT) + return __radix_tree_preload(gfp_mask); + /* Preloading doesn't help anything with this gfp mask, skip it */ + preempt_disable(); + return 0; +} +EXPORT_SYMBOL(radix_tree_maybe_preload); + /* * Return the maximum key which can be store into a * radix tree with height HEIGHT. @@ -337,23 +360,28 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) } /** - * radix_tree_insert - insert into a radix tree + * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key - * @item: item to insert + * @nodep: returns node + * @slotp: returns slot * - * Insert an item into the radix tree at position @index. + * Create, if necessary, and return the node and slot for an item + * at position @index in the radix tree @root. + * + * Until there is more than one item in the tree, no nodes are + * allocated and @root->rnode is used as a direct slot instead of + * pointing to a node, in which case *@nodep will be NULL. + * + * Returns -ENOMEM, or 0 for success. */ -int radix_tree_insert(struct radix_tree_root *root, - unsigned long index, void *item) +int __radix_tree_create(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node **nodep, void ***slotp) { struct radix_tree_node *node = NULL, *slot; - unsigned int height, shift; - int offset; + unsigned int height, shift, offset; int error; - BUG_ON(radix_tree_is_indirect_ptr(item)); - /* Make sure the tree is high enough. */ if (index > radix_tree_maxindex(root->height)) { error = radix_tree_extend(root, index); @@ -389,16 +417,42 @@ int radix_tree_insert(struct radix_tree_root *root, height--; } - if (slot != NULL) + if (nodep) + *nodep = node; + if (slotp) + *slotp = node ? node->slots + offset : (void **)&root->rnode; + return 0; +} + +/** + * radix_tree_insert - insert into a radix tree + * @root: radix tree root + * @index: index key + * @item: item to insert + * + * Insert an item into the radix tree at position @index. + */ +int radix_tree_insert(struct radix_tree_root *root, + unsigned long index, void *item) +{ + struct radix_tree_node *node; + void **slot; + int error; + + BUG_ON(radix_tree_is_indirect_ptr(item)); + + error = __radix_tree_create(root, index, &node, &slot); + if (error) + return error; + if (*slot != NULL) return -EEXIST; + rcu_assign_pointer(*slot, item); if (node) { node->count++; - rcu_assign_pointer(node->slots[offset], item); - BUG_ON(tag_get(node, 0, offset)); - BUG_ON(tag_get(node, 1, offset)); + BUG_ON(tag_get(node, 0, index & RADIX_TREE_MAP_MASK)); + BUG_ON(tag_get(node, 1, index & RADIX_TREE_MAP_MASK)); } else { - rcu_assign_pointer(root->rnode, item); BUG_ON(root_tag_get(root, 0)); BUG_ON(root_tag_get(root, 1)); } @@ -407,15 +461,26 @@ int radix_tree_insert(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_insert); -/* - * is_slot == 1 : search for the slot. - * is_slot == 0 : search for the node. +/** + * __radix_tree_lookup - lookup an item in a radix tree + * @root: radix tree root + * @index: index key + * @nodep: returns node + * @slotp: returns slot + * + * Lookup and return the item at position @index in the radix + * tree @root. + * + * Until there is more than one item in the tree, no nodes are + * allocated and @root->rnode is used as a direct slot instead of + * pointing to a node, in which case *@nodep will be NULL. */ -static void *radix_tree_lookup_element(struct radix_tree_root *root, - unsigned long index, int is_slot) +void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node **nodep, void ***slotp) { + struct radix_tree_node *node, *parent; unsigned int height, shift; - struct radix_tree_node *node, **slot; + void **slot; node = rcu_dereference_raw(root->rnode); if (node == NULL) @@ -424,7 +489,12 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, if (!radix_tree_is_indirect_ptr(node)) { if (index > 0) return NULL; - return is_slot ? (void *)&root->rnode : node; + + if (nodep) + *nodep = NULL; + if (slotp) + *slotp = (void **)&root->rnode; + return node; } node = indirect_to_ptr(node); @@ -435,8 +505,8 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, shift = (height-1) * RADIX_TREE_MAP_SHIFT; do { - slot = (struct radix_tree_node **) - (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK)); + parent = node; + slot = node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK); node = rcu_dereference_raw(*slot); if (node == NULL) return NULL; @@ -445,7 +515,11 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, height--; } while (height > 0); - return is_slot ? (void *)slot : indirect_to_ptr(node); + if (nodep) + *nodep = parent; + if (slotp) + *slotp = slot; + return node; } /** @@ -463,7 +537,11 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, */ void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) { - return (void **)radix_tree_lookup_element(root, index, 1); + void **slot; + + if (!__radix_tree_lookup(root, index, NULL, &slot)) + return NULL; + return slot; } EXPORT_SYMBOL(radix_tree_lookup_slot); @@ -481,7 +559,7 @@ EXPORT_SYMBOL(radix_tree_lookup_slot); */ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) { - return radix_tree_lookup_element(root, index, 0); + return __radix_tree_lookup(root, index, NULL, NULL); } EXPORT_SYMBOL(radix_tree_lookup); @@ -896,103 +974,73 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); - /** - * radix_tree_next_hole - find the next hole (not-present entry) - * @root: tree root - * @index: index key - * @max_scan: maximum range to search + * radix_tree_gang_lookup - perform multiple lookup on a radix tree + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results * - * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest - * indexed hole. + * Performs an index-ascending scan of the tree for present items. Places + * them at *@results and returns the number of items which were placed at + * *@results. * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'return - index >= max_scan' - * will be true). In rare cases of index wrap-around, 0 will be returned. + * The implementation is naive. * - * radix_tree_next_hole may be called under rcu_read_lock. However, like - * radix_tree_gang_lookup, this will not atomically search a snapshot of - * the tree at a single point in time. For example, if a hole is created - * at index 5, then subsequently a hole is created at index 10, - * radix_tree_next_hole covering both indexes may return 10 if called - * under rcu_read_lock. + * Like radix_tree_lookup, radix_tree_gang_lookup may be called under + * rcu_read_lock. In this case, rather than the returned results being + * an atomic snapshot of the tree at a single point in time, the semantics + * of an RCU protected gang lookup are as though multiple radix_tree_lookups + * have been issued in individual locks, and results stored in 'results'. */ -unsigned long radix_tree_next_hole(struct radix_tree_root *root, - unsigned long index, unsigned long max_scan) +unsigned int +radix_tree_gang_lookup(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items) { - unsigned long i; - - for (i = 0; i < max_scan; i++) { - if (!radix_tree_lookup(root, index)) - break; - index++; - if (index == 0) - break; - } - - return index; -} -EXPORT_SYMBOL(radix_tree_next_hole); + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; -/** - * radix_tree_prev_hole - find the prev hole (not-present entry) - * @root: tree root - * @index: index key - * @max_scan: maximum range to search - * - * Search backwards in the range [max(index-max_scan+1, 0), index] - * for the first hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'index - return >= max_scan' - * will be true). In rare cases of wrap-around, ULONG_MAX will be returned. - * - * radix_tree_next_hole may be called under rcu_read_lock. However, like - * radix_tree_gang_lookup, this will not atomically search a snapshot of - * the tree at a single point in time. For example, if a hole is created - * at index 10, then subsequently a hole is created at index 5, - * radix_tree_prev_hole covering both indexes may return 5 if called under - * rcu_read_lock. - */ -unsigned long radix_tree_prev_hole(struct radix_tree_root *root, - unsigned long index, unsigned long max_scan) -{ - unsigned long i; + if (unlikely(!max_items)) + return 0; - for (i = 0; i < max_scan; i++) { - if (!radix_tree_lookup(root, index)) - break; - index--; - if (index == ULONG_MAX) + radix_tree_for_each_slot(slot, root, &iter, first_index) { + results[ret] = rcu_dereference_raw(*slot); + if (!results[ret]) + continue; + if (radix_tree_is_indirect_ptr(results[ret])) { + slot = radix_tree_iter_retry(&iter); + continue; + } + if (++ret == max_items) break; } - return index; + return ret; } -EXPORT_SYMBOL(radix_tree_prev_hole); +EXPORT_SYMBOL(radix_tree_gang_lookup); /** - * radix_tree_gang_lookup - perform multiple lookup on a radix tree + * radix_tree_gang_lookup_index - perform multiple lookup on a radix tree * @root: radix tree root * @results: where the results of the lookup are placed + * @indices: where their indices should be placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * * Performs an index-ascending scan of the tree for present items. Places * them at *@results and returns the number of items which were placed at - * *@results. + * *@results. The indices are placed in @indices. * * The implementation is naive. * - * Like radix_tree_lookup, radix_tree_gang_lookup may be called under - * rcu_read_lock. In this case, rather than the returned results being - * an atomic snapshot of the tree at a single point in time, the semantics - * of an RCU protected gang lookup are as though multiple radix_tree_lookups - * have been issued in individual locks, and results stored in 'results'. + * Just one difference from radix_tree_gang_lookup, the indices are also + * collected along with the results of lookup. */ unsigned int -radix_tree_gang_lookup(struct radix_tree_root *root, void **results, - unsigned long first_index, unsigned int max_items) +radix_tree_gang_lookup_index(struct radix_tree_root *root, void **results, + unsigned long *indices, unsigned long first_index, + unsigned int max_items) { struct radix_tree_iter iter; void **slot; @@ -1005,13 +1053,15 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot)); if (!results[ret]) continue; + if (indices) + indices[ret] = iter.index; if (++ret == max_items) break; } return ret; } -EXPORT_SYMBOL(radix_tree_gang_lookup); +EXPORT_SYMBOL(radix_tree_gang_lookup_index); /** * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree @@ -1081,9 +1131,13 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, return 0; radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { - results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot)); + results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; + if (radix_tree_is_indirect_ptr(results[ret])) { + slot = radix_tree_iter_retry(&iter); + continue; + } if (++ret == max_items) break; } @@ -1203,8 +1257,10 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); - if (cur_index > max_index) + if (cur_index > max_index) { + rcu_read_unlock(); break; + } cur_index = __locate(node, item, cur_index, &found_index); rcu_read_unlock(); @@ -1285,48 +1341,89 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) } /** - * radix_tree_delete - delete an item from a radix tree + * __radix_tree_delete_node - try to free node after clearing a slot * @root: radix tree root * @index: index key + * @node: node containing @index * - * Remove the item at @index from the radix tree rooted at @root. + * After clearing the slot at @index in @node from radix tree + * rooted at @root, call this function to attempt freeing the + * node and shrinking the tree. * - * Returns the address of the deleted item, or NULL if it was not present. + * Returns %true if @node was freed, %false otherwise. */ -void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) +bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node *node) { - struct radix_tree_node *node = NULL; - struct radix_tree_node *slot = NULL; - struct radix_tree_node *to_free; - unsigned int height, shift; + bool deleted = false; + + do { + struct radix_tree_node *parent; + + if (node->count) { + if (node == indirect_to_ptr(root->rnode)) { + radix_tree_shrink(root); + if (root->height == 0) + deleted = true; + } + return deleted; + } + + parent = node->parent; + if (parent) { + index >>= RADIX_TREE_MAP_SHIFT; + + parent->slots[index & RADIX_TREE_MAP_MASK] = NULL; + parent->count--; + } else { + root_tag_clear_all(root); + root->height = 0; + root->rnode = NULL; + } + + radix_tree_node_free(node); + deleted = true; + + node = parent; + } while (node); + + return deleted; +} + +/** + * radix_tree_delete_item - delete an item from a radix tree + * @root: radix tree root + * @index: index key + * @item: expected item + * + * Remove @item at @index from the radix tree rooted at @root. + * + * Returns the address of the deleted item, or NULL if it was not present + * or the entry at the given @index was not @item. + */ +void *radix_tree_delete_item(struct radix_tree_root *root, + unsigned long index, void *item) +{ + struct radix_tree_node *node; + unsigned int offset; + void **slot; + void *entry; int tag; - int uninitialized_var(offset); - height = root->height; - if (index > radix_tree_maxindex(height)) - goto out; + entry = __radix_tree_lookup(root, index, &node, &slot); + if (!entry) + return NULL; - slot = root->rnode; - if (height == 0) { + if (item && entry != item) + return NULL; + + if (!node) { root_tag_clear_all(root); root->rnode = NULL; - goto out; + return entry; } - slot = indirect_to_ptr(slot); - shift = height * RADIX_TREE_MAP_SHIFT; - do { - if (slot == NULL) - goto out; - - shift -= RADIX_TREE_MAP_SHIFT; - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - node = slot; - slot = slot->slots[offset]; - } while (shift); - - if (slot == NULL) - goto out; + offset = index & RADIX_TREE_MAP_MASK; /* * Clear all tags associated with the item to be deleted. @@ -1337,40 +1434,27 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) radix_tree_tag_clear(root, index, tag); } - to_free = NULL; - /* Now free the nodes we do not need anymore */ - while (node) { - node->slots[offset] = NULL; - node->count--; - /* - * Queue the node for deferred freeing after the - * last reference to it disappears (set NULL, above). - */ - if (to_free) - radix_tree_node_free(to_free); - - if (node->count) { - if (node == indirect_to_ptr(root->rnode)) - radix_tree_shrink(root); - goto out; - } - - /* Node with zero slots in use so free it */ - to_free = node; + node->slots[offset] = NULL; + node->count--; - index >>= RADIX_TREE_MAP_SHIFT; - offset = index & RADIX_TREE_MAP_MASK; - node = node->parent; - } + __radix_tree_delete_node(root, index, node); - root_tag_clear_all(root); - root->height = 0; - root->rnode = NULL; - if (to_free) - radix_tree_node_free(to_free); + return entry; +} +EXPORT_SYMBOL(radix_tree_delete_item); -out: - return slot; +/** + * radix_tree_delete - delete an item from a radix tree + * @root: radix tree root + * @index: index key + * + * Remove the item at @index from the radix tree rooted at @root. + * + * Returns the address of the deleted item, or NULL if it was not present. + */ +void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) +{ + return radix_tree_delete_item(root, index, NULL); } EXPORT_SYMBOL(radix_tree_delete); diff --git a/lib/rbtree.c b/lib/rbtree.c index d4175565d..1356454e3 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -2,7 +2,8 @@ Red Black Trees (C) 1999 Andrea Arcangeli (C) 2002 David Woodhouse - + (C) 2012 Michel Lespinasse + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or @@ -20,339 +21,428 @@ linux/lib/rbtree.c */ -#include +#include #include -static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) -{ - struct rb_node *right = node->rb_right; - struct rb_node *parent = rb_parent(node); - - if ((node->rb_right = right->rb_left)) - rb_set_parent(right->rb_left, node); - right->rb_left = node; +/* + * red-black trees properties: http://en.wikipedia.org/wiki/Rbtree + * + * 1) A node is either red or black + * 2) The root is black + * 3) All leaves (NULL) are black + * 4) Both children of every red node are black + * 5) Every simple path from root to leaves contains the same number + * of black nodes. + * + * 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two + * consecutive red nodes in a path and every red node is therefore followed by + * a black. So if B is the number of black nodes on every simple path (as per + * 5), then the longest possible path due to 4 is 2B. + * + * We shall indicate color with case, where black nodes are uppercase and red + * nodes will be lowercase. Unknown color nodes shall be drawn as red within + * parentheses and have some accompanying text comment. + */ - rb_set_parent(right, parent); +/* + * Notes on lockless lookups: + * + * All stores to the tree structure (rb_left and rb_right) must be done using + * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the + * tree structure as seen in program order. + * + * These two requirements will allow lockless iteration of the tree -- not + * correct iteration mind you, tree rotations are not atomic so a lookup might + * miss entire subtrees. + * + * But they do guarantee that any such traversal will only see valid elements + * and that it will indeed complete -- does not get stuck in a loop. + * + * It also guarantees that if the lookup returns an element it is the 'correct' + * one. But not returning an element does _NOT_ mean it's not present. + * + * NOTE: + * + * Stores to __rb_parent_color are not important for simple lookups so those + * are left undone as of now. Nor did I check for loops involving parent + * pointers. + */ - if (parent) - { - if (node == parent->rb_left) - parent->rb_left = right; - else - parent->rb_right = right; - } - else - root->rb_node = right; - rb_set_parent(node, right); +static inline void rb_set_black(struct rb_node *rb) +{ + rb->__rb_parent_color |= RB_BLACK; } -static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) +static inline struct rb_node *rb_red_parent(struct rb_node *red) { - struct rb_node *left = node->rb_left; - struct rb_node *parent = rb_parent(node); - - if ((node->rb_left = left->rb_right)) - rb_set_parent(left->rb_right, node); - left->rb_right = node; - - rb_set_parent(left, parent); + return (struct rb_node *)red->__rb_parent_color; +} - if (parent) - { - if (node == parent->rb_right) - parent->rb_right = left; - else - parent->rb_left = left; - } - else - root->rb_node = left; - rb_set_parent(node, left); +/* + * Helper function for rotations: + * - old's parent and color get assigned to new + * - old gets assigned new as a parent and 'color' as a color. + */ +static inline void +__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, + struct rb_root *root, int color) +{ + struct rb_node *parent = rb_parent(old); + new->__rb_parent_color = old->__rb_parent_color; + rb_set_parent_color(old, new, color); + __rb_change_child(old, new, parent, root); } -void rb_insert_color(struct rb_node *node, struct rb_root *root) +static __always_inline void +__rb_insert(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - struct rb_node *parent, *gparent; - - while ((parent = rb_parent(node)) && rb_is_red(parent)) - { - gparent = rb_parent(parent); - - if (parent == gparent->rb_left) - { - { - register struct rb_node *uncle = gparent->rb_right; - if (uncle && rb_is_red(uncle)) - { - rb_set_black(uncle); - rb_set_black(parent); - rb_set_red(gparent); - node = gparent; - continue; - } + struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; + + while (true) { + /* + * Loop invariant: node is red + * + * If there is a black parent, we are done. + * Otherwise, take some corrective action as we don't + * want a red root or two consecutive red nodes. + */ + if (!parent) { + rb_set_parent_color(node, NULL, RB_BLACK); + break; + } else if (rb_is_black(parent)) + break; + + gparent = rb_red_parent(parent); + + tmp = gparent->rb_right; + if (parent != tmp) { /* parent == gparent->rb_left */ + if (tmp && rb_is_red(tmp)) { + /* + * Case 1 - color flips + * + * G g + * / \ / \ + * p u --> P U + * / / + * n n + * + * However, since g's parent might be red, and + * 4) does not allow this, we need to recurse + * at g. + */ + rb_set_parent_color(tmp, gparent, RB_BLACK); + rb_set_parent_color(parent, gparent, RB_BLACK); + node = gparent; + parent = rb_parent(node); + rb_set_parent_color(node, parent, RB_RED); + continue; } - if (parent->rb_right == node) - { - register struct rb_node *tmp; - __rb_rotate_left(parent, root); - tmp = parent; + tmp = parent->rb_right; + if (node == tmp) { + /* + * Case 2 - left rotate at parent + * + * G G + * / \ / \ + * p U --> n U + * \ / + * n p + * + * This still leaves us in violation of 4), the + * continuation into Case 3 will fix that. + */ + tmp = node->rb_left; + WRITE_ONCE(parent->rb_right, tmp); + WRITE_ONCE(node->rb_left, parent); + if (tmp) + rb_set_parent_color(tmp, parent, + RB_BLACK); + rb_set_parent_color(parent, node, RB_RED); + augment_rotate(parent, node); parent = node; - node = tmp; + tmp = node->rb_right; } - rb_set_black(parent); - rb_set_red(gparent); - __rb_rotate_right(gparent, root); + /* + * Case 3 - right rotate at gparent + * + * G P + * / \ / \ + * p U --> n g + * / \ + * n U + */ + WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */ + WRITE_ONCE(parent->rb_right, gparent); + if (tmp) + rb_set_parent_color(tmp, gparent, RB_BLACK); + __rb_rotate_set_parents(gparent, parent, root, RB_RED); + augment_rotate(gparent, parent); + break; } else { - { - register struct rb_node *uncle = gparent->rb_left; - if (uncle && rb_is_red(uncle)) - { - rb_set_black(uncle); - rb_set_black(parent); - rb_set_red(gparent); - node = gparent; - continue; - } + tmp = gparent->rb_left; + if (tmp && rb_is_red(tmp)) { + /* Case 1 - color flips */ + rb_set_parent_color(tmp, gparent, RB_BLACK); + rb_set_parent_color(parent, gparent, RB_BLACK); + node = gparent; + parent = rb_parent(node); + rb_set_parent_color(node, parent, RB_RED); + continue; } - if (parent->rb_left == node) - { - register struct rb_node *tmp; - __rb_rotate_right(parent, root); - tmp = parent; + tmp = parent->rb_left; + if (node == tmp) { + /* Case 2 - right rotate at parent */ + tmp = node->rb_right; + WRITE_ONCE(parent->rb_left, tmp); + WRITE_ONCE(node->rb_right, parent); + if (tmp) + rb_set_parent_color(tmp, parent, + RB_BLACK); + rb_set_parent_color(parent, node, RB_RED); + augment_rotate(parent, node); parent = node; - node = tmp; + tmp = node->rb_left; } - rb_set_black(parent); - rb_set_red(gparent); - __rb_rotate_left(gparent, root); + /* Case 3 - left rotate at gparent */ + WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */ + WRITE_ONCE(parent->rb_left, gparent); + if (tmp) + rb_set_parent_color(tmp, gparent, RB_BLACK); + __rb_rotate_set_parents(gparent, parent, root, RB_RED); + augment_rotate(gparent, parent); + break; } } - - rb_set_black(root->rb_node); } -EXPORT_SYMBOL(rb_insert_color); -static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, - struct rb_root *root) +/* + * Inline version for rb_erase() use - we want to be able to inline + * and eliminate the dummy_rotate callback there + */ +static __always_inline void +____rb_erase_color(struct rb_node *parent, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - struct rb_node *other; - - while ((!node || rb_is_black(node)) && node != root->rb_node) - { - if (parent->rb_left == node) - { - other = parent->rb_right; - if (rb_is_red(other)) - { - rb_set_black(other); - rb_set_red(parent); - __rb_rotate_left(parent, root); - other = parent->rb_right; + struct rb_node *node = NULL, *sibling, *tmp1, *tmp2; + + while (true) { + /* + * Loop invariants: + * - node is black (or NULL on first iteration) + * - node is not the root (parent is not NULL) + * - All leaf paths going through parent and node have a + * black node count that is 1 lower than other leaf paths. + */ + sibling = parent->rb_right; + if (node != sibling) { /* node == parent->rb_left */ + if (rb_is_red(sibling)) { + /* + * Case 1 - left rotate at parent + * + * P S + * / \ / \ + * N s --> p Sr + * / \ / \ + * Sl Sr N Sl + */ + tmp1 = sibling->rb_left; + WRITE_ONCE(parent->rb_right, tmp1); + WRITE_ONCE(sibling->rb_left, parent); + rb_set_parent_color(tmp1, parent, RB_BLACK); + __rb_rotate_set_parents(parent, sibling, root, + RB_RED); + augment_rotate(parent, sibling); + sibling = tmp1; } - if ((!other->rb_left || rb_is_black(other->rb_left)) && - (!other->rb_right || rb_is_black(other->rb_right))) - { - rb_set_red(other); - node = parent; - parent = rb_parent(node); - } - else - { - if (!other->rb_right || rb_is_black(other->rb_right)) - { - rb_set_black(other->rb_left); - rb_set_red(other); - __rb_rotate_right(other, root); - other = parent->rb_right; + tmp1 = sibling->rb_right; + if (!tmp1 || rb_is_black(tmp1)) { + tmp2 = sibling->rb_left; + if (!tmp2 || rb_is_black(tmp2)) { + /* + * Case 2 - sibling color flip + * (p could be either color here) + * + * (p) (p) + * / \ / \ + * N S --> N s + * / \ / \ + * Sl Sr Sl Sr + * + * This leaves us violating 5) which + * can be fixed by flipping p to black + * if it was red, or by recursing at p. + * p is red when coming from Case 1. + */ + rb_set_parent_color(sibling, parent, + RB_RED); + if (rb_is_red(parent)) + rb_set_black(parent); + else { + node = parent; + parent = rb_parent(node); + if (parent) + continue; + } + break; } - rb_set_color(other, rb_color(parent)); - rb_set_black(parent); - rb_set_black(other->rb_right); - __rb_rotate_left(parent, root); - node = root->rb_node; - break; - } - } - else - { - other = parent->rb_left; - if (rb_is_red(other)) - { - rb_set_black(other); - rb_set_red(parent); - __rb_rotate_right(parent, root); - other = parent->rb_left; + /* + * Case 3 - right rotate at sibling + * (p could be either color here) + * + * (p) (p) + * / \ / \ + * N S --> N Sl + * / \ \ + * sl Sr s + * \ + * Sr + */ + tmp1 = tmp2->rb_right; + WRITE_ONCE(sibling->rb_left, tmp1); + WRITE_ONCE(tmp2->rb_right, sibling); + WRITE_ONCE(parent->rb_right, tmp2); + if (tmp1) + rb_set_parent_color(tmp1, sibling, + RB_BLACK); + augment_rotate(sibling, tmp2); + tmp1 = sibling; + sibling = tmp2; } - if ((!other->rb_left || rb_is_black(other->rb_left)) && - (!other->rb_right || rb_is_black(other->rb_right))) - { - rb_set_red(other); - node = parent; - parent = rb_parent(node); + /* + * Case 4 - left rotate at parent + color flips + * (p and sl could be either color here. + * After rotation, p becomes black, s acquires + * p's color, and sl keeps its color) + * + * (p) (s) + * / \ / \ + * N S --> P Sr + * / \ / \ + * (sl) sr N (sl) + */ + tmp2 = sibling->rb_left; + WRITE_ONCE(parent->rb_right, tmp2); + WRITE_ONCE(sibling->rb_left, parent); + rb_set_parent_color(tmp1, sibling, RB_BLACK); + if (tmp2) + rb_set_parent(tmp2, parent); + __rb_rotate_set_parents(parent, sibling, root, + RB_BLACK); + augment_rotate(parent, sibling); + break; + } else { + sibling = parent->rb_left; + if (rb_is_red(sibling)) { + /* Case 1 - right rotate at parent */ + tmp1 = sibling->rb_right; + WRITE_ONCE(parent->rb_left, tmp1); + WRITE_ONCE(sibling->rb_right, parent); + rb_set_parent_color(tmp1, parent, RB_BLACK); + __rb_rotate_set_parents(parent, sibling, root, + RB_RED); + augment_rotate(parent, sibling); + sibling = tmp1; } - else - { - if (!other->rb_left || rb_is_black(other->rb_left)) - { - rb_set_black(other->rb_right); - rb_set_red(other); - __rb_rotate_left(other, root); - other = parent->rb_left; + tmp1 = sibling->rb_left; + if (!tmp1 || rb_is_black(tmp1)) { + tmp2 = sibling->rb_right; + if (!tmp2 || rb_is_black(tmp2)) { + /* Case 2 - sibling color flip */ + rb_set_parent_color(sibling, parent, + RB_RED); + if (rb_is_red(parent)) + rb_set_black(parent); + else { + node = parent; + parent = rb_parent(node); + if (parent) + continue; + } + break; } - rb_set_color(other, rb_color(parent)); - rb_set_black(parent); - rb_set_black(other->rb_left); - __rb_rotate_right(parent, root); - node = root->rb_node; - break; + /* Case 3 - right rotate at sibling */ + tmp1 = tmp2->rb_left; + WRITE_ONCE(sibling->rb_right, tmp1); + WRITE_ONCE(tmp2->rb_left, sibling); + WRITE_ONCE(parent->rb_left, tmp2); + if (tmp1) + rb_set_parent_color(tmp1, sibling, + RB_BLACK); + augment_rotate(sibling, tmp2); + tmp1 = sibling; + sibling = tmp2; } + /* Case 4 - left rotate at parent + color flips */ + tmp2 = sibling->rb_right; + WRITE_ONCE(parent->rb_left, tmp2); + WRITE_ONCE(sibling->rb_right, parent); + rb_set_parent_color(tmp1, sibling, RB_BLACK); + if (tmp2) + rb_set_parent(tmp2, parent); + __rb_rotate_set_parents(parent, sibling, root, + RB_BLACK); + augment_rotate(parent, sibling); + break; } } - if (node) - rb_set_black(node); } -void rb_erase(struct rb_node *node, struct rb_root *root) +/* Non-inline version for rb_erase_augmented() use */ +void __rb_erase_color(struct rb_node *parent, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - struct rb_node *child, *parent; - int color; - - if (!node->rb_left) - child = node->rb_right; - else if (!node->rb_right) - child = node->rb_left; - else - { - struct rb_node *old = node, *left; - - node = node->rb_right; - while ((left = node->rb_left) != NULL) - node = left; - - if (rb_parent(old)) { - if (rb_parent(old)->rb_left == old) - rb_parent(old)->rb_left = node; - else - rb_parent(old)->rb_right = node; - } else - root->rb_node = node; - - child = node->rb_right; - parent = rb_parent(node); - color = rb_color(node); - - if (parent == old) { - parent = node; - } else { - if (child) - rb_set_parent(child, parent); - parent->rb_left = child; - - node->rb_right = old->rb_right; - rb_set_parent(old->rb_right, node); - } - - node->rb_parent_color = old->rb_parent_color; - node->rb_left = old->rb_left; - rb_set_parent(old->rb_left, node); - - goto color; - } - - parent = rb_parent(node); - color = rb_color(node); - - if (child) - rb_set_parent(child, parent); - if (parent) - { - if (parent->rb_left == node) - parent->rb_left = child; - else - parent->rb_right = child; - } - else - root->rb_node = child; - - color: - if (color == RB_BLACK) - __rb_erase_color(child, parent, root); + ____rb_erase_color(parent, root, augment_rotate); } -EXPORT_SYMBOL(rb_erase); +EXPORT_SYMBOL(__rb_erase_color); -static void rb_augment_path(struct rb_node *node, rb_augment_f func, void *data) -{ - struct rb_node *parent; +/* + * Non-augmented rbtree manipulation functions. + * + * We use dummy augmented callbacks here, and have the compiler optimize them + * out of the rb_insert_color() and rb_erase() function definitions. + */ -up: - func(node, data); - parent = rb_parent(node); - if (!parent) - return; +static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {} +static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {} +static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {} - if (node == parent->rb_left && parent->rb_right) - func(parent->rb_right, data); - else if (parent->rb_left) - func(parent->rb_left, data); +static const struct rb_augment_callbacks dummy_callbacks = { + dummy_propagate, dummy_copy, dummy_rotate +}; - node = parent; - goto up; -} - -/* - * after inserting @node into the tree, update the tree to account for - * both the new entry and any damage done by rebalance - */ -void rb_augment_insert(struct rb_node *node, rb_augment_f func, void *data) +void rb_insert_color(struct rb_node *node, struct rb_root *root) { - if (node->rb_left) - node = node->rb_left; - else if (node->rb_right) - node = node->rb_right; - - rb_augment_path(node, func, data); + __rb_insert(node, root, dummy_rotate); } -EXPORT_SYMBOL(rb_augment_insert); +EXPORT_SYMBOL(rb_insert_color); -/* - * before removing the node, find the deepest node on the rebalance path - * that will still be there after @node gets removed - */ -struct rb_node *rb_augment_erase_begin(struct rb_node *node) +void rb_erase(struct rb_node *node, struct rb_root *root) { - struct rb_node *deepest; - - if (!node->rb_right && !node->rb_left) - deepest = rb_parent(node); - else if (!node->rb_right) - deepest = node->rb_left; - else if (!node->rb_left) - deepest = node->rb_right; - else { - deepest = rb_next(node); - if (deepest->rb_right) - deepest = deepest->rb_right; - else if (rb_parent(deepest) != node) - deepest = rb_parent(deepest); - } - - return deepest; + struct rb_node *rebalance; + rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); + if (rebalance) + ____rb_erase_color(rebalance, root, dummy_rotate); } -EXPORT_SYMBOL(rb_augment_erase_begin); +EXPORT_SYMBOL(rb_erase); /* - * after removal, update the tree to account for the removed entry - * and any rebalance damage. + * Augmented rbtree manipulation functions. + * + * This instantiates the same __always_inline functions as in the non-augmented + * case, but this time with user-defined callbacks. */ -void rb_augment_erase_end(struct rb_node *node, rb_augment_f func, void *data) + +void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - if (node) - rb_augment_path(node, func, data); + __rb_insert(node, root, augment_rotate); } -EXPORT_SYMBOL(rb_augment_erase_end); +EXPORT_SYMBOL(__rb_insert_augmented); /* * This function returns the first node (in sort order) of the tree. @@ -387,11 +477,13 @@ struct rb_node *rb_next(const struct rb_node *node) { struct rb_node *parent; - if (rb_parent(node) == node) + if (RB_EMPTY_NODE(node)) return NULL; - /* If we have a right-hand child, go down and then left as far - as we can. */ + /* + * If we have a right-hand child, go down and then left as far + * as we can. + */ if (node->rb_right) { node = node->rb_right; while (node->rb_left) @@ -399,12 +491,13 @@ struct rb_node *rb_next(const struct rb_node *node) return (struct rb_node *)node; } - /* No right-hand children. Everything down and left is - smaller than us, so any 'next' node must be in the general - direction of our parent. Go up the tree; any time the - ancestor is a right-hand child of its parent, keep going - up. First time it's a left-hand child of its parent, said - parent is our 'next' node. */ + /* + * No right-hand children. Everything down and left is smaller than us, + * so any 'next' node must be in the general direction of our parent. + * Go up the tree; any time the ancestor is a right-hand child of its + * parent, keep going up. First time it's a left-hand child of its + * parent, said parent is our 'next' node. + */ while ((parent = rb_parent(node)) && node == parent->rb_right) node = parent; @@ -416,11 +509,13 @@ struct rb_node *rb_prev(const struct rb_node *node) { struct rb_node *parent; - if (rb_parent(node) == node) + if (RB_EMPTY_NODE(node)) return NULL; - /* If we have a left-hand child, go down and then right as far - as we can. */ + /* + * If we have a left-hand child, go down and then right as far + * as we can. + */ if (node->rb_left) { node = node->rb_left; while (node->rb_right) @@ -428,8 +523,10 @@ struct rb_node *rb_prev(const struct rb_node *node) return (struct rb_node *)node; } - /* No left-hand children. Go up till we find an ancestor which - is a right-hand child of its parent */ + /* + * No left-hand children. Go up till we find an ancestor which + * is a right-hand child of its parent. + */ while ((parent = rb_parent(node)) && node == parent->rb_left) node = parent; @@ -443,14 +540,7 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_node *parent = rb_parent(victim); /* Set the surrounding nodes to point to the replacement */ - if (parent) { - if (victim == parent->rb_left) - parent->rb_left = new; - else - parent->rb_right = new; - } else { - root->rb_node = new; - } + __rb_change_child(victim, new, parent, root); if (victim->rb_left) rb_set_parent(victim->rb_left, new); if (victim->rb_right) @@ -460,3 +550,43 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, *new = *victim; } EXPORT_SYMBOL(rb_replace_node); + +static struct rb_node *rb_left_deepest_node(const struct rb_node *node) +{ + for (;;) { + if (node->rb_left) + node = node->rb_left; + else if (node->rb_right) + node = node->rb_right; + else + return (struct rb_node *)node; + } +} + +struct rb_node *rb_next_postorder(const struct rb_node *node) +{ + const struct rb_node *parent; + if (!node) + return NULL; + parent = rb_parent(node); + + /* If we're sitting on node, we've already seen our children */ + if (parent && node == parent->rb_left && parent->rb_right) { + /* If we are the parent's left node, go to the parent's right + * node then all the way down to the left */ + return rb_left_deepest_node(parent->rb_right); + } else + /* Otherwise we are the parent's right node, and the parent + * should be next */ + return (struct rb_node *)parent; +} +EXPORT_SYMBOL(rb_next_postorder); + +struct rb_node *rb_first_postorder(const struct rb_root *root) +{ + if (!root->rb_node) + return NULL; + + return rb_left_deepest_node(root->rb_node); +} +EXPORT_SYMBOL(rb_first_postorder); diff --git a/mm/Kconfig b/mm/Kconfig index e0b4f6fbb..81c5191d2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -466,6 +466,22 @@ config ZSWAP from this compressed store much faster than most tradition swap devices resulting in reduced I/O and faster performance for many workloads. + +config ZCACHE + bool "Compressed cache for file pages (EXPERIMENTAL)" + depends on CRYPTO && CLEANCACHE + select CRYPTO_LZO + select ZBUD + default n + help + A compressed cache for file pages. + It takes active file pages that are in the process of being reclaimed + and attempts to compress them into a dynamically allocated RAM-based + memory pool. + + If this process is successful, when those file pages needed again, the + I/O reading operation was avoided. This results in a significant performance + gains under memory pressure for systems full with file pages. config SWAP_ENABLE_READAHEAD bool "Enable readahead on page swap in" @@ -482,6 +498,24 @@ config ZSWAP_ENABLE_WRITEBACK depends on ZSWAP default n +config ZPOOL + tristate "Common API for compressed memory storage" + default n + help + Compressed memory storage API. This allows using either zbud or + zsmalloc. + +config ZBUD + tristate "Low density storage for compressed pages" + default n + help + A special purpose allocator for storing compressed pages. + It is designed to store up to two compressed pages per physical + page. While this design limits storage density, it has simple and + deterministic reclaim properties that make it preferable to a higher + density approach when reclaim will be used. + + config DIRECT_RECLAIM_FILE_PAGES_ONLY bool "Reclaim file pages only on direct reclaim path" depends on ZSWAP diff --git a/mm/Makefile b/mm/Makefile index 0774dd357..f279b0dc1 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_FRONTSWAP) += frontswap.o +obj-$(CONFIG_ZCACHE) += zcache.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o @@ -54,3 +55,5 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_ZSMALLOC_NEW) += zsmalloc.o +obj-$(CONFIG_ZPOOL) += zpool.o +obj-$(CONFIG_ZBUD) += zbud.o \ No newline at end of file diff --git a/mm/cleancache.c b/mm/cleancache.c index 5646c740f..1d97b0d05 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -19,23 +19,13 @@ #include /* - * This global enablement flag may be read thousands of times per second - * by cleancache_get/put/invalidate even on systems where cleancache_ops - * is not claimed (e.g. cleancache is config'ed on but remains - * disabled), so is preferred to the slower alternative: a function - * call that checks a non-global. - */ -int cleancache_enabled __read_mostly; -EXPORT_SYMBOL(cleancache_enabled); - -/* - * cleancache_ops is set by cleancache_ops_register to contain the pointers + * cleancache_ops is set by cleancache_register_ops to contain the pointers * to the cleancache "backend" implementation functions. */ -static struct cleancache_ops cleancache_ops __read_mostly; +static const struct cleancache_ops *cleancache_ops __read_mostly; /* - * Counters available via /sys/kernel/debug/frontswap (if debugfs is + * Counters available via /sys/kernel/debug/cleancache (if debugfs is * properly configured. These are for information only so are not protected * against increment races. */ @@ -44,32 +34,107 @@ static u64 cleancache_failed_gets; static u64 cleancache_puts; static u64 cleancache_invalidates; +static void cleancache_register_ops_sb(struct super_block *sb, void *unused) +{ + switch (sb->cleancache_poolid) { + case CLEANCACHE_NO_BACKEND: + __cleancache_init_fs(sb); + break; + case CLEANCACHE_NO_BACKEND_SHARED: + __cleancache_init_shared_fs(sb); + break; + } +} + /* - * register operations for cleancache, returning previous thus allowing - * detection of multiple backends and possible nesting + * Register operations for cleancache. Returns 0 on success. */ -struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops) +int cleancache_register_ops(const struct cleancache_ops *ops) { - struct cleancache_ops old = cleancache_ops; + if (cmpxchg(&cleancache_ops, NULL, ops)) + return -EBUSY; - cleancache_ops = *ops; - cleancache_enabled = 1; - return old; + /* + * A cleancache backend can be built as a module and hence loaded after + * a cleancache enabled filesystem has called cleancache_init_fs. To + * handle such a scenario, here we call ->init_fs or ->init_shared_fs + * for each active super block. To differentiate between local and + * shared filesystems, we temporarily initialize sb->cleancache_poolid + * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED + * respectively in case there is no backend registered at the time + * cleancache_init_fs or cleancache_init_shared_fs is called. + * + * Since filesystems can be mounted concurrently with cleancache + * backend registration, we have to be careful to guarantee that all + * cleancache enabled filesystems that has been mounted by the time + * cleancache_register_ops is called has got and all mounted later will + * get cleancache_poolid. This is assured by the following statements + * tied together: + * + * a) iterate_supers skips only those super blocks that has started + * ->kill_sb + * + * b) if iterate_supers encounters a super block that has not finished + * ->mount yet, it waits until it is finished + * + * c) cleancache_init_fs is called from ->mount and + * cleancache_invalidate_fs is called from ->kill_sb + * + * d) we call iterate_supers after cleancache_ops has been set + * + * From a) it follows that if iterate_supers skips a super block, then + * either the super block is already dead, in which case we do not need + * to bother initializing cleancache for it, or it was mounted after we + * initiated iterate_supers. In the latter case, it must have seen + * cleancache_ops set according to d) and initialized cleancache from + * ->mount by itself according to c). This proves that we call + * ->init_fs at least once for each active super block. + * + * From b) and c) it follows that if iterate_supers encounters a super + * block that has already started ->init_fs, it will wait until ->mount + * and hence ->init_fs has finished, then check cleancache_poolid, see + * that it has already been set and therefore do nothing. This proves + * that we call ->init_fs no more than once for each super block. + * + * Combined together, the last two paragraphs prove the function + * correctness. + * + * Note that various cleancache callbacks may proceed before this + * function is called or even concurrently with it, but since + * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop + * until the corresponding ->init_fs has been actually called and + * cleancache_ops has been set. + */ + iterate_supers(cleancache_register_ops_sb, NULL); + return 0; } EXPORT_SYMBOL(cleancache_register_ops); /* Called by a cleancache-enabled filesystem at time of mount */ void __cleancache_init_fs(struct super_block *sb) { - sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE); + int pool_id = CLEANCACHE_NO_BACKEND; + + if (cleancache_ops) { + pool_id = cleancache_ops->init_fs(PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; + } + sb->cleancache_poolid = pool_id; } EXPORT_SYMBOL(__cleancache_init_fs); /* Called by a cleancache-enabled clustered filesystem at time of mount */ -void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) +void __cleancache_init_shared_fs(struct super_block *sb) { - sb->cleancache_poolid = - (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE); + int pool_id = CLEANCACHE_NO_BACKEND_SHARED; + + if (cleancache_ops) { + pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; + } + sb->cleancache_poolid = pool_id; } EXPORT_SYMBOL(__cleancache_init_shared_fs); @@ -91,7 +156,7 @@ static int cleancache_get_key(struct inode *inode, struct dentry d; d.d_inode = inode; len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); - if (len <= 0 || len == 255) + if (len <= FILEID_ROOT || len == FILEID_INVALID) return -1; if (maxlen > CLEANCACHE_KEY_MAX) return -1; @@ -106,6 +171,10 @@ static int cleancache_get_key(struct inode *inode, * successful, use it to fill the specified page with data and return 0. * The pageframe is unchanged and returns -1 if the get fails. * Page must be locked by caller. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ int __cleancache_get_page(struct page *page) { @@ -113,6 +182,11 @@ int __cleancache_get_page(struct page *page) int pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) { + cleancache_failed_gets++; + goto out; + } + VM_BUG_ON(!PageLocked(page)); pool_id = page->mapping->host->i_sb->cleancache_poolid; if (pool_id < 0) @@ -121,7 +195,7 @@ int __cleancache_get_page(struct page *page) if (cleancache_get_key(page->mapping->host, &key) < 0) goto out; - ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page); + ret = cleancache_ops->get_page(pool_id, key, page->index, page); if (ret == 0) cleancache_succ_gets++; else @@ -136,17 +210,26 @@ EXPORT_SYMBOL(__cleancache_get_page); * (previously-obtained per-filesystem) poolid and the page's, * inode and page index. Page must be locked. Note that a put_page * always "succeeds", though a subsequent get_page may succeed or fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_put_page(struct page *page) { int pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) { + cleancache_puts++; + return; + } + VM_BUG_ON(!PageLocked(page)); pool_id = page->mapping->host->i_sb->cleancache_poolid; if (pool_id >= 0 && - cleancache_get_key(page->mapping->host, &key) >= 0) { - (*cleancache_ops.put_page)(pool_id, key, page->index, page); + cleancache_get_key(page->mapping->host, &key) >= 0) { + cleancache_ops->put_page(pool_id, key, page->index, page); cleancache_puts++; } } @@ -155,6 +238,10 @@ EXPORT_SYMBOL(__cleancache_put_page); /* * Invalidate any data from cleancache associated with the poolid and the * page's inode and page index so that a subsequent "get" will fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_invalidate_page(struct address_space *mapping, struct page *page) @@ -163,11 +250,14 @@ void __cleancache_invalidate_page(struct address_space *mapping, int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) + return; + if (pool_id >= 0) { VM_BUG_ON(!PageLocked(page)); if (cleancache_get_key(mapping->host, &key) >= 0) { - (*cleancache_ops.invalidate_page)(pool_id, - key, page->index); + cleancache_ops->invalidate_page(pool_id, + key, page->index); cleancache_invalidates++; } } @@ -178,29 +268,38 @@ EXPORT_SYMBOL(__cleancache_invalidate_page); * Invalidate all data from cleancache associated with the poolid and the * mappings's inode so that all subsequent gets to this poolid/inode * will fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_invalidate_inode(struct address_space *mapping) { int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) + return; + if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) - (*cleancache_ops.invalidate_inode)(pool_id, key); + cleancache_ops->invalidate_inode(pool_id, key); } EXPORT_SYMBOL(__cleancache_invalidate_inode); /* * Called by any cleancache-enabled filesystem at time of unmount; - * note that pool_id is surrendered and may be reutrned by a subsequent - * cleancache_init_fs or cleancache_init_shared_fs + * note that pool_id is surrendered and may be returned by a subsequent + * cleancache_init_fs or cleancache_init_shared_fs. */ void __cleancache_invalidate_fs(struct super_block *sb) { - if (sb->cleancache_poolid >= 0) { - int old_poolid = sb->cleancache_poolid; - sb->cleancache_poolid = -1; - (*cleancache_ops.invalidate_fs)(old_poolid); - } + int pool_id; + + pool_id = sb->cleancache_poolid; + sb->cleancache_poolid = CLEANCACHE_NO_POOL; + + if (cleancache_ops && pool_id >= 0) + cleancache_ops->invalidate_fs(pool_id); } EXPORT_SYMBOL(__cleancache_invalidate_fs); diff --git a/mm/filemap.c b/mm/filemap.c index 02c00eeef..e5e1543f9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -666,6 +666,86 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } } +/** + * page_cache_next_hole - find the next hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the + * lowest indexed hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'return - index >= + * max_scan' will be true). In rare cases of index wrap-around, 0 will + * be returned. + * + * page_cache_next_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 5, then subsequently a hole is created at + * index 10, page_cache_next_hole covering both indexes may return 10 + * if called under rcu_read_lock. + */ +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index++; + if (index == 0) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_next_hole); + +/** + * page_cache_prev_hole - find the prev hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] for + * the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= + * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX + * will be returned. + * + * page_cache_prev_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 10, then subsequently a hole is created at + * index 5, page_cache_prev_hole covering both indexes may return 5 if + * called under rcu_read_lock. + */ +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + if (!radix_tree_lookup(&mapping->page_tree, index)) + break; + index--; + if (index == ULONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_prev_hole); + + /** * find_get_page - find and get a page reference * @mapping: the address_space to search @@ -2442,6 +2522,11 @@ static ssize_t generic_perform_write(struct file *file, break; } + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } + status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); if (unlikely(status)) @@ -2482,10 +2567,6 @@ static ssize_t generic_perform_write(struct file *file, written += copied; balance_dirty_pages_ratelimited(mapping); - if (fatal_signal_pending(current)) { - status = -EINTR; - break; - } } while (iov_iter_count(i)); return written ? written : status; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc36e280c..e622aab7f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2503,6 +2503,14 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, if (iter_vma == vma) continue; + /* + * Shared VMAs have their own reserves and do not affect + * MAP_PRIVATE accounting but it is possible that a shared + * VMA is using the same page so check and skip such VMAs. + */ + if (iter_vma->vm_flags & VM_MAYSHARE) + continue; + /* * Unmap the page from other VMAs without their own reserves. * They get marked to be SIGKILLed if they fault in these diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5265cb948..6e96f56da 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5229,7 +5229,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, swp_entry_t swap = radix_to_swp_entry(page); if (do_swap_account) *entry = swap; - page = find_get_page(&swapper_space, swap.val); + page = find_get_page(swap_address_space(swap), swap.val); } #endif return page; diff --git a/mm/memory.c b/mm/memory.c index ec059db19..ed5b26277 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3258,7 +3258,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, mem_cgroup_commit_charge_swapin(page, ptr); swap_free(entry); - if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + if (vm_swap_full(page_swap_info(page)) || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); if (swapcache) { diff --git a/mm/mincore.c b/mm/mincore.c index 936b4cee8..8be8c84b9 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) /* shmem/tmpfs may return swap: account for swapcache page too. */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swap = radix_to_swp_entry(page); - page = find_get_page(&swapper_space, swap.val); + page = find_get_page(swap_address_space(swap), swap.val); } #endif if (page) { @@ -135,7 +135,7 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } else { #ifdef CONFIG_SWAP pgoff = entry.val; - *vec = mincore_page(&swapper_space, pgoff); + *vec = mincore_page(swap_address_space(entry), pgoff); #else WARN_ON(1); *vec = 1; diff --git a/mm/mmap.c b/mm/mmap.c index 1ce374f97..07b3d1fff 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include diff --git a/mm/mremap.c b/mm/mremap.c index db8d983b5..8855dec4e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include diff --git a/mm/nommu.c b/mm/nommu.c index 75ff3c173..3685abade 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 79d3c67be..1cfc67c3e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -317,7 +317,7 @@ static void bad_page(struct page *page) /* Don't complain about poisoned pages */ if (PageHWPoison(page)) { - reset_page_mapcount(page); /* remove PageBuddy */ + page_mapcount_reset(page); /* remove PageBuddy */ return; } @@ -349,7 +349,7 @@ static void bad_page(struct page *page) dump_stack(); out: /* Leave bad fields for debug, except PageBuddy could make trouble */ - reset_page_mapcount(page); /* remove PageBuddy */ + page_mapcount_reset(page); /* remove PageBuddy */ add_taint(TAINT_BAD_PAGE); } @@ -4039,7 +4039,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, set_page_links(page, zone, nid, pfn); mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); - reset_page_mapcount(page); + page_mapcount_reset(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for @@ -6312,6 +6312,9 @@ static struct trace_print_flags pageflag_names[] = { {1UL << PG_scfslower, "scfslower"}, {1UL << PG_nocache,"nocache"}, #endif +#ifdef CONFIG_ZCACHE + {1UL << PG_was_active, "PG_was_active" }, +#endif }; static void dump_page_flags(unsigned long flags) diff --git a/mm/page_io.c b/mm/page_io.c index d0b4d4a55..c1d2f81db 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -145,17 +146,43 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, int swap_writepage(struct page *page, struct writeback_control *wbc) { int ret = 0; + struct swap_info_struct *sis = page_swap_info(page); if (try_to_free_swap(page)) { unlock_page(page); goto out; } + if (frontswap_store(page) == 0) { set_page_writeback(page); unlock_page(page); end_page_writeback(page); goto out; } + if (sis->flags & SWP_FILE) { + struct kiocb kiocb; + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct iovec iov = { + .iov_base = page_address(page), + .iov_len = PAGE_SIZE, + }; + + init_sync_kiocb(&kiocb, swap_file); + kiocb.ki_pos = page_file_offset(page); + kiocb.ki_left = PAGE_SIZE; + kiocb.ki_nbytes = PAGE_SIZE; + + unlock_page(page); + ret = mapping->a_ops->direct_IO(KERNEL_WRITE, + &kiocb, &iov, + kiocb.ki_pos, 1); + if (ret == PAGE_SIZE) { + count_vm_event(PSWPOUT); + ret = 0; + } + return ret; + } ret = __swap_writepage(page, wbc, end_swap_bio_write); out: return ret; @@ -188,6 +215,7 @@ int swap_readpage(struct page *page) { struct bio *bio; int ret = 0; + struct swap_info_struct *sis = page_swap_info(page); VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); @@ -196,6 +224,15 @@ int swap_readpage(struct page *page) unlock_page(page); goto out; } + if (sis->flags & SWP_FILE) { + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + + ret = mapping->a_ops->readpage(swap_file, page); + if (!ret) + count_vm_event(PSWPIN); + return ret; + } bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); @@ -207,3 +244,15 @@ int swap_readpage(struct page *page) out: return ret; } + +int swap_set_page_dirty(struct page *page) +{ + struct swap_info_struct *sis = page_swap_info(page); + + if (sis->flags & SWP_FILE) { + struct address_space *mapping = sis->swap_file->f_mapping; + return mapping->a_ops->set_page_dirty(page); + } else { + return __set_page_dirty_no_writeback(page); + } +} \ No newline at end of file diff --git a/mm/readahead.c b/mm/readahead.c index 0eac4cbf6..abec1a59b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -367,7 +367,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t head; rcu_read_lock(); - head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); + head = page_cache_prev_hole(mapping, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; @@ -446,7 +446,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); + start = page_cache_next_hole(mapping, offset+1,max); rcu_read_unlock(); if (!start || start - offset > max) diff --git a/mm/slob.c b/mm/slob.c index 8105be42c..dcc976b33 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -117,7 +117,7 @@ static inline void struct_slob_page_wrong_size(void) */ static inline void free_slob_page(struct slob_page *sp) { - reset_page_mapcount(&sp->page); + page_mapcount_reset(&sp->page); sp->page.mapping = NULL; } diff --git a/mm/slub.c b/mm/slub.c index 286f736e9..5cb23f8ea 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1419,7 +1419,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) -pages); __ClearPageSlab(page); - reset_page_mapcount(page); + page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; __free_pages(page, order); diff --git a/mm/swap.c b/mm/swap.c index a8feea68a..db74d14f9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -845,7 +845,13 @@ void __init swap_setup(void) unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); #ifdef CONFIG_SWAP - bdi_init(swapper_space.backing_dev_info); + int i; + + bdi_init(swapper_spaces[0].backing_dev_info); + for (i = 0; i < MAX_SWAPFILES; i++) { + spin_lock_init(&swapper_spaces[i].tree_lock); + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); + } #endif /* Use a smaller cluster for small-memory machines */ diff --git a/mm/swap_state.c b/mm/swap_state.c index e561867c1..f1ee1e8fa 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -26,7 +26,7 @@ */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, - .set_page_dirty = __set_page_dirty_no_writeback, + .set_page_dirty = swap_set_page_dirty, .migratepage = migrate_page, }; @@ -35,12 +35,12 @@ static struct backing_dev_info swap_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, }; -struct address_space swapper_space = { - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), - .a_ops = &swap_aops, - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), - .backing_dev_info = &swap_backing_dev_info, +struct address_space swapper_spaces[MAX_SWAPFILES] = { + [0 ... MAX_SWAPFILES - 1] = { + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .a_ops = &swap_aops, + .backing_dev_info = &swap_backing_dev_info, + } }; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -52,9 +52,19 @@ static struct { unsigned long find_total; } swap_cache_info; +unsigned long total_swapcache_pages(void) +{ + int i; + unsigned long ret = 0; + + for (i = 0; i < MAX_SWAPFILES; i++) + ret += swapper_spaces[i].nrpages; + return ret; +} + void show_swap_cache_info(void) { - printk("%lu pages in swap cache\n", total_swapcache_pages); + printk("%lu pages in swap cache\n", total_swapcache_pages()); printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total); @@ -70,6 +80,7 @@ void show_swap_cache_info(void) int __add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; + struct address_space *address_space; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageSwapCache(page)); @@ -79,14 +90,15 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) SetPageSwapCache(page); set_page_private(page, entry.val); - spin_lock_irq(&swapper_space.tree_lock); - error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); + error = radix_tree_insert(&address_space->page_tree, entry.val, page); if (likely(!error)) { - total_swapcache_pages++; + address_space->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(add_total); } - spin_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&address_space->tree_lock); if (unlikely(error)) { /* @@ -122,14 +134,18 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { + swp_entry_t entry; + struct address_space *address_space; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapCache(page)); VM_BUG_ON(PageWriteback(page)); - radix_tree_delete(&swapper_space.page_tree, page_private(page)); + entry.val = page_private(page); + address_space = swap_address_space(entry); + radix_tree_delete(&address_space->page_tree, page_private(page)); set_page_private(page, 0); ClearPageSwapCache(page); - total_swapcache_pages--; + address_space->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); } @@ -195,12 +211,14 @@ int add_to_swap(struct page *page) void delete_from_swap_cache(struct page *page) { swp_entry_t entry; + struct address_space *address_space; entry.val = page_private(page); - spin_lock_irq(&swapper_space.tree_lock); + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); __delete_from_swap_cache(page); - spin_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&address_space->tree_lock); swapcache_free(entry, page); page_cache_release(page); @@ -263,7 +281,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) { struct page *page; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (page) INC_CACHE_INFO(find_success); @@ -290,7 +308,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(&swapper_space, entry.val); + found_page = find_get_page(swap_address_space(entry), entry.val); if (found_page) break; @@ -393,7 +411,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct page *page; unsigned long offset = swp_offset(entry); unsigned long start_offset, end_offset; - unsigned long mask = (1UL << page_cluster) - 1; + unsigned long mask = is_swap_fast(entry) ? 0 : + (1UL << page_cluster) - 1; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; diff --git a/mm/swapfile.c b/mm/swapfile.c index 99d16ec7c..f00f23601 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -73,6 +73,26 @@ static inline unsigned char swap_count(unsigned char ent) return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ } +bool is_swap_fast(swp_entry_t entry) +{ + struct swap_info_struct *p; + unsigned long type; + + if (non_swap_entry(entry)) + return false; + + type = swp_type(entry); + if (type >= nr_swapfiles) + return false; + + p = swap_info[type]; + + if (p->flags & SWP_FAST) + return true; + + return false; +} + /* returns 1 if swap entry is freed */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) @@ -81,7 +101,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) struct page *page; int ret = 0; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (!page) return 0; /* @@ -212,7 +232,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } - if (si->flags & SWP_DISCARDABLE) { + if (si->flags & SWP_PAGE_DISCARD) { /* * Start range check on racing allocations, in case * they overlap the cluster we eventually decide on @@ -293,7 +313,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, scan_base = offset = si->lowest_bit; /* reuse swap entry of cache-only swap if not busy. */ - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + if (vm_swap_full(si) && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset); @@ -322,7 +342,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, if (si->lowest_alloc) { /* - * Only set when SWP_DISCARDABLE, and there's a scan + * Only set when SWP_PAGE_DISCARD, and there's a scan * for a free cluster in progress or just completed. */ if (found_free_cluster) { @@ -382,7 +402,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, spin_lock(&si->lock); goto checks; } - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + if (vm_swap_full(si) && si->swap_map[offset] == SWAP_HAS_CACHE) { spin_lock(&si->lock); goto checks; } @@ -397,7 +417,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, spin_lock(&si->lock); goto checks; } - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + if (vm_swap_full(si) && si->swap_map[offset] == SWAP_HAS_CACHE) { spin_lock(&si->lock); goto checks; } @@ -746,7 +766,7 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (page && !trylock_page(page)) { page_cache_release(page); page = NULL; @@ -760,7 +780,7 @@ int free_swap_and_cache(swp_entry_t entry) * Also recheck PageSwapCache now page is locked (above). */ if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || vm_swap_full())) { + (!page_mapped(page) || vm_swap_full(page_swap_info(page)))) { delete_from_swap_cache(page); SetPageDirty(page); } @@ -1492,7 +1512,9 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, */ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) { - struct inode *inode; + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; unsigned blocks_per_page; unsigned long page_no; unsigned blkbits; @@ -1503,13 +1525,22 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) int nr_extents = 0; int ret; - inode = sis->swap_file->f_mapping->host; if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; goto out; } + if (mapping->a_ops->swap_activate) { + ret = mapping->a_ops->swap_activate(swap_file); + if (!ret) { + sis->flags |= SWP_FILE; + ret = add_swap_extent(sis, 0, sis->max, 0); + *span = sis->pages; + } + goto out; + } + blkbits = inode->i_blkbits; blocks_per_page = PAGE_SIZE >> blkbits; @@ -2083,6 +2114,21 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, return nr_extents; } +/* + * Helper to sys_swapon determining if a given swap + * backing device queue supports DISCARD operations. + */ +static bool swap_discardable(struct swap_info_struct *si) +{ + struct request_queue *q = bdev_get_queue(si->bdev); + + if (!q || !blk_queue_discard(q)) + return false; + + return true; +} + + SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; @@ -2190,8 +2236,39 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->flags |= SWP_SOLIDSTATE; p->cluster_next = 1 + (random32() % p->highest_bit); } - if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) - p->flags |= SWP_DISCARDABLE; + if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + /* + * When discard is enabled for swap with no particular + * policy flagged, we set all swap discard flags here in + * order to sustain backward compatibility with older + * swapon(8) releases. + */ + p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | + SWP_PAGE_DISCARD); + + /* + * By flagging sys_swapon, a sysadmin can tell us to + * either do single-time area discards only, or to just + * perform discards for released swap page-clusters. + * Now it's time to adjust the p->flags accordingly. + */ + if (swap_flags & SWAP_FLAG_DISCARD_ONCE) + p->flags &= ~SWP_PAGE_DISCARD; + else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) + p->flags &= ~SWP_AREA_DISCARD; + + /* issue a swapon-time discard if it's still required */ + if (p->flags & SWP_AREA_DISCARD) { + int err = discard_swap(p); + if (unlikely(err)) + printk(KERN_ERR + "swapon: discard_swap(%p): %d\n", + p, err); + } + } + + if (blk_queue_fast(bdev_get_queue(p->bdev))) + p->flags |= SWP_FAST; } mutex_lock(&swapon_mutex); @@ -2202,11 +2279,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) enable_swap_info(p, prio, swap_map, frontswap_map); printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s%s\n", + "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : "", + (p->flags & SWP_AREA_DISCARD) ? "s" : "", + (p->flags & SWP_PAGE_DISCARD) ? "c" : "", (frontswap_map) ? "FS" : ""); mutex_unlock(&swapon_mutex); diff --git a/mm/util.c b/mm/util.c index ae962b31d..a280deeb1 100644 --- a/mm/util.c +++ b/mm/util.c @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include "internal.h" @@ -341,6 +343,24 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, } EXPORT_SYMBOL_GPL(get_user_pages_fast); +struct address_space *page_mapping(struct page *page) +{ + struct address_space *mapping = page->mapping; + + VM_BUG_ON(PageSlab(page)); +#ifdef CONFIG_SWAP + if (unlikely(PageSwapCache(page))) { + swp_entry_t entry; + + entry.val = page_private(page); + mapping = swap_address_space(entry); + } else +#endif + if ((unsigned long)mapping & PAGE_MAPPING_ANON) + mapping = NULL; + return mapping; +} + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); diff --git a/mm/vmscan.c b/mm/vmscan.c index 530fab823..b27d8ba49 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -274,34 +274,41 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker, * * Returns the number of slab objects which we shrunk. */ -unsigned long shrink_slab(struct shrink_control *shrink, +unsigned long shrink_slab(struct shrink_control *shrinkctl, unsigned long nr_pages_scanned, unsigned long lru_pages) { struct shrinker *shrinker; - unsigned long ret = 0; + unsigned long freed = 0; if (nr_pages_scanned == 0) nr_pages_scanned = SWAP_CLUSTER_MAX; if (!down_read_trylock(&shrinker_rwsem)) { - /* Assume we'll be able to shrink next time */ - ret = 1; + /* + * If we would return 0, our callers would understand that we + * have nothing else to shrink and give up trying. By returning + * 1 we keep it going and assume we'll be able to shrink next + * time. + */ + freed = 1; goto out; } list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; - long total_scan, pages_got; + long total_scan; long max_pass; - int shrink_ret = 0; long nr; long new_nr; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; - max_pass = do_shrinker_shrink(shrinker, shrink, 0); - if (max_pass <= 0) + if (shrinker->count_objects) + max_pass = shrinker->count_objects(shrinker, shrinkctl); + else + max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0); + if (max_pass <= 0) continue; /* @@ -317,9 +324,9 @@ unsigned long shrink_slab(struct shrink_control *shrink, do_div(delta, lru_pages + 1); total_scan += delta; if (total_scan < 0) { - printk(KERN_ERR "shrink_slab: %pF negative objects to " - "delete nr=%ld\n", - shrinker->shrink, total_scan); + printk(KERN_ERR + "shrink_slab: %pF negative objects to delete nr=%ld\n", + shrinker->shrink, total_scan); total_scan = max_pass; } @@ -346,26 +353,35 @@ unsigned long shrink_slab(struct shrink_control *shrink, if (total_scan > max_pass * 2) total_scan = max_pass * 2; - trace_mm_shrink_slab_start(shrinker, shrink, nr, + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, nr_pages_scanned, lru_pages, max_pass, delta, total_scan); while (total_scan >= batch_size) { - int nr_before; - nr_before = do_shrinker_shrink(shrinker, shrink, 0); - shrink_ret = do_shrinker_shrink(shrinker, shrink, - batch_size); - if (shrink_ret == -1) - break; - if (shrink_ret < nr_before) { - pages_got = nr_before - shrink_ret; - ret += pages_got; - total_scan -= pages_got > batch_size ? pages_got : batch_size; - } else { - total_scan -= batch_size; - } + if (shrinker->scan_objects) { + unsigned long ret; + shrinkctl->nr_to_scan = batch_size; + ret = shrinker->scan_objects(shrinker, shrinkctl); + + if (ret == SHRINK_STOP) + break; + freed += ret; + } else { + int nr_before; + long ret; + + nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0); + ret = do_shrinker_shrink(shrinker, shrinkctl, + batch_size); + if (ret == -1) + break; + if (ret < nr_before) + freed += nr_before - ret; + } + count_vm_events(SLABS_SCANNED, batch_size); + total_scan -= batch_size; cond_resched(); } @@ -381,12 +397,12 @@ unsigned long shrink_slab(struct shrink_control *shrink, else new_nr = atomic_long_read(&shrinker->nr_in_batch); - trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); + trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); } up_read(&shrinker_rwsem); out: cond_resched(); - return ret; + return freed; } static inline int is_page_cache_freeable(struct page *page) @@ -974,7 +990,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && vm_swap_full()) + if (PageSwapCache(page) && vm_swap_full(page_swap_info(page))) try_to_free_swap(page); VM_BUG_ON(PageActive(page)); SetPageActive(page); @@ -1268,6 +1284,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, while (!list_empty(page_list)) { struct page *page = lru_to_page(page_list); int lru; + int file; VM_BUG_ON(PageLRU(page)); list_del(&page->lru); @@ -1280,6 +1297,13 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, SetPageLRU(page); lru = page_lru(page); add_page_to_lru_list(zone, page, lru); + + file = is_file_lru(lru); +#ifdef CONFIG_ZCACHE + if (IS_ENABLED(CONFIG_ZCACHE)) + if (file) + SetPageWasActive(page); +#endif if (is_active_lru(lru)) { int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); @@ -1574,6 +1598,12 @@ static void shrink_active_list(unsigned long nr_to_scan, } ClearPageActive(page); /* we are de-activating */ + if (IS_ENABLED(CONFIG_ZCACHE)) + /* + * For zcache to know whether the page is from active + * file list + */ + SetPageWasActive(page); list_add(&page->lru, &l_inactive); } diff --git a/mm/zbud.c b/mm/zbud.c new file mode 100644 index 000000000..894108348 --- /dev/null +++ b/mm/zbud.c @@ -0,0 +1,640 @@ +/* + * zbud.c + * + * Copyright (C) 2013, Seth Jennings, IBM + * + * Concepts based on zcache internal zbud allocator by Dan Magenheimer. + * + * zbud is an special purpose allocator for storing compressed pages. Contrary + * to what its name may suggest, zbud is not a buddy allocator, but rather an + * allocator that "buddies" two compressed pages together in a single memory + * page. + * + * While this design limits storage density, it has simple and deterministic + * reclaim properties that make it preferable to a higher density approach when + * reclaim will be used. + * + * zbud works by storing compressed pages, or "zpages", together in pairs in a + * single memory page called a "zbud page". The first buddy is "left + * justified" at the beginning of the zbud page, and the last buddy is "right + * justified" at the end of the zbud page. The benefit is that if either + * buddy is freed, the freed buddy space, coalesced with whatever slack space + * that existed between the buddies, results in the largest possible free region + * within the zbud page. + * + * zbud also provides an attractive lower bound on density. The ratio of zpages + * to zbud pages can not be less than 1. This ensures that zbud can never "do + * harm" by using more pages to store zpages than the uncompressed zpages would + * have used on their own. + * + * zbud pages are divided into "chunks". The size of the chunks is fixed at + * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages + * into chunks allows organizing unbuddied zbud pages into a manageable number + * of unbuddied lists according to the number of free chunks available in the + * zbud page. + * + * The zbud API differs from that of conventional allocators in that the + * allocation function, zbud_alloc(), returns an opaque handle to the user, + * not a dereferenceable pointer. The user must map the handle using + * zbud_map() in order to get a usable pointer by which to access the + * allocation data and unmap the handle with zbud_unmap() when operations + * on the allocation data are complete. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/***************** + * Structures +*****************/ +/* + * NCHUNKS_ORDER determines the internal allocation granularity, effectively + * adjusting internal fragmentation. It also determines the number of + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the + * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk + * in allocated page is occupied by zbud header, NCHUNKS will be calculated to + * 63 which shows the max number of free chunks in zbud page, also there will be + * 63 freelists per pool. + */ +#define NCHUNKS_ORDER 6 + +#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define ZHDR_SIZE_ALIGNED CHUNK_SIZE +#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) + +/** + * struct zbud_pool - stores metadata for each zbud pool + * @lock: protects all pool fields and first|last_chunk fields of any + * zbud page in the pool + * @unbuddied: array of lists tracking zbud pages that only contain one buddy; + * the lists each zbud page is added to depends on the size of + * its free region. + * @buddied: list tracking the zbud pages that contain two buddies; + * these zbud pages are full + * @lru: list tracking the zbud pages in LRU order by most recently + * added buddy. + * @pages_nr: number of zbud pages in the pool. + * @ops: pointer to a structure of user defined operations specified at + * pool creation time. + * + * This structure is allocated at pool creation time and maintains metadata + * pertaining to a particular zbud pool. + */ +struct zbud_pool { + spinlock_t lock; + struct list_head unbuddied[NCHUNKS]; + struct list_head buddied; + struct list_head lru; + u64 pages_nr; + const struct zbud_ops *ops; +#ifdef CONFIG_ZPOOL + struct zpool *zpool; + const struct zpool_ops *zpool_ops; +#endif +}; + +/* + * struct zbud_header - zbud page metadata occupying the first chunk of each + * zbud page. + * @buddy: links the zbud page into the unbuddied/buddied lists in the pool + * @lru: links the zbud page into the lru list in the pool + * @first_chunks: the size of the first buddy in chunks, 0 if free + * @last_chunks: the size of the last buddy in chunks, 0 if free + */ +struct zbud_header { + struct list_head buddy; + struct list_head lru; + unsigned int first_chunks; + unsigned int last_chunks; + bool under_reclaim; +}; + +/***************** + * zpool + ****************/ + +#ifdef CONFIG_ZPOOL + +static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) +{ + if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) + return pool->zpool_ops->evict(pool->zpool, handle); + else + return -ENOENT; +} + +static const struct zbud_ops zbud_zpool_ops = { + .evict = zbud_zpool_evict +}; + +static void *zbud_zpool_create(const char *name, gfp_t gfp, + const struct zpool_ops *zpool_ops, + struct zpool *zpool) +{ + struct zbud_pool *pool; + + pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); + if (pool) { + pool->zpool = zpool; + pool->zpool_ops = zpool_ops; + } + return pool; +} + +static void zbud_zpool_destroy(void *pool) +{ + zbud_destroy_pool(pool); +} + +static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zbud_alloc(pool, size, gfp, handle); +} +static void zbud_zpool_free(void *pool, unsigned long handle) +{ + zbud_free(pool, handle); +} + +static int zbud_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = zbud_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + +static void *zbud_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + return zbud_map(pool, handle); +} +static void zbud_zpool_unmap(void *pool, unsigned long handle) +{ + zbud_unmap(pool, handle); +} + +static u64 zbud_zpool_total_size(void *pool) +{ + return zbud_get_pool_size(pool) * PAGE_SIZE; +} + +static struct zpool_driver zbud_zpool_driver = { + .type = "zbud", + .owner = THIS_MODULE, + .create = zbud_zpool_create, + .destroy = zbud_zpool_destroy, + .malloc = zbud_zpool_malloc, + .free = zbud_zpool_free, + .shrink = zbud_zpool_shrink, + .map = zbud_zpool_map, + .unmap = zbud_zpool_unmap, + .total_size = zbud_zpool_total_size, +}; + +MODULE_ALIAS("zpool-zbud"); +#endif /* CONFIG_ZPOOL */ + +/***************** + * Helpers +*****************/ +/* Just to make the code easier to read */ +enum buddy { + FIRST, + LAST +}; + +/* Converts an allocation size in bytes to size in zbud chunks */ +static int size_to_chunks(size_t size) +{ + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; +} + +#define for_each_unbuddied_list(_iter, _begin) \ + for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) + +/* Initializes the zbud header of a newly allocated zbud page */ +static struct zbud_header *init_zbud_page(struct page *page) +{ + struct zbud_header *zhdr = page_address(page); + zhdr->first_chunks = 0; + zhdr->last_chunks = 0; + INIT_LIST_HEAD(&zhdr->buddy); + INIT_LIST_HEAD(&zhdr->lru); + zhdr->under_reclaim = 0; + return zhdr; +} + +/* Resets the struct page fields and frees the page */ +static void free_zbud_page(struct zbud_header *zhdr) +{ + __free_page(virt_to_page(zhdr)); +} + +/* + * Encodes the handle of a particular buddy within a zbud page + * Pool lock should be held as this function accesses first|last_chunks + */ +static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud) +{ + unsigned long handle; + + /* + * For now, the encoded handle is actually just the pointer to the data + * but this might not always be the case. A little information hiding. + * Add CHUNK_SIZE to the handle if it is the first allocation to jump + * over the zbud header in the first chunk. + */ + handle = (unsigned long)zhdr; + if (bud == FIRST) + /* skip over zbud header */ + handle += ZHDR_SIZE_ALIGNED; + else /* bud == LAST */ + handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); + return handle; +} + +/* Returns the zbud page where a given handle is stored */ +static struct zbud_header *handle_to_zbud_header(unsigned long handle) +{ + return (struct zbud_header *)(handle & PAGE_MASK); +} + +/* Returns the number of free chunks in a zbud page */ +static int num_free_chunks(struct zbud_header *zhdr) +{ + /* + * Rather than branch for different situations, just use the fact that + * free buddies have a length of zero to simplify everything. + */ + return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; +} + +/***************** + * API Functions +*****************/ +/** + * zbud_create_pool() - create a new zbud pool + * @gfp: gfp flags when allocating the zbud pool structure + * @ops: user-defined operations for the zbud pool + * + * Return: pointer to the new zbud pool or NULL if the metadata allocation + * failed. + */ +struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) +{ + struct zbud_pool *pool; + int i; + + pool = kzalloc(sizeof(struct zbud_pool), gfp); + if (!pool) + return NULL; + spin_lock_init(&pool->lock); + for_each_unbuddied_list(i, 0) + INIT_LIST_HEAD(&pool->unbuddied[i]); + INIT_LIST_HEAD(&pool->buddied); + INIT_LIST_HEAD(&pool->lru); + pool->pages_nr = 0; + pool->ops = ops; + return pool; +} + +/** + * zbud_destroy_pool() - destroys an existing zbud pool + * @pool: the zbud pool to be destroyed + * + * The pool should be emptied before this function is called. + */ +void zbud_destroy_pool(struct zbud_pool *pool) +{ + kfree(pool); +} + +/** + * zbud_alloc() - allocates a region of a given size + * @pool: zbud pool from which to allocate + * @size: size in bytes of the desired allocation + * @gfp: gfp flags used if the pool needs to grow + * @handle: handle of the new allocation + * + * This function will attempt to find a free region in the pool large enough to + * satisfy the allocation request. A search of the unbuddied lists is + * performed first. If no suitable free region is found, then a new page is + * allocated and added to the pool to satisfy the request. + * + * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used + * as zbud pool pages. + * + * Return: 0 if success and handle is set, otherwise -EINVAL if the size or + * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate + * a new page. + */ +int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + int chunks, i, freechunks; + struct zbud_header *zhdr = NULL; + enum buddy bud; + struct page *page; + int found = 0; + + if (!size || (gfp & __GFP_HIGHMEM)) + return -EINVAL; + if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) + return -ENOSPC; + chunks = size_to_chunks(size); + spin_lock(&pool->lock); + + /* First, try to find an unbuddied zbud page. */ + zhdr = NULL; + for_each_unbuddied_list(i, chunks) { + if (!list_empty(&pool->unbuddied[i])) { + zhdr = list_first_entry(&pool->unbuddied[i], + struct zbud_header, buddy); + list_del(&zhdr->buddy); + if (zhdr->first_chunks == 0) + bud = FIRST; + else + bud = LAST; + found = 1; + goto found; + } + } + + /* Couldn't find unbuddied zbud page, create new one */ + spin_unlock(&pool->lock); + page = alloc_page(gfp); + if (!page) + return -ENOMEM; + spin_lock(&pool->lock); + pool->pages_nr++; + zhdr = init_zbud_page(page); + bud = FIRST; + +found: + if (bud == FIRST) + zhdr->first_chunks = chunks; + else + zhdr->last_chunks = chunks; + + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* Add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + + /* Add/move zbud page to beginning of LRU */ + if (!list_empty(&zhdr->lru)) + list_del(&zhdr->lru); + list_add(&zhdr->lru, &pool->lru); + + *handle = encode_handle(zhdr, bud); + if ((gfp & __GFP_ZERO) && found) + memset((void *)*handle, 0, size); + spin_unlock(&pool->lock); + + return 0; +} + +/** + * zbud_free() - frees the allocation associated with the given handle + * @pool: pool in which the allocation resided + * @handle: handle associated with the allocation returned by zbud_alloc() + * + * In the case that the zbud page in which the allocation resides is under + * reclaim, as indicated by the PG_reclaim flag being set, this function + * only sets the first|last_chunks to 0. The page is actually freed + * once both buddies are evicted (see zbud_reclaim_page() below). + */ +void zbud_free(struct zbud_pool *pool, unsigned long handle) +{ + struct zbud_header *zhdr; + int freechunks; + + spin_lock(&pool->lock); + zhdr = handle_to_zbud_header(handle); + + /* If first buddy, handle will be page aligned */ + if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK) + zhdr->last_chunks = 0; + else + zhdr->first_chunks = 0; + + if (zhdr->under_reclaim) { + /* zbud page is under reclaim, reclaim will free */ + spin_unlock(&pool->lock); + return; + } + + /* Remove from existing buddy list */ + list_del(&zhdr->buddy); + + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* zbud page is empty, free */ + list_del(&zhdr->lru); + free_zbud_page(zhdr); + pool->pages_nr--; + } else { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } + + spin_unlock(&pool->lock); +} + +/** + * zbud_reclaim_page() - evicts allocations from a pool page and frees it + * @pool: pool from which a page will attempt to be evicted + * @retires: number of pages on the LRU list for which eviction will + * be attempted before failing + * + * zbud reclaim is different from normal system reclaim in that the reclaim is + * done from the bottom, up. This is because only the bottom layer, zbud, has + * information on how the allocations are organized within each zbud page. This + * has the potential to create interesting locking situations between zbud and + * the user, however. + * + * To avoid these, this is how zbud_reclaim_page() should be called: + + * The user detects a page should be reclaimed and calls zbud_reclaim_page(). + * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call + * the user-defined eviction handler with the pool and handle as arguments. + * + * If the handle can not be evicted, the eviction handler should return + * non-zero. zbud_reclaim_page() will add the zbud page back to the + * appropriate list and try the next zbud page on the LRU up to + * a user defined number of retries. + * + * If the handle is successfully evicted, the eviction handler should + * return 0 _and_ should have called zbud_free() on the handle. zbud_free() + * contains logic to delay freeing the page if the page is under reclaim, + * as indicated by the setting of the PG_reclaim flag on the underlying page. + * + * If all buddies in the zbud page are successfully evicted, then the + * zbud page can be freed. + * + * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are + * no pages to evict or an eviction handler is not registered, -EAGAIN if + * the retry limit was hit. + */ +int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) +{ + int i, ret, freechunks; + struct zbud_header *zhdr; + unsigned long first_handle = 0, last_handle = 0; + + spin_lock(&pool->lock); + if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || + retries == 0) { + spin_unlock(&pool->lock); + return -EINVAL; + } + for (i = 0; i < retries; i++) { + zhdr = list_last_entry(&pool->lru, struct zbud_header, lru); + list_del(&zhdr->lru); + list_del(&zhdr->buddy); + /* Protect zbud page against free */ + zhdr->under_reclaim = true; + /* + * We need encode the handles before unlocking, since we can + * race with free that will set (first|last)_chunks to 0 + */ + first_handle = 0; + last_handle = 0; + if (zhdr->first_chunks) + first_handle = encode_handle(zhdr, FIRST); + if (zhdr->last_chunks) + last_handle = encode_handle(zhdr, LAST); + spin_unlock(&pool->lock); + + /* Issue the eviction callback(s) */ + if (first_handle) { + ret = pool->ops->evict(pool, first_handle); + if (ret) + goto next; + } + if (last_handle) { + ret = pool->ops->evict(pool, last_handle); + if (ret) + goto next; + } +next: + spin_lock(&pool->lock); + zhdr->under_reclaim = false; + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* + * Both buddies are now free, free the zbud page and + * return success. + */ + free_zbud_page(zhdr); + pool->pages_nr--; + spin_unlock(&pool->lock); + return 0; + } else if (zhdr->first_chunks == 0 || + zhdr->last_chunks == 0) { + /* add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + + /* add to beginning of LRU */ + list_add(&zhdr->lru, &pool->lru); + } + spin_unlock(&pool->lock); + return -EAGAIN; +} + +/** + * zbud_map() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be mapped + * + * While trivial for zbud, the mapping functions for others allocators + * implementing this allocation API could have more complex information encoded + * in the handle and could create temporary mappings to make the data + * accessible to the user. + * + * Returns: a pointer to the mapped allocation + */ +void *zbud_map(struct zbud_pool *pool, unsigned long handle) +{ + return (void *)(handle); +} + +/** + * zbud_unmap() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be unmapped + */ +void zbud_unmap(struct zbud_pool *pool, unsigned long handle) +{ +} + +/** + * zbud_get_pool_size() - gets the zbud pool size in pages + * @pool: pool whose size is being queried + * + * Returns: size in pages of the given pool. The pool lock need not be + * taken to access pages_nr. + */ +u64 zbud_get_pool_size(struct zbud_pool *pool) +{ + return pool->pages_nr; +} + +static int __init init_zbud(void) +{ + /* Make sure the zbud header will fit in one chunk */ + BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); + pr_info("loaded\n"); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zbud_zpool_driver); +#endif + + return 0; +} + +static void __exit exit_zbud(void) +{ +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zbud_zpool_driver); +#endif + + pr_info("unloaded\n"); +} + +module_init(init_zbud); +module_exit(exit_zbud); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Seth Jennings "); +MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages"); diff --git a/mm/zcache.c b/mm/zcache.c new file mode 100644 index 000000000..ec1c7c919 --- /dev/null +++ b/mm/zcache.c @@ -0,0 +1,1158 @@ +/* + * linux/mm/zcache.c + * + * A cleancache backend for file pages compression. + * Concepts based on original zcache by Dan Magenheimer. + * Copyright (C) 2013 Bob Liu + * + * With zcache, active file pages can be compressed in memory during page + * reclaiming. When their data is needed again the I/O reading operation is + * avoided. This results in a significant performance gain under memory pressure + * for systems with many file pages. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Enable/disable zcache (enabled by default) + */ +static bool zcache_enabled = true; +module_param_named(enabled, zcache_enabled, bool, 0); + +/* + * Compressor to be used by zcache + */ +#define ZCACHE_COMPRESSOR_DEFAULT "lzo" +#ifndef CONFIG_CRYPTO_LZ4 +static char *zcache_compressor = ZCACHE_COMPRESSOR_DEFAULT; +#else +static char *zcache_compressor = "lz4"; +#endif +module_param_named(compressor, zcache_compressor, charp, 0); + +/* + * The maximum percentage of memory that the compressed pool can occupy. + */ +static unsigned int zcache_max_pool_percent = 10; +module_param_named(max_pool_percent, zcache_max_pool_percent, uint, 0644); + +static unsigned int zcache_clear_percent = 4; +module_param_named(clear_percent, zcache_clear_percent, uint, 0644); +/* + * zcache statistics + */ +static u64 zcache_pool_limit_hit; +static u64 zcache_dup_entry; +static u64 zcache_zbud_alloc_fail; +static u64 zcache_evict_zpages; +static u64 zcache_evict_filepages; +static u64 zcache_inactive_pages_refused; +static u64 zcache_reclaim_fail; +static u64 zcache_pool_shrink; +static u64 zcache_pool_shrink_fail; +static u64 zcache_pool_shrink_pages; +static u64 zcache_store_failed; +static atomic_t zcache_stored_pages = ATOMIC_INIT(0); +static atomic_t zcache_stored_zero_pages = ATOMIC_INIT(0); + +#define GFP_ZCACHE \ + (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NOMEMALLOC | __GFP_NO_KSWAPD | __GFP_ZERO) + +/* + * Make sure this is different from radix tree + * indirect ptr or exceptional entry. + */ +#define ZERO_HANDLE ((void *)~(~0UL >> 1)) + +/* + * Zcache receives pages for compression through the Cleancache API and is able + * to evict pages from its own compressed pool on an LRU basis in the case that + * the compressed pool is full. + * + * Zcache makes use of zbud for the managing the compressed memory pool. Each + * allocation in zbud is not directly accessible by address. Rather, a handle + * (zaddr) is return by the allocation routine and that handle(zaddr must be + * mapped before being accessed. The compressed memory pool grows on demand and + * shrinks as compressed pages are freed. + * + * When a file page is passed from cleancache to zcache, zcache maintains a + * mapping of the to the zbud + * address that references that compressed file page. This mapping is achieved + * with a red-black tree per filesystem type, plus a radix tree per red-black + * node. + * + * A zcache pool with pool_id as the index is created when a filesystem mounted + * Each zcache pool has a red-black tree, the inode number(rb_index) is the + * search key. Each red-black tree node has a radix tree which use + * page->index(ra_index) as the index. Each radix tree slot points to the zbud + * address combining with some extra information(zcache_ra_handle). + */ +#define MAX_ZCACHE_POOLS 32 +/* + * One zcache_pool per (cleancache aware) filesystem mount instance + */ +struct zcache_pool { + struct rb_root rbtree; + rwlock_t rb_lock; /* Protects rbtree */ + u64 size; + struct zbud_pool *pool; /* Zbud pool used */ +}; + +/* + * Manage all zcache pools + */ +struct _zcache { + struct zcache_pool *pools[MAX_ZCACHE_POOLS]; + u32 num_pools; /* Current no. of zcache pools */ + spinlock_t pool_lock; /* Protects pools[] and num_pools */ +}; +struct _zcache zcache; + +/* + * Redblack tree node, each node has a page index radix-tree. + * Indexed by inode nubmer. + */ +struct zcache_rbnode { + struct rb_node rb_node; + int rb_index; + struct radix_tree_root ratree; /* Page radix tree per inode rbtree */ + spinlock_t ra_lock; /* Protects radix tree */ + struct kref refcount; +}; + +/* + * Radix-tree leaf, indexed by page->index + */ +struct zcache_ra_handle { + int rb_index; /* Redblack tree index */ + int ra_index; /* Radix tree index */ + int zlen; /* Compressed page size */ + struct zcache_pool *zpool; /* Finding zcache_pool during evict */ +}; + +u64 zcache_pages(void) +{ + int i; + u64 count = 0; + + for (i = 0; (i < MAX_ZCACHE_POOLS) && zcache.pools[i]; i++) + count += zcache.pools[i]->size; + + return count; +} + +static struct kmem_cache *zcache_rbnode_cache; +static int zcache_rbnode_cache_create(void) +{ + zcache_rbnode_cache = KMEM_CACHE(zcache_rbnode, 0); + return zcache_rbnode_cache == NULL; +} +static void zcache_rbnode_cache_destroy(void) +{ + kmem_cache_destroy(zcache_rbnode_cache); +} + +static int zcache_shrink(struct shrinker *s, struct shrink_control *sc) +{ + unsigned long active_file; + unsigned long file; + long file_gap; + unsigned long freed = 0; + unsigned long pool; + static bool running; + int i = 0; + int retries; + + if (running) + goto end; + + running = true; + active_file = global_page_state(NR_ACTIVE_FILE); + file = global_page_state(NR_FILE_PAGES); + pool = zcache_pages(); + + file_gap = pool - file; + + if ((file_gap >= 0) && + (totalram_pages * zcache_clear_percent / 100 > file)) { + file_gap = pool; + zcache_pool_shrink++; + goto reclaim; + } + + /* + * file_gap == 0 means that the number of pages + * stored by zcache is around twice as many as the + * number of active file pages. + */ + file_gap = pool - active_file; + if (file_gap < 0) + file_gap = 0; + else + zcache_pool_shrink++; + +reclaim: + retries = file_gap; + while ((file_gap > 0) && retries) { + struct zcache_pool *zpool = + zcache.pools[i++ % MAX_ZCACHE_POOLS]; + if (!zpool || !zpool->size) + continue; + if (zbud_reclaim_page(zpool->pool, 8)) { + zcache_pool_shrink_fail++; + retries--; + continue; + } + freed++; + file_gap--; + } + + zcache_pool_shrink_pages += freed; + for (i = 0; (i < MAX_ZCACHE_POOLS) && zcache.pools[i]; i++) + zcache.pools[i]->size = + zbud_get_pool_size(zcache.pools[i]->pool); + + running = false; +end: + return freed; +} + +static struct shrinker zcache_shrinker = { + .shrink = zcache_shrink, + .seeks = DEFAULT_SEEKS * 16 +}; + +/* + * Compression functions + * (Below functions are copyed from zswap!) + */ +static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms; + +enum comp_op { + ZCACHE_COMPOP_COMPRESS, + ZCACHE_COMPOP_DECOMPRESS +}; + +static int zcache_comp_op(enum comp_op op, const u8 *src, unsigned int slen, + u8 *dst, unsigned int *dlen) +{ + struct crypto_comp *tfm; + int ret; + + tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu()); + switch (op) { + case ZCACHE_COMPOP_COMPRESS: + ret = crypto_comp_compress(tfm, src, slen, dst, dlen); + break; + case ZCACHE_COMPOP_DECOMPRESS: + ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); + break; + default: + ret = -EINVAL; + } + + put_cpu(); + return ret; +} + +static int __init zcache_comp_init(void) +{ + if (!crypto_has_comp(zcache_compressor, 0, 0)) { + pr_info("%s compressor not available\n", zcache_compressor); + /* fall back to default compressor */ + zcache_compressor = ZCACHE_COMPRESSOR_DEFAULT; + if (!crypto_has_comp(zcache_compressor, 0, 0)) + /* can't even load the default compressor */ + return -ENODEV; + } + pr_info("using %s compressor\n", zcache_compressor); + + /* alloc percpu transforms */ + zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); + if (!zcache_comp_pcpu_tfms) + return -ENOMEM; + return 0; +} + +static void zcache_comp_exit(void) +{ + /* free percpu transforms */ + if (zcache_comp_pcpu_tfms) + free_percpu(zcache_comp_pcpu_tfms); +} + +/* + * Per-cpu code + * (Below functions are also copyed from zswap!) + */ +static DEFINE_PER_CPU(u8 *, zcache_dstmem); + +static int __zcache_cpu_notifier(unsigned long action, unsigned long cpu) +{ + struct crypto_comp *tfm; + u8 *dst; + + switch (action) { + case CPU_UP_PREPARE: + tfm = crypto_alloc_comp(zcache_compressor, 0, 0); + if (IS_ERR(tfm)) { + pr_err("can't allocate compressor transform\n"); + return NOTIFY_BAD; + } + *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm; + dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); + if (!dst) { + pr_err("can't allocate compressor buffer\n"); + crypto_free_comp(tfm); + *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL; + return NOTIFY_BAD; + } + per_cpu(zcache_dstmem, cpu) = dst; + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu); + if (tfm) { + crypto_free_comp(tfm); + *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL; + } + dst = per_cpu(zcache_dstmem, cpu); + kfree(dst); + per_cpu(zcache_dstmem, cpu) = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} + +static int zcache_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + unsigned long cpu = (unsigned long)pcpu; + + return __zcache_cpu_notifier(action, cpu); +} + +static struct notifier_block zcache_cpu_notifier_block = { + .notifier_call = zcache_cpu_notifier +}; + +static int zcache_cpu_init(void) +{ + unsigned long cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + if (__zcache_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) + goto cleanup; + register_cpu_notifier(&zcache_cpu_notifier_block); + put_online_cpus(); + return 0; + +cleanup: + for_each_online_cpu(cpu) + __zcache_cpu_notifier(CPU_UP_CANCELED, cpu); + put_online_cpus(); + return -ENOMEM; +} + +/* + * Zcache helpers + */ +static bool zcache_is_full(void) +{ + long file = global_page_state(NR_FILE_PAGES); + + return ((totalram_pages * zcache_max_pool_percent / 100 < + zcache_pages()) || + (totalram_pages * zcache_clear_percent / 100 > + file)); +} + +/* + * The caller must hold zpool->rb_lock at least + */ +static struct zcache_rbnode *zcache_find_rbnode(struct rb_root *rbtree, + int index, struct rb_node **rb_parent, struct rb_node ***rb_link) +{ + struct zcache_rbnode *entry; + struct rb_node **__rb_link, *__rb_parent, *rb_prev; + + __rb_link = &rbtree->rb_node; + rb_prev = __rb_parent = NULL; + + while (*__rb_link) { + __rb_parent = *__rb_link; + entry = rb_entry(__rb_parent, struct zcache_rbnode, rb_node); + if (entry->rb_index > index) + __rb_link = &__rb_parent->rb_left; + else if (entry->rb_index < index) { + rb_prev = __rb_parent; + __rb_link = &__rb_parent->rb_right; + } else + return entry; + } + + if (rb_parent) + *rb_parent = __rb_parent; + if (rb_link) + *rb_link = __rb_link; + return NULL; +} + +static struct zcache_rbnode *zcache_find_get_rbnode(struct zcache_pool *zpool, + int rb_index) +{ + unsigned long flags; + struct zcache_rbnode *rbnode; + + read_lock_irqsave(&zpool->rb_lock, flags); + rbnode = zcache_find_rbnode(&zpool->rbtree, rb_index, 0, 0); + if (rbnode) + kref_get(&rbnode->refcount); + read_unlock_irqrestore(&zpool->rb_lock, flags); + return rbnode; +} + +/* + * kref_put callback for zcache_rbnode. + * + * The rbnode must have been isolated from rbtree already. + */ +static void zcache_rbnode_release(struct kref *kref) +{ + struct zcache_rbnode *rbnode; + + rbnode = container_of(kref, struct zcache_rbnode, refcount); + BUG_ON(rbnode->ratree.rnode); + kmem_cache_free(zcache_rbnode_cache, rbnode); +} + +/* + * Check whether the radix-tree of this rbnode is empty. + * If that's true, then we can delete this zcache_rbnode from + * zcache_pool->rbtree + * + * Caller must hold zcache_rbnode->ra_lock + */ +static int zcache_rbnode_empty(struct zcache_rbnode *rbnode) +{ + return rbnode->ratree.rnode == NULL; +} + +/* + * Remove zcache_rbnode from zpool->rbtree + * + * holded_rblock - whether the caller has holded zpool->rb_lock + */ +static void zcache_rbnode_isolate(struct zcache_pool *zpool, + struct zcache_rbnode *rbnode, bool holded_rblock) +{ + unsigned long flags; + + if (!holded_rblock) + write_lock_irqsave(&zpool->rb_lock, flags); + /* + * Someone can get reference on this rbnode before we could + * acquire write lock above. + * We want to remove it from zpool->rbtree when only the caller and + * corresponding ratree holds a reference to this rbnode. + * Below check ensures that a racing zcache put will not end up adding + * a page to an isolated node and thereby losing that memory. + */ + if (atomic_read(&rbnode->refcount.refcount) == 2) { + rb_erase(&rbnode->rb_node, &zpool->rbtree); + RB_CLEAR_NODE(&rbnode->rb_node); + kref_put(&rbnode->refcount, zcache_rbnode_release); + } + if (!holded_rblock) + write_unlock_irqrestore(&zpool->rb_lock, flags); +} + +/* + * Store zaddr which allocated by zbud_alloc() to the hierarchy rbtree-ratree. + */ +static int zcache_store_zaddr(struct zcache_pool *zpool, + int ra_index, int rb_index, unsigned long zaddr) +{ + unsigned long flags; + struct zcache_rbnode *rbnode, *tmp; + struct rb_node **link = NULL, *parent = NULL; + int ret; + void *dup_zaddr; + + rbnode = zcache_find_get_rbnode(zpool, rb_index); + if (!rbnode) { + /* alloc and init a new rbnode */ + rbnode = kmem_cache_alloc(zcache_rbnode_cache, + GFP_ZCACHE); + if (!rbnode) + return -ENOMEM; + + INIT_RADIX_TREE(&rbnode->ratree, GFP_ATOMIC|__GFP_NOWARN); + spin_lock_init(&rbnode->ra_lock); + rbnode->rb_index = rb_index; + kref_init(&rbnode->refcount); + RB_CLEAR_NODE(&rbnode->rb_node); + + /* add that rbnode to rbtree */ + write_lock_irqsave(&zpool->rb_lock, flags); + tmp = zcache_find_rbnode(&zpool->rbtree, rb_index, + &parent, &link); + if (tmp) { + /* somebody else allocated new rbnode */ + kmem_cache_free(zcache_rbnode_cache, rbnode); + rbnode = tmp; + } else { + rb_link_node(&rbnode->rb_node, parent, link); + rb_insert_color(&rbnode->rb_node, &zpool->rbtree); + } + + /* Inc the reference of this zcache_rbnode */ + kref_get(&rbnode->refcount); + write_unlock_irqrestore(&zpool->rb_lock, flags); + } + + /* Succfully got a zcache_rbnode when arriving here */ + spin_lock_irqsave(&rbnode->ra_lock, flags); + dup_zaddr = radix_tree_delete(&rbnode->ratree, ra_index); + if (unlikely(dup_zaddr)) { + if (dup_zaddr == ZERO_HANDLE) { + atomic_dec(&zcache_stored_zero_pages); + } else { + zbud_free(zpool->pool, (unsigned long)dup_zaddr); + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); + } + zcache_dup_entry++; + } + + /* Insert zcache_ra_handle to ratree */ + ret = radix_tree_insert(&rbnode->ratree, ra_index, + (void *)zaddr); + spin_unlock_irqrestore(&rbnode->ra_lock, flags); + if (unlikely(ret)) { + write_lock_irqsave(&zpool->rb_lock, flags); + spin_lock(&rbnode->ra_lock); + + if (zcache_rbnode_empty(rbnode)) + zcache_rbnode_isolate(zpool, rbnode, 1); + + spin_unlock(&rbnode->ra_lock); + write_unlock_irqrestore(&zpool->rb_lock, flags); + } + + kref_put(&rbnode->refcount, zcache_rbnode_release); + return ret; +} + +/* + * Load zaddr and delete it from radix tree. + * If the radix tree of the corresponding rbnode is empty, delete the rbnode + * from zpool->rbtree also. + */ +static void *zcache_load_delete_zaddr(struct zcache_pool *zpool, + int rb_index, int ra_index) +{ + struct zcache_rbnode *rbnode; + void *zaddr = NULL; + unsigned long flags; + + rbnode = zcache_find_get_rbnode(zpool, rb_index); + if (!rbnode) + goto out; + + BUG_ON(rbnode->rb_index != rb_index); + + spin_lock_irqsave(&rbnode->ra_lock, flags); + zaddr = radix_tree_delete(&rbnode->ratree, ra_index); + spin_unlock_irqrestore(&rbnode->ra_lock, flags); + + /* rb_lock and ra_lock must be taken again in the given sequence */ + write_lock_irqsave(&zpool->rb_lock, flags); + spin_lock(&rbnode->ra_lock); + if (zcache_rbnode_empty(rbnode)) + zcache_rbnode_isolate(zpool, rbnode, 1); + spin_unlock(&rbnode->ra_lock); + write_unlock_irqrestore(&zpool->rb_lock, flags); + + kref_put(&rbnode->refcount, zcache_rbnode_release); +out: + return zaddr; +} + +static bool zero_page(struct page *page) +{ + unsigned long *ptr = kmap_atomic(page); + int i; + bool ret = false; + + for (i = 0; i < PAGE_SIZE / sizeof(*ptr); i++) { + if (ptr[i]) + goto out; + } + ret = true; +out: + kunmap_atomic(ptr); + return ret; +} + +static void zcache_store_page(int pool_id, struct cleancache_filekey key, + pgoff_t index, struct page *page) +{ + struct zcache_ra_handle *zhandle; + u8 *zpage, *src, *dst; + /* Address of zhandle + compressed data(zpage) */ + unsigned long zaddr = 0; + unsigned int zlen = PAGE_SIZE; + bool zero = 0; + int ret; + + struct zcache_pool *zpool = zcache.pools[pool_id]; + + /* + * Zcache will be ineffective if the compressed memory pool is full with + * compressed inactive file pages and most of them will never be used + * again. + * So we refuse to compress pages that are not from active file list. + */ + if (!PageWasActive(page)) { + zcache_inactive_pages_refused++; + return; + } + + zero = zero_page(page); + if (zero) + goto zero; + + if (zcache_is_full()) { + zcache_pool_limit_hit++; + if (zbud_reclaim_page(zpool->pool, 8)) { + zcache_reclaim_fail++; + return; + } + /* + * Continue if reclaimed a page frame succ. + */ + zcache_evict_filepages++; + zpool->size = zbud_get_pool_size(zpool->pool); + } + + /* compress */ + dst = get_cpu_var(zcache_dstmem); + src = kmap_atomic(page); + ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, src, PAGE_SIZE, dst, + &zlen); + kunmap_atomic(src); + if (ret) { + pr_err("zcache compress error ret %d\n", ret); + put_cpu_var(zcache_dstmem); + return; + } + + /* store zcache handle together with compressed page data */ + ret = zbud_alloc(zpool->pool, zlen + sizeof(struct zcache_ra_handle), + GFP_ZCACHE, &zaddr); + if (ret) { + zcache_zbud_alloc_fail++; + put_cpu_var(zcache_dstmem); + return; + } + + zhandle = (struct zcache_ra_handle *)zbud_map(zpool->pool, zaddr); + + /* Compressed page data stored at the end of zcache_ra_handle */ + zpage = (u8 *)(zhandle + 1); + memcpy(zpage, dst, zlen); + zbud_unmap(zpool->pool, zaddr); + put_cpu_var(zcache_dstmem); + +zero: + if (zero) + zaddr = (unsigned long)ZERO_HANDLE; + + /* store zcache handle */ + ret = zcache_store_zaddr(zpool, index, key.u.ino, zaddr); + if (ret) { + zcache_store_failed++; + if (!zero) + zbud_free(zpool->pool, zaddr); + return; + } + + /* update stats */ + if (zero) { + atomic_inc(&zcache_stored_zero_pages); + } else { + zhandle->ra_index = index; + zhandle->rb_index = key.u.ino; + zhandle->zlen = zlen; + zhandle->zpool = zpool; + atomic_inc(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); + } + + return; +} + +static int zcache_load_page(int pool_id, struct cleancache_filekey key, + pgoff_t index, struct page *page) +{ + int ret = 0; + u8 *src, *dst; + void *zaddr; + unsigned int dlen = PAGE_SIZE; + struct zcache_ra_handle *zhandle; + struct zcache_pool *zpool = zcache.pools[pool_id]; + + zaddr = zcache_load_delete_zaddr(zpool, key.u.ino, index); + if (!zaddr) + return -ENOENT; + else if (zaddr == ZERO_HANDLE) + goto map; + + zhandle = (struct zcache_ra_handle *)zbud_map(zpool->pool, + (unsigned long)zaddr); + /* Compressed page data stored at the end of zcache_ra_handle */ + src = (u8 *)(zhandle + 1); + + /* decompress */ +map: + dst = kmap_atomic(page); + if (zaddr != ZERO_HANDLE) { + ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, src, + zhandle->zlen, dst, &dlen); + } else { + memset(dst, 0, PAGE_SIZE); + kunmap_atomic(dst); + flush_dcache_page(page); + atomic_dec(&zcache_stored_zero_pages); + goto out; + } + kunmap_atomic(dst); + zbud_unmap(zpool->pool, (unsigned long)zaddr); + zbud_free(zpool->pool, (unsigned long)zaddr); + + BUG_ON(ret); + BUG_ON(dlen != PAGE_SIZE); + + /* update stats */ + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); +out: + SetPageWasActive(page); + return ret; +} + +static void zcache_flush_page(int pool_id, struct cleancache_filekey key, + pgoff_t index) +{ + struct zcache_pool *zpool = zcache.pools[pool_id]; + void *zaddr = NULL; + + zaddr = zcache_load_delete_zaddr(zpool, key.u.ino, index); + if (zaddr && (zaddr != ZERO_HANDLE)) { + zbud_free(zpool->pool, (unsigned long)zaddr); + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); + } else if (zaddr == ZERO_HANDLE) { + atomic_dec(&zcache_stored_zero_pages); + } +} + +#define FREE_BATCH 16 +/* + * Callers must hold the lock + */ +static void zcache_flush_ratree(struct zcache_pool *zpool, + struct zcache_rbnode *rbnode) +{ + unsigned long index = 0; + int count, i; + struct zcache_ra_handle *zhandle; + void *zaddr = NULL; + + do { + void *zaddrs[FREE_BATCH]; + unsigned long indices[FREE_BATCH]; + + count = radix_tree_gang_lookup_index(&rbnode->ratree, + (void **)zaddrs, indices, + index, FREE_BATCH); + + for (i = 0; i < count; i++) { + if (zaddrs[i] == ZERO_HANDLE) { + zaddr = radix_tree_delete(&rbnode->ratree, + indices[i]); + if (zaddr) + atomic_dec(&zcache_stored_zero_pages); + continue; + } + zhandle = (struct zcache_ra_handle *)zbud_map( + zpool->pool, (unsigned long)zaddrs[i]); + index = zhandle->ra_index; + zaddr = radix_tree_delete(&rbnode->ratree, index); + if (!zaddr) + continue; + zbud_unmap(zpool->pool, (unsigned long)zaddrs[i]); + zbud_free(zpool->pool, (unsigned long)zaddrs[i]); + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); + } + + index++; + } while (count == FREE_BATCH); +} + +static void zcache_flush_inode(int pool_id, struct cleancache_filekey key) +{ + struct zcache_rbnode *rbnode; + unsigned long flags1, flags2; + struct zcache_pool *zpool = zcache.pools[pool_id]; + + /* + * Refuse new pages added in to the same rbinode, so get rb_lock at + * first. + */ + write_lock_irqsave(&zpool->rb_lock, flags1); + rbnode = zcache_find_rbnode(&zpool->rbtree, key.u.ino, 0, 0); + if (!rbnode) { + write_unlock_irqrestore(&zpool->rb_lock, flags1); + return; + } + + kref_get(&rbnode->refcount); + spin_lock_irqsave(&rbnode->ra_lock, flags2); + + zcache_flush_ratree(zpool, rbnode); + if (zcache_rbnode_empty(rbnode)) + /* When arrvied here, we already hold rb_lock */ + zcache_rbnode_isolate(zpool, rbnode, 1); + + spin_unlock_irqrestore(&rbnode->ra_lock, flags2); + write_unlock_irqrestore(&zpool->rb_lock, flags1); + kref_put(&rbnode->refcount, zcache_rbnode_release); +} + +static void zcache_destroy_pool(struct zcache_pool *zpool); +static void zcache_flush_fs(int pool_id) +{ + struct zcache_rbnode *z_rbnode = NULL; + struct rb_node *rbnode; + unsigned long flags1, flags2; + struct zcache_pool *zpool; + + if (pool_id < 0) + return; + + zpool = zcache.pools[pool_id]; + if (!zpool) + return; + + /* + * Refuse new pages added in, so get rb_lock at first. + */ + write_lock_irqsave(&zpool->rb_lock, flags1); + + rbnode = rb_first(&zpool->rbtree); + while (rbnode) { + z_rbnode = rb_entry(rbnode, struct zcache_rbnode, rb_node); + rbnode = rb_next(rbnode); + if (z_rbnode) { + kref_get(&z_rbnode->refcount); + spin_lock_irqsave(&z_rbnode->ra_lock, flags2); + zcache_flush_ratree(zpool, z_rbnode); + if (zcache_rbnode_empty(z_rbnode)) + zcache_rbnode_isolate(zpool, z_rbnode, 1); + spin_unlock_irqrestore(&z_rbnode->ra_lock, flags2); + kref_put(&z_rbnode->refcount, zcache_rbnode_release); + } + } + + write_unlock_irqrestore(&zpool->rb_lock, flags1); + zcache_destroy_pool(zpool); +} + +/* + * Evict compressed pages from zcache pool on an LRU basis after the compressed + * pool is full. + */ +static int zcache_evict_zpage(struct zbud_pool *pool, unsigned long zaddr) +{ + struct zcache_pool *zpool; + struct zcache_ra_handle *zhandle; + void *zaddr_intree; + + BUG_ON(zaddr == (unsigned long)ZERO_HANDLE); + + zhandle = (struct zcache_ra_handle *)zbud_map(pool, zaddr); + + zpool = zhandle->zpool; + /* There can be a race with zcache store */ + if (!zpool) + return -EINVAL; + + BUG_ON(pool != zpool->pool); + + zaddr_intree = zcache_load_delete_zaddr(zpool, zhandle->rb_index, + zhandle->ra_index); + if (zaddr_intree) { + BUG_ON((unsigned long)zaddr_intree != zaddr); + zbud_unmap(pool, zaddr); + zbud_free(pool, zaddr); + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(pool); + zcache_evict_zpages++; + } + return 0; +} + +static struct zbud_ops zcache_zbud_ops = { + .evict = zcache_evict_zpage +}; + +/* Return pool id */ +static int zcache_create_pool(void) +{ + int ret; + struct zcache_pool *zpool; + + zpool = kzalloc(sizeof(*zpool), GFP_KERNEL); + if (!zpool) { + ret = -ENOMEM; + goto out; + } + + zpool->pool = zbud_create_pool(GFP_KERNEL, &zcache_zbud_ops); + if (!zpool->pool) { + kfree(zpool); + ret = -ENOMEM; + goto out; + } + + spin_lock(&zcache.pool_lock); + if (zcache.num_pools == MAX_ZCACHE_POOLS) { + pr_err("Cannot create new pool (limit:%u)\n", MAX_ZCACHE_POOLS); + zbud_destroy_pool(zpool->pool); + kfree(zpool); + ret = -EPERM; + goto out_unlock; + } + + rwlock_init(&zpool->rb_lock); + zpool->rbtree = RB_ROOT; + /* Add to pool list */ + for (ret = 0; ret < MAX_ZCACHE_POOLS; ret++) + if (!zcache.pools[ret]) + break; + zcache.pools[ret] = zpool; + zcache.num_pools++; + pr_info("New pool created id:%d\n", ret); + +out_unlock: + spin_unlock(&zcache.pool_lock); +out: + return ret; +} + +static void zcache_destroy_pool(struct zcache_pool *zpool) +{ + int i; + + if (!zpool) + return; + + spin_lock(&zcache.pool_lock); + zcache.num_pools--; + for (i = 0; i < MAX_ZCACHE_POOLS; i++) + if (zcache.pools[i] == zpool) + break; + zcache.pools[i] = NULL; + spin_unlock(&zcache.pool_lock); + + if (!RB_EMPTY_ROOT(&zpool->rbtree)) + WARN_ON("Memory leak detected. Freeing non-empty pool!\n"); + + zbud_destroy_pool(zpool->pool); + kfree(zpool); +} + +static int zcache_init_fs(size_t pagesize) +{ + int ret; + + if (pagesize != PAGE_SIZE) { + pr_info("Unsupported page size: %zu", pagesize); + ret = -EINVAL; + goto out; + } + + ret = zcache_create_pool(); + if (ret < 0) { + pr_info("Failed to create new pool\n"); + ret = -ENOMEM; + goto out; + } +out: + return ret; +} + +static int zcache_init_shared_fs(char *uuid, size_t pagesize) +{ + /* shared pools are unsupported and map to private */ + return zcache_init_fs(pagesize); +} + +static struct cleancache_ops zcache_ops = { + .put_page = zcache_store_page, + .get_page = zcache_load_page, + .invalidate_page = zcache_flush_page, + .invalidate_inode = zcache_flush_inode, + .invalidate_fs = zcache_flush_fs, + .init_shared_fs = zcache_init_shared_fs, + .init_fs = zcache_init_fs +}; + +/* + * Debugfs functions + */ +#ifdef CONFIG_DEBUG_FS +#include + +static int pool_pages_get(void *_data, u64 *val) +{ + *val = zcache_pages(); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(pool_page_fops, pool_pages_get, NULL, "%llu\n"); + +static struct dentry *zcache_debugfs_root; + +static int __init zcache_debugfs_init(void) +{ + if (!debugfs_initialized()) + return -ENODEV; + + zcache_debugfs_root = debugfs_create_dir("zcache", NULL); + if (!zcache_debugfs_root) + return -ENOMEM; + + debugfs_create_u64("pool_limit_hit", S_IRUGO, zcache_debugfs_root, + &zcache_pool_limit_hit); + debugfs_create_u64("reject_alloc_fail", S_IRUGO, zcache_debugfs_root, + &zcache_zbud_alloc_fail); + debugfs_create_u64("duplicate_entry", S_IRUGO, zcache_debugfs_root, + &zcache_dup_entry); + debugfs_create_file("pool_pages", S_IRUGO, zcache_debugfs_root, NULL, + &pool_page_fops); + debugfs_create_atomic_t("stored_pages", S_IRUGO, zcache_debugfs_root, + &zcache_stored_pages); + debugfs_create_atomic_t("stored_zero_pages", S_IRUGO, + zcache_debugfs_root, &zcache_stored_zero_pages); + debugfs_create_u64("evicted_zpages", S_IRUGO, zcache_debugfs_root, + &zcache_evict_zpages); + debugfs_create_u64("evicted_filepages", S_IRUGO, zcache_debugfs_root, + &zcache_evict_filepages); + debugfs_create_u64("reclaim_fail", S_IRUGO, zcache_debugfs_root, + &zcache_reclaim_fail); + debugfs_create_u64("inactive_pages_refused", S_IRUGO, + zcache_debugfs_root, &zcache_inactive_pages_refused); + debugfs_create_u64("pool_shrink_count", S_IRUGO, + zcache_debugfs_root, &zcache_pool_shrink); + debugfs_create_u64("pool_shrink_fail", S_IRUGO, + zcache_debugfs_root, &zcache_pool_shrink_fail); + debugfs_create_u64("pool_shrink_pages", S_IRUGO, + zcache_debugfs_root, &zcache_pool_shrink_pages); + debugfs_create_u64("store_fail", S_IRUGO, + zcache_debugfs_root, &zcache_store_failed); + return 0; +} + +static void __exit zcache_debugfs_exit(void) +{ + debugfs_remove_recursive(zcache_debugfs_root); +} +#else +static int __init zcache_debugfs_init(void) +{ + return 0; +} +static void __exit zcache_debugfs_exit(void) +{ +} +#endif + +/* + * zcache init and exit + */ +static int __init init_zcache(void) +{ + if (!zcache_enabled) + return 0; + + pr_info("loading zcache..\n"); + if (zcache_rbnode_cache_create()) { + pr_err("entry cache creation failed\n"); + goto error; + } + + if (zcache_comp_init()) { + pr_err("compressor initialization failed\n"); + goto compfail; + } + if (zcache_cpu_init()) { + pr_err("per-cpu initialization failed\n"); + goto pcpufail; + } + + spin_lock_init(&zcache.pool_lock); + cleancache_register_ops(&zcache_ops); + + if (zcache_debugfs_init()) + pr_warn("debugfs initialization failed\n"); + register_shrinker(&zcache_shrinker); + return 0; +pcpufail: + zcache_comp_exit(); +compfail: + zcache_rbnode_cache_destroy(); +error: + return -ENOMEM; +} + +/* must be late so crypto has time to come up */ +late_initcall(init_zcache); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bob Liu "); +MODULE_DESCRIPTION("Compressed cache for clean file pages"); + diff --git a/mm/zpool.c b/mm/zpool.c new file mode 100644 index 000000000..308395ba8 --- /dev/null +++ b/mm/zpool.c @@ -0,0 +1,359 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for memory storage pool implementations. + * Typically, this is used to store compressed memory. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +struct zpool { + char *type; + + struct zpool_driver *driver; + void *pool; + const struct zpool_ops *ops; + + struct list_head list; +}; + +static LIST_HEAD(drivers_head); +static DEFINE_SPINLOCK(drivers_lock); + +static LIST_HEAD(pools_head); +static DEFINE_SPINLOCK(pools_lock); + +/** + * zpool_register_driver() - register a zpool implementation. + * @driver: driver to register + */ +void zpool_register_driver(struct zpool_driver *driver) +{ + spin_lock(&drivers_lock); + atomic_set(&driver->refcount, 0); + list_add(&driver->list, &drivers_head); + spin_unlock(&drivers_lock); +} +EXPORT_SYMBOL(zpool_register_driver); + +/** + * zpool_unregister_driver() - unregister a zpool implementation. + * @driver: driver to unregister. + * + * Module usage counting is used to prevent using a driver + * while/after unloading, so if this is called from module + * exit function, this should never fail; if called from + * other than the module exit function, and this returns + * failure, the driver is in use and must remain available. + */ +int zpool_unregister_driver(struct zpool_driver *driver) +{ + int ret = 0, refcount; + + spin_lock(&drivers_lock); + refcount = atomic_read(&driver->refcount); + WARN_ON(refcount < 0); + if (refcount > 0) + ret = -EBUSY; + else + list_del(&driver->list); + spin_unlock(&drivers_lock); + + return ret; +} +EXPORT_SYMBOL(zpool_unregister_driver); + +/* this assumes @type is null-terminated. */ +static struct zpool_driver *zpool_get_driver(const char *type) +{ + struct zpool_driver *driver; + + spin_lock(&drivers_lock); + list_for_each_entry(driver, &drivers_head, list) { + if (!strcmp(driver->type, type)) { + bool got = try_module_get(driver->owner); + + if (got) + atomic_inc(&driver->refcount); + spin_unlock(&drivers_lock); + return got ? driver : NULL; + } + } + + spin_unlock(&drivers_lock); + return NULL; +} + +static void zpool_put_driver(struct zpool_driver *driver) +{ + atomic_dec(&driver->refcount); + module_put(driver->owner); +} + +/** + * zpool_has_pool() - Check if the pool driver is available + * @type The type of the zpool to check (e.g. zbud, zsmalloc) + * + * This checks if the @type pool driver is available. This will try to load + * the requested module, if needed, but there is no guarantee the module will + * still be loaded and available immediately after calling. If this returns + * true, the caller should assume the pool is available, but must be prepared + * to handle the @zpool_create_pool() returning failure. However if this + * returns false, the caller should assume the requested pool type is not + * available; either the requested pool type module does not exist, or could + * not be loaded, and calling @zpool_create_pool() with the pool type will + * fail. + * + * Returns: true if @type pool is available, false if not + */ +bool zpool_has_pool(char *type) +{ + struct zpool_driver *driver = zpool_get_driver(type); + + if (!driver) { + request_module("zpool-%s", type); + driver = zpool_get_driver(type); + } + + if (!driver) + return false; + + zpool_put_driver(driver); + return true; +} +EXPORT_SYMBOL(zpool_has_pool); + +/** + * zpool_create_pool() - Create a new zpool + * @type The type of the zpool to create (e.g. zbud, zsmalloc) + * @name The name of the zpool (e.g. zram0, zswap) + * @gfp The GFP flags to use when allocating the pool. + * @ops The optional ops callback. + * + * This creates a new zpool of the specified type. The gfp flags will be + * used when allocating memory, if the implementation supports it. If the + * ops param is NULL, then the created zpool will not be shrinkable. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: New zpool on success, NULL on failure. + */ +struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, + const struct zpool_ops *ops) +{ + struct zpool_driver *driver; + struct zpool *zpool; + + pr_debug("creating pool type %s\n", type); + + driver = zpool_get_driver(type); + + if (!driver) { + request_module("zpool-%s", type); + driver = zpool_get_driver(type); + } + + if (!driver) { + pr_err("no driver for type %s\n", type); + return NULL; + } + + zpool = kmalloc(sizeof(*zpool), gfp); + if (!zpool) { + pr_err("couldn't create zpool - out of memory\n"); + zpool_put_driver(driver); + return NULL; + } + + zpool->type = driver->type; + zpool->driver = driver; + zpool->pool = driver->create(name, gfp, ops, zpool); + zpool->ops = ops; + + if (!zpool->pool) { + pr_err("couldn't create %s pool\n", type); + zpool_put_driver(driver); + kfree(zpool); + return NULL; + } + + pr_debug("created pool type %s\n", type); + + spin_lock(&pools_lock); + list_add(&zpool->list, &pools_head); + spin_unlock(&pools_lock); + + return zpool; +} + +/** + * zpool_destroy_pool() - Destroy a zpool + * @pool The zpool to destroy. + * + * Implementations must guarantee this to be thread-safe, + * however only when destroying different pools. The same + * pool should only be destroyed once, and should not be used + * after it is destroyed. + * + * This destroys an existing zpool. The zpool should not be in use. + */ +void zpool_destroy_pool(struct zpool *zpool) +{ + pr_debug("destroying pool type %s\n", zpool->type); + + spin_lock(&pools_lock); + list_del(&zpool->list); + spin_unlock(&pools_lock); + zpool->driver->destroy(zpool->pool); + zpool_put_driver(zpool->driver); + kfree(zpool); +} + +/** + * zpool_get_type() - Get the type of the zpool + * @pool The zpool to check + * + * This returns the type of the pool. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: The type of zpool. + */ +char *zpool_get_type(struct zpool *zpool) +{ + return zpool->type; +} + +/** + * zpool_malloc() - Allocate memory + * @pool The zpool to allocate from. + * @size The amount of memory to allocate. + * @gfp The GFP flags to use when allocating memory. + * @handle Pointer to the handle to set + * + * This allocates the requested amount of memory from the pool. + * The gfp flags will be used when allocating memory, if the + * implementation supports it. The provided @handle will be + * set to the allocated object handle. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error. + */ +int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zpool->driver->malloc(zpool->pool, size, gfp, handle); +} + +/** + * zpool_free() - Free previously allocated memory + * @pool The zpool that allocated the memory. + * @handle The handle to the memory to free. + * + * This frees previously allocated memory. This does not guarantee + * that the pool will actually free memory, only that the memory + * in the pool will become available for use by the pool. + * + * Implementations must guarantee this to be thread-safe, + * however only when freeing different handles. The same + * handle should only be freed once, and should not be used + * after freeing. + */ +void zpool_free(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->free(zpool->pool, handle); +} + +/** + * zpool_shrink() - Shrink the pool size + * @pool The zpool to shrink. + * @pages The number of pages to shrink the pool. + * @reclaimed The number of pages successfully evicted. + * + * This attempts to shrink the actual memory size of the pool + * by evicting currently used handle(s). If the pool was + * created with no zpool_ops, or the evict call fails for any + * of the handles, this will fail. If non-NULL, the @reclaimed + * parameter will be set to the number of pages reclaimed, + * which may be more than the number of pages requested. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error/failure. + */ +int zpool_shrink(struct zpool *zpool, unsigned int pages, + unsigned int *reclaimed) +{ + return zpool->driver->shrink(zpool->pool, pages, reclaimed); +} + +/** + * zpool_map_handle() - Map a previously allocated handle into memory + * @pool The zpool that the handle was allocated from + * @handle The handle to map + * @mm How the memory should be mapped + * + * This maps a previously allocated handle into memory. The @mm + * param indicates to the implementation how the memory will be + * used, i.e. read-only, write-only, read-write. If the + * implementation does not support it, the memory will be treated + * as read-write. + * + * This may hold locks, disable interrupts, and/or preemption, + * and the zpool_unmap_handle() must be called to undo those + * actions. The code that uses the mapped handle should complete + * its operatons on the mapped handle memory quickly and unmap + * as soon as possible. As the implementation may use per-cpu + * data, multiple handles should not be mapped concurrently on + * any cpu. + * + * Returns: A pointer to the handle's mapped memory area. + */ +void *zpool_map_handle(struct zpool *zpool, unsigned long handle, + enum zpool_mapmode mapmode) +{ + return zpool->driver->map(zpool->pool, handle, mapmode); +} + +/** + * zpool_unmap_handle() - Unmap a previously mapped handle + * @pool The zpool that the handle was allocated from + * @handle The handle to unmap + * + * This unmaps a previously mapped handle. Any locks or other + * actions that the implementation took in zpool_map_handle() + * will be undone here. The memory area returned from + * zpool_map_handle() should no longer be used after this. + */ +void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->unmap(zpool->pool, handle); +} + +/** + * zpool_get_total_size() - The total size of the pool + * @pool The zpool to check + * + * This returns the total size in bytes of the pool. + * + * Returns: Total size of the zpool in bytes. + */ +u64 zpool_get_total_size(struct zpool *zpool) +{ + return zpool->driver->total_size(zpool->pool); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Streetman "); +MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index eff0ca018..7543e3fca 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2,6 +2,7 @@ * zsmalloc memory allocator * * Copyright (C) 2011 Nitin Gupta + * Copyright (C) 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the license that better fits your requirements. @@ -10,43 +11,12 @@ * Released under the terms of GNU General Public License Version 2.0 */ - /* - * This allocator is designed for use with zcache and zram. Thus, the - * allocator is supposed to work well under low memory conditions. In - * particular, it never attempts higher order page allocation which is - * very likely to fail under memory pressure. On the other hand, if we - * just use single (0-order) pages, it would suffer from very high - * fragmentation -- any object of size PAGE_SIZE/2 or larger would occupy - * an entire page. This was one of the major issues with its predecessor - * (xvmalloc). - * - * To overcome these issues, zsmalloc allocates a bunch of 0-order pages - * and links them together using various 'struct page' fields. These linked - * pages act as a single higher-order page i.e. an object can span 0-order - * page boundaries. The code refers to these linked pages as a single entity - * called zspage. - * - * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE - * since this satisfies the requirements of all its current users (in the - * worst case, page is incompressible and is thus stored "as-is" i.e. in - * uncompressed form). For allocation requests larger than this size, failure - * is returned (see zs_malloc). - * - * Additionally, zs_malloc() does not return a dereferenceable pointer. - * Instead, it returns an opaque handle (unsigned long) which encodes actual - * location of the allocated object. The reason for this indirection is that - * zsmalloc does not keep zspages permanently mapped since that would cause - * issues on 32-bit systems where the VA region for kernel space mappings - * is very small. So, before using the allocating memory, the object has to - * be mapped using zs_map_object() to get a usable pointer and subsequently - * unmapped using zs_unmap_object(). - * * Following is how we use various fields and flags of underlying * struct page(s) to form a zspage. * * Usage of struct page fields: - * page->first_page: points to the first component (0-order) page + * page->private: points to the first component (0-order) page * page->index (union with page->freelist): offset of the first object * starting in this page. For the first page, this is * always 0, so we use this field (aka freelist) to point @@ -56,14 +26,18 @@ * * For _first_ page only: * - * page->private (union with page->first_page): refers to the - * component page after the first page + * page->private: refers to the component page after the first page + * If the page is first_page for huge object, it stores handle. + * Look at size_class->huge. * page->freelist: points to the first free object in zspage. * Free objects are linked together using in-place * metadata. + * page->objects: maximum number of objects we can store in this + * zspage (class->zspage_order * PAGE_SIZE / class->size) * page->lru: links together first pages of various zspages. * Basically forming list of zspages in a fullness group. * page->mapping: class index and fullness group of the zspage + * page->inuse: the number of objects that are used in this zspage * * Usage of struct page flags: * PG_private: identifies the first component page @@ -71,12 +45,14 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include +#include #include #include #include -#include #include #include #include @@ -84,11 +60,12 @@ #include #include #include -#include +#include #include #include - +#include #include +#include /* * This must be power of 2 and greater than of equal to sizeof(link_free). @@ -105,6 +82,8 @@ #define ZS_MAX_ZSPAGE_ORDER 2 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) +#define ZS_HANDLE_SIZE (sizeof(unsigned long)) + /* * Object location (, ) is encoded as * as single (unsigned long) handle value. @@ -128,17 +107,37 @@ #endif #endif #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) -#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) + +/* + * Memory for allocating for handle keeps object position by + * encoding and the encoded value has a room + * in least bit(ie, look at obj_to_location). + * We use the bit to synchronize between object access by + * user and migration. + */ +#define HANDLE_PIN_BIT 0 + +/* + * Head in allocated object should have OBJ_ALLOCATED_TAG + * to identify the object was allocated or not. + * It's okay to add the status bit in the least bit because + * header keeps handle which is 4byte-aligned address so we + * have room for two bit at least. + */ +#define OBJ_ALLOCATED_TAG 1 +#define OBJ_TAG_BITS 1 +#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) #define MAX(a, b) ((a) >= (b) ? (a) : (b)) /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ #define ZS_MIN_ALLOC_SIZE \ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) +/* each chunk includes extra space to keep handle */ #define ZS_MAX_ALLOC_SIZE PAGE_SIZE /* - * On systems with 4K page size, this gives 254 size classes! There is a + * On systems with 4K page size, this gives 255 size classes! There is a * trader-off here: * - Large number of size classes is potentially wasteful as free page are * spread across these classes @@ -151,8 +150,6 @@ * (reason above) */ #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) -#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ - ZS_SIZE_CLASS_DELTA + 1) /* * We do not maintain any list for completely empty or full pages @@ -166,12 +163,38 @@ enum fullness_group { ZS_FULL }; +enum zs_stat_type { + OBJ_ALLOCATED, + OBJ_USED, + CLASS_ALMOST_FULL, + CLASS_ALMOST_EMPTY, +}; + +#ifdef CONFIG_ZSMALLOC_STAT +#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) +#else +#define NR_ZS_STAT_TYPE (OBJ_USED + 1) +#endif + +struct zs_size_stat { + unsigned long objs[NR_ZS_STAT_TYPE]; +}; + +#ifdef CONFIG_ZSMALLOC_STAT +static struct dentry *zs_stat_root; +#endif + +/* + * number of size_classes + */ +static int zs_size_classes; + /* * We assign a page to ZS_ALMOST_EMPTY fullness group when: * n <= N / f, where * n = number of allocated objects * N = total number of objects zspage can store - * f = 1/fullness_threshold_frac + * f = fullness_threshold_frac * * Similarly, we assign zspage to: * ZS_ALMOST_FULL when n > N / f @@ -183,6 +206,8 @@ enum fullness_group { static const int fullness_threshold_frac = 4; struct size_class { + spinlock_t lock; + struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; /* * Size of objects stored in this class. Must be multiple * of ZS_ALIGN. @@ -190,15 +215,12 @@ struct size_class { int size; unsigned int index; + struct zs_size_stat stats; + /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ int pages_per_zspage; - - spinlock_t lock; - - /* stats */ - u64 pages_allocated; - - struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; + /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ + bool huge; }; /* @@ -208,14 +230,39 @@ struct size_class { * This must be power of 2 and less than or equal to ZS_ALIGN */ struct link_free { - /* Handle of next free chunk (encodes ) */ - void *next; + union { + /* + * Position of next free chunk (encodes ) + * It's valid for non-allocated object + */ + void *next; + /* + * Handle of allocated object. + */ + unsigned long handle; + }; }; struct zs_pool { - struct size_class size_class[ZS_SIZE_CLASSES]; + const char *name; + + struct size_class **size_class; + struct kmem_cache *handle_cachep; + + atomic_long_t pages_allocated; - struct zs_ops *ops; + struct zs_pool_stats stats; + + /* Compact classes */ + struct shrinker shrinker; + /* + * To signify that register_shrinker() was successful + * and unregister_shrinker() will not Oops. + */ + bool shrinker_enabled; +#ifdef CONFIG_ZSMALLOC_STAT + struct dentry *stat_dentry; +#endif }; /* @@ -237,22 +284,128 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; -/* default page alloc/free ops */ -struct page *zs_alloc_page(gfp_t flags) +static int create_handle_cache(struct zs_pool *pool) +{ + pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, + 0, 0, NULL); + return pool->handle_cachep ? 0 : 1; +} + +static void destroy_handle_cache(struct zs_pool *pool) +{ + kmem_cache_destroy(pool->handle_cachep); +} + +static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp) +{ + return (unsigned long)kmem_cache_alloc(pool->handle_cachep, + gfp & ~__GFP_HIGHMEM); +} + +static void free_handle(struct zs_pool *pool, unsigned long handle) +{ + kmem_cache_free(pool->handle_cachep, (void *)handle); +} + +static void record_obj(unsigned long handle, unsigned long obj) +{ + /* + * lsb of @obj represents handle lock while other bits + * represent object value the handle is pointing so + * updating shouldn't do store tearing. + */ + WRITE_ONCE(*(unsigned long *)handle, obj); +} + +/* zpool driver */ + +#ifdef CONFIG_ZPOOL + +static void *zs_zpool_create(const char *name, gfp_t gfp, + const struct zpool_ops *zpool_ops, + struct zpool *zpool) +{ + /* + * Ignore global gfp flags: zs_malloc() may be invoked from + * different contexts and its caller must provide a valid + * gfp mask. + */ + return zs_create_pool(name); +} + +static void zs_zpool_destroy(void *pool) +{ + zs_destroy_pool(pool); +} + +static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + *handle = zs_malloc(pool, size, gfp); + return *handle ? 0 : -1; +} +static void zs_zpool_free(void *pool, unsigned long handle) +{ + zs_free(pool, handle); +} + +static int zs_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + return -EINVAL; +} + +static void *zs_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + enum zs_mapmode zs_mm; + + switch (mm) { + case ZPOOL_MM_RO: + zs_mm = ZS_MM_RO; + break; + case ZPOOL_MM_WO: + zs_mm = ZS_MM_WO; + break; + case ZPOOL_MM_RW: /* fallthru */ + default: + zs_mm = ZS_MM_RW; + break; + } + + return zs_map_object(pool, handle, zs_mm); +} +static void zs_zpool_unmap(void *pool, unsigned long handle) { - return alloc_page(flags); + zs_unmap_object(pool, handle); } -void zs_free_page(struct page *page) +static u64 zs_zpool_total_size(void *pool) { - __free_page(page); + return zs_get_total_pages(pool) << PAGE_SHIFT; } -struct zs_ops zs_default_ops = { - .alloc = zs_alloc_page, - .free = zs_free_page +static struct zpool_driver zs_zpool_driver = { + .type = "zsmalloc", + .owner = THIS_MODULE, + .create = zs_zpool_create, + .destroy = zs_zpool_destroy, + .malloc = zs_zpool_malloc, + .free = zs_zpool_free, + .shrink = zs_zpool_shrink, + .map = zs_zpool_map, + .unmap = zs_zpool_unmap, + .total_size = zs_zpool_total_size, }; +MODULE_ALIAS("zpool-zsmalloc"); +#endif /* CONFIG_ZPOOL */ + +static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) +{ + return pages_per_zspage * PAGE_SIZE / size; +} + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); @@ -266,26 +419,28 @@ static int is_last_page(struct page *page) return PagePrivate2(page); } -static void get_zspage_mapping(struct page *page, unsigned int *class_idx, +static void get_zspage_mapping(struct page *first_page, + unsigned int *class_idx, enum fullness_group *fullness) { unsigned long m; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); - m = (unsigned long)page->mapping; + m = (unsigned long)first_page->mapping; *fullness = m & FULLNESS_MASK; *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; } -static void set_zspage_mapping(struct page *page, unsigned int class_idx, +static void set_zspage_mapping(struct page *first_page, + unsigned int class_idx, enum fullness_group fullness) { unsigned long m; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | (fullness & FULLNESS_MASK); - page->mapping = (struct address_space *)m; + first_page->mapping = (struct address_space *)m; } /* @@ -303,8 +458,171 @@ static int get_size_class_index(int size) idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, ZS_SIZE_CLASS_DELTA); - return idx; + return min(zs_size_classes - 1, idx); +} + +static inline void zs_stat_inc(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ + if (type < NR_ZS_STAT_TYPE) + class->stats.objs[type] += cnt; +} + +static inline void zs_stat_dec(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ + if (type < NR_ZS_STAT_TYPE) + class->stats.objs[type] -= cnt; +} + +static inline unsigned long zs_stat_get(struct size_class *class, + enum zs_stat_type type) +{ + if (type < NR_ZS_STAT_TYPE) + return class->stats.objs[type]; + return 0; +} + +#ifdef CONFIG_ZSMALLOC_STAT + +static void __init zs_stat_init(void) +{ + if (!debugfs_initialized()) { + pr_warn("debugfs not available, stat dir not created\n"); + return; + } + + zs_stat_root = debugfs_create_dir("zsmalloc", NULL); + if (!zs_stat_root) + pr_warn("debugfs 'zsmalloc' stat dir creation failed\n"); +} + +static void __exit zs_stat_exit(void) +{ + debugfs_remove_recursive(zs_stat_root); +} + +static unsigned long zs_can_compact(struct size_class *class); + +static int zs_stats_size_show(struct seq_file *s, void *v) +{ + int i; + struct zs_pool *pool = s->private; + struct size_class *class; + int objs_per_zspage; + unsigned long class_almost_full, class_almost_empty; + unsigned long obj_allocated, obj_used, pages_used, freeable; + unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; + unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; + unsigned long total_freeable = 0; + + seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n", + "class", "size", "almost_full", "almost_empty", + "obj_allocated", "obj_used", "pages_used", + "pages_per_zspage", "freeable"); + + for (i = 0; i < zs_size_classes; i++) { + class = pool->size_class[i]; + + if (class->index != i) + continue; + + spin_lock(&class->lock); + class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); + class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); + obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + obj_used = zs_stat_get(class, OBJ_USED); + freeable = zs_can_compact(class); + spin_unlock(&class->lock); + + objs_per_zspage = get_maxobj_per_zspage(class->size, + class->pages_per_zspage); + pages_used = obj_allocated / objs_per_zspage * + class->pages_per_zspage; + + seq_printf(s, " %5u %5u %11lu %12lu %13lu" + " %10lu %10lu %16d %8lu\n", + i, class->size, class_almost_full, class_almost_empty, + obj_allocated, obj_used, pages_used, + class->pages_per_zspage, freeable); + + total_class_almost_full += class_almost_full; + total_class_almost_empty += class_almost_empty; + total_objs += obj_allocated; + total_used_objs += obj_used; + total_pages += pages_used; + total_freeable += freeable; + } + + seq_puts(s, "\n"); + seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n", + "Total", "", total_class_almost_full, + total_class_almost_empty, total_objs, + total_used_objs, total_pages, "", total_freeable); + + return 0; +} + +static int zs_stats_size_open(struct inode *inode, struct file *file) +{ + return single_open(file, zs_stats_size_show, inode->i_private); +} + +static const struct file_operations zs_stat_size_ops = { + .open = zs_stats_size_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void zs_pool_stat_create(struct zs_pool *pool, const char *name) +{ + struct dentry *entry; + + if (!zs_stat_root) { + pr_warn("no root stat dir, not creating <%s> stat dir\n", name); + return; + } + + entry = debugfs_create_dir(name, zs_stat_root); + if (!entry) { + pr_warn("debugfs dir <%s> creation failed\n", name); + return; + } + pool->stat_dentry = entry; + + entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, + pool->stat_dentry, pool, &zs_stat_size_ops); + if (!entry) { + pr_warn("%s: debugfs file entry <%s> creation failed\n", + name, "classes"); + debugfs_remove_recursive(pool->stat_dentry); + pool->stat_dentry = NULL; + } +} + +static void zs_pool_stat_destroy(struct zs_pool *pool) +{ + debugfs_remove_recursive(pool->stat_dentry); +} + +#else /* CONFIG_ZSMALLOC_STAT */ +static void __init zs_stat_init(void) +{ +} + +static void __exit zs_stat_exit(void) +{ +} + +static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name) +{ +} + +static inline void zs_pool_stat_destroy(struct zs_pool *pool) +{ } +#endif /* * For each size class, zspages are divided into different groups @@ -313,21 +631,20 @@ static int get_size_class_index(int size) * the pool (not yet implemented). This function returns fullness * status of the given page. */ -static enum fullness_group get_fullness_group(struct page *page, - struct size_class *class) +static enum fullness_group get_fullness_group(struct page *first_page) { int inuse, max_objects; enum fullness_group fg; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); - inuse = page->inuse; - max_objects = class->pages_per_zspage * PAGE_SIZE / class->size; + inuse = first_page->inuse; + max_objects = first_page->objects; if (inuse == 0) fg = ZS_EMPTY; else if (inuse == max_objects) fg = ZS_FULL; - else if (inuse <= max_objects / fullness_threshold_frac) + else if (inuse <= 3 * max_objects / fullness_threshold_frac) fg = ZS_ALMOST_EMPTY; else fg = ZS_ALMOST_FULL; @@ -341,33 +658,46 @@ static enum fullness_group get_fullness_group(struct page *page, * have. This functions inserts the given zspage into the freelist * identified by . */ -static void insert_zspage(struct page *page, struct size_class *class, - enum fullness_group fullness) +static void insert_zspage(struct size_class *class, + enum fullness_group fullness, + struct page *first_page) { struct page **head; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); if (fullness >= _ZS_NR_FULLNESS_GROUPS) return; + zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? + CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); + head = &class->fullness_list[fullness]; - if (*head) - list_add_tail(&page->lru, &(*head)->lru); + if (!*head) { + *head = first_page; + return; + } - *head = page; + /* + * We want to see more ZS_FULL pages and less almost + * empty/full. Put pages with higher ->inuse first. + */ + list_add_tail(&first_page->lru, &(*head)->lru); + if (first_page->inuse >= (*head)->inuse) + *head = first_page; } /* * This function removes the given zspage from the freelist identified * by . */ -static void remove_zspage(struct page *page, struct size_class *class, - enum fullness_group fullness) +static void remove_zspage(struct size_class *class, + enum fullness_group fullness, + struct page *first_page) { struct page **head; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); if (fullness >= _ZS_NR_FULLNESS_GROUPS) return; @@ -376,11 +706,13 @@ static void remove_zspage(struct page *page, struct size_class *class, BUG_ON(!*head); if (list_empty(&(*head)->lru)) *head = NULL; - else if (*head == page) + else if (*head == first_page) *head = (struct page *)list_entry((*head)->lru.next, struct page, lru); - list_del_init(&page->lru); + list_del_init(&first_page->lru); + zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? + CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); } /* @@ -392,24 +724,22 @@ static void remove_zspage(struct page *page, struct size_class *class, * page from the freelist of the old fullness group to that of the new * fullness group. */ -static enum fullness_group fix_fullness_group(struct zs_pool *pool, - struct page *page) +static enum fullness_group fix_fullness_group(struct size_class *class, + struct page *first_page) { int class_idx; - struct size_class *class; enum fullness_group currfg, newfg; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); - get_zspage_mapping(page, &class_idx, &currfg); - class = &pool->size_class[class_idx]; - newfg = get_fullness_group(page, class); + get_zspage_mapping(first_page, &class_idx, &currfg); + newfg = get_fullness_group(first_page); if (newfg == currfg) goto out; - remove_zspage(page, class, currfg); - insert_zspage(page, class, newfg); - set_zspage_mapping(page, class_idx, newfg); + remove_zspage(class, currfg, first_page); + insert_zspage(class, newfg, first_page); + set_zspage_mapping(first_page, class_idx, newfg); out: return newfg; @@ -420,7 +750,8 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool, * to form a zspage for each size class. This is important * to reduce wastage due to unusable space left at end of * each zspage which is given as: - * wastage = Zp - Zp % size_class + * wastage = Zp % class_size + * usage = Zp - wastage * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... * * For example, for size class of 3/8 * PAGE_SIZE, we should @@ -460,7 +791,7 @@ static struct page *get_first_page(struct page *page) if (is_first_page(page)) return page; else - return page->first_page; + return (struct page *)page_private(page); } static struct page *get_next_page(struct page *page) @@ -470,35 +801,59 @@ static struct page *get_next_page(struct page *page) if (is_last_page(page)) next = NULL; else if (is_first_page(page)) - next = (struct page *)page->private; + next = (struct page *)page_private(page); else next = list_entry(page->lru.next, struct page, lru); return next; } -/* Encode as a single handle value */ -static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) +/* + * Encode as a single handle value. + * We use the least bit of handle for tagging. + */ +static void *location_to_obj(struct page *page, unsigned long obj_idx) { - unsigned long handle; + unsigned long obj; if (!page) { BUG_ON(obj_idx); return NULL; } - handle = page_to_pfn(page) << OBJ_INDEX_BITS; - handle |= (obj_idx & OBJ_INDEX_MASK); + obj = page_to_pfn(page) << OBJ_INDEX_BITS; + obj |= ((obj_idx) & OBJ_INDEX_MASK); + obj <<= OBJ_TAG_BITS; - return (void *)handle; + return (void *)obj; } -/* Decode pair from the given object handle */ -static void obj_handle_to_location(unsigned long handle, struct page **page, +/* + * Decode pair from the given object handle. We adjust the + * decoded obj_idx back to its original value since it was adjusted in + * location_to_obj(). + */ +static void obj_to_location(unsigned long obj, struct page **page, unsigned long *obj_idx) { - *page = pfn_to_page(handle >> OBJ_INDEX_BITS); - *obj_idx = handle & OBJ_INDEX_MASK; + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *obj_idx = (obj & OBJ_INDEX_MASK); +} + +static unsigned long handle_to_obj(unsigned long handle) +{ + return *(unsigned long *)handle; +} + +static unsigned long obj_to_head(struct size_class *class, struct page *page, + void *obj) +{ + if (class->huge) { + VM_BUG_ON(!is_first_page(page)); + return page_private(page); + } else + return *(unsigned long *)obj; } static unsigned long obj_idx_to_offset(struct page *page, @@ -512,6 +867,25 @@ static unsigned long obj_idx_to_offset(struct page *page, return off + obj_idx * class_size; } +static inline int trypin_tag(unsigned long handle) +{ + unsigned long *ptr = (unsigned long *)handle; + + return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); +} + +static void pin_tag(unsigned long handle) +{ + while (!trypin_tag(handle)); +} + +static void unpin_tag(unsigned long handle) +{ + unsigned long *ptr = (unsigned long *)handle; + + clear_bit_unlock(HANDLE_PIN_BIT, ptr); +} + static void reset_page(struct page *page) { clear_bit(PG_private, &page->flags); @@ -519,10 +893,10 @@ static void reset_page(struct page *page) set_page_private(page, 0); page->mapping = NULL; page->freelist = NULL; - reset_page_mapcount(page); + page_mapcount_reset(page); } -static void free_zspage(struct zs_ops *ops, struct page *first_page) +static void free_zspage(struct page *first_page) { struct page *nextp, *tmp, *head_extra; @@ -532,7 +906,7 @@ static void free_zspage(struct zs_ops *ops, struct page *first_page) head_extra = (struct page *)page_private(first_page); reset_page(first_page); - ops->free(first_page); + __free_page(first_page); /* zspage with only 1 system page */ if (!head_extra) @@ -541,14 +915,14 @@ static void free_zspage(struct zs_ops *ops, struct page *first_page) list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { list_del(&nextp->lru); reset_page(nextp); - ops->free(nextp); + __free_page(nextp); } reset_page(head_extra); - ops->free(head_extra); + __free_page(head_extra); } /* Initialize a newly allocated zspage */ -static void init_zspage(struct page *first_page, struct size_class *class) +static void init_zspage(struct size_class *class, struct page *first_page) { unsigned long off = 0; struct page *page = first_page; @@ -557,7 +931,8 @@ static void init_zspage(struct page *first_page, struct size_class *class) while (page) { struct page *next_page; struct link_free *link; - unsigned int i, objs_on_page; + unsigned int i = 1; + void *vaddr; /* * page->index stores offset of first object starting @@ -568,16 +943,12 @@ static void init_zspage(struct page *first_page, struct size_class *class) if (page != first_page) page->index = off; - link = (struct link_free *)kmap_atomic(page) + - off / sizeof(*link); - objs_on_page = (PAGE_SIZE - off) / class->size; + vaddr = kmap_atomic(page); + link = (struct link_free *)vaddr + off / sizeof(*link); - for (i = 1; i <= objs_on_page; i++) { - off += class->size; - if (off < PAGE_SIZE) { - link->next = obj_location_to_handle(page, i); - link += class->size / sizeof(*link); - } + while ((off += class->size) < PAGE_SIZE) { + link->next = location_to_obj(page, i++); + link += class->size / sizeof(*link); } /* @@ -586,18 +957,17 @@ static void init_zspage(struct page *first_page, struct size_class *class) * page (if present) */ next_page = get_next_page(page); - link->next = obj_location_to_handle(next_page, 0); - kunmap_atomic(link); + link->next = location_to_obj(next_page, 0); + kunmap_atomic(vaddr); page = next_page; - off = (off + class->size) % PAGE_SIZE; + off %= PAGE_SIZE; } } /* * Allocate a zspage for the given size class */ -static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class, - gfp_t flags) +static struct page *alloc_zspage(struct size_class *class, gfp_t flags) { int i, error; struct page *first_page = NULL, *uninitialized_var(prev_page); @@ -606,7 +976,7 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class, * Allocate individual pages and link them together as: * 1. first page->private = first sub-page * 2. all sub-pages are linked together using page->lru - * 3. each sub-page is linked to the first page using page->first_page + * 3. each sub-page is linked to the first page using page->private * * For each size class, First/Head pages are linked together using * page->lru. Also, we set PG_private to identify the first page @@ -617,7 +987,7 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class, for (i = 0; i < class->pages_per_zspage; i++) { struct page *page; - page = ops->alloc(flags); + page = alloc_page(flags); if (!page) goto cleanup; @@ -629,9 +999,9 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class, first_page->inuse = 0; } if (i == 1) - first_page->private = (unsigned long)page; + set_page_private(first_page, (unsigned long)page); if (i >= 1) - page->first_page = first_page; + set_page_private(page, (unsigned long)first_page); if (i >= 2) list_add(&page->lru, &prev_page->lru); if (i == class->pages_per_zspage - 1) /* last page */ @@ -639,15 +1009,17 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class, prev_page = page; } - init_zspage(first_page, class); + init_zspage(class, first_page); - first_page->freelist = obj_location_to_handle(first_page, 0); + first_page->freelist = location_to_obj(first_page, 0); + /* Maximum number of objects we can store in this zspage */ + first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; error = 0; /* Success */ cleanup: if (unlikely(error) && first_page) { - free_zspage(ops, first_page); + free_zspage(first_page); first_page = NULL; } @@ -702,14 +1074,11 @@ static inline void __zs_unmap_object(struct mapping_area *area, struct page *pages[2], int off, int size) { unsigned long addr = (unsigned long)area->vm_addr; - unsigned long end = addr + (PAGE_SIZE * 2); - flush_cache_vunmap(addr, end); - unmap_kernel_range_noflush(addr, PAGE_SIZE * 2); - flush_tlb_kernel_range(addr, end); + unmap_kernel_range(addr, PAGE_SIZE * 2); } -#else /* CONFIG_PGTABLE_MAPPING*/ +#else /* CONFIG_PGTABLE_MAPPING */ static inline int __zs_cpu_up(struct mapping_area *area) { @@ -719,7 +1088,7 @@ static inline int __zs_cpu_up(struct mapping_area *area) */ if (area->vm_buf) return 0; - area->vm_buf = (char *)__get_free_page(GFP_KERNEL); + area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); if (!area->vm_buf) return -ENOMEM; return 0; @@ -727,8 +1096,7 @@ static inline int __zs_cpu_up(struct mapping_area *area) static inline void __zs_cpu_down(struct mapping_area *area) { - if (area->vm_buf) - free_page((unsigned long)area->vm_buf); + kfree(area->vm_buf); area->vm_buf = NULL; } @@ -765,12 +1133,17 @@ static void __zs_unmap_object(struct mapping_area *area, { int sizes[2]; void *addr; - char *buf = area->vm_buf; + char *buf; /* no write fastpath */ if (area->vm_mm == ZS_MM_RO) goto out; + buf = area->vm_buf; + buf = buf + ZS_HANDLE_SIZE; + size -= ZS_HANDLE_SIZE; + off += ZS_HANDLE_SIZE; + sizes[0] = PAGE_SIZE - off; sizes[1] = size - sizes[0]; @@ -816,197 +1189,71 @@ static struct notifier_block zs_cpu_nb = { .notifier_call = zs_cpu_notifier }; -static void zs_exit(void) +static int zs_register_cpu_notifier(void) { - int cpu; - - for_each_online_cpu(cpu) - zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); - unregister_cpu_notifier(&zs_cpu_nb); -} + int cpu, uninitialized_var(ret); -static int zs_init(void) -{ - int cpu, ret; + cpu_notifier_register_begin(); - register_cpu_notifier(&zs_cpu_nb); + __register_cpu_notifier(&zs_cpu_nb); for_each_online_cpu(cpu) { ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); if (notifier_to_errno(ret)) - goto fail; + break; } - return 0; -fail: - zs_exit(); + + cpu_notifier_register_done(); return notifier_to_errno(ret); } -/** - * zs_create_pool - Creates an allocation pool to work from. - * @flags: allocation flags used to allocate pool metadata - * @ops: allocation/free callbacks for expanding the pool - * - * This function must be called before anything when using - * the zsmalloc allocator. - * - * On success, a pointer to the newly created pool is returned, - * otherwise NULL. - */ -struct zs_pool *zs_create_pool(gfp_t flags, struct zs_ops *ops) +static void zs_unregister_cpu_notifier(void) { - int i, ovhd_size; - struct zs_pool *pool; - - ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); - pool = kzalloc(ovhd_size, flags); - if (!pool) - return NULL; + int cpu; - for (i = 0; i < ZS_SIZE_CLASSES; i++) { - int size; - struct size_class *class; + cpu_notifier_register_begin(); - size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; - if (size > ZS_MAX_ALLOC_SIZE) - size = ZS_MAX_ALLOC_SIZE; + for_each_online_cpu(cpu) + zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); + __unregister_cpu_notifier(&zs_cpu_nb); - class = &pool->size_class[i]; - class->size = size; - class->index = i; - spin_lock_init(&class->lock); - class->pages_per_zspage = get_pages_per_zspage(size); + cpu_notifier_register_done(); +} - } +static void init_zs_size_classes(void) +{ + int nr; - if (ops) - pool->ops = ops; - else - pool->ops = &zs_default_ops; + nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; + if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) + nr += 1; - return pool; + zs_size_classes = nr; } -EXPORT_SYMBOL_GPL(zs_create_pool); -void zs_destroy_pool(struct zs_pool *pool) +static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) { - int i; + if (prev->pages_per_zspage != pages_per_zspage) + return false; - for (i = 0; i < ZS_SIZE_CLASSES; i++) { - int fg; - struct size_class *class = &pool->size_class[i]; + if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) + != get_maxobj_per_zspage(size, pages_per_zspage)) + return false; - for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { - if (class->fullness_list[fg]) { - pr_info("Freeing non-empty class with size " - "%db, fullness group %d\n", - class->size, fg); - } - } - } - kfree(pool); + return true; } -EXPORT_SYMBOL_GPL(zs_destroy_pool); -/** - * zs_malloc - Allocate block of given size from pool. - * @pool: pool to allocate from - * @size: size of block to allocate - * - * On success, handle to the allocated object is returned, - * otherwise 0. - * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. - */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags) +static bool zspage_full(struct page *first_page) { - unsigned long obj; - struct link_free *link; - int class_idx; - struct size_class *class; - - struct page *first_page, *m_page; - unsigned long m_objidx, m_offset; - - if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) - return 0; - - class_idx = get_size_class_index(size); - class = &pool->size_class[class_idx]; - BUG_ON(class_idx != class->index); - - spin_lock(&class->lock); - first_page = find_get_zspage(class); - - if (!first_page) { - spin_unlock(&class->lock); - first_page = alloc_zspage(pool->ops, class, flags); - if (unlikely(!first_page)) - return 0; - - set_zspage_mapping(first_page, class->index, ZS_EMPTY); - spin_lock(&class->lock); - class->pages_allocated += class->pages_per_zspage; - } - - obj = (unsigned long)first_page->freelist; - obj_handle_to_location(obj, &m_page, &m_objidx); - m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); - - link = (struct link_free *)kmap_atomic(m_page) + - m_offset / sizeof(*link); - first_page->freelist = link->next; - memset(link, POISON_INUSE, sizeof(*link)); - kunmap_atomic(link); - - first_page->inuse++; - /* Now move the zspage to another fullness group, if required */ - fix_fullness_group(pool, first_page); - spin_unlock(&class->lock); + BUG_ON(!is_first_page(first_page)); - return obj; + return first_page->inuse == first_page->objects; } -EXPORT_SYMBOL_GPL(zs_malloc); -void zs_free(struct zs_pool *pool, unsigned long obj) +unsigned long zs_get_total_pages(struct zs_pool *pool) { - struct link_free *link; - struct page *first_page, *f_page; - unsigned long f_objidx, f_offset; - - int class_idx; - struct size_class *class; - enum fullness_group fullness; - - if (unlikely(!obj)) - return; - - obj_handle_to_location(obj, &f_page, &f_objidx); - first_page = get_first_page(f_page); - - get_zspage_mapping(first_page, &class_idx, &fullness); - class = &pool->size_class[class_idx]; - f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); - - spin_lock(&class->lock); - - /* Insert this object in containing zspage's freelist */ - link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) - + f_offset); - link->next = first_page->freelist; - kunmap_atomic(link); - first_page->freelist = (void *)obj; - - first_page->inuse--; - fullness = fix_fullness_group(pool, first_page); - - if (fullness == ZS_EMPTY) - class->pages_allocated -= class->pages_per_zspage; - - spin_unlock(&class->lock); - - if (fullness == ZS_EMPTY) - free_zspage(pool->ops, first_page); + return atomic_long_read(&pool->pages_allocated); } -EXPORT_SYMBOL_GPL(zs_free); +EXPORT_SYMBOL_GPL(zs_get_total_pages); /** * zs_map_object - get address of allocated object from handle. @@ -1021,18 +1268,19 @@ EXPORT_SYMBOL_GPL(zs_free); * against nested mappings. * * This function returns with preemption and page faults disabled. -*/ + */ void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm) { struct page *page; - unsigned long obj_idx, off; + unsigned long obj, obj_idx, off; unsigned int class_idx; enum fullness_group fg; struct size_class *class; struct mapping_area *area; struct page *pages[2]; + void *ret; BUG_ON(!handle); @@ -1043,9 +1291,13 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, */ BUG_ON(in_interrupt()); - obj_handle_to_location(handle, &page, &obj_idx); + /* From now on, migration cannot move the object */ + pin_tag(handle); + + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); - class = &pool->size_class[class_idx]; + class = pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); area = &get_cpu_var(zs_map_area); @@ -1053,7 +1305,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ area->vm_addr = kmap_atomic(page); - return area->vm_addr + off; + ret = area->vm_addr + off; + goto out; } /* this object spans two pages */ @@ -1061,14 +1314,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, pages[1] = get_next_page(page); BUG_ON(!pages[1]); - return __zs_map_object(area, pages, off, class->size); + ret = __zs_map_object(area, pages, off, class->size); +out: + if (!class->huge) + ret += ZS_HANDLE_SIZE; + + return ret; } EXPORT_SYMBOL_GPL(zs_map_object); void zs_unmap_object(struct zs_pool *pool, unsigned long handle) { struct page *page; - unsigned long obj_idx, off; + unsigned long obj, obj_idx, off; unsigned int class_idx; enum fullness_group fg; @@ -1077,12 +1335,13 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) BUG_ON(!handle); - obj_handle_to_location(handle, &page, &obj_idx); + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); - class = &pool->size_class[class_idx]; + class = pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); - area = &__get_cpu_var(zs_map_area); + area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) kunmap_atomic(area->vm_addr); else { @@ -1095,20 +1354,704 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } put_cpu_var(zs_map_area); + unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); -u64 zs_get_total_size_bytes(struct zs_pool *pool) +static unsigned long obj_malloc(struct size_class *class, + struct page *first_page, unsigned long handle) +{ + unsigned long obj; + struct link_free *link; + + struct page *m_page; + unsigned long m_objidx, m_offset; + void *vaddr; + + handle |= OBJ_ALLOCATED_TAG; + obj = (unsigned long)first_page->freelist; + obj_to_location(obj, &m_page, &m_objidx); + m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); + + vaddr = kmap_atomic(m_page); + link = (struct link_free *)vaddr + m_offset / sizeof(*link); + first_page->freelist = link->next; + if (!class->huge) + /* record handle in the header of allocated chunk */ + link->handle = handle; + else + /* record handle in first_page->private */ + set_page_private(first_page, handle); + kunmap_atomic(vaddr); + first_page->inuse++; + zs_stat_inc(class, OBJ_USED, 1); + + return obj; +} + + +/** + * zs_malloc - Allocate block of given size from pool. + * @pool: pool to allocate from + * @size: size of block to allocate + * + * On success, handle to the allocated object is returned, + * otherwise 0. + * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. + */ +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) +{ + unsigned long handle, obj; + struct size_class *class; + struct page *first_page; + + if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) + return 0; + + handle = alloc_handle(pool, gfp); + if (!handle) + return 0; + + /* extra space in chunk to keep the handle */ + size += ZS_HANDLE_SIZE; + class = pool->size_class[get_size_class_index(size)]; + + spin_lock(&class->lock); + first_page = find_get_zspage(class); + + if (!first_page) { + spin_unlock(&class->lock); + first_page = alloc_zspage(class, gfp); + if (unlikely(!first_page)) { + free_handle(pool, handle); + return 0; + } + + set_zspage_mapping(first_page, class->index, ZS_EMPTY); + atomic_long_add(class->pages_per_zspage, + &pool->pages_allocated); + + spin_lock(&class->lock); + zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + } + + obj = obj_malloc(class, first_page, handle); + /* Now move the zspage to another fullness group, if required */ + fix_fullness_group(class, first_page); + record_obj(handle, obj); + spin_unlock(&class->lock); + + return handle; +} +EXPORT_SYMBOL_GPL(zs_malloc); + +static void obj_free(struct size_class *class, unsigned long obj) +{ + struct link_free *link; + struct page *first_page, *f_page; + unsigned long f_objidx, f_offset; + void *vaddr; + + BUG_ON(!obj); + + obj &= ~OBJ_ALLOCATED_TAG; + obj_to_location(obj, &f_page, &f_objidx); + first_page = get_first_page(f_page); + + f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); + + vaddr = kmap_atomic(f_page); + + /* Insert this object in containing zspage's freelist */ + link = (struct link_free *)(vaddr + f_offset); + link->next = first_page->freelist; + if (class->huge) + set_page_private(first_page, 0); + kunmap_atomic(vaddr); + first_page->freelist = (void *)obj; + first_page->inuse--; + zs_stat_dec(class, OBJ_USED, 1); +} + +void zs_free(struct zs_pool *pool, unsigned long handle) +{ + struct page *first_page, *f_page; + unsigned long obj, f_objidx; + int class_idx; + struct size_class *class; + enum fullness_group fullness; + + if (unlikely(!handle)) + return; + + pin_tag(handle); + obj = handle_to_obj(handle); + obj_to_location(obj, &f_page, &f_objidx); + first_page = get_first_page(f_page); + + get_zspage_mapping(first_page, &class_idx, &fullness); + class = pool->size_class[class_idx]; + + spin_lock(&class->lock); + obj_free(class, obj); + fullness = fix_fullness_group(class, first_page); + if (fullness == ZS_EMPTY) { + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); + free_zspage(first_page); + } + spin_unlock(&class->lock); + unpin_tag(handle); + + free_handle(pool, handle); +} +EXPORT_SYMBOL_GPL(zs_free); + +static void zs_object_copy(struct size_class *class, unsigned long dst, + unsigned long src) +{ + struct page *s_page, *d_page; + unsigned long s_objidx, d_objidx; + unsigned long s_off, d_off; + void *s_addr, *d_addr; + int s_size, d_size, size; + int written = 0; + + s_size = d_size = class->size; + + obj_to_location(src, &s_page, &s_objidx); + obj_to_location(dst, &d_page, &d_objidx); + + s_off = obj_idx_to_offset(s_page, s_objidx, class->size); + d_off = obj_idx_to_offset(d_page, d_objidx, class->size); + + if (s_off + class->size > PAGE_SIZE) + s_size = PAGE_SIZE - s_off; + + if (d_off + class->size > PAGE_SIZE) + d_size = PAGE_SIZE - d_off; + + s_addr = kmap_atomic(s_page); + d_addr = kmap_atomic(d_page); + + while (1) { + size = min(s_size, d_size); + memcpy(d_addr + d_off, s_addr + s_off, size); + written += size; + + if (written == class->size) + break; + + s_off += size; + s_size -= size; + d_off += size; + d_size -= size; + + if (s_off >= PAGE_SIZE) { + kunmap_atomic(d_addr); + kunmap_atomic(s_addr); + s_page = get_next_page(s_page); + BUG_ON(!s_page); + s_addr = kmap_atomic(s_page); + d_addr = kmap_atomic(d_page); + s_size = class->size - written; + s_off = 0; + } + + if (d_off >= PAGE_SIZE) { + kunmap_atomic(d_addr); + d_page = get_next_page(d_page); + BUG_ON(!d_page); + d_addr = kmap_atomic(d_page); + d_size = class->size - written; + d_off = 0; + } + } + + kunmap_atomic(d_addr); + kunmap_atomic(s_addr); +} + +/* + * Find alloced object in zspage from index object and + * return handle. + */ +static unsigned long find_alloced_obj(struct size_class *class, + struct page *page, int index) +{ + unsigned long head; + int offset = 0; + unsigned long handle = 0; + void *addr = kmap_atomic(page); + + if (!is_first_page(page)) + offset = page->index; + offset += class->size * index; + + while (offset < PAGE_SIZE) { + head = obj_to_head(class, page, addr + offset); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (trypin_tag(handle)) + break; + handle = 0; + } + + offset += class->size; + index++; + } + + kunmap_atomic(addr); + return handle; +} + +struct zs_compact_control { + /* Source page for migration which could be a subpage of zspage. */ + struct page *s_page; + /* Destination page for migration which should be a first page + * of zspage. */ + struct page *d_page; + /* Starting object index within @s_page which used for live object + * in the subpage. */ + int index; +}; + +static int migrate_zspage(struct zs_pool *pool, struct size_class *class, + struct zs_compact_control *cc) +{ + unsigned long used_obj, free_obj; + unsigned long handle; + struct page *s_page = cc->s_page; + struct page *d_page = cc->d_page; + unsigned long index = cc->index; + int ret = 0; + + while (1) { + handle = find_alloced_obj(class, s_page, index); + if (!handle) { + s_page = get_next_page(s_page); + if (!s_page) + break; + index = 0; + continue; + } + + /* Stop if there is no more space */ + if (zspage_full(d_page)) { + unpin_tag(handle); + ret = -ENOMEM; + break; + } + + used_obj = handle_to_obj(handle); + free_obj = obj_malloc(class, d_page, handle); + zs_object_copy(class, free_obj, used_obj); + index++; + /* + * record_obj updates handle's value to free_obj and it will + * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which + * breaks synchronization using pin_tag(e,g, zs_free) so + * let's keep the lock bit. + */ + free_obj |= BIT(HANDLE_PIN_BIT); + record_obj(handle, free_obj); + unpin_tag(handle); + obj_free(class, used_obj); + } + + /* Remember last position in this iteration */ + cc->s_page = s_page; + cc->index = index; + + return ret; +} + +static struct page *isolate_target_page(struct size_class *class) +{ + int i; + struct page *page; + + for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { + page = class->fullness_list[i]; + if (page) { + remove_zspage(class, i, page); + break; + } + } + + return page; +} + +/* + * putback_zspage - add @first_page into right class's fullness list + * @pool: target pool + * @class: destination class + * @first_page: target page + * + * Return @fist_page's fullness_group + */ +static enum fullness_group putback_zspage(struct zs_pool *pool, + struct size_class *class, + struct page *first_page) +{ + enum fullness_group fullness; + + BUG_ON(!is_first_page(first_page)); + + fullness = get_fullness_group(first_page); + insert_zspage(class, fullness, first_page); + set_zspage_mapping(first_page, class->index, fullness); + + if (fullness == ZS_EMPTY) { + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); + + free_zspage(first_page); + } + + return fullness; +} + +static struct page *isolate_source_page(struct size_class *class) { int i; - u64 npages = 0; + struct page *page = NULL; + + for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { + page = class->fullness_list[i]; + if (!page) + continue; - for (i = 0; i < ZS_SIZE_CLASSES; i++) - npages += pool->size_class[i].pages_allocated; + remove_zspage(class, i, page); + break; + } + + return page; +} + +/* + * + * Based on the number of unused allocated objects calculate + * and return the number of pages that we can free. + */ +static unsigned long zs_can_compact(struct size_class *class) +{ + unsigned long obj_wasted; + unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + unsigned long obj_used = zs_stat_get(class, OBJ_USED); + + if (obj_allocated <= obj_used) + return 0; + + obj_wasted = obj_allocated - obj_used; + obj_wasted /= get_maxobj_per_zspage(class->size, + class->pages_per_zspage); + + return obj_wasted * class->pages_per_zspage; +} + +static void __zs_compact(struct zs_pool *pool, struct size_class *class) +{ + struct zs_compact_control cc; + struct page *src_page; + struct page *dst_page = NULL; + + spin_lock(&class->lock); + while ((src_page = isolate_source_page(class))) { + + BUG_ON(!is_first_page(src_page)); + + if (!zs_can_compact(class)) + break; + + cc.index = 0; + cc.s_page = src_page; + + while ((dst_page = isolate_target_page(class))) { + cc.d_page = dst_page; + /* + * If there is no more space in dst_page, resched + * and see if anyone had allocated another zspage. + */ + if (!migrate_zspage(pool, class, &cc)) + break; + + putback_zspage(pool, class, dst_page); + } + + /* Stop if we couldn't find slot */ + if (dst_page == NULL) + break; + + putback_zspage(pool, class, dst_page); + if (putback_zspage(pool, class, src_page) == ZS_EMPTY) + pool->stats.pages_compacted += class->pages_per_zspage; + spin_unlock(&class->lock); + cond_resched(); + spin_lock(&class->lock); + } + + if (src_page) + putback_zspage(pool, class, src_page); + + spin_unlock(&class->lock); +} + +unsigned long zs_compact(struct zs_pool *pool) +{ + int i; + struct size_class *class; + + for (i = zs_size_classes - 1; i >= 0; i--) { + class = pool->size_class[i]; + if (!class) + continue; + if (class->index != i) + continue; + __zs_compact(pool, class); + } + + return pool->stats.pages_compacted; +} +EXPORT_SYMBOL_GPL(zs_compact); + +void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) +{ + memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); +} +EXPORT_SYMBOL_GPL(zs_pool_stats); + +static unsigned long zs_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long pages_freed; + struct zs_pool *pool = container_of(shrinker, struct zs_pool, + shrinker); + + pages_freed = pool->stats.pages_compacted; + /* + * Compact classes and calculate compaction delta. + * Can run concurrently with a manually triggered + * (by user) compaction. + */ + pages_freed = zs_compact(pool) - pages_freed; + + return pages_freed ? pages_freed : SHRINK_STOP; +} + +static unsigned long zs_shrinker_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int i; + struct size_class *class; + unsigned long pages_to_free = 0; + struct zs_pool *pool = container_of(shrinker, struct zs_pool, + shrinker); + + for (i = zs_size_classes - 1; i >= 0; i--) { + class = pool->size_class[i]; + if (!class) + continue; + if (class->index != i) + continue; + + pages_to_free += zs_can_compact(class); + } + + return pages_to_free; +} + +static void zs_unregister_shrinker(struct zs_pool *pool) +{ + if (pool->shrinker_enabled) { + unregister_shrinker(&pool->shrinker); + pool->shrinker_enabled = false; + } +} + +static void zs_register_shrinker(struct zs_pool *pool) +{ + pool->shrinker.scan_objects = zs_shrinker_scan; + pool->shrinker.count_objects = zs_shrinker_count; + pool->shrinker.batch = 0; + pool->shrinker.seeks = DEFAULT_SEEKS; + + register_shrinker(&pool->shrinker); +} + +static int zs_register_shrinker_int(struct zs_pool *pool) +{ + zs_register_shrinker(pool); + return 0; +} + +/** + * zs_create_pool - Creates an allocation pool to work from. + * @flags: allocation flags used to allocate pool metadata + * + * This function must be called before anything when using + * the zsmalloc allocator. + * + * On success, a pointer to the newly created pool is returned, + * otherwise NULL. + */ +struct zs_pool *zs_create_pool(const char *name) +{ + int i; + struct zs_pool *pool; + struct size_class *prev_class = NULL; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + + pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), + GFP_KERNEL); + if (!pool->size_class) { + kfree(pool); + return NULL; + } + + pool->name = kstrdup(name, GFP_KERNEL); + if (!pool->name) + goto err; + + if (create_handle_cache(pool)) + goto err; + + /* + * Iterate reversly, because, size of size_class that we want to use + * for merging should be larger or equal to current size. + */ + for (i = zs_size_classes - 1; i >= 0; i--) { + int size; + int pages_per_zspage; + struct size_class *class; + + size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; + if (size > ZS_MAX_ALLOC_SIZE) + size = ZS_MAX_ALLOC_SIZE; + pages_per_zspage = get_pages_per_zspage(size); + + /* + * size_class is used for normal zsmalloc operation such + * as alloc/free for that size. Although it is natural that we + * have one size_class for each size, there is a chance that we + * can get more memory utilization if we use one size_class for + * many different sizes whose size_class have same + * characteristics. So, we makes size_class point to + * previous size_class if possible. + */ + if (prev_class) { + if (can_merge(prev_class, size, pages_per_zspage)) { + pool->size_class[i] = prev_class; + continue; + } + } + + class = kzalloc(sizeof(struct size_class), GFP_KERNEL); + if (!class) + goto err; + + class->size = size; + class->index = i; + class->pages_per_zspage = pages_per_zspage; + if (pages_per_zspage == 1 && + get_maxobj_per_zspage(size, pages_per_zspage) == 1) + class->huge = true; + spin_lock_init(&class->lock); + pool->size_class[i] = class; + + prev_class = class; + } + + /* debug only, don't abort if it fails */ + zs_pool_stat_create(pool, name); + + /* + * Not critical, we still can use the pool + * and user can trigger compaction manually. + */ + if (zs_register_shrinker_int(pool) == 0) + pool->shrinker_enabled = true; + return pool; + +err: + zs_destroy_pool(pool); + return NULL; +} +EXPORT_SYMBOL_GPL(zs_create_pool); + +void zs_destroy_pool(struct zs_pool *pool) +{ + int i; + + zs_unregister_shrinker(pool); + zs_pool_stat_destroy(pool); + + for (i = 0; i < zs_size_classes; i++) { + int fg; + struct size_class *class = pool->size_class[i]; + + if (!class) + continue; + + if (class->index != i) + continue; + + for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { + if (class->fullness_list[fg]) { + pr_info("Freeing non-empty class with size %db, fullness group %d\n", + class->size, fg); + } + } + kfree(class); + } + + destroy_handle_cache(pool); + kfree(pool->size_class); + kfree(pool->name); + kfree(pool); +} +EXPORT_SYMBOL_GPL(zs_destroy_pool); + +static int __init zs_init(void) +{ + int ret = zs_register_cpu_notifier(); + + if (ret) + goto notifier_fail; + + init_zs_size_classes(); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zs_zpool_driver); +#endif + + zs_stat_init(); + + return 0; + +notifier_fail: + zs_unregister_cpu_notifier(); + + return ret; +} + +static void __exit zs_exit(void) +{ +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif + zs_unregister_cpu_notifier(); - return npages << PAGE_SHIFT; + zs_stat_exit(); } -EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); module_init(zs_init); module_exit(zs_exit); diff --git a/mm/zswap.c b/mm/zswap.c index b825f9876..d5980efc1 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1,11 +1,11 @@ /* * zswap.c - zswap driver file * - * zswap is a backend for frontswap that takes pages that are in the - * process of being swapped out and attempts to compress them and store - * them in a RAM-based memory pool. This results in a significant I/O - * reduction on the real swap device and, in the case of a slow swap - * device, can also improve workload performance. + * zswap is a backend for frontswap that takes pages that are in the process + * of being swapped out and attempts to compress and store them in a + * RAM-based memory pool. This can result in a significant I/O reduction on + * the swap device and, in the case where decompressing from RAM is faster + * than reading from the swap device, can also improve workload performance. * * Copyright (C) 2012 Seth Jennings * @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include @@ -45,65 +45,62 @@ /********************************* * statistics **********************************/ -/* Number of memory pages used by the compressed pool */ -atomic_t zswap_pool_pages = ATOMIC_INIT(0); +/* Total bytes used by the compressed storage */ +u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ atomic_t zswap_stored_pages = ATOMIC_INIT(0); -#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK -/* The number of outstanding pages awaiting writeback */ -static atomic_t zswap_outstanding_writebacks = ATOMIC_INIT(0); -#endif - /* * The statistics below are not protected from concurrent access for * performance reasons so they may not be a 100% accurate. However, * they do provide useful information on roughly how many times a * certain event is occurring. */ + +/* Pool limit was hit (see zswap_max_pool_percent) */ static u64 zswap_pool_limit_hit; +/* Pages written back when pool limit was reached */ static u64 zswap_written_back_pages; +/* Store failed due to a reclaim failure after pool limit was reached */ +static u64 zswap_reject_reclaim_fail; +/* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; -static u64 zswap_writeback_attempted; -static u64 zswap_reject_tmppage_fail; -static u64 zswap_reject_zsmalloc_fail; +/* Store failed because underlying allocator could not get memory */ +static u64 zswap_reject_alloc_fail; +/* Store failed because the entry metadata could not be allocated (rare) */ static u64 zswap_reject_kmemcache_fail; -static u64 zswap_saved_by_writeback; +/* Duplicate store was encountered (rare) */ static u64 zswap_duplicate_entry; /********************************* * tunables **********************************/ -/* Enable/disable zswap (enabled by default, fixed at boot for now) */ + +/* Enable/disable zswap (disabled by default) */ static bool zswap_enabled = 1; -module_param_named(enabled, zswap_enabled, bool, 0); +module_param_named(enabled, zswap_enabled, bool, 0644); /* Compressor to be used by zswap (fixed at boot for now) */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" +#ifndef CONFIG_CRYPTO_LZ4 static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; -module_param_named(compressor, zswap_compressor, charp, 0); +#else +static char *zswap_compressor = "lz4"; +#endif +module_param_named(compressor, zswap_compressor, charp, 0444); /* The maximum percentage of memory that the compressed pool can occupy */ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); -/* - * Maximum compression ratio, as as percentage, for an acceptable - * compressed page. Any pages that do not compress by at least - * this ratio will be rejected. -*/ -static unsigned int zswap_max_compression_ratio = 80; -module_param_named(max_compression_ratio, - zswap_max_compression_ratio, uint, 0644); +/* Compressed storage to use */ +#define ZSWAP_ZPOOL_DEFAULT "zbud" +static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; +module_param_named(zpool, zswap_zpool_type, charp, 0444); -/* - * Maximum number of outstanding writebacks allowed at any given time. - * This is to prevent decompressing an unbounded number of compressed - * pages into the swap cache all at once, and to help with writeback - * congestion. -*/ -#define ZSWAP_MAX_OUTSTANDING_FLUSHES 64 +/* zpool is shared by all of zswap backend */ +static struct zpool *zswap_pool; /********************************* * compression functions @@ -157,17 +154,15 @@ static int __init zswap_comp_init(void) return 0; } -static void zswap_comp_exit(void) +static void __init zswap_comp_exit(void) { /* free percpu transforms */ - if (zswap_comp_pcpu_tfms) - free_percpu(zswap_comp_pcpu_tfms); + free_percpu(zswap_comp_pcpu_tfms); } /********************************* * data structures **********************************/ - /* * struct zswap_entry * @@ -175,41 +170,37 @@ static void zswap_comp_exit(void) * page within zswap. * * rbnode - links the entry into red-black tree for the appropriate swap type - * lru - links the entry into the lru list for the appropriate swap type * refcount - the number of outstanding reference to the entry. This is needed * to protect against premature freeing of the entry by code - * concurent calls to load, invalidate, and writeback. The lock + * concurrent calls to load, invalidate, and writeback. The lock * for the zswap_tree structure that contains the entry must * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. - * type - the swap type for the entry. Used to map back to the zswap_tree - * structure that contains the entry. * offset - the swap offset for the entry. Index into the red-black tree. - * handle - zsmalloc allocation handle that stores the compressed page data + * handle - zpool allocation handle that stores the compressed page data * length - the length in bytes of the compressed page data. Needed during - * decompression + * decompression */ struct zswap_entry { struct rb_node rbnode; - struct list_head lru; - int refcount; pgoff_t offset; - unsigned long handle; + int refcount; unsigned int length; + unsigned long handle; +}; + +struct zswap_header { + swp_entry_t swpentry; }; /* * The tree lock in the zswap_tree struct protects a few things: * - the rbtree - * - the lru list * - the refcount field of each entry in the tree */ struct zswap_tree { struct rb_root rbroot; - struct list_head lru; spinlock_t lock; - struct zs_pool *pool; - unsigned type; }; static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; @@ -217,49 +208,35 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; /********************************* * zswap entry functions **********************************/ -#define ZSWAP_KMEM_CACHE_NAME "zswap_entry_cache" static struct kmem_cache *zswap_entry_cache; -static inline int zswap_entry_cache_create(void) +static int __init zswap_entry_cache_create(void) { - zswap_entry_cache = - kmem_cache_create(ZSWAP_KMEM_CACHE_NAME, - sizeof(struct zswap_entry), 0, 0, NULL); - return (zswap_entry_cache == NULL); + zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); + return zswap_entry_cache == NULL; } -static inline void zswap_entry_cache_destory(void) +static void __init zswap_entry_cache_destroy(void) { kmem_cache_destroy(zswap_entry_cache); } -static inline struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) { struct zswap_entry *entry; entry = kmem_cache_alloc(zswap_entry_cache, gfp); if (!entry) return NULL; - INIT_LIST_HEAD(&entry->lru); entry->refcount = 1; + RB_CLEAR_NODE(&entry->rbnode); return entry; } -static inline void zswap_entry_cache_free(struct zswap_entry *entry) +static void zswap_entry_cache_free(struct zswap_entry *entry) { kmem_cache_free(zswap_entry_cache, entry); } -static inline void zswap_entry_get(struct zswap_entry *entry) -{ - entry->refcount++; -} - -static inline int zswap_entry_put(struct zswap_entry *entry) -{ - entry->refcount--; - return entry->refcount; -} - /********************************* * rbtree functions **********************************/ @@ -281,9 +258,9 @@ static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) } /* - * In the case that a entry with the same offset is found, it a pointer to + * In the case that a entry with the same offset is found, a pointer to * the existing entry is stored in dupentry and the function returns -EEXIST -*/ + */ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, struct zswap_entry **dupentry) { @@ -307,6 +284,60 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, return 0; } +static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) +{ + if (!RB_EMPTY_NODE(&entry->rbnode)) { + rb_erase(&entry->rbnode, root); + RB_CLEAR_NODE(&entry->rbnode); + } +} + +/* + * Carries out the common pattern of freeing and entry's zpool allocation, + * freeing the entry itself, and decrementing the number of stored pages. + */ +static void zswap_free_entry(struct zswap_entry *entry) +{ + zpool_free(zswap_pool, entry->handle); + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); +} + +/* caller must hold the tree lock */ +static void zswap_entry_get(struct zswap_entry *entry) +{ + entry->refcount++; +} + +/* caller must hold the tree lock +* remove from the tree and free it, if nobody reference the entry +*/ +static void zswap_entry_put(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + int refcount = --entry->refcount; + + BUG_ON(refcount < 0); + if (refcount == 0) { + zswap_rb_erase(&tree->rbroot, entry); + zswap_free_entry(entry); + } +} + +/* caller must hold the tree lock */ +static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, + pgoff_t offset) +{ + struct zswap_entry *entry = NULL; + + entry = zswap_rb_search(root, offset); + if (entry) + zswap_entry_get(entry); + + return entry; +} + /********************************* * per-cpu code **********************************/ @@ -325,7 +356,7 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) return NOTIFY_BAD; } *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; - dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); + dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!dst) { pr_err("can't allocate compressor buffer\n"); crypto_free_comp(tfm); @@ -342,10 +373,8 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; } dst = per_cpu(zswap_dstmem, cpu); - if (dst) { - kfree(dst); - per_cpu(zswap_dstmem, cpu) = NULL; - } + kfree(dst); + per_cpu(zswap_dstmem, cpu) = NULL; break; default: break; @@ -364,108 +393,42 @@ static struct notifier_block zswap_cpu_notifier_block = { .notifier_call = zswap_cpu_notifier }; -static int zswap_cpu_init(void) +static int __init zswap_cpu_init(void) { unsigned long cpu; - get_online_cpus(); + cpu_notifier_register_begin(); for_each_online_cpu(cpu) if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) goto cleanup; - register_cpu_notifier(&zswap_cpu_notifier_block); - put_online_cpus(); + __register_cpu_notifier(&zswap_cpu_notifier_block); + cpu_notifier_register_done(); return 0; cleanup: for_each_online_cpu(cpu) __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); - put_online_cpus(); + cpu_notifier_register_done(); return -ENOMEM; } -/********************************* -* zsmalloc callbacks -**********************************/ -static mempool_t *zswap_page_pool; - -static inline unsigned int zswap_max_pool_pages(void) -{ - return zswap_max_pool_percent * totalram_pages / 100; -} - -static inline int zswap_page_pool_create(void) -{ - /* TODO: dynamically size mempool */ - zswap_page_pool = mempool_create_page_pool(256, 0); - if (!zswap_page_pool) - return -ENOMEM; - return 0; -} - -static inline void zswap_page_pool_destroy(void) -{ - mempool_destroy(zswap_page_pool); -} - -static struct page *zswap_alloc_page(gfp_t flags) -{ - struct page *page; - - if (atomic_read(&zswap_pool_pages) >= zswap_max_pool_pages()) { - zswap_pool_limit_hit++; - return NULL; - } - page = mempool_alloc(zswap_page_pool, flags); - if (page) - atomic_inc(&zswap_pool_pages); - return page; -} - -static void zswap_free_page(struct page *page) -{ - if (!page) - return; - mempool_free(page, zswap_page_pool); - atomic_dec(&zswap_pool_pages); -} - -static struct zs_ops zswap_zs_ops = { - .alloc = zswap_alloc_page, - .free = zswap_free_page -}; - - /********************************* * helpers **********************************/ - -/* - * Carries out the common pattern of freeing and entry's zsmalloc allocation, - * freeing the entry itself, and decrementing the number of stored pages. - */ -static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) +static bool zswap_is_full(void) { - zs_free(tree->pool, entry->handle); - zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); + return totalram_pages * zswap_max_pool_percent / 100 < + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } -#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK /********************************* * writeback code **********************************/ -static void zswap_end_swap_write(struct bio *bio, int err) -{ - end_swap_bio_write(bio, err); - atomic_dec(&zswap_outstanding_writebacks); - zswap_written_back_pages++; -} - /* return enum for zswap_get_swap_cache_page */ enum zswap_get_swap_ret { ZSWAP_SWAPCACHE_NEW, ZSWAP_SWAPCACHE_EXIST, - ZSWAP_SWAPCACHE_NOMEM + ZSWAP_SWAPCACHE_FAIL, }; /* @@ -479,15 +442,16 @@ enum zswap_get_swap_ret { * added to the swap cache, and returned in retpage. * * If success, the swap cache page is returned in retpage - * Returns 0 if page was already in the swap cache, page is not locked - * Returns 1 if the new page needs to be populated, page is locked - * Returns <0 on error + * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache + * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, + * the new page is added to swapcache and locked + * Returns ZSWAP_SWAPCACHE_FAIL on error */ static int zswap_get_swap_cache_page(swp_entry_t entry, struct page **retpage) { struct page *found_page, *new_page = NULL; - struct address_space *swapper_space = &swapper_spaces[swp_type(entry)]; + struct address_space *swapper_space = swap_address_space(entry); int err; *retpage = NULL; @@ -553,13 +517,13 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, if (new_page) page_cache_release(new_page); if (!found_page) - return ZSWAP_SWAPCACHE_NOMEM; + return ZSWAP_SWAPCACHE_FAIL; *retpage = found_page; return ZSWAP_SWAPCACHE_EXIST; } /* - * Attempts to free and entry by adding a page to the swap cache, + * Attempts to free an entry by adding a page to the swap cache, * decompressing the entry data into the page, and issuing a * bio write to write the page back to the swap device. * @@ -570,12 +534,14 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, * the swap cache, the compressed version stored by zswap can be * freed. */ -static int zswap_writeback_entry(struct zswap_tree *tree, - struct zswap_entry *entry) +static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) { - unsigned long type = tree->type; - struct page *page; + struct zswap_header *zhdr; swp_entry_t swpentry; + struct zswap_tree *tree; + pgoff_t offset; + struct zswap_entry *entry; + struct page *page; u8 *src, *dst; unsigned int dlen; int ret; @@ -583,30 +549,46 @@ static int zswap_writeback_entry(struct zswap_tree *tree, .sync_mode = WB_SYNC_NONE, }; - /* get/allocate page in the swap cache */ - swpentry = swp_entry(type, entry->offset); + /* extract swpentry from data */ + zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); + swpentry = zhdr->swpentry; /* here */ + zpool_unmap_handle(pool, handle); + tree = zswap_trees[swp_type(swpentry)]; + offset = swp_offset(swpentry); + + /* find and ref zswap entry */ + spin_lock(&tree->lock); + entry = zswap_entry_find_get(&tree->rbroot, offset); + if (!entry) { + /* entry was invalidated */ + spin_unlock(&tree->lock); + return 0; + } + spin_unlock(&tree->lock); + BUG_ON(offset != entry->offset); /* try to allocate swap cache page */ switch (zswap_get_swap_cache_page(swpentry, &page)) { + case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ + ret = -ENOMEM; + goto fail; - case ZSWAP_SWAPCACHE_NOMEM: /* no memory */ - return -ENOMEM; - break; /* not reached */ - - case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */ + case ZSWAP_SWAPCACHE_EXIST: /* page is already in the swap cache, ignore for now */ - return -EEXIST; - break; /* not reached */ + page_cache_release(page); + ret = -EEXIST; + goto fail; case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = zs_map_object(tree->pool, entry->handle, ZS_MM_RO); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); - ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, - dst, &dlen); + ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, + entry->length, dst, &dlen); kunmap_atomic(dst); - zs_unmap_object(tree->pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -614,148 +596,45 @@ static int zswap_writeback_entry(struct zswap_tree *tree, SetPageUptodate(page); } - /* start writeback */ + /* move it to the tail of the inactive list after end_writeback */ SetPageReclaim(page); - if (!__swap_writepage(page, &wbc, zswap_end_swap_write)) - atomic_inc(&zswap_outstanding_writebacks); - page_cache_release(page); - - return 0; -} - -/* - * Attempts to free nr of entries via writeback to the swap device. - * The number of entries that were actually freed is returned. - */ -static int zswap_writeback_entries(struct zswap_tree *tree, int nr) -{ - struct zswap_entry *entry; - int i, ret, refcount, freed_nr = 0; - - for (i = 0; i < nr; i++) { - /* - * This limits is arbitrary for now until a better - * policy can be implemented. This is so we don't - * eat all of RAM decompressing pages for writeback. - */ - if (atomic_read(&zswap_outstanding_writebacks) > - ZSWAP_MAX_OUTSTANDING_FLUSHES) - break; - - spin_lock(&tree->lock); - - /* dequeue from lru */ - if (list_empty(&tree->lru)) { - spin_unlock(&tree->lock); - break; - } - entry = list_first_entry(&tree->lru, - struct zswap_entry, lru); - list_del_init(&entry->lru); - - /* so invalidate doesn't free the entry from under us */ - zswap_entry_get(entry); - - spin_unlock(&tree->lock); - /* attempt writeback */ - ret = zswap_writeback_entry(tree, entry); - - spin_lock(&tree->lock); - - /* drop reference from above */ - refcount = zswap_entry_put(entry); - - if (!ret) - /* drop the initial reference from entry creation */ - refcount = zswap_entry_put(entry); - - /* - * There are four possible values for refcount here: - * (1) refcount is 2, writeback failed and load is in progress; - * do nothing, load will add us back to the LRU - * (2) refcount is 1, writeback failed; do not free entry, - * add back to LRU - * (3) refcount is 0, (normal case) not invalidate yet; - * remove from rbtree and free entry - * (4) refcount is -1, invalidate happened during writeback; - * free entry - */ - if (refcount == 1) - list_add(&entry->lru, &tree->lru); - - if (refcount == 0) { - /* no invalidate yet, remove from rbtree */ - rb_erase(&entry->rbnode, &tree->rbroot); - } - spin_unlock(&tree->lock); - if (refcount <= 0) { - /* free the entry */ - zswap_free_entry(tree, entry); - freed_nr++; - } - } - return freed_nr; -} -#endif /* CONFIG_ZSWAP_ENABLE_WRITEBACK */ - -/******************************************* -* page pool for temporary compression result -********************************************/ -#define ZSWAP_TMPPAGE_POOL_PAGES 16 -static LIST_HEAD(zswap_tmppage_list); -static DEFINE_SPINLOCK(zswap_tmppage_lock); - -static void zswap_tmppage_pool_destroy(void) -{ - struct page *page, *tmppage; + /* start writeback */ + __swap_writepage(page, &wbc, end_swap_bio_write); + page_cache_release(page); + zswap_written_back_pages++; - spin_lock(&zswap_tmppage_lock); - list_for_each_entry_safe(page, tmppage, &zswap_tmppage_list, lru) { - list_del(&page->lru); - __free_pages(page, 1); - } - spin_unlock(&zswap_tmppage_lock); -} + spin_lock(&tree->lock); + /* drop local reference */ + zswap_entry_put(tree, entry); -static int zswap_tmppage_pool_create(void) -{ - int i; - struct page *page; + /* + * There are two possible situations for entry here: + * (1) refcount is 1(normal case), entry is valid and on the tree + * (2) refcount is 0, entry is freed and not on the tree + * because invalidate happened during writeback + * search the tree and free the entry if find entry + */ + if (entry == zswap_rb_search(&tree->rbroot, offset)) + zswap_entry_put(tree, entry); + spin_unlock(&tree->lock); - for (i = 0; i < ZSWAP_TMPPAGE_POOL_PAGES; i++) { - page = alloc_pages(GFP_KERNEL, 1); - if (!page) { - zswap_tmppage_pool_destroy(); - return -ENOMEM; - } - spin_lock(&zswap_tmppage_lock); - list_add(&page->lru, &zswap_tmppage_list); - spin_unlock(&zswap_tmppage_lock); - } - return 0; -} + goto end; -static inline struct page *zswap_tmppage_alloc(void) -{ - struct page *page; - - spin_lock(&zswap_tmppage_lock); - if (list_empty(&zswap_tmppage_list)) { - spin_unlock(&zswap_tmppage_lock); - return NULL; - } - page = list_first_entry(&zswap_tmppage_list, struct page, lru); - list_del(&page->lru); - spin_unlock(&zswap_tmppage_lock); - return page; -} + /* + * if we get here due to ZSWAP_SWAPCACHE_EXIST + * a load may happening concurrently + * it is safe and okay to not free the entry + * if we free the entry in the following put + * it it either okay to return !0 + */ +fail: + spin_lock(&tree->lock); + zswap_entry_put(tree, entry); + spin_unlock(&tree->lock); -static inline void zswap_tmppage_free(struct page *page) -{ - spin_lock(&zswap_tmppage_lock); - list_add(&page->lru, &zswap_tmppage_list); - spin_unlock(&zswap_tmppage_lock); +end: + return ret; } /********************************* @@ -768,25 +647,25 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *dupentry; int ret; - unsigned int dlen = PAGE_SIZE; + unsigned int dlen = PAGE_SIZE, len; unsigned long handle; char *buf; u8 *src, *dst; - struct page *tmppage; - bool writeback_attempted = 0; -#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK - u8 *tmpdst; -#endif + struct zswap_header *zhdr; - if (!tree) { + if (!zswap_enabled || !tree) { ret = -ENODEV; goto reject; } - /* if this page got EIO on pageout before, give up immediately */ - if (PageError(page)) { - ret = -ENOMEM; - goto reject; + /* reclaim space if needed */ + if (zswap_is_full()) { + zswap_pool_limit_hit++; + if (zpool_shrink(zswap_pool, 1, NULL)) { + zswap_reject_reclaim_fail++; + ret = -ENOMEM; + goto reject; + } } /* allocate entry */ @@ -806,62 +685,25 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, ret = -EINVAL; goto freepage; } - if ((dlen * 100 / PAGE_SIZE) > zswap_max_compression_ratio) { + + /* store */ + len = dlen + sizeof(struct zswap_header); + ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, + &handle); + if (ret == -ENOSPC) { zswap_reject_compress_poor++; - ret = -E2BIG; goto freepage; } - - /* store */ - handle = zs_malloc(tree->pool, dlen, - __GFP_NORETRY | __GFP_HIGHMEM | __GFP_NOMEMALLOC | - __GFP_NOWARN); - if (!handle) { -#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK - zswap_writeback_attempted++; - /* - * Copy compressed buffer out of per-cpu storage so - * we can re-enable preemption. - */ - tmppage = zswap_tmppage_alloc(); - if (!tmppage) { - zswap_reject_tmppage_fail++; - ret = -ENOMEM; - goto freepage; - } - writeback_attempted = 1; - tmpdst = page_address(tmppage); - memcpy(tmpdst, dst, dlen); - dst = tmpdst; - put_cpu_var(zswap_dstmem); - - /* try to free up some space */ - /* TODO: replace with more targeted policy */ - zswap_writeback_entries(tree, 16); - /* try again, allowing wait */ - handle = zs_malloc(tree->pool, dlen, - __GFP_NORETRY | __GFP_HIGHMEM | __GFP_NOMEMALLOC | - __GFP_NOWARN); - if (!handle) { - /* still no space, fail */ - zswap_reject_zsmalloc_fail++; - ret = -ENOMEM; - goto freepage; - } - zswap_saved_by_writeback++; -#else - ret = -ENOMEM; + if (ret) { + zswap_reject_alloc_fail++; goto freepage; -#endif } - - buf = zs_map_object(tree->pool, handle, ZS_MM_WO); + zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); + zhdr->swpentry = swp_entry(type, offset); + buf = (u8 *)(zhdr + 1); memcpy(buf, dst, dlen); - zs_unmap_object(tree->pool, handle); - if (writeback_attempted) - zswap_tmppage_free(tmppage); - else - put_cpu_var(zswap_dstmem); + zpool_unmap_handle(zswap_pool, handle); + put_cpu_var(zswap_dstmem); /* populate entry */ entry->offset = offset; @@ -874,29 +716,21 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); if (ret == -EEXIST) { zswap_duplicate_entry++; - /* remove from rbtree and lru */ - rb_erase(&dupentry->rbnode, &tree->rbroot); - if (!list_empty(&dupentry->lru)) - list_del_init(&dupentry->lru); - if (!zswap_entry_put(dupentry)) { - /* free */ - zswap_free_entry(tree, dupentry); - } + /* remove from rbtree */ + zswap_rb_erase(&tree->rbroot, dupentry); + zswap_entry_put(tree, dupentry); } } while (ret == -EEXIST); - list_add_tail(&entry->lru, &tree->lru); spin_unlock(&tree->lock); /* update stats */ atomic_inc(&zswap_stored_pages); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); return 0; freepage: - if (writeback_attempted) - zswap_tmppage_free(tmppage); - else - put_cpu_var(zswap_dstmem); + put_cpu_var(zswap_dstmem); zswap_entry_cache_free(entry); reject: return ret; @@ -913,59 +747,41 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, struct zswap_entry *entry; u8 *src, *dst; unsigned int dlen; - int refcount; + int ret; /* find */ spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); + entry = zswap_entry_find_get(&tree->rbroot, offset); if (!entry) { /* entry was written back */ spin_unlock(&tree->lock); return -1; } - zswap_entry_get(entry); - - /* remove from lru */ - if (!list_empty(&entry->lru)) - list_del_init(&entry->lru); spin_unlock(&tree->lock); /* decompress */ dlen = PAGE_SIZE; - src = zs_map_object(tree->pool, entry->handle, ZS_MM_RO); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); - zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, + ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zs_unmap_object(tree->pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); + BUG_ON(ret); spin_lock(&tree->lock); - refcount = zswap_entry_put(entry); - if (likely(refcount)) { - list_add_tail(&entry->lru, &tree->lru); - spin_unlock(&tree->lock); - return 0; - } + zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - /* - * We don't have to unlink from the rbtree because - * zswap_writeback_entry() or zswap_frontswap_invalidate page() - * has already done this for us if we are the last reference. - */ - /* free */ - - zswap_free_entry(tree, entry); - return 0; } -/* invalidates a single page */ +/* frees an entry in zswap */ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; - int refcount; /* find */ spin_lock(&tree->lock); @@ -976,80 +792,51 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) return; } - /* remove from rbtree and lru */ - rb_erase(&entry->rbnode, &tree->rbroot); - if (!list_empty(&entry->lru)) - list_del_init(&entry->lru); + /* remove from rbtree */ + zswap_rb_erase(&tree->rbroot, entry); /* drop the initial reference from entry creation */ - refcount = zswap_entry_put(entry); + zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - - if (refcount) { - /* writeback in progress, writeback will free */ - return; - } - - /* free */ - zswap_free_entry(tree, entry); } -/* invalidates all pages for the given swap type */ +/* frees all zswap entries for the given swap type */ static void zswap_frontswap_invalidate_area(unsigned type) { struct zswap_tree *tree = zswap_trees[type]; - struct rb_node *node; - struct zswap_entry *entry; + struct zswap_entry *entry, *n; if (!tree) return; /* walk the tree and free everything */ spin_lock(&tree->lock); - /* - * TODO: Even though this code should not be executed because - * the try_to_unuse() in swapoff should have emptied the tree, - * it is very wasteful to rebalance the tree after every - * removal when we are freeing the whole tree. - * - * If post-order traversal code is ever added to the rbtree - * implementation, it should be used here. - */ - while ((node = rb_first(&tree->rbroot))) { - entry = rb_entry(node, struct zswap_entry, rbnode); - rb_erase(&entry->rbnode, &tree->rbroot); - zs_free(tree->pool, entry->handle); - zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); - } + rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) + zswap_free_entry(entry); tree->rbroot = RB_ROOT; - INIT_LIST_HEAD(&tree->lru); spin_unlock(&tree->lock); + kfree(tree); + zswap_trees[type] = NULL; } -/* NOTE: this is called in atomic context from swapon and must not sleep */ +static const struct zpool_ops zswap_zpool_ops = { + .evict = zswap_writeback_entry +}; + static void zswap_frontswap_init(unsigned type) { struct zswap_tree *tree; - tree = kzalloc(sizeof(struct zswap_tree), GFP_ATOMIC); - if (!tree) - goto err; - tree->pool = zs_create_pool(GFP_NOWAIT, &zswap_zs_ops); - if (!tree->pool) - goto freetree; + tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); + if (!tree) { + pr_err("alloc failed, zswap disabled for swap type %d\n", type); + return; + } + tree->rbroot = RB_ROOT; - INIT_LIST_HEAD(&tree->lru); spin_lock_init(&tree->lock); - tree->type = type; zswap_trees[type] = tree; - return; - -freetree: - kfree(tree); -err: - pr_err("alloc failed, zswap disabled for swap type %d\n", type); } static struct frontswap_ops zswap_frontswap_ops = { @@ -1077,16 +864,12 @@ static int __init zswap_debugfs_init(void) if (!zswap_debugfs_root) return -ENOMEM; - debugfs_create_u64("saved_by_writeback", S_IRUGO, - zswap_debugfs_root, &zswap_saved_by_writeback); debugfs_create_u64("pool_limit_hit", S_IRUGO, zswap_debugfs_root, &zswap_pool_limit_hit); - debugfs_create_u64("reject_writeback_attempted", S_IRUGO, - zswap_debugfs_root, &zswap_writeback_attempted); - debugfs_create_u64("reject_tmppage_fail", S_IRUGO, - zswap_debugfs_root, &zswap_reject_tmppage_fail); - debugfs_create_u64("reject_zsmalloc_fail", S_IRUGO, - zswap_debugfs_root, &zswap_reject_zsmalloc_fail); + debugfs_create_u64("reject_reclaim_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_reclaim_fail); + debugfs_create_u64("reject_alloc_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_alloc_fail); debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, zswap_debugfs_root, &zswap_reject_kmemcache_fail); debugfs_create_u64("reject_compress_poor", S_IRUGO, @@ -1095,14 +878,11 @@ static int __init zswap_debugfs_init(void) zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_u64("duplicate_entry", S_IRUGO, zswap_debugfs_root, &zswap_duplicate_entry); - debugfs_create_atomic_t("pool_pages", S_IRUGO, - zswap_debugfs_root, &zswap_pool_pages); + debugfs_create_u64("pool_total_size", S_IRUGO, + zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", S_IRUGO, zswap_debugfs_root, &zswap_stored_pages); -#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK - debugfs_create_atomic_t("outstanding_writebacks", S_IRUGO, - zswap_debugfs_root, &zswap_outstanding_writebacks); -#endif + return 0; } @@ -1111,12 +891,12 @@ static void __exit zswap_debugfs_exit(void) debugfs_remove_recursive(zswap_debugfs_root); } #else -static inline int __init zswap_debugfs_init(void) +static int __init zswap_debugfs_init(void) { return 0; } -static inline void __exit zswap_debugfs_exit(void) { } +static void __exit zswap_debugfs_exit(void) { } #endif /********************************* @@ -1124,21 +904,28 @@ static inline void __exit zswap_debugfs_exit(void) { } **********************************/ static int __init init_zswap(void) { - if (!zswap_enabled) - return 0; + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; pr_info("loading zswap\n"); - if (zswap_entry_cache_create()) { - pr_err("entry cache creation failed\n"); - goto error; + + zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, + &zswap_zpool_ops); + if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + pr_info("%s zpool not available\n", zswap_zpool_type); + zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; + zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, + &zswap_zpool_ops); } - if (zswap_page_pool_create()) { - pr_err("page pool initialization failed\n"); - goto pagepoolfail; + if (!zswap_pool) { + pr_err("%s zpool not available\n", zswap_zpool_type); + pr_err("zpool creation failed\n"); + goto error; } - if (zswap_tmppage_pool_create()) { - pr_err("workmem pool initialization failed\n"); - goto tmppoolfail; + pr_info("using %s pool\n", zswap_zpool_type); + + if (zswap_entry_cache_create()) { + pr_err("entry cache creation failed\n"); + goto cachefail; } if (zswap_comp_init()) { pr_err("compressor initialization failed\n"); @@ -1148,6 +935,7 @@ static int __init init_zswap(void) pr_err("per-cpu initialization failed\n"); goto pcpufail; } + frontswap_register_ops(&zswap_frontswap_ops); if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); @@ -1155,11 +943,9 @@ static int __init init_zswap(void) pcpufail: zswap_comp_exit(); compfail: - zswap_tmppage_pool_destroy(); -tmppoolfail: - zswap_page_pool_destroy(); -pagepoolfail: - zswap_entry_cache_destory(); + zswap_entry_cache_destroy(); +cachefail: + zpool_destroy_pool(zswap_pool); error: return -ENOMEM; } @@ -1167,5 +953,5 @@ static int __init init_zswap(void) late_initcall(init_zswap); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Seth Jennings "); -MODULE_DESCRIPTION("Compressed cache for swap pages"); +MODULE_AUTHOR("Seth Jennings "); +MODULE_DESCRIPTION("Compressed cache for swap pages"); \ No newline at end of file diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 8755a3079..7fc10b915 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -137,20 +137,24 @@ static int __xfrm6_output(struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct xfrm_state *x = dst->xfrm; int mtu = ip6_skb_dst_mtu(skb); + bool toobig; - if (skb->len > mtu && xfrm6_local_dontfrag(skb)) { + if (x->props.mode != XFRM_MODE_TUNNEL) + goto skip_frag; + + toobig = skb->len > mtu && !skb_is_gso(skb); + + if (toobig && xfrm6_local_dontfrag(skb)) { xfrm6_local_rxpmtu(skb, mtu); return -EMSGSIZE; - } else if (!skb->local_df && skb->len > mtu && skb->sk) { + } else if (!skb->local_df && toobig && skb->sk) { xfrm6_local_error(skb, mtu); return -EMSGSIZE; } - if (x->props.mode == XFRM_MODE_TUNNEL && - ((skb->len > mtu && !skb_is_gso(skb)) || - dst_allfrag(skb_dst(skb)))) { + if (toobig || dst_allfrag(skb_dst(skb))) return ip6_fragment(skb, x->outer_mode->afinfo->output_finish); - } +skip_frag: return x->outer_mode->afinfo->output_finish(skb); } diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index bed52a6ba..d7bfc431d 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -284,9 +284,6 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx) if (tx->sdata->vif.type == NL80211_IFTYPE_WDS) return TX_CONTINUE; - if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT) - return TX_CONTINUE; - if (tx->flags & IEEE80211_TX_PS_BUFFERED) return TX_CONTINUE; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 42eb7ba0b..897a5f14c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -545,6 +545,7 @@ static int send_reply(struct svcxprt_rdma *rdma, { struct ib_send_wr send_wr; struct ib_send_wr inv_wr; + u32 xdr_off; int sge_no; int sge_bytes; int page_no; @@ -584,8 +585,8 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->direction = DMA_TO_DEVICE; /* Map the payload indicated by 'byte_count' */ + xdr_off = 0; for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { - int xdr_off = 0; sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; if (!vec->frmr) { @@ -623,6 +624,14 @@ static int send_reply(struct svcxprt_rdma *rdma, if (page_no+1 >= sge_no) ctxt->sge[page_no+1].length = 0; } + + /* The loop above bumps sc_dma_used for each sge. The + * xdr_buf.tail gets a separate sge, but resides in the + * same page as xdr_buf.head. Don't count it twice. + */ + if (sge_no > ctxt->count) + atomic_dec(&rdma->sc_dma_used); + BUG_ON(sge_no > rdma->sc_max_sge); memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 31275e52c..d4a564fec 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -811,6 +811,7 @@ static void xs_reset_transport(struct sock_xprt *transport) { struct socket *sock = transport->sock; struct sock *sk = transport->inet; + struct rpc_xprt *xprt = &transport->xprt; if (sk == NULL) return; @@ -824,6 +825,7 @@ static void xs_reset_transport(struct sock_xprt *transport) sk->sk_user_data = NULL; xs_restore_old_callbacks(transport, sk); + xprt_clear_connected(xprt); write_unlock_bh(&sk->sk_callback_lock); sk->sk_no_check = 0; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 70d93c605..5b3796788 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1026,7 +1026,7 @@ static void selinux_write_opts(struct seq_file *m, seq_puts(m, prefix); if (has_comma) seq_putc(m, '\"'); - seq_puts(m, opts->mnt_opts[i]); + seq_escape(m, opts->mnt_opts[i], "\"\n\\"); if (has_comma) seq_putc(m, '\"'); } diff --git a/setup_ramdisk.sh b/setup_ramdisk.sh index 0a70b4aa2..9aa0ec904 100755 --- a/setup_ramdisk.sh +++ b/setup_ramdisk.sh @@ -21,7 +21,7 @@ RDIR=$(pwd) # japanese variants: # dcm = N900D / SC-01F (NTT Docomo) # kdi = N900J / SCL22 (au by KDDI) -VARIANT=can +VARIANT=eur ############## SCARY NO-TOUCHY STUFF ############### diff --git a/sound/arm/Kconfig b/sound/arm/Kconfig index 885683a3b..e04062117 100644 --- a/sound/arm/Kconfig +++ b/sound/arm/Kconfig @@ -9,6 +9,14 @@ menuconfig SND_ARM Drivers that are implemented on ASoC can be found in "ALSA for SoC audio support" section. +config SND_PXA2XX_LIB + tristate + select SND_AC97_CODEC if SND_PXA2XX_LIB_AC97 + select SND_DMAENGINE_PCM + +config SND_PXA2XX_LIB_AC97 + bool + if SND_ARM config SND_ARMAACI @@ -21,13 +29,6 @@ config SND_PXA2XX_PCM tristate select SND_PCM -config SND_PXA2XX_LIB - tristate - select SND_AC97_CODEC if SND_PXA2XX_LIB_AC97 - -config SND_PXA2XX_LIB_AC97 - bool - config SND_PXA2XX_AC97 tristate "AC97 driver for the Intel PXA2xx chip" depends on ARCH_PXA diff --git a/sound/soc/pxa/Kconfig b/sound/soc/pxa/Kconfig index a0f7d3cfa..23deb67b8 100644 --- a/sound/soc/pxa/Kconfig +++ b/sound/soc/pxa/Kconfig @@ -1,7 +1,6 @@ config SND_PXA2XX_SOC tristate "SoC Audio for the Intel PXA2xx chip" depends on ARCH_PXA - select SND_ARM select SND_PXA2XX_LIB help Say Y or M if you want to add support for codecs attached to @@ -15,7 +14,6 @@ config SND_PXA2XX_AC97 config SND_PXA2XX_SOC_AC97 tristate select AC97_BUS - select SND_ARM select SND_PXA2XX_LIB_AC97 select SND_SOC_AC97_BUS diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index d8a22c07e..5e612f881 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -1620,7 +1620,7 @@ static int dapm_power_widgets(struct snd_soc_dapm_context *dapm, int event) struct snd_soc_dapm_context *d; LIST_HEAD(up_list); LIST_HEAD(down_list); - LIST_HEAD(async_domain); + ASYNC_DOMAIN_EXCLUSIVE(async_domain); enum snd_soc_bias_level bias; trace_snd_soc_dapm_start(card); diff --git a/sound/synth/emux/emux_oss.c b/sound/synth/emux/emux_oss.c index daf61abc3..646b66703 100644 --- a/sound/synth/emux/emux_oss.c +++ b/sound/synth/emux/emux_oss.c @@ -69,7 +69,8 @@ snd_emux_init_seq_oss(struct snd_emux *emu) struct snd_seq_oss_reg *arg; struct snd_seq_device *dev; - if (snd_seq_device_new(emu->card, 0, SNDRV_SEQ_DEV_ID_OSS, + /* using device#1 here for avoiding conflicts with OPL3 */ + if (snd_seq_device_new(emu->card, 1, SNDRV_SEQ_DEV_ID_OSS, sizeof(struct snd_seq_oss_reg), &dev) < 0) return; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index c0b70c697..5a4482c2a 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1060,25 +1060,19 @@ static void print_cpudesc(struct perf_header *ph, int fd, FILE *fp) static void print_nrcpus(struct perf_header *ph, int fd, FILE *fp) { ssize_t ret; - u32 nr; + u32 nr[2]; ret = read(fd, &nr, sizeof(nr)); if (ret != (ssize_t)sizeof(nr)) - nr = -1; /* interpreted as error */ + nr[0] = nr[1] = -1; /* interpreted as error */ - if (ph->needs_swap) - nr = bswap_32(nr); - - fprintf(fp, "# nrcpus online : %u\n", nr); - - ret = read(fd, &nr, sizeof(nr)); - if (ret != (ssize_t)sizeof(nr)) - nr = -1; /* interpreted as error */ - - if (ph->needs_swap) - nr = bswap_32(nr); + if (ph->needs_swap) { + nr[0] = bswap_32(nr[0]); + nr[1] = bswap_32(nr[1]); + } - fprintf(fp, "# nrcpus avail : %u\n", nr); + fprintf(fp, "# nrcpus online : %u\n", nr[1]); + fprintf(fp, "# nrcpus avail : %u\n", nr[0]); } static void print_version(struct perf_header *ph, int fd, FILE *fp)