From bc58f9bc7d8587f54a1bb9f557c33f1d3d594b24 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Wed, 30 Mar 2016 19:42:06 +0800 Subject: [PATCH 01/52] Try to build with gcc 5.3 toolchain from @dorimanx --- Makefile | 1 + build.sh | 8 +++++++- crypto/testmgr.c | 10 +++++----- drivers/video/msm/mdss/mdss_mdp_pp.c | 2 +- drivers/video/msm/mhl_v2/sii8240/sii8240.c | 11 ++--------- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index c4610edd8..8a53f0d3e 100644 --- a/Makefile +++ b/Makefile @@ -377,6 +377,7 @@ KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -fno-common \ -Werror-implicit-function-declaration \ -Wno-format-security \ + -Wno-discarded-array-qualifiers \ -mtune=cortex-a15 \ --param l1-cache-size=32 \ --param l2-cache-size=2048 \ diff --git a/build.sh b/build.sh index d6c7de091..cb8e93ce6 100755 --- a/build.sh +++ b/build.sh @@ -34,7 +34,7 @@ RDIR=$(pwd) # japanese variants: # dcm = N900D / SC-01F (NTT Docomo) # kdi = N900J / SCL22 (au by KDDI) -VARIANT=can +VARIANT=eur [ -z $VER ] && \ # version number @@ -66,6 +66,8 @@ TOOLCHAIN=/home/jc/build/toolchain/arm-cortex_a15-linux-gnueabihf-linaro_4.9.4-2 # amount of cpu threads to use in kernel make process THREADS=5 +G_SESNSOR_HUB=1 + ############## SCARY NO-TOUCHY STUFF ############### export ARCH=arm @@ -82,6 +84,10 @@ if ! [ -d $RDIR"/ik.ramdisk/variant/$VARIANT/" ] ; then exit -1 fi +if [ $G_SESNSOR_HUB -eq 1] then + sed -i "s/CONFIG_SENSORS_SSP_STM_LEGACY=y/# CONFIG_SENSORS_SSP_STM_LEGACY is not set/" arch/arm/configs/id_defconfig +fi + [ $PERMISSIVE -eq 1 ] && SELINUX="never_enforce" || SELINUX="always_enforce" KDIR=$RDIR/build/arch/arm/boot diff --git a/crypto/testmgr.c b/crypto/testmgr.c index b0b161d03..360db8701 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -462,7 +462,7 @@ static int test_aead(struct crypto_aead *tfm, int enc, ret = crypto_aead_setkey(tfm, key, template[i].klen); - if (!ret == template[i].fail) { + if ((!ret) == template[i].fail) { printk(KERN_ERR "alg: aead: setkey failed on " "test %d for %s: flags=%x\n", j, algo, crypto_aead_get_flags(tfm)); @@ -552,7 +552,7 @@ static int test_aead(struct crypto_aead *tfm, int enc, key = template[i].key; ret = crypto_aead_setkey(tfm, key, template[i].klen); - if (!ret == template[i].fail) { + if ((!ret) == template[i].fail) { printk(KERN_ERR "alg: aead: setkey failed on " "chunk test %d for %s: flags=%x\n", j, algo, crypto_aead_get_flags(tfm)); @@ -756,7 +756,7 @@ static int test_cipher(struct crypto_cipher *tfm, int enc, ret = crypto_cipher_setkey(tfm, template[i].key, template[i].klen); - if (!ret == template[i].fail) { + if ((!ret) == template[i].fail) { printk(KERN_ERR "alg: cipher: setkey failed " "on test %d for %s: flags=%x\n", j, algo, crypto_cipher_get_flags(tfm)); @@ -852,7 +852,7 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc, ret = crypto_ablkcipher_setkey(tfm, template[i].key, template[i].klen); - if (!ret == template[i].fail) { + if ((!ret) == template[i].fail) { printk(KERN_ERR "alg: skcipher: setkey failed " "on test %d for %s: flags=%x\n", j, algo, crypto_ablkcipher_get_flags(tfm)); @@ -916,7 +916,7 @@ static int test_skcipher(struct crypto_ablkcipher *tfm, int enc, ret = crypto_ablkcipher_setkey(tfm, template[i].key, template[i].klen); - if (!ret == template[i].fail) { + if ((!ret) == template[i].fail) { printk(KERN_ERR "alg: skcipher: setkey failed " "on chunk test %d for %s: flags=%x\n", j, algo, diff --git a/drivers/video/msm/mdss/mdss_mdp_pp.c b/drivers/video/msm/mdss/mdss_mdp_pp.c index 94ab630a2..def3f7de9 100644 --- a/drivers/video/msm/mdss/mdss_mdp_pp.c +++ b/drivers/video/msm/mdss/mdss_mdp_pp.c @@ -4373,7 +4373,7 @@ int mdss_mdp_ad_input(struct msm_fb_data_type *mfd, mutex_lock(&ad->lock); if ((!PP_AD_STATE_IS_INITCFG(ad->state) && !PP_AD_STS_IS_DIRTY(ad->sts)) && - !input->mode == MDSS_AD_MODE_CALIB) { + input->mode != MDSS_AD_MODE_CALIB) { pr_warn("AD not initialized or configured."); ret = -EPERM; goto error; diff --git a/drivers/video/msm/mhl_v2/sii8240/sii8240.c b/drivers/video/msm/mhl_v2/sii8240/sii8240.c index 9c8c43769..9ad69723f 100755 --- a/drivers/video/msm/mhl_v2/sii8240/sii8240.c +++ b/drivers/video/msm/mhl_v2/sii8240/sii8240.c @@ -444,8 +444,7 @@ static int tmds_control(struct sii8240_data *sii8240, bool tmds_on) return ret; } - switch (tmds_on) { - case true: + if (tmds_on) { #ifdef SFEATURE_HDCP_SUPPORT ret = mhl_read_byte_reg(tpi, 0x1A, &value); if (TMDS_OUTPUT_CONTROL_POWER_DOWN & value) { @@ -497,9 +496,8 @@ static int tmds_control(struct sii8240_data *sii8240, bool tmds_on) if (unlikely(ret < 0)) pr_err("[ERROR] %s() send AVIF fail\n", __func__); #endif - break; + } else { - case false: #ifdef SFEATURE_HDCP_SUPPORT sii8240_hdcp_on(sii8240, false); #endif @@ -520,11 +518,6 @@ static int tmds_control(struct sii8240_data *sii8240, bool tmds_on) __func__, __LINE__); return ret; } - break; - - default: - pr_err("[ERROR] %s() unknown value\n", __func__); - break; } return ret; } From 8fbaf784cb4b23b377f177c319f83e52bedbda75 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Fri, 15 Apr 2016 13:24:48 +0800 Subject: [PATCH 02/52] msm: kgsl: Fix direct references to HZ Make the various timeout values HZ agnostic by using the proper macros and values instead. Change-Id: I906b948657c8873518042c7465272c98c5391e59 Signed-off-by: Suman Tatiraju Signed-off-by: Ajay Dudani --- arch/arm/boot/dts/msm8974/msm8974-gpu.dtsi | 2 +- drivers/gpu/msm/adreno.c | 2 +- drivers/gpu/msm/adreno_dispatch.c | 2 +- drivers/gpu/msm/kgsl_device.h | 1 - drivers/gpu/msm/kgsl_pwrctrl.c | 8 ++++---- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/arm/boot/dts/msm8974/msm8974-gpu.dtsi b/arch/arm/boot/dts/msm8974/msm8974-gpu.dtsi index 695e45298..21487a85a 100644 --- a/arch/arm/boot/dts/msm8974/msm8974-gpu.dtsi +++ b/arch/arm/boot/dts/msm8974/msm8974-gpu.dtsi @@ -24,7 +24,7 @@ qcom,initial-pwrlevel = <1>; - qcom,idle-timeout = <8>; // + qcom,idle-timeout = <80>; //msec qcom,strtstp-sleepwake; qcom,clk-map = <0x0000006>; //KGSL_CLK_CORE | KGSL_CLK_IFACE diff --git a/drivers/gpu/msm/adreno.c b/drivers/gpu/msm/adreno.c index 5cb6f4f64..791989de8 100644 --- a/drivers/gpu/msm/adreno.c +++ b/drivers/gpu/msm/adreno.c @@ -1735,7 +1735,7 @@ static int adreno_of_get_pdata(struct platform_device *pdev) if (adreno_of_read_property(pdev->dev.of_node, "qcom,idle-timeout", &pdata->idle_timeout)) - pdata->idle_timeout = HZ/12; + pdata->idle_timeout = 80; pdata->strtstp_sleepwake = of_property_read_bool(pdev->dev.of_node, "qcom,strtstp-sleepwake"); diff --git a/drivers/gpu/msm/adreno_dispatch.c b/drivers/gpu/msm/adreno_dispatch.c index 12c40c1ae..2a42e64db 100644 --- a/drivers/gpu/msm/adreno_dispatch.c +++ b/drivers/gpu/msm/adreno_dispatch.c @@ -267,7 +267,7 @@ static struct kgsl_cmdbatch *_get_cmdbatch(struct adreno_context *drawctxt) * it hasn't already been started */ if (!timer_pending(&cmdbatch->timer)) - mod_timer(&cmdbatch->timer, jiffies + (5 * HZ)); + mod_timer(&cmdbatch->timer, jiffies + msecs_to_jiffies(5000)); return ERR_PTR(-EAGAIN); } diff --git a/drivers/gpu/msm/kgsl_device.h b/drivers/gpu/msm/kgsl_device.h index c69bbee74..234666488 100644 --- a/drivers/gpu/msm/kgsl_device.h +++ b/drivers/gpu/msm/kgsl_device.h @@ -30,7 +30,6 @@ #define KGSL_TIMEOUT_DEFAULT 0xFFFFFFFF #define KGSL_TIMEOUT_PART 50 /* 50 msec */ -#define FIRST_TIMEOUT (HZ / 2) /* KGSL device state is initialized to INIT when platform_probe * diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c index 98140720d..3aab7351b 100644 --- a/drivers/gpu/msm/kgsl_pwrctrl.c +++ b/drivers/gpu/msm/kgsl_pwrctrl.c @@ -505,7 +505,7 @@ static int kgsl_pwrctrl_idle_timer_store(struct device *dev, unsigned int val = 0; struct kgsl_device *device = kgsl_device_from_dev(dev); struct kgsl_pwrctrl *pwr; - const long div = 1000/HZ; + const long div = 1000/msecs_to_jiffies(1000); int ret; if (device == NULL) @@ -532,7 +532,7 @@ static int kgsl_pwrctrl_idle_timer_show(struct device *dev, char *buf) { struct kgsl_device *device = kgsl_device_from_dev(dev); - int mul = 1000/HZ; + int mul = 1000/msecs_to_jiffies(1000); if (device == NULL) return 0; /* Show the idle_timeout converted to msec */ @@ -1121,7 +1121,7 @@ int kgsl_pwrctrl_init(struct kgsl_device *device) pwr->power_flags = 0; pwr->idle_needed = pdata->idle_needed; - pwr->interval_timeout = pdata->idle_timeout; + pwr->interval_timeout = msecs_to_jiffies(pdata->idle_timeout); pwr->strtstp_sleepwake = pdata->strtstp_sleepwake; pwr->ebi1_clk = clk_get(&pdev->dev, "bus_clk"); if (IS_ERR(pwr->ebi1_clk)) @@ -1722,7 +1722,7 @@ static int _check_active_count(struct kgsl_device *device, int count) int kgsl_active_count_wait(struct kgsl_device *device, int count) { int result = 0; - long wait_jiffies = HZ; + long wait_jiffies = msecs_to_jiffies(1000); BUG_ON(!mutex_is_locked(&device->mutex)); From 9da1d3a2a67b1c3dfd83544647c1de2f119ab031 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Fri, 6 May 2016 18:36:48 +0800 Subject: [PATCH 03/52] bacon: Set CONFIG_HZ to 300 * As seen on Nexus. * Improves overall interactivity and UX with no power cost. Change-Id: Ifeae42feab5f4a0364a6ddf24710a1f9be71b5e6 --- arch/arm/Kconfig | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4b3aa788b..45b36ecca 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -816,9 +816,7 @@ config ARCH_NR_GPIO source kernel/Kconfig.preempt -config HZ - int - default 100 +source kernel/Kconfig.hz config THUMB2_KERNEL bool "Compile the kernel in Thumb-2 mode (EXPERIMENTAL)" From 7130d15432dc9b54794741d67d80837eb91c79d0 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Fri, 6 May 2016 19:02:48 +0800 Subject: [PATCH 04/52] msm: Convert direct references to HZ thanks to neobuddy89 neobuddy89/shamu@97c5d3a --- arch/arm/mach-msm/bam_dmux.c | 2 +- arch/arm/mach-msm/dma.c | 2 +- arch/arm/mach-msm/ipc_router.c | 4 +-- arch/arm/mach-msm/platsmp.c | 2 +- arch/arm/mach-msm/qdsp6v2/apr.c | 4 +-- arch/arm/mach-msm/qdsp6v2/apr_tal.c | 2 +- arch/arm/mach-msm/qdsp6v2/ultrasound/usf.c | 12 +++---- .../qdsp6v2/ultrasound/version_b/q6usm_b.c | 30 ++++++++-------- arch/arm/mach-msm/smd_pkt.c | 2 +- arch/arm/mach-msm/smd_qmi.c | 2 +- arch/arm/mach-msm/smd_tty.c | 2 +- arch/arm/mach-msm/smem_log.c | 4 +-- arch/arm/mach-msm/smp2p_spinlock_test.c | 8 ++--- arch/arm/mach-msm/smp2p_test.c | 34 +++++++++---------- arch/arm/mach-msm/timer.c | 2 +- drivers/gpu/msm/adreno_profile.c | 2 +- drivers/platform/msm/usb_bam.c | 8 ++--- 17 files changed, 61 insertions(+), 61 deletions(-) diff --git a/arch/arm/mach-msm/bam_dmux.c b/arch/arm/mach-msm/bam_dmux.c index 529e7d12a..6c5df1d38 100644 --- a/arch/arm/mach-msm/bam_dmux.c +++ b/arch/arm/mach-msm/bam_dmux.c @@ -1716,7 +1716,7 @@ static void ul_wakeup(void) } if (wait_for_dfab) { ret = wait_for_completion_timeout( - &dfab_unvote_completion, HZ); + &dfab_unvote_completion, msecs_to_jiffies(1000)); BUG_ON(ret == 0); } if (likely(do_vote_dfab)) diff --git a/arch/arm/mach-msm/dma.c b/arch/arm/mach-msm/dma.c index afee25f67..adb1b596c 100644 --- a/arch/arm/mach-msm/dma.c +++ b/arch/arm/mach-msm/dma.c @@ -356,7 +356,7 @@ static void msm_dmov_enqueue_cmd_ext_work(struct work_struct *work) } if (!dmov_conf[adm].channel_active) { dmov_conf[adm].clk_ctl = CLK_TO_BE_DIS; - schedule_delayed_work(&dmov_conf[adm].work, (HZ/10)); + schedule_delayed_work(&dmov_conf[adm].work, msecs_to_jiffies(100)); } spin_unlock_irqrestore(&dmov_conf[adm].list_lock, flags); error: diff --git a/arch/arm/mach-msm/ipc_router.c b/arch/arm/mach-msm/ipc_router.c index fa9546984..a84b904a3 100644 --- a/arch/arm/mach-msm/ipc_router.c +++ b/arch/arm/mach-msm/ipc_router.c @@ -189,7 +189,7 @@ static LIST_HEAD(xprt_info_list); static DECLARE_RWSEM(xprt_info_list_lock_lha5); static DECLARE_COMPLETION(msm_ipc_local_router_up); -#define IPC_ROUTER_INIT_TIMEOUT (10 * HZ) +#define IPC_ROUTER_INIT_TIMEOUT 10000 static uint32_t next_port_id; static DEFINE_MUTEX(next_port_id_lock_lha1); @@ -3267,7 +3267,7 @@ void msm_ipc_router_xprt_notify(struct msm_ipc_router_xprt *xprt, if (!msm_ipc_router_workqueue) { ret = wait_for_completion_timeout(&msm_ipc_local_router_up, - IPC_ROUTER_INIT_TIMEOUT); + msecs_to_jiffies(IPC_ROUTER_INIT_TIMEOUT)); if (!ret || !msm_ipc_router_workqueue) { pr_err("%s: IPC Router not initialized\n", __func__); return; diff --git a/arch/arm/mach-msm/platsmp.c b/arch/arm/mach-msm/platsmp.c index 796fba1de..c780f776e 100644 --- a/arch/arm/mach-msm/platsmp.c +++ b/arch/arm/mach-msm/platsmp.c @@ -235,7 +235,7 @@ static int __cpuinit release_from_pen(unsigned int cpu) */ gic_raise_softirq(cpumask_of(cpu), 1); - timeout = jiffies + (1 * HZ); + timeout = jiffies + msecs_to_jiffies(1000); while (time_before(jiffies, timeout)) { smp_rmb(); if (pen_release == -1) diff --git a/arch/arm/mach-msm/qdsp6v2/apr.c b/arch/arm/mach-msm/qdsp6v2/apr.c index d3119bccf..9de7c6cf0 100644 --- a/arch/arm/mach-msm/qdsp6v2/apr.c +++ b/arch/arm/mach-msm/qdsp6v2/apr.c @@ -220,11 +220,11 @@ int apr_wait_for_device_up(int dest_id) if (dest_id == APR_DEST_MODEM) rc = wait_event_interruptible_timeout(modem_wait, (apr_get_modem_state() == APR_SUBSYS_UP), - (1 * HZ)); + msecs_to_jiffies(1000)); else if (dest_id == APR_DEST_QDSP6) rc = wait_event_interruptible_timeout(dsp_wait, (apr_get_q6_state() == APR_SUBSYS_UP), - (1 * HZ)); + msecs_to_jiffies(1000)); else pr_err("%s: unknown dest_id %d\n", __func__, dest_id); /* returns left time */ diff --git a/arch/arm/mach-msm/qdsp6v2/apr_tal.c b/arch/arm/mach-msm/qdsp6v2/apr_tal.c index 4443d64e5..333ae8f6e 100644 --- a/arch/arm/mach-msm/qdsp6v2/apr_tal.c +++ b/arch/arm/mach-msm/qdsp6v2/apr_tal.c @@ -183,7 +183,7 @@ struct apr_svc_ch_dev *apr_tal_open(uint32_t svc, uint32_t dest, return NULL; } rc = wait_event_timeout(apr_svc_ch[dl][dest][svc].wait, - (apr_svc_ch[dl][dest][svc].smd_state == 1), 5 * HZ); + (apr_svc_ch[dl][dest][svc].smd_state == 1), msecs_to_jiffies(5000)); if (rc == 0) { pr_err("apr_tal:TIMEOUT for OPEN event\n"); mutex_unlock(&apr_svc_ch[dl][dest][svc].m_lock); diff --git a/arch/arm/mach-msm/qdsp6v2/ultrasound/usf.c b/arch/arm/mach-msm/qdsp6v2/ultrasound/usf.c index 1c42020f1..9733612fc 100644 --- a/arch/arm/mach-msm/qdsp6v2/ultrasound/usf.c +++ b/arch/arm/mach-msm/qdsp6v2/ultrasound/usf.c @@ -31,7 +31,7 @@ #define USF_VERSION_ID 0x0171 /* Standard timeout in the asynchronous ops */ -#define USF_TIMEOUT_JIFFIES (1*HZ) /* 1 sec */ +#define USF_TIMEOUT_JIFFIES 1000 /* 1 sec */ /* Undefined USF device */ #define USF_UNDEF_DEV_ID 0xffff @@ -886,9 +886,9 @@ static int usf_set_us_detection(struct usf_type *usf, unsigned long arg) USF_ADSP_RESTART_STATE)); } else { if (detect_info.detect_timeout == USF_DEFAULT_TIMEOUT) - timeout = USF_TIMEOUT_JIFFIES; + timeout = msecs_to_jiffies(USF_TIMEOUT_JIFFIES); else - timeout = detect_info.detect_timeout * HZ; + timeout = detect_info.detect_timeout * msecs_to_jiffies(1000); } rc = wait_event_interruptible_timeout(usf_xx->wait, (usf_xx->us_detect_type != @@ -1098,7 +1098,7 @@ static int usf_get_tx_update(struct usf_type *usf, unsigned long arg) else { prev_jiffies = jiffies; if (upd_tx_info.timeout == USF_DEFAULT_TIMEOUT) { - timeout = USF_TIMEOUT_JIFFIES; + timeout = msecs_to_jiffies(USF_TIMEOUT_JIFFIES); rc = wait_event_timeout( usf_xx->wait, (usf_xx->prev_region != @@ -1107,7 +1107,7 @@ static int usf_get_tx_update(struct usf_type *usf, unsigned long arg) USF_WORK_STATE), timeout); } else { - timeout = upd_tx_info.timeout * HZ; + timeout = upd_tx_info.timeout * msecs_to_jiffies(1000); rc = wait_event_interruptible_timeout( usf_xx->wait, (usf_xx->prev_region != @@ -1190,7 +1190,7 @@ static int usf_set_rx_update(struct usf_xx_type *usf_xx, unsigned long arg) usf_xx->usc, &upd_rx_info.free_region) || (usf_xx->usf_state == USF_IDLE_STATE), - USF_TIMEOUT_JIFFIES); + msecs_to_jiffies(USF_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; diff --git a/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c b/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c index 1a1d58727..4290bddd8 100644 --- a/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c +++ b/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c @@ -33,7 +33,7 @@ #define WRITEDONE_IDX_STATUS 0 /* Standard timeout in the asynchronous ops */ -#define Q6USM_TIMEOUT_JIFFIES (1*HZ) /* 1 sec */ +#define Q6USM_TIMEOUT_JIFFIES 1000 /* 1 sec */ static DEFINE_MUTEX(session_lock); @@ -103,7 +103,7 @@ static int q6usm_memory_map(uint32_t buf_add, int dir, uint32_t bufsz, rc = wait_event_timeout(this_mmap.cmd_wait, (atomic_read(&this_mmap.cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: timeout. waited for memory_map\n", __func__); @@ -141,7 +141,7 @@ int q6usm_memory_unmap(uint32_t buf_add, int dir, uint32_t session, rc = wait_event_timeout(this_mmap.cmd_wait, (atomic_read(&this_mmap.cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: timeout. waited for memory_unmap\n", __func__); @@ -795,7 +795,7 @@ int q6usm_open_read(struct us_client *usc, } rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: timeout, waited for OPEN_READ rc[%d]\n", @@ -905,7 +905,7 @@ int q6usm_enc_cfg_blk(struct us_client *usc, struct us_encdec_cfg *us_cfg) } rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: timeout opcode[0x%x]\n", @@ -993,7 +993,7 @@ int q6usm_dec_cfg_blk(struct us_client *usc, struct us_encdec_cfg *us_cfg) } rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: timeout opcode[0x%x]\n", @@ -1041,7 +1041,7 @@ int q6usm_open_write(struct us_client *usc, } rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s:timeout. waited for OPEN_WRITR rc[%d]\n", @@ -1079,7 +1079,7 @@ int q6usm_run(struct us_client *usc, uint32_t flags, rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: timeout. waited for run success rc[%d]\n", @@ -1280,7 +1280,7 @@ int q6usm_cmd(struct us_client *usc, int cmd) goto fail_cmd; } rc = wait_event_timeout(usc->cmd_wait, (atomic_read(state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s:timeout. waited for response opcode[0x%x]\n", @@ -1320,11 +1320,11 @@ int q6usm_set_us_detection(struct us_client *usc, } rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: CMD_SIGNAL_DETECT_MODE: timeout=%d\n", - __func__, Q6USM_TIMEOUT_JIFFIES); + __func__, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); } else rc = 0; @@ -1366,11 +1366,11 @@ int q6usm_set_us_stream_param(int dir, struct us_client *usc, rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: CMD_SET_PARAM: timeout=%d\n", - __func__, Q6USM_TIMEOUT_JIFFIES); + __func__, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); } else rc = 0; @@ -1412,11 +1412,11 @@ int q6usm_get_us_stream_param(int dir, struct us_client *usc, rc = wait_event_timeout(usc->cmd_wait, (atomic_read(&usc->cmd_state) == 0), - Q6USM_TIMEOUT_JIFFIES); + msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; pr_err("%s: CMD_GET_PARAM: timeout=%d\n", - __func__, Q6USM_TIMEOUT_JIFFIES); + __func__, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); } else rc = 0; diff --git a/arch/arm/mach-msm/smd_pkt.c b/arch/arm/mach-msm/smd_pkt.c index c6001a626..27cd2e50f 100644 --- a/arch/arm/mach-msm/smd_pkt.c +++ b/arch/arm/mach-msm/smd_pkt.c @@ -896,7 +896,7 @@ int smd_pkt_open(struct inode *inode, struct file *file) r = wait_event_interruptible_timeout( smd_pkt_devp->ch_opened_wait_queue, - smd_pkt_devp->is_open, (2 * HZ)); + smd_pkt_devp->is_open, msecs_to_jiffies(2000)); if (r == 0) { r = -ETIMEDOUT; /* close the ch to sync smd's state with smd_pkt */ diff --git a/arch/arm/mach-msm/smd_qmi.c b/arch/arm/mach-msm/smd_qmi.c index ca10f00d0..925fb7d4b 100644 --- a/arch/arm/mach-msm/smd_qmi.c +++ b/arch/arm/mach-msm/smd_qmi.c @@ -509,7 +509,7 @@ static void qmi_notify(void *priv, unsigned event) int sz; sz = smd_cur_packet_size(ctxt->ch); if ((sz > 0) && (sz <= smd_read_avail(ctxt->ch))) { - wake_lock_timeout(&ctxt->wake_lock, HZ / 2); + wake_lock_timeout(&ctxt->wake_lock, msecs_to_jiffies(500)); queue_work(qmi_wq, &ctxt->read_work); } break; diff --git a/arch/arm/mach-msm/smd_tty.c b/arch/arm/mach-msm/smd_tty.c index 55b000340..e7945633e 100644 --- a/arch/arm/mach-msm/smd_tty.c +++ b/arch/arm/mach-msm/smd_tty.c @@ -484,7 +484,7 @@ static int smd_tty_port_activate(struct tty_port *tport, } res = wait_event_interruptible_timeout(info->ch_opened_wait_queue, - info->is_open, (2 * HZ)); + info->is_open, msecs_to_jiffies(2000)); if (res == 0) res = -ETIMEDOUT; if (res < 0) { diff --git a/arch/arm/mach-msm/smem_log.c b/arch/arm/mach-msm/smem_log.c index 35a6e1599..93cf370f8 100644 --- a/arch/arm/mach-msm/smem_log.c +++ b/arch/arm/mach-msm/smem_log.c @@ -1740,7 +1740,7 @@ static int debug_dump(char *buf, int max, uint32_t cont) r = wait_event_interruptible_timeout(inst[GEN].read_wait, inst[GEN].last_read_avail, smem_log_timeout_ms * - HZ / 1000); + msecs_to_jiffies(1)); DBG("%s: read available %d\n", __func__, inst[GEN].last_read_avail); if (r < 0) @@ -1764,7 +1764,7 @@ static int debug_dump_sym(char *buf, int max, uint32_t cont) r = wait_event_interruptible_timeout(inst[GEN].read_wait, inst[GEN].last_read_avail, smem_log_timeout_ms * - HZ / 1000); + msecs_to_jiffies(1)); DBG("%s: readavailable %d\n", __func__, inst[GEN].last_read_avail); if (r < 0) diff --git a/arch/arm/mach-msm/smp2p_spinlock_test.c b/arch/arm/mach-msm/smp2p_spinlock_test.c index c14bac0a7..5ecf6be79 100644 --- a/arch/arm/mach-msm/smp2p_spinlock_test.c +++ b/arch/arm/mach-msm/smp2p_spinlock_test.c @@ -88,7 +88,7 @@ static void smp2p_ut_remote_spinlock_core(struct seq_file *s, int remote_pid, UT_ASSERT_INT(ret, ==, 0); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_out.cb_completion, HZ * 2), + &cb_out.cb_completion, msecs_to_jiffies(2000)), >, 0); UT_ASSERT_INT(cb_out.cb_count, ==, 1); UT_ASSERT_INT(cb_out.event_open, ==, 1); @@ -99,7 +99,7 @@ static void smp2p_ut_remote_spinlock_core(struct seq_file *s, int remote_pid, UT_ASSERT_INT(ret, ==, 0); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ * 2), + &cb_in.cb_completion, msecs_to_jiffies(2000)), >, 0); UT_ASSERT_INT(cb_in.cb_count, ==, 1); UT_ASSERT_INT(cb_in.event_open, ==, 1); @@ -116,7 +116,7 @@ static void smp2p_ut_remote_spinlock_core(struct seq_file *s, int remote_pid, UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ * 2), + &cb_in.cb_completion, msecs_to_jiffies(2000)), >, 0); UT_ASSERT_INT(cb_in.cb_count, ==, 1); UT_ASSERT_INT(cb_in.event_entry_update, ==, 1); @@ -225,7 +225,7 @@ static void smp2p_ut_remote_spinlock_core(struct seq_file *s, int remote_pid, do { UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ * 2), + &cb_in.cb_completion, msecs_to_jiffies(2000)), >, 0); INIT_COMPLETION(cb_in.cb_completion); ret = msm_smp2p_in_read(remote_pid, diff --git a/arch/arm/mach-msm/smp2p_test.c b/arch/arm/mach-msm/smp2p_test.c index 2f86df4b7..c16e79371 100644 --- a/arch/arm/mach-msm/smp2p_test.c +++ b/arch/arm/mach-msm/smp2p_test.c @@ -85,7 +85,7 @@ static void smp2p_ut_local_basic(struct seq_file *s) /* verify port was opened */ UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_data.cb_completion, HZ / 2), >, 0); + &cb_data.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_data.cb_count, ==, 1); UT_ASSERT_INT(cb_data.event_open, ==, 1); UT_ASSERT_INT(rmp->rx_interrupt_count, ==, 2); @@ -173,7 +173,7 @@ static void smp2p_ut_local_late_open(struct seq_file *s) /* verify port was opened */ UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_data.cb_completion, HZ / 2), + &cb_data.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_data.cb_count, ==, 1); UT_ASSERT_INT(cb_data.event_open, ==, 1); @@ -265,7 +265,7 @@ static void smp2p_ut_local_early_open(struct seq_file *s) UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_data.cb_completion, HZ / 8), + &cb_data.cb_completion, msecs_to_jiffies(125)), ==, 0); UT_ASSERT_INT(cb_data.cb_count, ==, 0); UT_ASSERT_INT(cb_data.event_open, ==, 0); @@ -294,7 +294,7 @@ static void smp2p_ut_local_early_open(struct seq_file *s) UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_data.cb_completion, HZ / 2), + &cb_data.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_data.cb_count, ==, 1); UT_ASSERT_INT(cb_data.event_open, ==, 1); @@ -384,7 +384,7 @@ static void smp2p_ut_mock_loopback(struct seq_file *s) local = msm_smp2p_init_rmt_lpb_proc(SMP2P_REMOTE_MOCK_PROC); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &rmp->cb_completion, HZ / 2), + &rmp->cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(rmp->rx_interrupt_count, ==, 2); @@ -398,7 +398,7 @@ static void smp2p_ut_mock_loopback(struct seq_file *s) rmp->tx_interrupt(); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &rmp->cb_completion, HZ / 2), + &rmp->cb_completion, msecs_to_jiffies(500)), >, 0); /* Verify Echo Response */ @@ -421,7 +421,7 @@ static void smp2p_ut_mock_loopback(struct seq_file *s) rmp->tx_interrupt(); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &rmp->cb_completion, HZ / 2), + &rmp->cb_completion, msecs_to_jiffies(500)), >, 0); /* Verify PINGPONG Response */ @@ -443,7 +443,7 @@ static void smp2p_ut_mock_loopback(struct seq_file *s) rmp->tx_interrupt(); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &rmp->cb_completion, HZ / 2), + &rmp->cb_completion, msecs_to_jiffies(500)), >, 0); /* Verify CLEARALL response */ @@ -494,7 +494,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid) UT_ASSERT_INT(ret, ==, 0); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_out.cb_completion, HZ / 2), + &cb_out.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_out.cb_count, ==, 1); UT_ASSERT_INT(cb_out.event_open, ==, 1); @@ -505,7 +505,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid) UT_ASSERT_INT(ret, ==, 0); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ / 2), + &cb_in.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_in.cb_count, ==, 1); UT_ASSERT_INT(cb_in.event_open, ==, 1); @@ -523,7 +523,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid) /* Verify inbound reply */ UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ / 2), + &cb_in.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_in.cb_count, ==, 1); UT_ASSERT_INT(cb_in.event_entry_update, ==, 1); @@ -549,7 +549,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid) /* Verify inbound reply */ UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ / 2), + &cb_in.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_in.cb_count, ==, 1); UT_ASSERT_INT(cb_in.event_entry_update, ==, 1); @@ -573,7 +573,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid) /* Verify inbound reply */ UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ / 2), + &cb_in.cb_completion, msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_in.cb_count, ==, 1); UT_ASSERT_INT(cb_in.event_entry_update, ==, 1); @@ -598,7 +598,7 @@ static void smp2p_ut_remote_inout_core(struct seq_file *s, int remote_pid) UT_ASSERT_INT( (int)wait_for_completion_timeout( - &cb_in.cb_completion, HZ / 2), + &cb_in.cb_completion, msecs_to_jiffies(500)), ==, 0); UT_ASSERT_INT(cb_in.cb_count, ==, 0); UT_ASSERT_INT(cb_in.event_entry_update, ==, 0); @@ -814,7 +814,7 @@ static void smp2p_ut_local_in_max_entries(struct seq_file *s) UT_ASSERT_INT(ret, ==, 0); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &(cb_in[j].cb_completion), HZ / 2), + &(cb_in[j].cb_completion), msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_in[j].cb_count, ==, 1); UT_ASSERT_INT(cb_in[j].event_entry_update, ==, 0); @@ -909,7 +909,7 @@ static void smp2p_ut_local_in_multiple(struct seq_file *s) UT_ASSERT_INT(ret, ==, 0); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &(cb_in_1.cb_completion), HZ / 2), + &(cb_in_1.cb_completion), msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_in_1.cb_count, ==, 1); UT_ASSERT_INT(cb_in_1.event_entry_update, ==, 0); @@ -920,7 +920,7 @@ static void smp2p_ut_local_in_multiple(struct seq_file *s) UT_ASSERT_INT(ret, ==, 0); UT_ASSERT_INT( (int)wait_for_completion_timeout( - &(cb_in_2.cb_completion), HZ / 2), + &(cb_in_2.cb_completion), msecs_to_jiffies(500)), >, 0); UT_ASSERT_INT(cb_in_2.cb_count, ==, 1); UT_ASSERT_INT(cb_in_2.event_entry_update, ==, 0); diff --git a/arch/arm/mach-msm/timer.c b/arch/arm/mach-msm/timer.c index 1ea0f2dc6..2a6e0bda5 100644 --- a/arch/arm/mach-msm/timer.c +++ b/arch/arm/mach-msm/timer.c @@ -722,7 +722,7 @@ static void msm_timer_sync_to_gpt(struct msm_clock *clock, int exit_sleep) &__get_cpu_var(msm_clocks_percpu)[clock->index]; struct msm_timer_sync_data_t data; uint32_t gpt_clk_val; - u64 gpt_period = (1ULL << 32) * HZ; + u64 gpt_period = (1ULL << 32) * msecs_to_jiffies(1000); u64 now = get_jiffies_64(); do_div(gpt_period, gpt_hz); diff --git a/drivers/gpu/msm/adreno_profile.c b/drivers/gpu/msm/adreno_profile.c index ebe9e587f..3929eee47 100644 --- a/drivers/gpu/msm/adreno_profile.c +++ b/drivers/gpu/msm/adreno_profile.c @@ -909,7 +909,7 @@ static int profile_pipe_print(struct file *filep, char __user *ubuf, kgsl_mutex_unlock(&device->mutex, &device->mutex_owner); set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ / 10); + schedule_timeout(msecs_to_jiffies(100)); kgsl_mutex_lock(&device->mutex, &device->mutex_owner); if (signal_pending(current)) { diff --git a/drivers/platform/msm/usb_bam.c b/drivers/platform/msm/usb_bam.c index bdb7e4122..913dc0bae 100644 --- a/drivers/platform/msm/usb_bam.c +++ b/drivers/platform/msm/usb_bam.c @@ -31,7 +31,7 @@ #define USB_THRESHOLD 512 #define USB_BAM_MAX_STR_LEN 50 -#define USB_BAM_TIMEOUT (10*HZ) +#define USB_BAM_TIMEOUT 10000 enum usb_bam_sm { USB_BAM_SM_INIT = 0, @@ -1073,7 +1073,7 @@ static void wait_for_prod_granted(enum usb_bam cur_bam) } else if (ret == -EINPROGRESS) { pr_debug("%s: Waiting for PROD_GRANTED\n", __func__); if (!wait_for_completion_timeout(&info.prod_avail[cur_bam], - USB_BAM_TIMEOUT)) + msecs_to_jiffies(USB_BAM_TIMEOUT))) pr_err("%s: Timeout wainting for PROD_GRANTED\n", __func__); } else @@ -1117,7 +1117,7 @@ static void wait_for_prod_release(enum usb_bam cur_bam) } else if (ret == -EINPROGRESS) { pr_debug("%s: Waiting for PROD_RELEASED\n", __func__); if (!wait_for_completion_timeout(&info.prod_released[cur_bam], - USB_BAM_TIMEOUT)) + msecs_to_jiffies(USB_BAM_TIMEOUT))) pr_err("%s: Timeout waiting for PROD_RELEASED\n", __func__); } else @@ -2073,7 +2073,7 @@ void usb_bam_reset_complete(void) { pr_debug("Waiting for reset compelte"); if (wait_for_completion_interruptible_timeout(&ctx.reset_done, - 10*HZ) <= 0) + msecs_to_jiffies(10000)) <= 0) pr_warn("Timeout while waiting for reset"); pr_debug("Finished Waiting for reset complete"); From c1b2d6caceed1756e29bc43509ed3db5c845bce5 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Fri, 6 May 2016 19:03:56 +0800 Subject: [PATCH 05/52] CONFIG: switch to HZ = 300 --- arch/arm/configs/ik_defconfig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm/configs/ik_defconfig b/arch/arm/configs/ik_defconfig index 1ec09b28d..e98bc3f5e 100644 --- a/arch/arm/configs/ik_defconfig +++ b/arch/arm/configs/ik_defconfig @@ -574,7 +574,11 @@ CONFIG_ARCH_NR_GPIO=0 # CONFIG_PREEMPT_VOLUNTARY is not set CONFIG_PREEMPT=y CONFIG_PREEMPT_COUNT=y -CONFIG_HZ=100 +# CONFIG_HZ_100 is not set +# CONFIG_HZ_250 is not set +CONFIG_HZ_300=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=300 # CONFIG_THUMB2_KERNEL is not set CONFIG_AEABI=y # CONFIG_OABI_COMPAT is not set From dc3e2f326c72df90155c5b36bdd8e1508f22a329 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Fri, 6 May 2016 19:19:18 +0800 Subject: [PATCH 06/52] Build eur by default --- Makefile | 1 + build.sh | 6 +++--- ik.ramdisk/common/default.prop | 2 +- ik.ramdisk/common/fstab.qcom | 4 ++-- ik.ramdisk/common/init.qcom.rc | 1 + setup_ramdisk.sh | 2 +- 6 files changed, 9 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 8a53f0d3e..6dce7ac08 100644 --- a/Makefile +++ b/Makefile @@ -378,6 +378,7 @@ KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -Werror-implicit-function-declaration \ -Wno-format-security \ -Wno-discarded-array-qualifiers \ + -Wno-switch-bool \ -mtune=cortex-a15 \ --param l1-cache-size=32 \ --param l2-cache-size=2048 \ diff --git a/build.sh b/build.sh index cb8e93ce6..0a6060e70 100755 --- a/build.sh +++ b/build.sh @@ -61,7 +61,7 @@ MAKE_ZIP=1 MAKE_TAR=1 # directory containing cross-compile arm-cortex_a15 toolchain -TOOLCHAIN=/home/jc/build/toolchain/arm-cortex_a15-linux-gnueabihf-linaro_4.9.4-2015.06 +TOOLCHAIN=/home/vagrant/g2/DORIMANX_LG_STOCK_LP_KERNEL/android-toolchain # amount of cpu threads to use in kernel make process THREADS=5 @@ -84,8 +84,8 @@ if ! [ -d $RDIR"/ik.ramdisk/variant/$VARIANT/" ] ; then exit -1 fi -if [ $G_SESNSOR_HUB -eq 1] then - sed -i "s/CONFIG_SENSORS_SSP_STM_LEGACY=y/# CONFIG_SENSORS_SSP_STM_LEGACY is not set/" arch/arm/configs/id_defconfig +if [ $G_SESNSOR_HUB -eq 1 ] ; then + sed -i "s/CONFIG_SENSORS_SSP_STM_LEGACY=y/# CONFIG_SENSORS_SSP_STM_LEGACY is not set/" arch/arm/configs/ik_defconfig fi [ $PERMISSIVE -eq 1 ] && SELINUX="never_enforce" || SELINUX="always_enforce" diff --git a/ik.ramdisk/common/default.prop b/ik.ramdisk/common/default.prop index 077870132..8200bd439 100644 --- a/ik.ramdisk/common/default.prop +++ b/ik.ramdisk/common/default.prop @@ -5,7 +5,7 @@ persist.cne.feature=0 persist.security.ams.enforcing=0 ro.secure=0 ro.allow.mock.location=1 -ro.debuggable=1 +ro.debuggable=0 ro.adb.secure=0 ro.zygote=zygote32 persist.sys.strict_op_enable=false diff --git a/ik.ramdisk/common/fstab.qcom b/ik.ramdisk/common/fstab.qcom index 03e280b7f..d373b4236 100644 --- a/ik.ramdisk/common/fstab.qcom +++ b/ik.ramdisk/common/fstab.qcom @@ -3,10 +3,10 @@ # specify MF_CHECK, and must come before any filesystems that do specify MF_CHECK # -/dev/block/platform/msm_sdcc.1/by-name/system /system f2fs ro wait /dev/block/platform/msm_sdcc.1/by-name/system /system ext4 ro,errors=panic wait -/dev/block/platform/msm_sdcc.1/by-name/userdata /data f2fs nosuid,nodev,noatime,nodiratime,discard,inline_xattr wait,encryptable=footer +/dev/block/platform/msm_sdcc.1/by-name/system /system f2fs ro wait /dev/block/platform/msm_sdcc.1/by-name/userdata /data ext4 nosuid,nodev,noatime,noauto_da_alloc,discard,journal_async_commit,errors=panic wait,check,encryptable=footer +/dev/block/platform/msm_sdcc.1/by-name/userdata /data f2fs nosuid,nodev,noatime,nodiratime,discard,inline_xattr wait,encryptable=footer # VOLD /devices/msm_sdcc.[.23]/mmc_host/mmc* auto vfat default voldmanaged=extSdCard:auto,noemulatedsd /devices/platform/xhci-hcd/usb*sda auto vfat default voldmanaged=UsbDriveA:auto diff --git a/ik.ramdisk/common/init.qcom.rc b/ik.ramdisk/common/init.qcom.rc index 412392f73..5980e5ad3 100755 --- a/ik.ramdisk/common/init.qcom.rc +++ b/ik.ramdisk/common/init.qcom.rc @@ -27,6 +27,7 @@ import init.qcom.usb.rc import init.target.rc +import init.carrier.rc on early-init mount debugfs debugfs /sys/kernel/debug diff --git a/setup_ramdisk.sh b/setup_ramdisk.sh index 0a70b4aa2..9aa0ec904 100755 --- a/setup_ramdisk.sh +++ b/setup_ramdisk.sh @@ -21,7 +21,7 @@ RDIR=$(pwd) # japanese variants: # dcm = N900D / SC-01F (NTT Docomo) # kdi = N900J / SCL22 (au by KDDI) -VARIANT=can +VARIANT=eur ############## SCARY NO-TOUCHY STUFF ############### From 0d2b34201726e33a421fde23dff4ce32bee6f81c Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Fri, 6 May 2016 19:39:10 +0800 Subject: [PATCH 07/52] Fix build error caused by HZ changes --- arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c b/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c index 4290bddd8..3c9f6c6b9 100644 --- a/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c +++ b/arch/arm/mach-msm/qdsp6v2/ultrasound/version_b/q6usm_b.c @@ -1323,7 +1323,7 @@ int q6usm_set_us_detection(struct us_client *usc, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; - pr_err("%s: CMD_SIGNAL_DETECT_MODE: timeout=%d\n", + pr_err("%s: CMD_SIGNAL_DETECT_MODE: timeout=%ld\n", __func__, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); } else rc = 0; @@ -1369,7 +1369,7 @@ int q6usm_set_us_stream_param(int dir, struct us_client *usc, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; - pr_err("%s: CMD_SET_PARAM: timeout=%d\n", + pr_err("%s: CMD_SET_PARAM: timeout=%ld\n", __func__, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); } else rc = 0; @@ -1415,7 +1415,7 @@ int q6usm_get_us_stream_param(int dir, struct us_client *usc, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); if (!rc) { rc = -ETIME; - pr_err("%s: CMD_GET_PARAM: timeout=%d\n", + pr_err("%s: CMD_GET_PARAM: timeout=%ld\n", __func__, msecs_to_jiffies(Q6USM_TIMEOUT_JIFFIES)); } else rc = 0; From f8f27f73e67bbac3320a686b905cb4c23074d3b7 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 9 May 2016 10:39:06 +0800 Subject: [PATCH 08/52] msm: kgsl: Fix idle timer sysfs after removing references to HZ Dividing by jiffies is nonsense. Change-Id: Id1ca8e11cc19f18f1c435f70c31f17aa76d6e967 --- drivers/gpu/msm/kgsl_pwrctrl.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/msm/kgsl_pwrctrl.c b/drivers/gpu/msm/kgsl_pwrctrl.c index 3aab7351b..8a2b31085 100644 --- a/drivers/gpu/msm/kgsl_pwrctrl.c +++ b/drivers/gpu/msm/kgsl_pwrctrl.c @@ -505,7 +505,6 @@ static int kgsl_pwrctrl_idle_timer_store(struct device *dev, unsigned int val = 0; struct kgsl_device *device = kgsl_device_from_dev(dev); struct kgsl_pwrctrl *pwr; - const long div = 1000/msecs_to_jiffies(1000); int ret; if (device == NULL) @@ -519,8 +518,7 @@ static int kgsl_pwrctrl_idle_timer_store(struct device *dev, kgsl_mutex_lock(&device->mutex, &device->mutex_owner); /* Let the timeout be requested in ms, but convert to jiffies. */ - val /= div; - pwr->interval_timeout = val; + pwr->interval_timeout = msecs_to_jiffies(val); kgsl_mutex_unlock(&device->mutex, &device->mutex_owner); @@ -532,12 +530,11 @@ static int kgsl_pwrctrl_idle_timer_show(struct device *dev, char *buf) { struct kgsl_device *device = kgsl_device_from_dev(dev); - int mul = 1000/msecs_to_jiffies(1000); if (device == NULL) return 0; /* Show the idle_timeout converted to msec */ return snprintf(buf, PAGE_SIZE, "%d\n", - device->pwrctrl.interval_timeout * mul); + jiffies_to_msecs(device->pwrctrl.interval_timeout)); } static int kgsl_pwrctrl_pmqos_latency_store(struct device *dev, From 619b6be853fa739a1a1630fc70dd48f0577b2622 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 9 May 2016 11:18:23 +0800 Subject: [PATCH 09/52] UKSM: turn off uksm, it is not helping --- ik.ramdisk/common/init.idlekernel.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ik.ramdisk/common/init.idlekernel.sh b/ik.ramdisk/common/init.idlekernel.sh index 44022c976..ccbab5cf8 100755 --- a/ik.ramdisk/common/init.idlekernel.sh +++ b/ik.ramdisk/common/init.idlekernel.sh @@ -118,6 +118,16 @@ if [ ! -f $CFILE ]; then echo $CDEF > $CFILE fi echo `cat $CFILE` > $SFILE +# +# uksm +# +CFILE=$DDIR/uksm_run +SFILE=/sys/kernel/mm/uksm/run +CDEF=N +if [ ! -f $CFILE ]; then + echo $CDEF > $CFILE +fi +echo `cat $CFILE` > $SFILE /sbin/setonboot apply & /system/xbin/busybox run-parts /system/etc/init.d & From 62ac7d795cb52a21e2d3d29f3d210cbffb11b55c Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 9 May 2016 04:00:06 -0400 Subject: [PATCH 10/52] Linux 3.4.112 --- Makefile | 2 +- arch/arm/Makefile | 8 +++ arch/arm/kernel/signal.c | 15 ++++-- crypto/ablkcipher.c | 2 +- crypto/ahash.c | 3 +- crypto/algapi.c | 2 +- crypto/api.c | 6 +-- crypto/crypto_user.c | 2 +- drivers/auxdisplay/ks0108.c | 1 + drivers/base/devres.c | 4 +- drivers/base/platform.c | 8 +-- drivers/base/regmap/regmap-debugfs.c | 5 +- drivers/block/xen-blkfront.c | 3 +- drivers/gpu/drm/drm_crtc.c | 2 +- drivers/gpu/drm/nouveau/nouveau_gem.c | 5 +- drivers/gpu/drm/radeon/radeon_combios.c | 8 +++ drivers/gpu/drm/radeon/radeon_connectors.c | 5 ++ drivers/infiniband/core/cm.c | 10 +++- drivers/infiniband/core/uverbs.h | 3 +- drivers/infiniband/core/uverbs_cmd.c | 10 +++- drivers/infiniband/core/uverbs_main.c | 43 ++++++++++----- drivers/infiniband/hw/mlx4/ah.c | 6 ++- drivers/iommu/amd_iommu.c | 4 +- drivers/iommu/amd_iommu_types.h | 1 + drivers/iommu/intel-iommu.c | 19 ++++--- drivers/macintosh/windfarm_core.c | 2 +- drivers/md/Kconfig | 2 +- drivers/md/md.c | 1 + drivers/md/persistent-data/dm-btree.c | 2 +- drivers/md/raid0.c | 55 ++++++++++++------- drivers/md/raid1.c | 41 +++++++++++++-- drivers/md/raid1.h | 5 ++ drivers/md/raid10.c | 42 +++++++++++++-- drivers/md/raid10.h | 6 +++ drivers/media/rc/rc-main.c | 3 -- drivers/mtd/ubi/io.c | 5 ++ drivers/mtd/ubi/vtbl.c | 1 + drivers/mtd/ubi/wl.c | 1 + drivers/net/wireless/ath/ath9k/init.c | 1 + drivers/net/wireless/iwlwifi/iwl-agn-lib.c | 2 +- drivers/of/address.c | 6 +-- drivers/pci/access.c | 61 +++++++++++++++++++++- drivers/pci/quirks.c | 18 +++++-- drivers/scsi/mvsas/mv_sas.c | 2 + drivers/spi/spi-pxa2xx.c | 4 ++ drivers/spi/spi.c | 3 +- drivers/tty/n_tty.c | 6 +-- drivers/tty/tty_io.c | 31 +++++++++-- drivers/usb/core/config.c | 8 +-- drivers/usb/core/quirks.c | 13 +++++ drivers/usb/host/ehci-sysfs.c | 8 +-- drivers/usb/host/xhci-mem.c | 6 +-- drivers/usb/host/xhci-pci.c | 1 + drivers/usb/host/xhci-ring.c | 33 +++++++++--- drivers/usb/host/xhci.c | 3 +- drivers/usb/serial/ftdi_sio.c | 4 ++ drivers/usb/serial/ftdi_sio_ids.h | 8 +++ fs/btrfs/inode.c | 3 +- fs/ceph/super.c | 8 +-- fs/cifs/cifsencrypt.c | 51 +++++++++++++++++- fs/cifs/cifsfs.c | 4 +- fs/ecryptfs/dentry.c | 32 ++++++------ fs/ext4/super.c | 4 +- fs/gfs2/super.c | 6 +-- fs/hfs/bnode.c | 9 ++-- fs/hfs/brec.c | 20 +++---- fs/hfs/super.c | 4 +- fs/hfsplus/bnode.c | 3 -- fs/hfsplus/options.c | 4 +- fs/hostfs/hostfs_kern.c | 2 +- fs/hpfs/namei.c | 25 ++++++++- fs/nfs/nfs4proc.c | 2 +- fs/ocfs2/dlm/dlmmaster.c | 7 ++- fs/ocfs2/dlm/dlmrecovery.c | 6 ++- fs/ocfs2/super.c | 4 +- fs/pipe.c | 5 +- fs/reiserfs/super.c | 8 +-- fs/splice.c | 12 ++++- fs/xfs/xfs_super.c | 4 +- include/linux/pci.h | 2 + include/linux/seq_file.h | 35 +++++++++++++ include/sound/wm8904.h | 2 +- kernel/cgroup.c | 7 +-- kernel/irq/proc.c | 19 ++++++- kernel/module.c | 8 ++- kernel/sched/core.c | 10 ++-- kernel/sched/sched.h | 4 +- kernel/time/clocksource.c | 2 +- mm/filemap.c | 9 ++-- mm/hugetlb.c | 8 +++ net/ipv6/xfrm6_output.c | 16 +++--- net/mac80211/tx.c | 3 -- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 11 +++- net/sunrpc/xprtsock.c | 2 + security/selinux/hooks.c | 2 +- sound/arm/Kconfig | 15 +++--- sound/soc/pxa/Kconfig | 2 - sound/synth/emux/emux_oss.c | 3 +- tools/perf/util/header.c | 22 +++----- 99 files changed, 728 insertions(+), 238 deletions(-) diff --git a/Makefile b/Makefile index 6dce7ac08..470225f3b 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 3 PATCHLEVEL = 4 -SUBLEVEL = 111 +SUBLEVEL = 112 EXTRAVERSION = NAME = Saber-toothed Squirrel diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 529e88060..b1eedba96 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -53,6 +53,14 @@ endif comma = , +# +# The Scalar Replacement of Aggregates (SRA) optimization pass in GCC 4.9 and +# later may result in code being generated that handles signed short and signed +# char struct members incorrectly. So disable it. +# (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65932) +# +KBUILD_CFLAGS += $(call cc-option,-fno-ipa-sra) + arch-$(CONFIG_ARCH_MSM_KRAIT) :=-D__LINUX_ARM_ARCH__=7 $(call cc-option,-mcpu=cortex-a15,-march=armv7-a) ifeq ($(CONFIG_AEABI),y) diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index a178f75da..38ecc608b 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -440,12 +440,17 @@ setup_return(struct pt_regs *regs, struct k_sigaction *ka, */ thumb = handler & 1; -#if __LINUX_ARM_ARCH__ >= 7 +#if __LINUX_ARM_ARCH__ >= 6 /* - * Clear the If-Then Thumb-2 execution state - * ARM spec requires this to be all 000s in ARM mode - * Snapdragon S4/Krait misbehaves on a Thumb=>ARM - * signal transition without this. + * Clear the If-Then Thumb-2 execution state. ARM spec + * requires this to be all 000s in ARM mode. Snapdragon + * S4/Krait misbehaves on a Thumb=>ARM signal transition + * without this. + * + * We must do this whenever we are running on a Thumb-2 + * capable CPU, which includes ARMv6T2. However, we elect + * to do this whenever we're on an ARMv6 or later CPU for + * simplicity. */ cpsr &= ~PSR_IT_MASK; #endif diff --git a/crypto/ablkcipher.c b/crypto/ablkcipher.c index 313a05b8f..7da6f6a21 100644 --- a/crypto/ablkcipher.c +++ b/crypto/ablkcipher.c @@ -715,7 +715,7 @@ struct crypto_ablkcipher *crypto_alloc_ablkcipher(const char *alg_name, err: if (err != -EAGAIN) break; - if (signal_pending(current)) { + if (fatal_signal_pending(current)) { err = -EINTR; break; } diff --git a/crypto/ahash.c b/crypto/ahash.c index d4c8aae5b..2a79f7ff1 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -487,7 +487,8 @@ static int ahash_prepare_alg(struct ahash_alg *alg) struct crypto_alg *base = &alg->halg.base; if (alg->halg.digestsize > PAGE_SIZE / 8 || - alg->halg.statesize > PAGE_SIZE / 8) + alg->halg.statesize > PAGE_SIZE / 8 || + alg->halg.statesize == 0) return -EINVAL; base->cra_type = &crypto_ahash_type; diff --git a/crypto/algapi.c b/crypto/algapi.c index c18f8b104..8df2104b5 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -386,7 +386,7 @@ static void crypto_wait_for_test(struct crypto_larval *larval) crypto_alg_tested(larval->alg.cra_driver_name, 0); } - err = wait_for_completion_interruptible(&larval->completion); + err = wait_for_completion_killable(&larval->completion); WARN_ON(err); out: diff --git a/crypto/api.c b/crypto/api.c index 5c13704f4..29fe0757f 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -181,7 +181,7 @@ static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg) struct crypto_larval *larval = (void *)alg; long timeout; - timeout = wait_for_completion_interruptible_timeout( + timeout = wait_for_completion_killable_timeout( &larval->completion, 60 * HZ); alg = larval->adult; @@ -460,7 +460,7 @@ struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask) err: if (err != -EAGAIN) break; - if (signal_pending(current)) { + if (fatal_signal_pending(current)) { err = -EINTR; break; } @@ -589,7 +589,7 @@ void *crypto_alloc_tfm(const char *alg_name, err: if (err != -EAGAIN) break; - if (signal_pending(current)) { + if (fatal_signal_pending(current)) { err = -EINTR; break; } diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c index 910497bd7..0c19d0357 100644 --- a/crypto/crypto_user.c +++ b/crypto/crypto_user.c @@ -350,7 +350,7 @@ static struct crypto_alg *crypto_user_aead_alg(const char *name, u32 type, err = PTR_ERR(alg); if (err != -EAGAIN) break; - if (signal_pending(current)) { + if (fatal_signal_pending(current)) { err = -EINTR; break; } diff --git a/drivers/auxdisplay/ks0108.c b/drivers/auxdisplay/ks0108.c index 5b9385239..0d752851a 100644 --- a/drivers/auxdisplay/ks0108.c +++ b/drivers/auxdisplay/ks0108.c @@ -139,6 +139,7 @@ static int __init ks0108_init(void) ks0108_pardevice = parport_register_device(ks0108_parport, KS0108_NAME, NULL, NULL, NULL, PARPORT_DEV_EXCL, NULL); + parport_put_port(ks0108_parport); if (ks0108_pardevice == NULL) { printk(KERN_ERR KS0108_NAME ": ERROR: " "parport didn't register new device\n"); diff --git a/drivers/base/devres.c b/drivers/base/devres.c index 524bf96c2..06c541dc4 100644 --- a/drivers/base/devres.c +++ b/drivers/base/devres.c @@ -254,10 +254,10 @@ void * devres_get(struct device *dev, void *new_res, if (!dr) { add_dr(dev, &new_dr->node); dr = new_dr; - new_dr = NULL; + new_res = NULL; } spin_unlock_irqrestore(&dev->devres_lock, flags); - devres_free(new_dr); + devres_free(new_res); return dr->data; } diff --git a/drivers/base/platform.c b/drivers/base/platform.c index a1a722502..5a1373330 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -311,9 +311,7 @@ int platform_device_add(struct platform_device *pdev) failed: while (--i >= 0) { struct resource *r = &pdev->resource[i]; - unsigned long type = resource_type(r); - - if (type == IORESOURCE_MEM || type == IORESOURCE_IO) + if (r->parent) release_resource(r); } @@ -338,9 +336,7 @@ void platform_device_del(struct platform_device *pdev) for (i = 0; i < pdev->num_resources; i++) { struct resource *r = &pdev->resource[i]; - unsigned long type = resource_type(r); - - if (type == IORESOURCE_MEM || type == IORESOURCE_IO) + if (r->parent) release_resource(r); } } diff --git a/drivers/base/regmap/regmap-debugfs.c b/drivers/base/regmap/regmap-debugfs.c index 1db128951..023a9d79e 100644 --- a/drivers/base/regmap/regmap-debugfs.c +++ b/drivers/base/regmap/regmap-debugfs.c @@ -23,8 +23,7 @@ static struct dentry *regmap_debugfs_root; /* Calculate the length of a fixed format */ static size_t regmap_calc_reg_len(int max_val, char *buf, size_t buf_size) { - snprintf(buf, buf_size, "%x", max_val); - return strlen(buf); + return snprintf(NULL, 0, "%x", max_val); } static ssize_t regmap_name_read_file(struct file *file, @@ -205,7 +204,7 @@ static ssize_t regmap_access_read_file(struct file *file, /* If we're in the region the user is trying to read */ if (p >= *ppos) { /* ...but not beyond it */ - if (buf_pos >= count - 1 - tot_len) + if (buf_pos + tot_len + 1 >= count) break; /* Format the register */ diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index a81cdd7b9..16477b255 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1314,7 +1314,8 @@ static void blkback_changed(struct xenbus_device *dev, break; /* Missed the backend's Closing state -- fallthrough */ case XenbusStateClosing: - blkfront_closing(info); + if (info) + blkfront_closing(info); break; } } diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c index ed4b7481a..93c5b2fdf 100644 --- a/drivers/gpu/drm/drm_crtc.c +++ b/drivers/gpu/drm/drm_crtc.c @@ -2945,7 +2945,7 @@ static struct drm_property_blob *drm_property_create_blob(struct drm_device *dev struct drm_property_blob *blob; int ret; - if (!length || !data) + if (!length || length > ULONG_MAX - sizeof(struct drm_property_blob) || !data) return NULL; blob = kzalloc(sizeof(struct drm_property_blob)+length, GFP_KERNEL); diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c index 2f46bbfbb..b242534f5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_gem.c +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c @@ -172,11 +172,12 @@ nouveau_gem_info(struct drm_file *file_priv, struct drm_gem_object *gem, struct nouveau_bo *nvbo = nouveau_gem_object(gem); struct nouveau_vma *vma; - if (nvbo->bo.mem.mem_type == TTM_PL_TT) + if (is_power_of_2(nvbo->valid_domains)) + rep->domain = nvbo->valid_domains; + else if (nvbo->bo.mem.mem_type == TTM_PL_TT) rep->domain = NOUVEAU_GEM_DOMAIN_GART; else rep->domain = NOUVEAU_GEM_DOMAIN_VRAM; - rep->offset = nvbo->bo.offset; if (fpriv->vm) { vma = nouveau_bo_vma_find(nvbo, fpriv->vm); diff --git a/drivers/gpu/drm/radeon/radeon_combios.c b/drivers/gpu/drm/radeon/radeon_combios.c index b72eb507d..69c2dd085 100644 --- a/drivers/gpu/drm/radeon/radeon_combios.c +++ b/drivers/gpu/drm/radeon/radeon_combios.c @@ -3399,6 +3399,14 @@ void radeon_combios_asic_init(struct drm_device *dev) rdev->pdev->subsystem_device == 0x30ae) return; + /* quirk for rs4xx HP Compaq dc5750 Small Form Factor to make it resume + * - it hangs on resume inside the dynclk 1 table. + */ + if (rdev->family == CHIP_RS480 && + rdev->pdev->subsystem_vendor == 0x103c && + rdev->pdev->subsystem_device == 0x280a) + return; + /* DYN CLK 1 */ table = combios_get_table_offset(dev, COMBIOS_DYN_CLK_1_TABLE); if (table) diff --git a/drivers/gpu/drm/radeon/radeon_connectors.c b/drivers/gpu/drm/radeon/radeon_connectors.c index 9184bbe7c..9c5d96cb6 100644 --- a/drivers/gpu/drm/radeon/radeon_connectors.c +++ b/drivers/gpu/drm/radeon/radeon_connectors.c @@ -82,6 +82,11 @@ void radeon_connector_hotplug(struct drm_connector *connector) if (!radeon_hpd_sense(rdev, radeon_connector->hpd.hpd)) { drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); } else if (radeon_dp_needs_link_train(radeon_connector)) { + /* Don't try to start link training before we + * have the dpcd */ + if (!radeon_dp_getdpcd(radeon_connector)) + return; + /* set it to OFF so that drm_helper_connector_dpms() * won't return immediately since the current state * is ON at this point. diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c889aaef3..90104c6fc 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -856,6 +856,11 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err) case IB_CM_SIDR_REQ_RCVD: spin_unlock_irq(&cm_id_priv->lock); cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT); + spin_lock_irq(&cm.lock); + if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) + rb_erase(&cm_id_priv->sidr_id_node, + &cm.remote_sidr_table); + spin_unlock_irq(&cm.lock); break; case IB_CM_REQ_SENT: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); @@ -3092,7 +3097,10 @@ int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id, spin_unlock_irqrestore(&cm_id_priv->lock, flags); spin_lock_irqsave(&cm.lock, flags); - rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) { + rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table); + RB_CLEAR_NODE(&cm_id_priv->sidr_id_node); + } spin_unlock_irqrestore(&cm.lock, flags); return 0; diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 5bcb2afd3..228af1894 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -69,7 +69,7 @@ */ struct ib_uverbs_device { - struct kref ref; + atomic_t refcount; int num_comp_vectors; struct completion comp; struct device *dev; @@ -78,6 +78,7 @@ struct ib_uverbs_device { struct cdev cdev; struct rb_root xrcd_tree; struct mutex xrcd_tree_mutex; + struct kobject kobj; }; struct ib_uverbs_event_file { diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4d27e4c3f..95885b490 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1979,6 +1979,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, next->send_flags = user_wr->send_flags; if (is_ud) { + if (next->opcode != IB_WR_SEND && + next->opcode != IB_WR_SEND_WITH_IMM) { + ret = -EINVAL; + goto out_put; + } + next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah, file->ucontext); if (!next->wr.ud.ah) { @@ -2015,9 +2021,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, user_wr->wr.atomic.compare_add; next->wr.atomic.swap = user_wr->wr.atomic.swap; next->wr.atomic.rkey = user_wr->wr.atomic.rkey; + case IB_WR_SEND: break; default: - break; + ret = -EINVAL; + goto out_put; } } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 5b51e4e6e..c8e766924 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -117,14 +117,18 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, static void ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device); -static void ib_uverbs_release_dev(struct kref *ref) +static void ib_uverbs_release_dev(struct kobject *kobj) { struct ib_uverbs_device *dev = - container_of(ref, struct ib_uverbs_device, ref); + container_of(kobj, struct ib_uverbs_device, kobj); - complete(&dev->comp); + kfree(dev); } +static struct kobj_type ib_uverbs_dev_ktype = { + .release = ib_uverbs_release_dev, +}; + static void ib_uverbs_release_event_file(struct kref *ref) { struct ib_uverbs_event_file *file = @@ -273,13 +277,19 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, return context->device->dealloc_ucontext(context); } +static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev) +{ + complete(&dev->comp); +} + static void ib_uverbs_release_file(struct kref *ref) { struct ib_uverbs_file *file = container_of(ref, struct ib_uverbs_file, ref); module_put(file->device->ib_dev->owner); - kref_put(&file->device->ref, ib_uverbs_release_dev); + if (atomic_dec_and_test(&file->device->refcount)) + ib_uverbs_comp_dev(file->device); kfree(file); } @@ -621,9 +631,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) int ret; dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev); - if (dev) - kref_get(&dev->ref); - else + if (!atomic_inc_not_zero(&dev->refcount)) return -ENXIO; if (!try_module_get(dev->ib_dev->owner)) { @@ -644,6 +652,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) mutex_init(&file->mutex); filp->private_data = file; + kobject_get(&dev->kobj); return nonseekable_open(inode, filp); @@ -651,13 +660,16 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) module_put(dev->ib_dev->owner); err: - kref_put(&dev->ref, ib_uverbs_release_dev); + if (atomic_dec_and_test(&dev->refcount)) + ib_uverbs_comp_dev(dev); + return ret; } static int ib_uverbs_close(struct inode *inode, struct file *filp) { struct ib_uverbs_file *file = filp->private_data; + struct ib_uverbs_device *dev = file->device; ib_uverbs_cleanup_ucontext(file, file->ucontext); @@ -665,6 +677,7 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) kref_put(&file->async_file->ref, ib_uverbs_release_event_file); kref_put(&file->ref, ib_uverbs_release_file); + kobject_put(&dev->kobj); return 0; } @@ -760,10 +773,11 @@ static void ib_uverbs_add_one(struct ib_device *device) if (!uverbs_dev) return; - kref_init(&uverbs_dev->ref); + atomic_set(&uverbs_dev->refcount, 1); init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; mutex_init(&uverbs_dev->xrcd_tree_mutex); + kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype); spin_lock(&map_lock); devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); @@ -790,6 +804,7 @@ static void ib_uverbs_add_one(struct ib_device *device) cdev_init(&uverbs_dev->cdev, NULL); uverbs_dev->cdev.owner = THIS_MODULE; uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; + uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj; kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum); if (cdev_add(&uverbs_dev->cdev, base, 1)) goto err_cdev; @@ -820,9 +835,10 @@ static void ib_uverbs_add_one(struct ib_device *device) clear_bit(devnum, overflow_map); err: - kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); + if (atomic_dec_and_test(&uverbs_dev->refcount)) + ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); - kfree(uverbs_dev); + kobject_put(&uverbs_dev->kobj); return; } @@ -842,9 +858,10 @@ static void ib_uverbs_remove_one(struct ib_device *device) else clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); - kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); + if (atomic_dec_and_test(&uverbs_dev->refcount)) + ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); - kfree(uverbs_dev); + kobject_put(&uverbs_dev->kobj); } static char *uverbs_devnode(struct device *dev, umode_t *mode) diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index a251becda..890c23b3d 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -169,9 +169,13 @@ int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) enum rdma_link_layer ll; memset(ah_attr, 0, sizeof *ah_attr); - ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24; ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num); + if (ll == IB_LINK_LAYER_ETHERNET) + ah_attr->sl = be32_to_cpu(ah->av.eth.sl_tclass_flowlabel) >> 29; + else + ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0; if (ah->av.ib.stat_rate) ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET; diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index a20d64d4e..4fd4f519c 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -1931,8 +1931,8 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) static void clear_dte_entry(u16 devid) { /* remove entry from the device table seen by the hardware */ - amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; - amd_iommu_dev_table[devid].data[1] = 0; + amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; + amd_iommu_dev_table[devid].data[1] &= DTE_FLAG_MASK; amd_iommu_apply_erratum_63(devid); } diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index c4ffacb03..42f2090d3 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -277,6 +277,7 @@ #define IOMMU_PTE_IR (1ULL << 61) #define IOMMU_PTE_IW (1ULL << 62) +#define DTE_FLAG_MASK (0x3ffULL << 32) #define DTE_FLAG_IOTLB (0x01UL << 32) #define DTE_FLAG_GV (0x01ULL << 55) #define DTE_GLX_SHIFT (56) diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 2e48b2273..f0a84bc3c 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -1827,13 +1827,20 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, return -ENOMEM; /* It is large page*/ if (largepage_lvl > 1) { + unsigned long nr_superpages, end_pfn, lvl_pages; + pteval |= DMA_PTE_LARGE_PAGE; - /* Ensure that old small page tables are removed to make room - for superpage, if they exist. */ - dma_pte_clear_range(domain, iov_pfn, - iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1); - dma_pte_free_pagetable(domain, iov_pfn, - iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1); + lvl_pages = lvl_to_nr_pages(largepage_lvl); + + nr_superpages = sg_res / lvl_pages; + end_pfn = iov_pfn + nr_superpages * lvl_pages - 1; + + /* + * Ensure that old small page tables are + * removed to make room for superpage(s). + */ + dma_pte_clear_range(domain, iov_pfn, end_pfn); + dma_pte_free_pagetable(domain, iov_pfn, end_pfn); } else { pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; } diff --git a/drivers/macintosh/windfarm_core.c b/drivers/macintosh/windfarm_core.c index ce8897933..004fa1089 100644 --- a/drivers/macintosh/windfarm_core.c +++ b/drivers/macintosh/windfarm_core.c @@ -421,7 +421,7 @@ int wf_unregister_client(struct notifier_block *nb) { mutex_lock(&wf_lock); blocking_notifier_chain_unregister(&wf_client_list, nb); - wf_client_count++; + wf_client_count--; if (wf_client_count == 0) wf_stop_thread(); mutex_unlock(&wf_lock); diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2241534d3..f3b617eb3 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -347,7 +347,7 @@ config DM_MULTIPATH # of SCSI_DH if the latter isn't defined but if # it is, DM_MULTIPATH must depend on it. We get a build # error if SCSI_DH=m and DM_MULTIPATH=y - depends on SCSI_DH || !SCSI_DH + depends on !SCSI_DH || SCSI ---help--- Allow volume managers to support multipath hardware. diff --git a/drivers/md/md.c b/drivers/md/md.c index 14c3eea6e..29fb41513 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7929,6 +7929,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, /* Make sure they get written out promptly */ sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); + set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); md_wakeup_thread(rdev->mddev->thread); } return rv; diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index dddd5a47f..be86d59ea 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -502,7 +502,7 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) r = new_block(s->info, &right); if (r < 0) { - /* FIXME: put left */ + unlock_block(s->info, left); return r; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index de63a1fc3..a0bb34a81 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -88,6 +88,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) char b[BDEVNAME_SIZE]; char b2[BDEVNAME_SIZE]; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); + unsigned short blksize = 512; if (!conf) return -ENOMEM; @@ -102,6 +103,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) sector_div(sectors, mddev->chunk_sectors); rdev1->sectors = sectors * mddev->chunk_sectors; + blksize = max(blksize, queue_logical_block_size( + rdev1->bdev->bd_disk->queue)); + rdev_for_each(rdev2, mddev) { pr_debug("md/raid0:%s: comparing %s(%llu)" " with %s(%llu)\n", @@ -138,6 +142,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } pr_debug("md/raid0:%s: FINAL %d zones\n", mdname(mddev), conf->nr_strip_zones); + /* + * now since we have the hard sector sizes, we can make sure + * chunk size is a multiple of that sector size + */ + if ((mddev->chunk_sectors << 9) % blksize) { + printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n", + mdname(mddev), + mddev->chunk_sectors << 9, blksize); + err = -EINVAL; + goto abort; + } + err = -ENOMEM; conf->strip_zone = kzalloc(sizeof(struct strip_zone)* conf->nr_strip_zones, GFP_KERNEL); @@ -186,9 +202,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } dev[j] = rdev1; - disk_stack_limits(mddev->gendisk, rdev1->bdev, - rdev1->data_offset << 9); - if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) conf->has_merge_bvec = 1; @@ -257,21 +270,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) mddev->queue->backing_dev_info.congested_fn = raid0_congested; mddev->queue->backing_dev_info.congested_data = mddev; - /* - * now since we have the hard sector sizes, we can make sure - * chunk size is a multiple of that sector size - */ - if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { - printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n", - mdname(mddev), - mddev->chunk_sectors << 9); - goto abort; - } - - blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); - blk_queue_io_opt(mddev->queue, - (mddev->chunk_sectors << 9) * mddev->raid_disks); - pr_debug("md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; @@ -431,6 +429,27 @@ static int raid0_run(struct mddev *mddev) mddev->private = conf; } conf = mddev->private; + if (mddev->queue) { + struct md_rdev *rdev; + bool discard_supported = false; + + blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); + + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); + blk_queue_io_opt(mddev->queue, + (mddev->chunk_sectors << 9) * mddev->raid_disks); + + rdev_for_each(rdev, mddev) { + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; + } + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + } /* calculate array device size */ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a5e678ee7..375cdc68d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1285,6 +1285,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Faulty, &rdev->flags); spin_unlock_irqrestore(&conf->device_lock, flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" "md/raid1:%s: Operation continuing on %d devices.\n", @@ -2055,6 +2056,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) { int m; + bool fail = false; for (m = 0; m < conf->raid_disks * 2 ; m++) if (r1_bio->bios[m] == IO_MADE_GOOD) { struct md_rdev *rdev = conf->mirrors[m].rdev; @@ -2067,6 +2069,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) * narrow down and record precise write * errors. */ + fail = true; if (!narrow_write_error(r1_bio, m)) { md_error(conf->mddev, conf->mirrors[m].rdev); @@ -2076,9 +2079,17 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) rdev_dec_pending(conf->mirrors[m].rdev, conf->mddev); } - if (test_bit(R1BIO_WriteError, &r1_bio->state)) - close_write(r1_bio); - raid_end_bio_io(r1_bio); + if (fail) { + spin_lock_irq(&conf->device_lock); + list_add(&r1_bio->retry_list, &conf->bio_end_io_list); + conf->nr_queued++; + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(conf->mddev->thread); + } else { + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); + raid_end_bio_io(r1_bio); + } } static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) @@ -2181,6 +2192,29 @@ static void raid1d(struct mddev *mddev) md_check_recovery(mddev); + if (!list_empty_careful(&conf->bio_end_io_list) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + LIST_HEAD(tmp); + spin_lock_irqsave(&conf->device_lock, flags); + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + while (!list_empty(&conf->bio_end_io_list)) { + list_move(conf->bio_end_io_list.prev, &tmp); + conf->nr_queued--; + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); + while (!list_empty(&tmp)) { + r1_bio = list_first_entry(&conf->bio_end_io_list, + struct r1bio, retry_list); + list_del(&r1_bio->retry_list); + if (mddev->degraded) + set_bit(R1BIO_Degraded, &r1_bio->state); + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); + raid_end_bio_io(r1_bio); + } + } + blk_start_plug(&plug); for (;;) { @@ -2582,6 +2616,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; INIT_LIST_HEAD(&conf->retry_list); + INIT_LIST_HEAD(&conf->bio_end_io_list); spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 80ded1393..50086cf0e 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -48,6 +48,11 @@ struct r1conf { * block, or anything else. */ struct list_head retry_list; + /* A separate list of r1bio which just need raid_end_bio_io called. + * This mustn't happen for writes which had any errors if the superblock + * needs to be written. + */ + struct list_head bio_end_io_list; /* queue pending writes to be submitted on unplug */ struct bio_list pending_bio_list; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f9a981b0e..6ce5c8d3b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1453,6 +1453,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" "md/raid10:%s: Operation continuing on %d devices.\n", @@ -2526,6 +2527,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) } put_buf(r10_bio); } else { + bool fail = false; for (m = 0; m < conf->copies; m++) { int dev = r10_bio->devs[m].devnum; struct bio *bio = r10_bio->devs[m].bio; @@ -2538,6 +2540,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_dec_pending(rdev, conf->mddev); } else if (bio != NULL && !test_bit(BIO_UPTODATE, &bio->bi_flags)) { + fail = true; if (!narrow_write_error(r10_bio, m)) { md_error(conf->mddev, rdev); set_bit(R10BIO_Degraded, @@ -2555,10 +2558,17 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_dec_pending(rdev, conf->mddev); } } - if (test_bit(R10BIO_WriteError, - &r10_bio->state)) - close_write(r10_bio); - raid_end_bio_io(r10_bio); + if (fail) { + spin_lock_irq(&conf->device_lock); + list_add(&r10_bio->retry_list, &conf->bio_end_io_list); + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(conf->mddev->thread); + } else { + if (test_bit(R10BIO_WriteError, + &r10_bio->state)) + close_write(r10_bio); + raid_end_bio_io(r10_bio); + } } } @@ -2572,6 +2582,29 @@ static void raid10d(struct mddev *mddev) md_check_recovery(mddev); + if (!list_empty_careful(&conf->bio_end_io_list) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + LIST_HEAD(tmp); + spin_lock_irqsave(&conf->device_lock, flags); + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + list_add(&tmp, &conf->bio_end_io_list); + list_del_init(&conf->bio_end_io_list); + } + spin_unlock_irqrestore(&conf->device_lock, flags); + while (!list_empty(&tmp)) { + r10_bio = list_first_entry(&conf->bio_end_io_list, + struct r10bio, retry_list); + list_del(&r10_bio->retry_list); + if (mddev->degraded) + set_bit(R10BIO_Degraded, &r10_bio->state); + + if (test_bit(R10BIO_WriteError, + &r10_bio->state)) + close_write(r10_bio); + raid_end_bio_io(r10_bio); + } + } + blk_start_plug(&plug); for (;;) { @@ -3270,6 +3303,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); + INIT_LIST_HEAD(&conf->bio_end_io_list); spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 7c615613c..c09b38414 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -42,6 +42,12 @@ struct r10conf { sector_t chunk_mask; struct list_head retry_list; + /* A separate list of r1bio which just need raid_end_bio_io called. + * This mustn't happen for writes which had any errors if the superblock + * needs to be written. + */ + struct list_head bio_end_io_list; + /* queue pending writes and submit them on unplug */ struct bio_list pending_bio_list; int pending_count; diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index cec1f8c05..a7ff6b547 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -946,9 +946,6 @@ static int rc_dev_uevent(struct device *device, struct kobj_uevent_env *env) { struct rc_dev *dev = to_rc_dev(device); - if (!dev || !dev->input_dev) - return -ENODEV; - if (dev->rc_map.name) ADD_HOTPLUG_VAR("NAME=%s", dev->rc_map.name); if (dev->driver_name) diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c index 43f1a0011..8f793ea1d 100644 --- a/drivers/mtd/ubi/io.c +++ b/drivers/mtd/ubi/io.c @@ -942,6 +942,11 @@ static int validate_vid_hdr(const struct ubi_device *ubi, goto bad; } + if (data_size > ubi->leb_size) { + dbg_err("bad data_size"); + goto bad; + } + if (vol_type == UBI_VID_STATIC) { /* * Although from high-level point of view static volumes may diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c index c015fc0a7..4105a508f 100644 --- a/drivers/mtd/ubi/vtbl.c +++ b/drivers/mtd/ubi/vtbl.c @@ -656,6 +656,7 @@ static int init_volumes(struct ubi_device *ubi, const struct ubi_scan_info *si, if (ubi->corr_peb_count) ubi_err("%d PEBs are corrupted and not used", ubi->corr_peb_count); + return -ENOSPC; } ubi->rsvd_pebs += reserved_pebs; ubi->avail_pebs -= reserved_pebs; diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c index 424260a3c..43422b94d 100644 --- a/drivers/mtd/ubi/wl.c +++ b/drivers/mtd/ubi/wl.c @@ -1516,6 +1516,7 @@ int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si) if (ubi->corr_peb_count) ubi_err("%d PEBs are corrupted and not used", ubi->corr_peb_count); + err = -ENOSPC; goto out_free; } ubi->avail_pebs -= WL_RESERVED_PEBS; diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c index cac5b256a..37534a06b 100644 --- a/drivers/net/wireless/ath/ath9k/init.c +++ b/drivers/net/wireless/ath/ath9k/init.c @@ -683,6 +683,7 @@ void ath9k_set_hw_capab(struct ath_softc *sc, struct ieee80211_hw *hw) hw->max_rate_tries = 10; hw->sta_data_size = sizeof(struct ath_node); hw->vif_data_size = sizeof(struct ath_vif); + hw->extra_tx_headroom = 4; hw->wiphy->available_antennas_rx = BIT(ah->caps.max_rxchains) - 1; hw->wiphy->available_antennas_tx = BIT(ah->caps.max_txchains) - 1; diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-lib.c b/drivers/net/wireless/iwlwifi/iwl-agn-lib.c index 56f41c940..6314e24c2 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn-lib.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn-lib.c @@ -1063,7 +1063,7 @@ static void iwlagn_wowlan_program_keys(struct ieee80211_hw *hw, u8 *pn = seq.ccmp.pn; ieee80211_get_key_rx_seq(key, i, &seq); - aes_sc->pn = cpu_to_le64( + aes_sc[i].pn = cpu_to_le64( (u64)pn[5] | ((u64)pn[4] << 8) | ((u64)pn[3] << 16) | diff --git a/drivers/of/address.c b/drivers/of/address.c index 693cdaaf2..05c653958 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -623,10 +623,10 @@ struct device_node *of_find_matching_node_by_address(struct device_node *from, struct resource res; while (dn) { - if (of_address_to_resource(dn, 0, &res)) - continue; - if (res.start == base_address) + if (!of_address_to_resource(dn, 0, &res) && + res.start == base_address) return dn; + dn = of_find_matching_node(dn, matches); } diff --git a/drivers/pci/access.c b/drivers/pci/access.c index 2a581642c..f49d961cf 100644 --- a/drivers/pci/access.c +++ b/drivers/pci/access.c @@ -357,6 +357,56 @@ static const struct pci_vpd_ops pci_vpd_pci22_ops = { .release = pci_vpd_pci22_release, }; +static ssize_t pci_vpd_f0_read(struct pci_dev *dev, loff_t pos, size_t count, + void *arg) +{ + struct pci_dev *tdev = pci_get_slot(dev->bus, PCI_SLOT(dev->devfn)); + ssize_t ret; + + if (!tdev) + return -ENODEV; + + ret = pci_read_vpd(tdev, pos, count, arg); + pci_dev_put(tdev); + return ret; +} + +static ssize_t pci_vpd_f0_write(struct pci_dev *dev, loff_t pos, size_t count, + const void *arg) +{ + struct pci_dev *tdev = pci_get_slot(dev->bus, PCI_SLOT(dev->devfn)); + ssize_t ret; + + if (!tdev) + return -ENODEV; + + ret = pci_write_vpd(tdev, pos, count, arg); + pci_dev_put(tdev); + return ret; +} + +static const struct pci_vpd_ops pci_vpd_f0_ops = { + .read = pci_vpd_f0_read, + .write = pci_vpd_f0_write, + .release = pci_vpd_pci22_release, +}; + +static int pci_vpd_f0_dev_check(struct pci_dev *dev) +{ + struct pci_dev *tdev = pci_get_slot(dev->bus, PCI_SLOT(dev->devfn)); + int ret = 0; + + if (!tdev) + return -ENODEV; + if (!tdev->vpd || !tdev->multifunction || + dev->class != tdev->class || dev->vendor != tdev->vendor || + dev->device != tdev->device) + ret = -ENODEV; + + pci_dev_put(tdev); + return ret; +} + int pci_vpd_pci22_init(struct pci_dev *dev) { struct pci_vpd_pci22 *vpd; @@ -365,12 +415,21 @@ int pci_vpd_pci22_init(struct pci_dev *dev) cap = pci_find_capability(dev, PCI_CAP_ID_VPD); if (!cap) return -ENODEV; + if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0) { + int ret = pci_vpd_f0_dev_check(dev); + + if (ret) + return ret; + } vpd = kzalloc(sizeof(*vpd), GFP_ATOMIC); if (!vpd) return -ENOMEM; vpd->base.len = PCI_VPD_PCI22_SIZE; - vpd->base.ops = &pci_vpd_pci22_ops; + if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0) + vpd->base.ops = &pci_vpd_f0_ops; + else + vpd->base.ops = &pci_vpd_pci22_ops; mutex_init(&vpd->lock); vpd->cap = cap; vpd->busy = false; diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index c0300242d..3ce87c82f 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -1883,6 +1883,15 @@ static void __devinit quirk_netmos(struct pci_dev *dev) DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_NETMOS, PCI_ANY_ID, PCI_CLASS_COMMUNICATION_SERIAL, 8, quirk_netmos); +static void quirk_f0_vpd_link(struct pci_dev *dev) +{ + if ((dev->class >> 8) != PCI_CLASS_NETWORK_ETHERNET || + !dev->multifunction || !PCI_FUNC(dev->devfn)) + return; + dev->dev_flags |= PCI_DEV_FLAGS_VPD_REF_F0; +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_f0_vpd_link); + static void __devinit quirk_e100_interrupt(struct pci_dev *dev) { u16 command, pmcsr; @@ -2834,12 +2843,15 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x3c28, vtd_mask_spec_errors); static void __devinit fixup_ti816x_class(struct pci_dev* dev) { + u32 class = dev->class; + /* TI 816x devices do not have class code set when in PCIe boot mode */ - dev_info(&dev->dev, "Setting PCI class for 816x PCIe device\n"); - dev->class = PCI_CLASS_MULTIMEDIA_VIDEO; + dev->class = PCI_CLASS_MULTIMEDIA_VIDEO << 8; + dev_info(&dev->dev, "PCI class overridden (%#08x -> %#08x)\n", + class, dev->class); } DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_TI, 0xb800, - PCI_CLASS_NOT_DEFINED, 0, fixup_ti816x_class); + PCI_CLASS_NOT_DEFINED, 0, fixup_ti816x_class); /* Some PCIe devices do not work reliably with the claimed maximum * payload size supported. diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.c index dbb8edfc8..06da698da 100644 --- a/drivers/scsi/mvsas/mv_sas.c +++ b/drivers/scsi/mvsas/mv_sas.c @@ -984,6 +984,8 @@ static void mvs_slot_free(struct mvs_info *mvi, u32 rx_desc) static void mvs_slot_task_free(struct mvs_info *mvi, struct sas_task *task, struct mvs_slot_info *slot, u32 slot_idx) { + if (!slot) + return; if (!slot->task) return; if (!sas_protocol_ata(task->task_proto)) diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c index dc25bee8d..2ecc2d646 100644 --- a/drivers/spi/spi-pxa2xx.c +++ b/drivers/spi/spi-pxa2xx.c @@ -799,6 +799,10 @@ static irqreturn_t ssp_int(int irq, void *dev_id) if (!(sccr1_reg & SSCR1_TIE)) mask &= ~SSSR_TFS; + /* Ignore RX timeout interrupt if it is disabled */ + if (!(sccr1_reg & SSCR1_TINTE)) + mask &= ~SSSR_TINT; + if (!(status & mask)) return IRQ_NONE; diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 3d8f662e4..a3f31e9ab 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -831,8 +831,7 @@ static struct class spi_master_class = { * * The caller is responsible for assigning the bus number and initializing * the master's methods before calling spi_register_master(); and (after errors - * adding the device) calling spi_master_put() and kfree() to prevent a memory - * leak. + * adding the device) calling spi_master_put() to prevent a memory leak. */ struct spi_master *spi_alloc_master(struct device *dev, unsigned size) { diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 0a2d86ea5..b7df271f2 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -1310,8 +1310,7 @@ static inline void n_tty_receive_char(struct tty_struct *tty, unsigned char c) tty->canon_data++; spin_unlock_irqrestore(&tty->read_lock, flags); kill_fasync(&tty->fasync, SIGIO, POLL_IN); - if (waitqueue_active(&tty->read_wait)) - wake_up_interruptible(&tty->read_wait); + wake_up_interruptible(&tty->read_wait); return; } } @@ -1434,8 +1433,7 @@ static void n_tty_receive_buf(struct tty_struct *tty, const unsigned char *cp, if ((!tty->icanon && (tty->read_cnt >= tty->minimum_to_wake)) || L_EXTPROC(tty)) { kill_fasync(&tty->fasync, SIGIO, POLL_IN); - if (waitqueue_active(&tty->read_wait)) - wake_up_interruptible(&tty->read_wait); + wake_up_interruptible(&tty->read_wait); } /* diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 8d4a762aa..061fcf47f 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2018,8 +2018,24 @@ static int tty_open(struct inode *inode, struct file *filp) if (!noctty && current->signal->leader && !current->signal->tty && - tty->session == NULL) - __proc_set_tty(current, tty); + tty->session == NULL) { + /* + * Don't let a process that only has write access to the tty + * obtain the privileges associated with having a tty as + * controlling terminal (being able to reopen it with full + * access through /dev/tty, being able to perform pushback). + * Many distributions set the group of all ttys to "tty" and + * grant write-only access to all terminals for setgid tty + * binaries, which should not imply full privileges on all ttys. + * + * This could theoretically break old code that performs open() + * on a write-only file descriptor. In that case, it might be + * necessary to also permit this if + * inode_permission(inode, MAY_READ) == 0. + */ + if (filp->f_mode & FMODE_READ) + __proc_set_tty(current, tty); + } spin_unlock_irq(¤t->sighand->siglock); tty_unlock(); mutex_unlock(&tty_mutex); @@ -2308,7 +2324,7 @@ static int fionbio(struct file *file, int __user *p) * Takes ->siglock() when updating signal->tty */ -static int tiocsctty(struct tty_struct *tty, int arg) +static int tiocsctty(struct tty_struct *tty, struct file *file, int arg) { int ret = 0; if (current->signal->leader && (task_session(current) == tty->session)) @@ -2341,6 +2357,13 @@ static int tiocsctty(struct tty_struct *tty, int arg) goto unlock; } } + + /* See the comment in tty_open(). */ + if ((file->f_mode & FMODE_READ) == 0 && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto unlock; + } + proc_set_tty(current, tty); unlock: mutex_unlock(&tty_mutex); @@ -2698,7 +2721,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) no_tty(); return 0; case TIOCSCTTY: - return tiocsctty(tty, arg); + return tiocsctty(tty, file, arg); case TIOCGPGRP: return tiocgpgrp(tty, real_tty, p); case TIOCSPGRP: diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index cc1004a2f..bfc9b6912 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -114,16 +114,18 @@ static void usb_parse_ss_endpoint_companion(struct device *ddev, int cfgno, cfgno, inum, asnum, ep->desc.bEndpointAddress); ep->ss_ep_comp.bmAttributes = 16; } else if (usb_endpoint_xfer_isoc(&ep->desc) && - desc->bmAttributes > 2) { + USB_SS_MULT(desc->bmAttributes) > 3) { dev_warn(ddev, "Isoc endpoint has Mult of %d in " "config %d interface %d altsetting %d ep %d: " - "setting to 3\n", desc->bmAttributes + 1, + "setting to 3\n", + USB_SS_MULT(desc->bmAttributes), cfgno, inum, asnum, ep->desc.bEndpointAddress); ep->ss_ep_comp.bmAttributes = 2; } if (usb_endpoint_xfer_isoc(&ep->desc)) - max_tx = (desc->bMaxBurst + 1) * (desc->bmAttributes + 1) * + max_tx = (desc->bMaxBurst + 1) * + (USB_SS_MULT(desc->bmAttributes)) * usb_endpoint_maxp(&ep->desc); else if (usb_endpoint_xfer_int(&ep->desc)) max_tx = usb_endpoint_maxp(&ep->desc) * diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index e845a1f80..392c125fc 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -49,6 +49,13 @@ static const struct usb_device_id usb_quirk_list[] = { /* Microsoft LifeCam-VX700 v2.0 */ { USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME }, + /* Logitech ConferenceCam CC3000e */ + { USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT }, + { USB_DEVICE(0x046d, 0x0848), .driver_info = USB_QUIRK_DELAY_INIT }, + + /* Logitech PTZ Pro Camera */ + { USB_DEVICE(0x046d, 0x0853), .driver_info = USB_QUIRK_DELAY_INIT }, + /* Logitech Quickcam Fusion */ { USB_DEVICE(0x046d, 0x08c1), .driver_info = USB_QUIRK_RESET_RESUME }, @@ -73,6 +80,12 @@ static const struct usb_device_id usb_quirk_list[] = { /* Philips PSC805 audio device */ { USB_DEVICE(0x0471, 0x0155), .driver_info = USB_QUIRK_RESET_RESUME }, + /* Plantronic Audio 655 DSP */ + { USB_DEVICE(0x047f, 0xc008), .driver_info = USB_QUIRK_RESET_RESUME }, + + /* Plantronic Audio 648 USB */ + { USB_DEVICE(0x047f, 0xc013), .driver_info = USB_QUIRK_RESET_RESUME }, + /* Artisman Watchdog Dongle */ { USB_DEVICE(0x04b4, 0x0526), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, diff --git a/drivers/usb/host/ehci-sysfs.c b/drivers/usb/host/ehci-sysfs.c index 14ced00ba..065902429 100644 --- a/drivers/usb/host/ehci-sysfs.c +++ b/drivers/usb/host/ehci-sysfs.c @@ -29,7 +29,7 @@ static ssize_t show_companion(struct device *dev, int count = PAGE_SIZE; char *ptr = buf; - ehci = hcd_to_ehci(bus_to_hcd(dev_get_drvdata(dev))); + ehci = hcd_to_ehci(dev_get_drvdata(dev)); nports = HCS_N_PORTS(ehci->hcs_params); for (index = 0; index < nports; ++index) { @@ -54,7 +54,7 @@ static ssize_t store_companion(struct device *dev, struct ehci_hcd *ehci; int portnum, new_owner; - ehci = hcd_to_ehci(bus_to_hcd(dev_get_drvdata(dev))); + ehci = hcd_to_ehci(dev_get_drvdata(dev)); new_owner = PORT_OWNER; /* Owned by companion */ if (sscanf(buf, "%d", &portnum) != 1) return -EINVAL; @@ -85,7 +85,7 @@ static ssize_t show_uframe_periodic_max(struct device *dev, struct ehci_hcd *ehci; int n; - ehci = hcd_to_ehci(bus_to_hcd(dev_get_drvdata(dev))); + ehci = hcd_to_ehci(dev_get_drvdata(dev)); n = scnprintf(buf, PAGE_SIZE, "%d\n", ehci->uframe_periodic_max); return n; } @@ -102,7 +102,7 @@ static ssize_t store_uframe_periodic_max(struct device *dev, unsigned long flags; ssize_t ret; - ehci = hcd_to_ehci(bus_to_hcd(dev_get_drvdata(dev))); + ehci = hcd_to_ehci(dev_get_drvdata(dev)); if (kstrtouint(buf, 0, &uframe_periodic_max) < 0) return -EINVAL; diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index 048cc382a..cad4a174e 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -1493,10 +1493,10 @@ int xhci_endpoint_init(struct xhci_hcd *xhci, * use Event Data TRBs, and we don't chain in a link TRB on short * transfers, we're basically dividing by 1. * - * xHCI 1.0 specification indicates that the Average TRB Length should - * be set to 8 for control endpoints. + * xHCI 1.0 and 1.1 specification indicates that the Average TRB Length + * should be set to 8 for control endpoints. */ - if (usb_endpoint_xfer_control(&ep->desc) && xhci->hci_version == 0x100) + if (usb_endpoint_xfer_control(&ep->desc) && xhci->hci_version >= 0x100) ep_ctx->tx_info |= cpu_to_le32(AVG_TRB_LENGTH_FOR_EP(8)); else ep_ctx->tx_info |= diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 710b2e98b..305393373 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -121,6 +121,7 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) * PPT chipsets. */ xhci->quirks |= XHCI_SPURIOUS_REBOOT; + xhci->quirks |= XHCI_SPURIOUS_WAKEUP; } if (pdev->vendor == PCI_VENDOR_ID_INTEL && (pdev->device == PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_XHCI || diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index d857ba543..5d4f1b754 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -331,6 +331,15 @@ static int xhci_abort_cmd_ring(struct xhci_hcd *xhci) ret = handshake(xhci, &xhci->op_regs->cmd_ring, CMD_RING_RUNNING, 0, 5 * 1000 * 1000); if (ret < 0) { + /* we are about to kill xhci, give it one more chance */ + xhci_write_64(xhci, temp_64 | CMD_RING_ABORT, + &xhci->op_regs->cmd_ring); + udelay(1000); + ret = handshake(xhci, &xhci->op_regs->cmd_ring, + CMD_RING_RUNNING, 0, 3 * 1000 * 1000); + if (ret == 0) + return 0; + xhci_err(xhci, "Stopped the command ring failed, " "maybe the host is dead\n"); xhci->xhc_state |= XHCI_STATE_DYING; @@ -2337,6 +2346,7 @@ static int handle_tx_event(struct xhci_hcd *xhci, u32 trb_comp_code; int ret = 0; int td_num = 0; + bool handling_skipped_tds = false; slot_id = TRB_TO_SLOT_ID(le32_to_cpu(event->flags)); xdev = xhci->devs[slot_id]; @@ -2470,6 +2480,10 @@ static int handle_tx_event(struct xhci_hcd *xhci, ep->skip = true; xhci_dbg(xhci, "Miss service interval error, set skip flag\n"); goto cleanup; + case COMP_PING_ERR: + ep->skip = true; + xhci_dbg(xhci, "No Ping response error, Skip one Isoc TD\n"); + goto cleanup; default: if (xhci_is_vendor_info_code(xhci, trb_comp_code)) { status = 0; @@ -2601,13 +2615,18 @@ static int handle_tx_event(struct xhci_hcd *xhci, ep, &status); cleanup: + + + handling_skipped_tds = ep->skip && + trb_comp_code != COMP_MISSED_INT && + trb_comp_code != COMP_PING_ERR; + /* - * Do not update event ring dequeue pointer if ep->skip is set. - * Will roll back to continue process missed tds. + * Do not update event ring dequeue pointer if we're in a loop + * processing missed tds. */ - if (trb_comp_code == COMP_MISSED_INT || !ep->skip) { + if (!handling_skipped_tds) inc_deq(xhci, xhci->event_ring); - } if (ret) { urb = td->urb; @@ -2642,7 +2661,7 @@ static int handle_tx_event(struct xhci_hcd *xhci, * Process them as short transfer until reach the td pointed by * the event. */ - } while (ep->skip && trb_comp_code != COMP_MISSED_INT); + } while (handling_skipped_tds); return 0; } @@ -3493,8 +3512,8 @@ int xhci_queue_ctrl_tx(struct xhci_hcd *xhci, gfp_t mem_flags, if (start_cycle == 0) field |= 0x1; - /* xHCI 1.0 6.4.1.2.1: Transfer Type field */ - if (xhci->hci_version == 0x100) { + /* xHCI 1.0/1.1 6.4.1.2.1: Transfer Type field */ + if (xhci->hci_version >= 0x100) { if (urb->transfer_buffer_length > 0) { if (setup->bRequestType & USB_DIR_IN) field |= TRB_TX_TYPE(TRB_DATA_IN); diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index c918fd6aa..eca811ef9 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -141,7 +141,8 @@ static int xhci_start(struct xhci_hcd *xhci) "waited %u microseconds.\n", XHCI_MAX_HALT_USEC); if (!ret) - xhci->xhc_state &= ~XHCI_STATE_HALTED; + xhci->xhc_state &= ~(XHCI_STATE_HALTED | XHCI_STATE_DYING); + return ret; } diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index 1e4899c2d..4038789d6 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -629,6 +629,10 @@ static struct usb_device_id id_table_combined [] = { { USB_DEVICE(FTDI_VID, FTDI_NT_ORIONLXM_PID), .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk }, { USB_DEVICE(FTDI_VID, FTDI_SYNAPSE_SS200_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_CUSTOMWARE_MINIPLEX_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_CUSTOMWARE_MINIPLEX2_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_CUSTOMWARE_MINIPLEX2WI_PID) }, + { USB_DEVICE(FTDI_VID, FTDI_CUSTOMWARE_MINIPLEX3_PID) }, /* * ELV devices: */ diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index 1fee973f1..70b24c02b 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -568,6 +568,14 @@ */ #define FTDI_SYNAPSE_SS200_PID 0x9090 /* SS200 - SNAP Stick 200 */ +/* + * CustomWare / ShipModul NMEA multiplexers product ids (FTDI_VID) + */ +#define FTDI_CUSTOMWARE_MINIPLEX_PID 0xfd48 /* MiniPlex first generation NMEA Multiplexer */ +#define FTDI_CUSTOMWARE_MINIPLEX2_PID 0xfd49 /* MiniPlex-USB and MiniPlex-2 series */ +#define FTDI_CUSTOMWARE_MINIPLEX2WI_PID 0xfd4a /* MiniPlex-2Wi */ +#define FTDI_CUSTOMWARE_MINIPLEX3_PID 0xfd4b /* MiniPlex-3 series */ + /********************************/ /** third-party VID/PID combos **/ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9e5132582..575c1902d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3685,7 +3685,8 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ - btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (!special_file(inode->i_mode)) + btrfs_wait_ordered_range(inode, 0, (u64)-1); if (root->fs_info->log_root_recovering) { BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f4fa5cf0c..e5eacd9dd 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -383,8 +383,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (opt->flags & CEPH_OPT_NOCRC) seq_puts(m, ",nocrc"); - if (opt->name) - seq_printf(m, ",name=%s", opt->name); + if (opt->name) { + seq_puts(m, ",name="); + seq_escape(m, opt->name, ", \t\n\\"); + } if (opt->key) seq_puts(m, ",secret="); @@ -429,7 +431,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) - seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + seq_show_option(m, "snapdirname", fsopt->snapdir_name); return 0; } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 6dd3b61ea..8431216ee 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -388,6 +388,48 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp) return 0; } +/* Server has provided av pairs/target info in the type 2 challenge + * packet and we have plucked it and stored within smb session. + * We parse that blob here to find the server given timestamp + * as part of ntlmv2 authentication (or local current time as + * default in case of failure) + */ +static __le64 +find_timestamp(struct cifs_ses *ses) +{ + unsigned int attrsize; + unsigned int type; + unsigned int onesize = sizeof(struct ntlmssp2_name); + unsigned char *blobptr; + unsigned char *blobend; + struct ntlmssp2_name *attrptr; + + if (!ses->auth_key.len || !ses->auth_key.response) + return 0; + + blobptr = ses->auth_key.response; + blobend = blobptr + ses->auth_key.len; + + while (blobptr + onesize < blobend) { + attrptr = (struct ntlmssp2_name *) blobptr; + type = le16_to_cpu(attrptr->type); + if (type == NTLMSSP_AV_EOL) + break; + blobptr += 2; /* advance attr type */ + attrsize = le16_to_cpu(attrptr->length); + blobptr += 2; /* advance attr size */ + if (blobptr + attrsize > blobend) + break; + if (type == NTLMSSP_AV_TIMESTAMP) { + if (attrsize == sizeof(u64)) + return *((__le64 *)blobptr); + } + blobptr += attrsize; /* advance attr value */ + } + + return cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); +} + static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, const struct nls_table *nls_cp) { @@ -549,6 +591,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) struct ntlmv2_resp *buf; char ntlmv2_hash[16]; unsigned char *tiblob = NULL; /* target info blob */ + __le64 rsp_timestamp; if (ses->server->secType == RawNTLMSSP) { if (!ses->domainName) { @@ -566,6 +609,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) } } + /* Must be within 5 minutes of the server (or in range +/-2h + * in case of Mac OS X), so simply carry over server timestamp + * (as Windows 7 does) + */ + rsp_timestamp = find_timestamp(ses); + baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp); tilen = ses->auth_key.len; tiblob = ses->auth_key.response; @@ -583,7 +632,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) (ses->auth_key.response + CIFS_SESS_KEY_SIZE); buf->blob_signature = cpu_to_le32(0x00000101); buf->reserved = 0; - buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); + buf->time = rsp_timestamp; get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); buf->reserved2 = 0; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c0f65e848..5b730ba78 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -373,10 +373,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) seq_printf(s, ",multiuser"); else if (tcon->ses->user_name) - seq_printf(s, ",username=%s", tcon->ses->user_name); + seq_show_option(s, "username", tcon->ses->user_name); if (tcon->ses->domainName) - seq_printf(s, ",domain=%s", tcon->ses->domainName); + seq_show_option(s, "domain", tcon->ses->domainName); if (srcaddr->sa_family != AF_UNSPEC) { struct sockaddr_in *saddr4; diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 534c1d46e..eba8f1d4a 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -55,26 +55,26 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) lower_dentry = ecryptfs_dentry_to_lower(dentry); lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); - if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) - goto out; - if (nd) { - dentry_save = nd->path.dentry; - vfsmount_save = nd->path.mnt; - nd->path.dentry = lower_dentry; - nd->path.mnt = lower_mnt; - } - rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd); - if (nd) { - nd->path.dentry = dentry_save; - nd->path.mnt = vfsmount_save; + if (lower_dentry->d_op && lower_dentry->d_op->d_revalidate) { + if (nd) { + dentry_save = nd->path.dentry; + vfsmount_save = nd->path.mnt; + nd->path.dentry = lower_dentry; + nd->path.mnt = lower_mnt; + } + rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd); + if (nd) { + nd->path.dentry = dentry_save; + nd->path.mnt = vfsmount_save; + } } if (dentry->d_inode) { - struct inode *lower_inode = - ecryptfs_inode_to_lower(dentry->d_inode); + struct inode *inode = dentry->d_inode; - fsstack_copy_attr_all(dentry->d_inode, lower_inode); + fsstack_copy_attr_all(inode, ecryptfs_inode_to_lower(inode)); + if (!inode->i_nlink) + return 0; } -out: return rc; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8f408762a..951ff257f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1677,10 +1677,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq, } if (sbi->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); + seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); if (sbi->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); + seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); if (test_opt(sb, USRQUOTA)) seq_puts(seq, ",usrquota"); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 6172fa77a..4db9a9a31 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1298,11 +1298,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) if (is_ancestor(root, sdp->sd_master_dir)) seq_printf(s, ",meta"); if (args->ar_lockproto[0]) - seq_printf(s, ",lockproto=%s", args->ar_lockproto); + seq_show_option(s, "lockproto", args->ar_lockproto); if (args->ar_locktable[0]) - seq_printf(s, ",locktable=%s", args->ar_locktable); + seq_show_option(s, "locktable", args->ar_locktable); if (args->ar_hostdata[0]) - seq_printf(s, ",hostdata=%s", args->ar_hostdata); + seq_show_option(s, "hostdata", args->ar_hostdata); if (args->ar_spectator) seq_printf(s, ",spectator"); if (args->ar_localflocks) diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index cdb41a1f6..8daea16ef 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -287,7 +287,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page_cache_release(page); goto fail; } - page_cache_release(page); node->page[i] = page; } @@ -397,11 +396,11 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) void hfs_bnode_free(struct hfs_bnode *node) { - //int i; + int i; - //for (i = 0; i < node->tree->pages_per_bnode; i++) - // if (node->page[i]) - // page_cache_release(node->page[i]); + for (i = 0; i < node->tree->pages_per_bnode; i++) + if (node->page[i]) + page_cache_release(node->page[i]); kfree(node); } diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index 92fb358ce..db240c54a 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -132,13 +132,16 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) hfs_bnode_write(node, entry, data_off + key_len, entry_len); hfs_bnode_dump(node); - if (new_node) { - /* update parent key if we inserted a key - * at the start of the first node - */ - if (!rec && new_node != node) - hfs_brec_update_parent(fd); + /* + * update parent key if we inserted a key + * at the start of the node and it is not the new node + */ + if (!rec && new_node != node) { + hfs_bnode_read_key(node, fd->search_key, data_off + size); + hfs_brec_update_parent(fd); + } + if (new_node) { hfs_bnode_put(fd->bnode); if (!new_node->parent) { hfs_btree_inc_height(tree); @@ -167,9 +170,6 @@ int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len) goto again; } - if (!rec) - hfs_brec_update_parent(fd); - return 0; } @@ -366,6 +366,8 @@ static int hfs_brec_update_parent(struct hfs_find_data *fd) if (IS_ERR(parent)) return PTR_ERR(parent); __hfs_brec_find(parent, fd); + if (fd->record < 0) + return -ENOENT; hfs_bnode_dump(parent); rec = fd->record; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 7b4c537d6..be0e218a3 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -138,9 +138,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root) struct hfs_sb_info *sbi = HFS_SB(root->d_sb); if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f)) - seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator); + seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4); if (sbi->s_type != cpu_to_be32(0x3f3f3f3f)) - seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type); + seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4); seq_printf(seq, ",uid=%u,gid=%u", sbi->s_uid, sbi->s_gid); if (sbi->s_file_umask != 0133) seq_printf(seq, ",file_umask=%o", sbi->s_file_umask); diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 1c42cc5b8..a1e91092f 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) page_cache_release(page); goto fail; } - page_cache_release(page); node->page[i] = page; } @@ -566,13 +565,11 @@ struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num) void hfs_bnode_free(struct hfs_bnode *node) { -#if 0 int i; for (i = 0; i < node->tree->pages_per_bnode; i++) if (node->page[i]) page_cache_release(node->page[i]); -#endif kfree(node); } diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 06fa56186..38e41d07d 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -211,9 +211,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root) struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb); if (sbi->creator != HFSPLUS_DEF_CR_TYPE) - seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); + seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4); if (sbi->type != HFSPLUS_DEF_CR_TYPE) - seq_printf(seq, ",type=%.4s", (char *)&sbi->type); + seq_show_option_n(seq, "type", (char *)&sbi->type, 4); seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, sbi->uid, sbi->gid); if (sbi->part >= 0) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 07c516bfe..fe63b15f5 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -264,7 +264,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) size_t offset = strlen(root_ino) + 1; if (strlen(root_path) > offset) - seq_printf(seq, ",%s", root_path + offset); + seq_show_option(seq, root_path + offset, NULL); return 0; } diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 30dd7b10b..bdb86a8a8 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -8,6 +8,17 @@ #include #include "hpfs_fn.h" +static void hpfs_update_directory_times(struct inode *dir) +{ + time_t t = get_seconds(); + if (t == dir->i_mtime.tv_sec && + t == dir->i_ctime.tv_sec) + return; + dir->i_mtime.tv_sec = dir->i_ctime.tv_sec = t; + dir->i_mtime.tv_nsec = dir->i_ctime.tv_nsec = 0; + hpfs_write_inode_nolock(dir); +} + static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { const unsigned char *name = dentry->d_name.name; @@ -99,6 +110,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) result->i_mode = mode | S_IFDIR; hpfs_write_inode_nolock(result); } + hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); return 0; @@ -187,6 +199,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, s result->i_mode = mode | S_IFREG; hpfs_write_inode_nolock(result); } + hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); return 0; @@ -262,6 +275,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de insert_inode_hash(result); hpfs_write_inode_nolock(result); + hpfs_update_directory_times(dir); d_instantiate(dentry, result); brelse(bh); hpfs_unlock(dir->i_sb); @@ -340,6 +354,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy insert_inode_hash(result); hpfs_write_inode_nolock(result); + hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); return 0; @@ -423,6 +438,8 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry) out1: hpfs_brelse4(&qbh); out: + if (!err) + hpfs_update_directory_times(dir); hpfs_unlock(dir->i_sb); return err; } @@ -477,6 +494,8 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) out1: hpfs_brelse4(&qbh); out: + if (!err) + hpfs_update_directory_times(dir); hpfs_unlock(dir->i_sb); return err; } @@ -595,7 +614,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto end1; } - end: +end: hpfs_i(i)->i_parent_dir = new_dir->i_ino; if (S_ISDIR(i->i_mode)) { inc_nlink(new_dir); @@ -610,6 +629,10 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, brelse(bh); } end1: + if (!err) { + hpfs_update_directory_times(old_dir); + hpfs_update_directory_times(new_dir); + } hpfs_unlock(i->i_sb); return err; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3d344ab0b..92eff4da0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1851,7 +1851,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, fmode_t fmode if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); - if (opendata->o_arg.open_flags & O_EXCL) { + if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) { nfs4_exclusive_attrset(opendata, sattr); nfs_fattr_init(opendata->o_res.f_attr); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 7ba6ac187..8e48ba5f6 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1411,6 +1411,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, int found, ret; int set_maybe; int dispatch_assert = 0; + int dispatched = 0; if (!dlm_grab(dlm)) return DLM_MASTER_RESP_NO; @@ -1617,13 +1618,16 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, mlog(ML_ERROR, "failed to dispatch assert master work\n"); response = DLM_MASTER_RESP_ERROR; dlm_lockres_put(res); + } else { + dispatched = 1; } } else { if (res) dlm_lockres_put(res); } - dlm_put(dlm); + if (!dispatched) + dlm_put(dlm); return response; } @@ -2041,7 +2045,6 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, /* queue up work for dlm_assert_master_worker */ - dlm_grab(dlm); /* get an extra ref for the work item */ dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL); item->u.am.lockres = res; /* already have a ref */ /* can optionally ignore node numbers higher than this node */ diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index d15b0714e..0e5013ed7 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1689,6 +1689,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, unsigned int hash; int master = DLM_LOCK_RES_OWNER_UNKNOWN; u32 flags = DLM_ASSERT_MASTER_REQUERY; + int dispatched = 0; if (!dlm_grab(dlm)) { /* since the domain has gone away on this @@ -1710,6 +1711,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, mlog_errno(-ENOMEM); /* retry!? */ BUG(); + } else { + dispatched = 1; } } else /* put.. incase we are not the master */ dlm_lockres_put(res); @@ -1717,7 +1720,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, } spin_unlock(&dlm->spinlock); - dlm_put(dlm); + if (!dispatched) + dlm_put(dlm); return master; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 68f4541c2..91a0020a0 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1578,8 +1578,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",localflocks,"); if (osb->osb_cluster_stack[0]) - seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, - osb->osb_cluster_stack); + seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack, + OCFS2_STACK_LABEL_LEN); if (opts & OCFS2_MOUNT_USRQUOTA) seq_printf(s, ",usrquota"); if (opts & OCFS2_MOUNT_GRPQUOTA) diff --git a/fs/pipe.c b/fs/pipe.c index abfb93525..6049235e2 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -390,6 +390,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, void *addr; size_t chars = buf->len, remaining; int error, atomic; + int offset; if (chars > total_len) chars = total_len; @@ -403,9 +404,10 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, atomic = !iov_fault_in_pages_write(iov, chars); remaining = chars; + offset = buf->offset; redo: addr = ops->map(pipe, buf, atomic); - error = pipe_iov_copy_to_user(iov, addr, &buf->offset, + error = pipe_iov_copy_to_user(iov, addr, &offset, &remaining, atomic); ops->unmap(pipe, buf, addr); if (unlikely(error)) { @@ -421,6 +423,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, break; } ret += chars; + buf->offset += chars; buf->len -= chars; /* Was it a packet buffer? Clean up and exit */ diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 8169be93a..e12357bb3 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -645,18 +645,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",acl"); if (REISERFS_SB(s)->s_jdev) - seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev); + seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev); if (journal->j_max_commit_age != journal->j_default_max_commit_age) seq_printf(seq, ",commit=%d", journal->j_max_commit_age); #ifdef CONFIG_QUOTA if (REISERFS_SB(s)->s_qf_names[USRQUOTA]) - seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]); + seq_show_option(seq, "usrjquota", + REISERFS_SB(s)->s_qf_names[USRQUOTA]); else if (opts & (1 << REISERFS_USRQUOTA)) seq_puts(seq, ",usrquota"); if (REISERFS_SB(s)->s_qf_names[GRPQUOTA]) - seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]); + seq_show_option(seq, "grpjquota", + REISERFS_SB(s)->s_qf_names[GRPQUOTA]); else if (opts & (1 << REISERFS_GRPQUOTA)) seq_puts(seq, ",grpquota"); if (REISERFS_SB(s)->s_jquota_fmt) { diff --git a/fs/splice.c b/fs/splice.c index 67c5210e7..286417764 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1165,7 +1165,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, long ret, bytes; umode_t i_mode; size_t len; - int i, flags; + int i, flags, more; /* * We require the input being a regular file, as we don't want to @@ -1208,6 +1208,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, * Don't block on output, we have to drain the direct pipe. */ sd->flags &= ~SPLICE_F_NONBLOCK; + more = sd->flags & SPLICE_F_MORE; while (len) { size_t read_len; @@ -1220,6 +1221,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, read_len = ret; sd->total_len = read_len; + /* + * If more data is pending, set SPLICE_F_MORE + * If this is the last data and SPLICE_F_MORE was not set + * initially, clears it. + */ + if (read_len < len) + sd->flags |= SPLICE_F_MORE; + else if (!more) + sd->flags &= ~SPLICE_F_MORE; /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index dab9a5f6d..d6c787dc2 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -523,9 +523,9 @@ xfs_showargs( seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); if (mp->m_logname) - seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); + seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname); if (mp->m_rtname) - seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); + seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname); if (mp->m_dalign > 0) seq_printf(m, "," MNTOPT_SUNIT "=%d", diff --git a/include/linux/pci.h b/include/linux/pci.h index 469c9536c..579baf06f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -176,6 +176,8 @@ enum pci_dev_flags { PCI_DEV_FLAGS_NO_D3 = (__force pci_dev_flags_t) 2, /* Provide indication device is assigned by a Virtual Machine Manager */ PCI_DEV_FLAGS_ASSIGNED = (__force pci_dev_flags_t) 4, + /* Get VPD from function 0 VPD */ + PCI_DEV_FLAGS_VPD_REF_F0 = (__force pci_dev_flags_t) (1 << 8), }; enum pci_irq_reroute_variant { diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index e156ce155..6218a2dad 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -142,6 +142,41 @@ int seq_put_decimal_ull(struct seq_file *m, char delimiter, int seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num); +/** + * seq_show_options - display mount options with appropriate escapes. + * @m: the seq_file handle + * @name: the mount option name + * @value: the mount option name's value, can be NULL + */ +static inline void seq_show_option(struct seq_file *m, const char *name, + const char *value) +{ + seq_putc(m, ','); + seq_escape(m, name, ",= \t\n\\"); + if (value) { + seq_putc(m, '='); + seq_escape(m, value, ", \t\n\\"); + } +} + +/** + * seq_show_option_n - display mount options with appropriate escapes + * where @value must be a specific length. + * @m: the seq_file handle + * @name: the mount option name + * @value: the mount option name's value, cannot be NULL + * @length: the length of @value to display + * + * This is a macro since this uses "length" to define the size of the + * stack buffer. + */ +#define seq_show_option_n(m, name, value, length) { \ + char val_buf[length + 1]; \ + strncpy(val_buf, value, length); \ + val_buf[length] = '\0'; \ + seq_show_option(m, name, val_buf); \ +} + #define SEQ_START_TOKEN ((void *)1) /* * Helpers for iteration over list_head-s in seq_files diff --git a/include/sound/wm8904.h b/include/sound/wm8904.h index 898be3a8d..6d8f8fba3 100644 --- a/include/sound/wm8904.h +++ b/include/sound/wm8904.h @@ -119,7 +119,7 @@ #define WM8904_MIC_REGS 2 #define WM8904_GPIO_REGS 4 #define WM8904_DRC_REGS 4 -#define WM8904_EQ_REGS 25 +#define WM8904_EQ_REGS 24 /** * DRC configurations are specified with a label and a set of register diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ad70cada2..f48e4e739 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1053,15 +1053,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) - seq_printf(seq, ",%s", ss->name); + seq_show_option(seq, ss->name, NULL); if (test_bit(ROOT_NOPREFIX, &root->flags)) seq_puts(seq, ",noprefix"); if (strlen(root->release_agent_path)) - seq_printf(seq, ",release_agent=%s", root->release_agent_path); + seq_show_option(seq, "release_agent", + root->release_agent_path); if (clone_children(&root->top_cgroup)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) - seq_printf(seq, ",name=%s", root->name); + seq_show_option(seq, "name", root->name); mutex_unlock(&cgroup_root_mutex); return 0; } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index fb655f5f9..15374d0ac 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "internals.h" @@ -326,18 +327,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) void register_irq_proc(unsigned int irq, struct irq_desc *desc) { + static DEFINE_MUTEX(register_lock); char name [MAX_NAMELEN]; - if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) + if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) return; + /* + * irq directories are registered only when a handler is + * added, not when the descriptor is created, so multiple + * tasks might try to register at the same time. + */ + mutex_lock(®ister_lock); + + if (desc->dir) + goto out_unlock; + memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ desc->dir = proc_mkdir(name, root_irq_dir); if (!desc->dir) - return; + goto out_unlock; #ifdef CONFIG_SMP /* create /proc/irq//smp_affinity */ @@ -358,6 +370,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) proc_create_data("spurious", 0444, desc->dir, &irq_spurious_proc_fops, (void *)(long)irq); + +out_unlock: + mutex_unlock(®ister_lock); } void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) diff --git a/kernel/module.c b/kernel/module.c index 6282fdc4d..fac398bb5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -969,11 +969,15 @@ void symbol_put_addr(void *addr) if (core_kernel_text(a)) return; - /* module_text_address is safe here: we're supposed to have reference - * to module from symbol_get, so it can't go away. */ + /* + * Even though we hold a reference on the module; we still need to + * disable preemption in order to safely traverse the data structure. + */ + preempt_disable(); modaddr = __module_text_address(a); BUG_ON(!modaddr); module_put(modaddr); + preempt_enable(); } EXPORT_SYMBOL_GPL(symbol_put_addr); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8d443c36c..12b6466b3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1983,11 +1983,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul + * + * We must observe prev->state before clearing prev->on_cpu (in + * finish_lock_switch), otherwise a concurrent wakeup can get prev + * running on another CPU and we could rave with its RUNNING -> DEAD + * transition, resulting in a double drop. */ prev_state = prev->state; finish_arch_switch(prev); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c65ccfcae..cc248e5ed 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -712,8 +712,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. + * + * Pairs with the control dependency and rmb in try_to_wake_up(). */ - smp_wmb(); + smp_mb(); prev->on_cpu = 0; #endif #ifdef CONFIG_DEBUG_SPINLOCK diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c95833821..b3f540792 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -291,7 +291,7 @@ static void clocksource_watchdog(unsigned long data) continue; /* Check the deviation from the watchdog clocksource. */ - if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { + if ((abs64(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { clocksource_unstable(cs, cs_nsec - wd_nsec); continue; } diff --git a/mm/filemap.c b/mm/filemap.c index 02c00eeef..48a807ac0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2442,6 +2442,11 @@ static ssize_t generic_perform_write(struct file *file, break; } + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } + status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); if (unlikely(status)) @@ -2482,10 +2487,6 @@ static ssize_t generic_perform_write(struct file *file, written += copied; balance_dirty_pages_ratelimited(mapping); - if (fatal_signal_pending(current)) { - status = -EINTR; - break; - } } while (iov_iter_count(i)); return written ? written : status; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc36e280c..e622aab7f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2503,6 +2503,14 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, if (iter_vma == vma) continue; + /* + * Shared VMAs have their own reserves and do not affect + * MAP_PRIVATE accounting but it is possible that a shared + * VMA is using the same page so check and skip such VMAs. + */ + if (iter_vma->vm_flags & VM_MAYSHARE) + continue; + /* * Unmap the page from other VMAs without their own reserves. * They get marked to be SIGKILLed if they fault in these diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 8755a3079..7fc10b915 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -137,20 +137,24 @@ static int __xfrm6_output(struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct xfrm_state *x = dst->xfrm; int mtu = ip6_skb_dst_mtu(skb); + bool toobig; - if (skb->len > mtu && xfrm6_local_dontfrag(skb)) { + if (x->props.mode != XFRM_MODE_TUNNEL) + goto skip_frag; + + toobig = skb->len > mtu && !skb_is_gso(skb); + + if (toobig && xfrm6_local_dontfrag(skb)) { xfrm6_local_rxpmtu(skb, mtu); return -EMSGSIZE; - } else if (!skb->local_df && skb->len > mtu && skb->sk) { + } else if (!skb->local_df && toobig && skb->sk) { xfrm6_local_error(skb, mtu); return -EMSGSIZE; } - if (x->props.mode == XFRM_MODE_TUNNEL && - ((skb->len > mtu && !skb_is_gso(skb)) || - dst_allfrag(skb_dst(skb)))) { + if (toobig || dst_allfrag(skb_dst(skb))) return ip6_fragment(skb, x->outer_mode->afinfo->output_finish); - } +skip_frag: return x->outer_mode->afinfo->output_finish(skb); } diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index bed52a6ba..d7bfc431d 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -284,9 +284,6 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx) if (tx->sdata->vif.type == NL80211_IFTYPE_WDS) return TX_CONTINUE; - if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT) - return TX_CONTINUE; - if (tx->flags & IEEE80211_TX_PS_BUFFERED) return TX_CONTINUE; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 42eb7ba0b..897a5f14c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -545,6 +545,7 @@ static int send_reply(struct svcxprt_rdma *rdma, { struct ib_send_wr send_wr; struct ib_send_wr inv_wr; + u32 xdr_off; int sge_no; int sge_bytes; int page_no; @@ -584,8 +585,8 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->direction = DMA_TO_DEVICE; /* Map the payload indicated by 'byte_count' */ + xdr_off = 0; for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { - int xdr_off = 0; sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); byte_count -= sge_bytes; if (!vec->frmr) { @@ -623,6 +624,14 @@ static int send_reply(struct svcxprt_rdma *rdma, if (page_no+1 >= sge_no) ctxt->sge[page_no+1].length = 0; } + + /* The loop above bumps sc_dma_used for each sge. The + * xdr_buf.tail gets a separate sge, but resides in the + * same page as xdr_buf.head. Don't count it twice. + */ + if (sge_no > ctxt->count) + atomic_dec(&rdma->sc_dma_used); + BUG_ON(sge_no > rdma->sc_max_sge); memset(&send_wr, 0, sizeof send_wr); ctxt->wr_op = IB_WR_SEND; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 31275e52c..d4a564fec 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -811,6 +811,7 @@ static void xs_reset_transport(struct sock_xprt *transport) { struct socket *sock = transport->sock; struct sock *sk = transport->inet; + struct rpc_xprt *xprt = &transport->xprt; if (sk == NULL) return; @@ -824,6 +825,7 @@ static void xs_reset_transport(struct sock_xprt *transport) sk->sk_user_data = NULL; xs_restore_old_callbacks(transport, sk); + xprt_clear_connected(xprt); write_unlock_bh(&sk->sk_callback_lock); sk->sk_no_check = 0; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 70d93c605..5b3796788 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1026,7 +1026,7 @@ static void selinux_write_opts(struct seq_file *m, seq_puts(m, prefix); if (has_comma) seq_putc(m, '\"'); - seq_puts(m, opts->mnt_opts[i]); + seq_escape(m, opts->mnt_opts[i], "\"\n\\"); if (has_comma) seq_putc(m, '\"'); } diff --git a/sound/arm/Kconfig b/sound/arm/Kconfig index 885683a3b..e04062117 100644 --- a/sound/arm/Kconfig +++ b/sound/arm/Kconfig @@ -9,6 +9,14 @@ menuconfig SND_ARM Drivers that are implemented on ASoC can be found in "ALSA for SoC audio support" section. +config SND_PXA2XX_LIB + tristate + select SND_AC97_CODEC if SND_PXA2XX_LIB_AC97 + select SND_DMAENGINE_PCM + +config SND_PXA2XX_LIB_AC97 + bool + if SND_ARM config SND_ARMAACI @@ -21,13 +29,6 @@ config SND_PXA2XX_PCM tristate select SND_PCM -config SND_PXA2XX_LIB - tristate - select SND_AC97_CODEC if SND_PXA2XX_LIB_AC97 - -config SND_PXA2XX_LIB_AC97 - bool - config SND_PXA2XX_AC97 tristate "AC97 driver for the Intel PXA2xx chip" depends on ARCH_PXA diff --git a/sound/soc/pxa/Kconfig b/sound/soc/pxa/Kconfig index a0f7d3cfa..23deb67b8 100644 --- a/sound/soc/pxa/Kconfig +++ b/sound/soc/pxa/Kconfig @@ -1,7 +1,6 @@ config SND_PXA2XX_SOC tristate "SoC Audio for the Intel PXA2xx chip" depends on ARCH_PXA - select SND_ARM select SND_PXA2XX_LIB help Say Y or M if you want to add support for codecs attached to @@ -15,7 +14,6 @@ config SND_PXA2XX_AC97 config SND_PXA2XX_SOC_AC97 tristate select AC97_BUS - select SND_ARM select SND_PXA2XX_LIB_AC97 select SND_SOC_AC97_BUS diff --git a/sound/synth/emux/emux_oss.c b/sound/synth/emux/emux_oss.c index daf61abc3..646b66703 100644 --- a/sound/synth/emux/emux_oss.c +++ b/sound/synth/emux/emux_oss.c @@ -69,7 +69,8 @@ snd_emux_init_seq_oss(struct snd_emux *emu) struct snd_seq_oss_reg *arg; struct snd_seq_device *dev; - if (snd_seq_device_new(emu->card, 0, SNDRV_SEQ_DEV_ID_OSS, + /* using device#1 here for avoiding conflicts with OPL3 */ + if (snd_seq_device_new(emu->card, 1, SNDRV_SEQ_DEV_ID_OSS, sizeof(struct snd_seq_oss_reg), &dev) < 0) return; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index c0b70c697..5a4482c2a 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1060,25 +1060,19 @@ static void print_cpudesc(struct perf_header *ph, int fd, FILE *fp) static void print_nrcpus(struct perf_header *ph, int fd, FILE *fp) { ssize_t ret; - u32 nr; + u32 nr[2]; ret = read(fd, &nr, sizeof(nr)); if (ret != (ssize_t)sizeof(nr)) - nr = -1; /* interpreted as error */ + nr[0] = nr[1] = -1; /* interpreted as error */ - if (ph->needs_swap) - nr = bswap_32(nr); - - fprintf(fp, "# nrcpus online : %u\n", nr); - - ret = read(fd, &nr, sizeof(nr)); - if (ret != (ssize_t)sizeof(nr)) - nr = -1; /* interpreted as error */ - - if (ph->needs_swap) - nr = bswap_32(nr); + if (ph->needs_swap) { + nr[0] = bswap_32(nr[0]); + nr[1] = bswap_32(nr[1]); + } - fprintf(fp, "# nrcpus avail : %u\n", nr); + fprintf(fp, "# nrcpus online : %u\n", nr[1]); + fprintf(fp, "# nrcpus avail : %u\n", nr[0]); } static void print_version(struct perf_header *ph, int fd, FILE *fp) From 33d52a16b2b16dbe48d635998c2839f2a412cb8c Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 9 May 2016 18:27:58 +0800 Subject: [PATCH 11/52] Fix Google services power drain --- ik.ramdisk/common/init.idlekernel.sh | 33 ++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/ik.ramdisk/common/init.idlekernel.sh b/ik.ramdisk/common/init.idlekernel.sh index ccbab5cf8..4c995f67e 100755 --- a/ik.ramdisk/common/init.idlekernel.sh +++ b/ik.ramdisk/common/init.idlekernel.sh @@ -135,3 +135,36 @@ echo `cat $CFILE` > $SFILE stop thermal-engine sleep 2 start thermal-engine + +( + sleep 60; + + BB=/system/xbin/busybox + + # stop google service and restart it on boot. this remove high cpu load and ram leak! + if [ "$($BB pidof com.google.android.gms | wc -l)" -eq "1" ]; then + $BB kill "$($BB pidof com.google.android.gms)"; + fi; + if [ "$($BB pidof com.google.android.gms.unstable | wc -l)" -eq "1" ]; then + $BB kill "$($BB pidof com.google.android.gms.unstable)"; + fi; + if [ "$($BB pidof com.google.android.gms.persistent | wc -l)" -eq "1" ]; then + $BB kill "$($BB pidof com.google.android.gms.persistent)"; + fi; + if [ "$($BB pidof com.google.android.gms.wearable | wc -l)" -eq "1" ]; then + $BB kill "$($BB pidof com.google.android.gms.wearable)"; + fi; + + # Google Services battery drain fixer by Alcolawl@xda + # http://forum.xda-developers.com/google-nexus-5/general/script-google-play-services-battery-t3059585/post59563859 + pm enable com.google.android.gms/.update.SystemUpdateActivity + pm enable com.google.android.gms/.update.SystemUpdateService + pm enable com.google.android.gms/.update.SystemUpdateService$ActiveReceiver + pm enable com.google.android.gms/.update.SystemUpdateService$Receiver + pm enable com.google.android.gms/.update.SystemUpdateService$SecretCodeReceiver + pm enable com.google.android.gsf/.update.SystemUpdateActivity + pm enable com.google.android.gsf/.update.SystemUpdatePanoActivity + pm enable com.google.android.gsf/.update.SystemUpdateService + pm enable com.google.android.gsf/.update.SystemUpdateService$Receiver + pm enable com.google.android.gsf/.update.SystemUpdateService$SecretCodeReceiver +)& \ No newline at end of file From 155fb9674cc925e91ab858f3a29023383e0ea08d Mon Sep 17 00:00:00 2001 From: Carck Date: Mon, 16 May 2016 12:09:46 +0800 Subject: [PATCH 12/52] UKSM: Correct default value of uksm/run --- ik.ramdisk/common/init.idlekernel.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ik.ramdisk/common/init.idlekernel.sh b/ik.ramdisk/common/init.idlekernel.sh index 4c995f67e..691d2dd02 100755 --- a/ik.ramdisk/common/init.idlekernel.sh +++ b/ik.ramdisk/common/init.idlekernel.sh @@ -123,7 +123,7 @@ echo `cat $CFILE` > $SFILE # CFILE=$DDIR/uksm_run SFILE=/sys/kernel/mm/uksm/run -CDEF=N +CDEF=0 if [ ! -f $CFILE ]; then echo $CDEF > $CFILE fi @@ -167,4 +167,4 @@ start thermal-engine pm enable com.google.android.gsf/.update.SystemUpdateService pm enable com.google.android.gsf/.update.SystemUpdateService$Receiver pm enable com.google.android.gsf/.update.SystemUpdateService$SecretCodeReceiver -)& \ No newline at end of file +)& From 25ee890b0234a5feeef85dc9ad81ad9cdc998984 Mon Sep 17 00:00:00 2001 From: carck Date: Sun, 22 May 2016 04:45:53 -0400 Subject: [PATCH 13/52] LZ4: add lz4 api --- arch/arm/configs/ik_defconfig | 5 + lib/Kconfig | 13 + lib/Makefile | 4 + lib/lz4/Makefile | 3 + lib/lz4/lz4_compress.c | 443 ++++++++++++++++++++++++++++ lib/lz4/lz4_decompress.c | 343 ++++++++++++++++++++++ lib/lz4/lz4defs.h | 157 ++++++++++ lib/lz4/lz4hc_compress.c | 539 ++++++++++++++++++++++++++++++++++ 8 files changed, 1507 insertions(+) create mode 100644 lib/lz4/Makefile create mode 100644 lib/lz4/lz4_compress.c create mode 100644 lib/lz4/lz4_decompress.c create mode 100644 lib/lz4/lz4defs.h create mode 100644 lib/lz4/lz4hc_compress.c diff --git a/arch/arm/configs/ik_defconfig b/arch/arm/configs/ik_defconfig index e98bc3f5e..5b947d920 100644 --- a/arch/arm/configs/ik_defconfig +++ b/arch/arm/configs/ik_defconfig @@ -47,6 +47,7 @@ CONFIG_LOCALVERSION="-idleKernel-hlte-" CONFIG_HAVE_KERNEL_LZMA=y CONFIG_HAVE_KERNEL_XZ=y CONFIG_HAVE_KERNEL_LZO=y +CONFIG_HAVE_KERNEL_LZ4=y # CONFIG_KERNEL_LZMA is not set CONFIG_KERNEL_XZ=y # CONFIG_KERNEL_LZO is not set @@ -4232,6 +4233,9 @@ CONFIG_ZLIB_INFLATE=y CONFIG_ZLIB_DEFLATE=y CONFIG_LZO_COMPRESS=y CONFIG_LZO_DECOMPRESS=y +CONFIG_LZ4_COMPRESS=y +CONFIG_LZ4HC_COMPRESS=y +CONFIG_LZ4_DECOMPRESS=y CONFIG_XZ_DEC=y CONFIG_XZ_DEC_X86=y CONFIG_XZ_DEC_POWERPC=y @@ -4243,6 +4247,7 @@ CONFIG_XZ_DEC_BCJ=y # CONFIG_XZ_DEC_TEST is not set CONFIG_DECOMPRESS_GZIP=y CONFIG_DECOMPRESS_XZ=y +CONFIG_DECOMPRESS_LZ4=y CONFIG_GENERIC_ALLOCATOR=y CONFIG_REED_SOLOMON=y CONFIG_REED_SOLOMON_ENC8=y diff --git a/lib/Kconfig b/lib/Kconfig index 09f0d8a64..4b23c9600 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -177,6 +177,15 @@ config LZO_COMPRESS config LZO_DECOMPRESS tristate +config LZ4_COMPRESS + tristate + +config LZ4HC_COMPRESS + tristate + +config LZ4_DECOMPRESS + tristate + source "lib/xz/Kconfig" # @@ -201,6 +210,10 @@ config DECOMPRESS_LZO select LZO_DECOMPRESS tristate +config DECOMPRESS_LZ4 + select LZ4_DECOMPRESS + tristate + # # Generic allocator support is selected if needed # diff --git a/lib/Makefile b/lib/Makefile index 9161b969d..1919769ec 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -72,6 +72,9 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_BCH) += bch.o obj-$(CONFIG_LZO_COMPRESS) += lzo/ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ +obj-$(CONFIG_LZ4_COMPRESS) += lz4/ +obj-$(CONFIG_LZ4HC_COMPRESS) += lz4/ +obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/ obj-$(CONFIG_XZ_DEC) += xz/ obj-$(CONFIG_RAID6_PQ) += raid6/ @@ -80,6 +83,7 @@ lib-$(CONFIG_DECOMPRESS_BZIP2) += decompress_bunzip2.o lib-$(CONFIG_DECOMPRESS_LZMA) += decompress_unlzma.o lib-$(CONFIG_DECOMPRESS_XZ) += decompress_unxz.o lib-$(CONFIG_DECOMPRESS_LZO) += decompress_unlzo.o +lib-$(CONFIG_DECOMPRESS_LZ4) += decompress_unlz4.o obj-$(CONFIG_TEXTSEARCH) += textsearch.o obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile new file mode 100644 index 000000000..8085d04e9 --- /dev/null +++ b/lib/lz4/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LZ4_COMPRESS) += lz4_compress.o +obj-$(CONFIG_LZ4HC_COMPRESS) += lz4hc_compress.o +obj-$(CONFIG_LZ4_DECOMPRESS) += lz4_decompress.o diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c new file mode 100644 index 000000000..28321d8f7 --- /dev/null +++ b/lib/lz4/lz4_compress.c @@ -0,0 +1,443 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + * + * Changed for kernel use by: + * Chanho Min + */ + +#include +#include +#include +#include +#include "lz4defs.h" + +/* + * LZ4_compressCtx : + * ----------------- + * Compress 'isize' bytes from 'source' into an output buffer 'dest' of + * maximum size 'maxOutputSize'. * If it cannot achieve it, compression + * will stop, and result of the function will be zero. + * return : the number of bytes written in buffer 'dest', or 0 if the + * compression fails + */ +static inline int lz4_compressctx(void *ctx, + const char *source, + char *dest, + int isize, + int maxoutputsize) +{ + HTYPE *hashtable = (HTYPE *)ctx; + const u8 *ip = (u8 *)source; +#if LZ4_ARCH64 + const BYTE * const base = ip; +#else + const int base = 0; +#endif + const u8 *anchor = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + #define MATCHLIMIT (iend - LASTLITERALS) + + u8 *op = (u8 *) dest; + u8 *const oend = op + maxoutputsize; + int length; + const int skipstrength = SKIPSTRENGTH; + u32 forwardh; + int lastrun; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + memset((void *)hashtable, 0, LZ4_MEM_COMPRESS); + + /* First Byte */ + hashtable[LZ4_HASH_VALUE(ip)] = ip - base; + ip++; + forwardh = LZ4_HASH_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findmatchattempts = (1U << skipstrength) + 3; + const u8 *forwardip = ip; + const u8 *ref; + u8 *token; + + /* Find a match */ + do { + u32 h = forwardh; + int step = findmatchattempts++ >> skipstrength; + ip = forwardip; + forwardip = ip + step; + + if (unlikely(forwardip > mflimit)) + goto _last_literals; + + forwardh = LZ4_HASH_VALUE(forwardip); + ref = base + hashtable[h]; + hashtable[h] = ip - base; + } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); + + /* Catch up */ + while ((ip > anchor) && (ref > (u8 *)source) && + unlikely(ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = (int)(ip - anchor); + token = op++; + /* check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return 0; + + if (length >= (int)RUN_MASK) { + int len; + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *op++ = 255; + *op++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); +_next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref)); + + /* Start Counting */ + ip += MINMATCH; + /* MinMatch verified */ + ref += MINMATCH; + anchor = ip; + while (likely(ip < MATCHLIMIT - (STEPSIZE - 1))) { + #if LZ4_ARCH64 + u64 diff = A64(ref) ^ A64(ip); + #else + u32 diff = A32(ref) ^ A32(ip); + #endif + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } + #endif + if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < MATCHLIMIT) && (*ref == *ip)) + ip++; +_endcount: + /* Encode MatchLength */ + length = (int)(ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (length >> 8) > oend)) + return 0; + if (length >= (int)ML_MASK) { + *token += ML_MASK; + length -= ML_MASK; + for (; length > 509 ; length -= 510) { + *op++ = 255; + *op++ = 255; + } + if (length > 254) { + length -= 255; + *op++ = 255; + } + *op++ = (u8)length; + } else + *token += length; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + + /* Fill table */ + hashtable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base; + + /* Test next position */ + ref = base + hashtable[LZ4_HASH_VALUE(ip)]; + hashtable[LZ4_HASH_VALUE(ip)] = ip - base; + if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { + token = op++; + *token = 0; + goto _next_match; + } + + /* Prepare next loop */ + anchor = ip++; + forwardh = LZ4_HASH_VALUE(ip); + } + +_last_literals: + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (((char *)op - dest) + lastrun + 1 + + ((lastrun + 255 - RUN_MASK) / 255) > (u32)maxoutputsize) + return 0; + + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8)lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + + /* End */ + return (int)(((char *)op) - dest); +} + +static inline int lz4_compress64kctx(void *ctx, + const char *source, + char *dest, + int isize, + int maxoutputsize) +{ + u16 *hashtable = (u16 *)ctx; + const u8 *ip = (u8 *) source; + const u8 *anchor = ip; + const u8 *const base = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + #define MATCHLIMIT (iend - LASTLITERALS) + + u8 *op = (u8 *) dest; + u8 *const oend = op + maxoutputsize; + int len, length; + const int skipstrength = SKIPSTRENGTH; + u32 forwardh; + int lastrun; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + memset((void *)hashtable, 0, LZ4_MEM_COMPRESS); + + /* First Byte */ + ip++; + forwardh = LZ4_HASH64K_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findmatchattempts = (1U << skipstrength) + 3; + const u8 *forwardip = ip; + const u8 *ref; + u8 *token; + + /* Find a match */ + do { + u32 h = forwardh; + int step = findmatchattempts++ >> skipstrength; + ip = forwardip; + forwardip = ip + step; + + if (forwardip > mflimit) + goto _last_literals; + + forwardh = LZ4_HASH64K_VALUE(forwardip); + ref = base + hashtable[h]; + hashtable[h] = (u16)(ip - base); + } while (A32(ref) != A32(ip)); + + /* Catch up */ + while ((ip > anchor) && (ref > (u8 *)source) + && (ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = (int)(ip - anchor); + token = op++; + /* Check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return 0; + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *op++ = 255; + *op++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); + +_next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref)); + + /* Start Counting */ + ip += MINMATCH; + /* MinMatch verified */ + ref += MINMATCH; + anchor = ip; + + while (ip < MATCHLIMIT - (STEPSIZE - 1)) { + #if LZ4_ARCH64 + u64 diff = A64(ref) ^ A64(ip); + #else + u32 diff = A32(ref) ^ A32(ip); + #endif + + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } + #endif + if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < MATCHLIMIT) && (*ref == *ip)) + ip++; +_endcount: + + /* Encode MatchLength */ + len = (int)(ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)) + return 0; + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509 ; len -= 510) { + *op++ = 255; + *op++ = 255; + } + if (len > 254) { + len -= 255; + *op++ = 255; + } + *op++ = (u8)len; + } else + *token += len; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + + /* Fill table */ + hashtable[LZ4_HASH64K_VALUE(ip-2)] = (u16)(ip - 2 - base); + + /* Test next position */ + ref = base + hashtable[LZ4_HASH64K_VALUE(ip)]; + hashtable[LZ4_HASH64K_VALUE(ip)] = (u16)(ip - base); + if (A32(ref) == A32(ip)) { + token = op++; + *token = 0; + goto _next_match; + } + + /* Prepare next loop */ + anchor = ip++; + forwardh = LZ4_HASH64K_VALUE(ip); + } + +_last_literals: + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (op + lastrun + 1 + (lastrun - RUN_MASK + 255) / 255 > oend) + return 0; + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8)lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + /* End */ + return (int)(((char *)op) - dest); +} + +int lz4_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem) +{ + int ret = -1; + int out_len = 0; + + if (src_len < LZ4_64KLIMIT) + out_len = lz4_compress64kctx(wrkmem, src, dst, src_len, + lz4_compressbound(src_len)); + else + out_len = lz4_compressctx(wrkmem, src, dst, src_len, + lz4_compressbound(src_len)); + + if (out_len < 0) + goto exit; + + *dst_len = out_len; + + return 0; +exit: + return ret; +} +EXPORT_SYMBOL(lz4_compress); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("LZ4 compressor"); diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c new file mode 100644 index 000000000..6d940c72b --- /dev/null +++ b/lib/lz4/lz4_decompress.c @@ -0,0 +1,343 @@ +/* + * LZ4 Decompressor for Linux kernel + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee + * + * Based on LZ4 implementation by Yann Collet. + * + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + */ + +#ifndef STATIC +#include +#include +#endif +#include + +#include + +#include "lz4defs.h" + +static const int dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 +static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; +#endif + +static int lz4_uncompress(const char *source, char *dest, int osize) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *ref; + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + osize; + BYTE *cpy; + unsigned token; + size_t length; + + while (1) { + + /* get runlength */ + token = *ip++; + length = (token >> ML_BITS); + if (length == RUN_MASK) { + size_t len; + + len = *ip++; + for (; len == 255; length += 255) + len = *ip++; + if (unlikely(length > (size_t)(length + len))) + goto _output_error; + length += len; + } + + /* copy literals */ + cpy = op + length; + if (unlikely(cpy > oend - COPYLENGTH)) { + /* + * Error: not enough place for another match + * (min 4) + 5 literals + */ + if (cpy != oend) + goto _output_error; + + memcpy(op, ip, length); + ip += length; + break; /* EOF */ + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + + /* Error: offset create reference outside destination buffer */ + if (unlikely(ref < (BYTE *const) dest)) + goto _output_error; + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + for (; *ip == 255; length += 255) + ip++; + if (unlikely(length > (size_t)(length + *ip))) + goto _output_error; + length += *ip++; + } + + /* copy repeated sequence */ + if (unlikely((op - ref) < STEPSIZE)) { +#if LZ4_ARCH64 + int dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op-ref]; + PUT4(ref, op); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE - 4); + if (cpy > (oend - COPYLENGTH)) { + + /* Error: request to write beyond destination buffer */ + if (cpy > oend) + goto _output_error; +#if LZ4_ARCH64 + if ((ref + COPYLENGTH) > oend) +#else + if ((ref + COPYLENGTH) > oend || + (op + COPYLENGTH) > oend) +#endif + goto _output_error; + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + /* + * Check EOF (should never happen, since last 5 bytes + * are supposed to be literals) + */ + if (op == oend) + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + /* end of decoding */ + return (int) (((char *)ip) - source); + + /* write overflow error detected */ +_output_error: + return -1; +} + +static int lz4_uncompress_unknownoutputsize(const char *source, char *dest, + int isize, size_t maxoutputsize) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *const iend = ip + isize; + const BYTE *ref; + + + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + maxoutputsize; + BYTE *cpy; + + /* Main Loop */ + while (ip < iend) { + + unsigned token; + size_t length; + + /* get runlength */ + token = *ip++; + length = (token >> ML_BITS); + if (length == RUN_MASK) { + int s = 255; + while ((ip < iend) && (s == 255)) { + s = *ip++; + if (unlikely(length > (size_t)(length + s))) + goto _output_error; + length += s; + } + } + /* copy literals */ + cpy = op + length; + if ((cpy > oend - COPYLENGTH) || + (ip + length > iend - COPYLENGTH)) { + + if (cpy > oend) + goto _output_error;/* writes beyond buffer */ + + if (ip + length != iend) + goto _output_error;/* + * Error: LZ4 format requires + * to consume all input + * at this stage + */ + memcpy(op, ip, length); + op += length; + break;/* Necessarily EOF, due to parsing restrictions */ + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + if (ref < (BYTE * const) dest) + goto _output_error; + /* + * Error : offset creates reference + * outside of destination buffer + */ + + /* get matchlength */ + length = (token & ML_MASK); + if (length == ML_MASK) { + while (ip < iend) { + int s = *ip++; + if (unlikely(length > (size_t)(length + s))) + goto _output_error; + length += s; + if (s == 255) + continue; + break; + } + } + + /* copy repeated sequence */ + if (unlikely((op - ref) < STEPSIZE)) { +#if LZ4_ARCH64 + int dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op - ref]; + PUT4(ref, op); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE-4); + if (cpy > oend - COPYLENGTH) { + if (cpy > oend) + goto _output_error; /* write outside of buf */ +#if LZ4_ARCH64 + if ((ref + COPYLENGTH) > oend) +#else + if ((ref + COPYLENGTH) > oend || + (op + COPYLENGTH) > oend) +#endif + goto _output_error; + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + /* + * Check EOF (should never happen, since last 5 bytes + * are supposed to be literals) + */ + if (op == oend) + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + /* end of decoding */ + return (int) (((char *) op) - dest); + + /* write overflow error detected */ +_output_error: + return -1; +} + +int lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len) +{ + int ret = -1; + int input_len = 0; + + input_len = lz4_uncompress(src, dest, actual_dest_len); + if (input_len < 0) + goto exit_0; + *src_len = input_len; + + return 0; +exit_0: + return ret; +} +#ifndef STATIC +EXPORT_SYMBOL(lz4_decompress); +#endif + +int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, + unsigned char *dest, size_t *dest_len) +{ + int ret = -1; + int out_len = 0; + + out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len, + *dest_len); + if (out_len < 0) + goto exit_0; + *dest_len = out_len; + + return 0; +exit_0: + return ret; +} +#ifndef STATIC +EXPORT_SYMBOL(lz4_decompress_unknownoutputsize); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("LZ4 Decompressor"); +#endif diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h new file mode 100644 index 000000000..9b4182fd3 --- /dev/null +++ b/lib/lz4/lz4defs.h @@ -0,0 +1,157 @@ +/* + * lz4defs.h -- architecture specific defines + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * Detects 64 bits mode + */ +#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) \ + || defined(__ppc64__) || defined(__LP64__)) +#define LZ4_ARCH64 1 +#else +#define LZ4_ARCH64 0 +#endif + +/* + * Architecture-specific macros + */ +#define ARM_EFFICIENT_UNALIGNED_ACCESS +#define BYTE u8 +typedef struct _U16_S { u16 v; } U16_S; +typedef struct _U32_S { u32 v; } U32_S; +typedef struct _U64_S { u64 v; } U64_S; +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \ + || defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 6 \ + && defined(ARM_EFFICIENT_UNALIGNED_ACCESS) + +#define A16(x) (((U16_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A64(x) (((U64_S *)(x))->v) + +#define PUT4(s, d) (A32(d) = A32(s)) +#define PUT8(s, d) (A64(d) = A64(s)) +#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ + do { \ + A16(p) = v; \ + p += 2; \ + } while (0) +#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ + +#define A64(x) get_unaligned((u64 *)&(((U16_S *)(x))->v)) +#define A32(x) get_unaligned((u32 *)&(((U16_S *)(x))->v)) +#define A16(x) get_unaligned((u16 *)&(((U16_S *)(x))->v)) + +#define PUT4(s, d) \ + put_unaligned(get_unaligned((const u32 *) s), (u32 *) d) +#define PUT8(s, d) \ + put_unaligned(get_unaligned((const u64 *) s), (u64 *) d) + +#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ + do { \ + put_unaligned(v, (u16 *)(p)); \ + p += 2; \ + } while (0) +#endif + +#define COPYLENGTH 8 +#define ML_BITS 4 +#define ML_MASK ((1U << ML_BITS) - 1) +#define RUN_BITS (8 - ML_BITS) +#define RUN_MASK ((1U << RUN_BITS) - 1) +#define MEMORY_USAGE 14 +#define MINMATCH 4 +#define SKIPSTRENGTH 6 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH + MINMATCH) +#define MINLENGTH (MFLIMIT + 1) +#define MAXD_LOG 16 +#define MAXD (1 << MAXD_LOG) +#define MAXD_MASK (u32)(MAXD - 1) +#define MAX_DISTANCE (MAXD - 1) +#define HASH_LOG (MAXD_LOG - 1) +#define HASHTABLESIZE (1 << HASH_LOG) +#define MAX_NB_ATTEMPTS 256 +#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH) +#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT - 1)) +#define HASHLOG64K ((MEMORY_USAGE - 2) + 1) +#define HASH64KTABLESIZE (1U << HASHLOG64K) +#define LZ4_HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - (MEMORY_USAGE-2))) +#define LZ4_HASH64K_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - HASHLOG64K)) +#define HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - HASH_LOG)) + +#if LZ4_ARCH64/* 64-bit */ +#define STEPSIZE 8 + +#define LZ4_COPYSTEP(s, d) \ + do { \ + PUT8(s, d); \ + d += 8; \ + s += 8; \ + } while (0) + +#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d) + +#define LZ4_SECURECOPY(s, d, e) \ + do { \ + if (d < e) { \ + LZ4_WILDCOPY(s, d, e); \ + } \ + } while (0) +#define HTYPE u32 + +#ifdef __BIG_ENDIAN +#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3) +#else +#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3) +#endif + +#else /* 32-bit */ +#define STEPSIZE 4 + +#define LZ4_COPYSTEP(s, d) \ + do { \ + PUT4(s, d); \ + d += 4; \ + s += 4; \ + } while (0) + +#define LZ4_COPYPACKET(s, d) \ + do { \ + LZ4_COPYSTEP(s, d); \ + LZ4_COPYSTEP(s, d); \ + } while (0) + +#define LZ4_SECURECOPY LZ4_WILDCOPY +#define HTYPE const u8* + +#ifdef __BIG_ENDIAN +#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3) +#else +#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3) +#endif + +#endif + +#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ + (d = s - get_unaligned_le16(p)) + +#define LZ4_WILDCOPY(s, d, e) \ + do { \ + LZ4_COPYPACKET(s, d); \ + } while (d < e) + +#define LZ4_BLINDCOPY(s, d, l) \ + do { \ + u8 *e = (d) + l; \ + LZ4_WILDCOPY(s, d, e); \ + d = e; \ + } while (0) diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c new file mode 100644 index 000000000..f344f76b6 --- /dev/null +++ b/lib/lz4/lz4hc_compress.c @@ -0,0 +1,539 @@ +/* + * LZ4 HC - High Compression Mode of LZ4 + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + * + * Changed for kernel use by: + * Chanho Min + */ + +#include +#include +#include +#include +#include "lz4defs.h" + +struct lz4hc_data { + const u8 *base; + HTYPE hashtable[HASHTABLESIZE]; + u16 chaintable[MAXD]; + const u8 *nexttoupdate; +} __attribute__((__packed__)); + +static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base) +{ + memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable)); + memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable)); + +#if LZ4_ARCH64 + hc4->nexttoupdate = base + 1; +#else + hc4->nexttoupdate = base; +#endif + hc4->base = base; + return 1; +} + +/* Update chains up to ip (excluded) */ +static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip) +{ + u16 *chaintable = hc4->chaintable; + HTYPE *hashtable = hc4->hashtable; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + + while (hc4->nexttoupdate < ip) { + const u8 *p = hc4->nexttoupdate; + size_t delta = p - (hashtable[HASH_VALUE(p)] + base); + if (delta > MAX_DISTANCE) + delta = MAX_DISTANCE; + chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta; + hashtable[HASH_VALUE(p)] = (p) - base; + hc4->nexttoupdate++; + } +} + +static inline size_t lz4hc_commonlength(const u8 *p1, const u8 *p2, + const u8 *const matchlimit) +{ + const u8 *p1t = p1; + + while (p1t < matchlimit - (STEPSIZE - 1)) { +#if LZ4_ARCH64 + u64 diff = A64(p2) ^ A64(p1t); +#else + u32 diff = A32(p2) ^ A32(p1t); +#endif + if (!diff) { + p1t += STEPSIZE; + p2 += STEPSIZE; + continue; + } + p1t += LZ4_NBCOMMONBYTES(diff); + return p1t - p1; + } +#if LZ4_ARCH64 + if ((p1t < (matchlimit-3)) && (A32(p2) == A32(p1t))) { + p1t += 4; + p2 += 4; + } +#endif + + if ((p1t < (matchlimit - 1)) && (A16(p2) == A16(p1t))) { + p1t += 2; + p2 += 2; + } + if ((p1t < matchlimit) && (*p2 == *p1t)) + p1t++; + return p1t - p1; +} + +static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4, + const u8 *ip, const u8 *const matchlimit, const u8 **matchpos) +{ + u16 *const chaintable = hc4->chaintable; + HTYPE *const hashtable = hc4->hashtable; + const u8 *ref; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + int nbattempts = MAX_NB_ATTEMPTS; + size_t repl = 0, ml = 0; + u16 delta; + + /* HC4 match finder */ + lz4hc_insert(hc4, ip); + ref = hashtable[HASH_VALUE(ip)] + base; + + /* potential repetition */ + if (ref >= ip-4) { + /* confirmed */ + if (A32(ref) == A32(ip)) { + delta = (u16)(ip-ref); + repl = ml = lz4hc_commonlength(ip + MINMATCH, + ref + MINMATCH, matchlimit) + MINMATCH; + *matchpos = ref; + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + + while ((ref >= ip - MAX_DISTANCE) && nbattempts) { + nbattempts--; + if (*(ref + ml) == *(ip + ml)) { + if (A32(ref) == A32(ip)) { + size_t mlt = + lz4hc_commonlength(ip + MINMATCH, + ref + MINMATCH, matchlimit) + MINMATCH; + if (mlt > ml) { + ml = mlt; + *matchpos = ref; + } + } + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + + /* Complete table */ + if (repl) { + const BYTE *ptr = ip; + const BYTE *end; + end = ip + repl - (MINMATCH-1); + /* Pre-Load */ + while (ptr < end - delta) { + chaintable[(size_t)(ptr) & MAXD_MASK] = delta; + ptr++; + } + do { + chaintable[(size_t)(ptr) & MAXD_MASK] = delta; + /* Head of chain */ + hashtable[HASH_VALUE(ptr)] = (ptr) - base; + ptr++; + } while (ptr < end); + hc4->nexttoupdate = end; + } + + return (int)ml; +} + +static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4, + const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest, + const u8 **matchpos, const u8 **startpos) +{ + u16 *const chaintable = hc4->chaintable; + HTYPE *const hashtable = hc4->hashtable; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + const u8 *ref; + int nbattempts = MAX_NB_ATTEMPTS; + int delta = (int)(ip - startlimit); + + /* First Match */ + lz4hc_insert(hc4, ip); + ref = hashtable[HASH_VALUE(ip)] + base; + + while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base) + && (nbattempts)) { + nbattempts--; + if (*(startlimit + longest) == *(ref - delta + longest)) { + if (A32(ref) == A32(ip)) { + const u8 *reft = ref + MINMATCH; + const u8 *ipt = ip + MINMATCH; + const u8 *startt = ip; + + while (ipt < matchlimit-(STEPSIZE - 1)) { + #if LZ4_ARCH64 + u64 diff = A64(reft) ^ A64(ipt); + #else + u32 diff = A32(reft) ^ A32(ipt); + #endif + + if (!diff) { + ipt += STEPSIZE; + reft += STEPSIZE; + continue; + } + ipt += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ipt < (matchlimit - 3)) + && (A32(reft) == A32(ipt))) { + ipt += 4; + reft += 4; + } + ipt += 2; + #endif + if ((ipt < (matchlimit - 1)) + && (A16(reft) == A16(ipt))) { + reft += 2; + } + if ((ipt < matchlimit) && (*reft == *ipt)) + ipt++; +_endcount: + reft = ref; + + while ((startt > startlimit) + && (reft > hc4->base) + && (startt[-1] == reft[-1])) { + startt--; + reft--; + } + + if ((ipt - startt) > longest) { + longest = (int)(ipt - startt); + *matchpos = reft; + *startpos = startt; + } + } + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + return longest; +} + +static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor, + int ml, const u8 *ref) +{ + int length, len; + u8 *token; + + /* Encode Literal length */ + length = (int)(*ip - *anchor); + token = (*op)++; + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *(*op)++ = 255; + *(*op)++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(*anchor, *op, length); + + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(*op, (u16)(*ip - ref)); + + /* Encode MatchLength */ + len = (int)(ml - MINMATCH); + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509 ; len -= 510) { + *(*op)++ = 255; + *(*op)++ = 255; + } + if (len > 254) { + len -= 255; + *(*op)++ = 255; + } + *(*op)++ = (u8)len; + } else + *token += len; + + /* Prepare next loop */ + *ip += ml; + *anchor = *ip; + + return 0; +} + +static int lz4_compresshcctx(struct lz4hc_data *ctx, + const char *source, + char *dest, + int isize) +{ + const u8 *ip = (const u8 *)source; + const u8 *anchor = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + const u8 *const matchlimit = (iend - LASTLITERALS); + + u8 *op = (u8 *)dest; + + int ml, ml2, ml3, ml0; + const u8 *ref = NULL; + const u8 *start2 = NULL; + const u8 *ref2 = NULL; + const u8 *start3 = NULL; + const u8 *ref3 = NULL; + const u8 *start0; + const u8 *ref0; + int lastrun; + + ip++; + + /* Main Loop */ + while (ip < mflimit) { + ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref)); + if (!ml) { + ip++; + continue; + } + + /* saved, in case we would skip too much */ + start0 = ip; + ref0 = ref; + ml0 = ml; +_search2: + if (ip+ml < mflimit) + ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2, + ip + 1, matchlimit, ml, &ref2, &start2); + else + ml2 = ml; + /* No better match */ + if (ml2 == ml) { + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + continue; + } + + if (start0 < ip) { + /* empirical */ + if (start2 < ip + ml0) { + ip = start0; + ref = ref0; + ml = ml0; + } + } + /* + * Here, start0==ip + * First Match too small : removed + */ + if ((start2 - ip) < 3) { + ml = ml2; + ip = start2; + ref = ref2; + goto _search2; + } + +_search3: + /* + * Currently we have : + * ml2 > ml1, and + * ip1+3 <= ip2 (usually < ip1+ml1) + */ + if ((start2 - ip) < OPTIMAL_ML) { + int correction; + int new_ml = ml; + if (new_ml > OPTIMAL_ML) + new_ml = OPTIMAL_ML; + if (ip + new_ml > start2 + ml2 - MINMATCH) + new_ml = (int)(start2 - ip) + ml2 - MINMATCH; + correction = new_ml - (int)(start2 - ip); + if (correction > 0) { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } + /* + * Now, we have start2 = ip+new_ml, + * with new_ml=min(ml, OPTIMAL_ML=18) + */ + if (start2 + ml2 < mflimit) + ml3 = lz4hc_insertandgetwidermatch(ctx, + start2 + ml2 - 3, start2, matchlimit, + ml2, &ref3, &start3); + else + ml3 = ml2; + + /* No better match : 2 sequences to encode */ + if (ml3 == ml2) { + /* ip & ref are known; Now for ml */ + if (start2 < ip+ml) + ml = (int)(start2 - ip); + + /* Now, encode 2 sequences */ + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + ip = start2; + lz4_encodesequence(&ip, &op, &anchor, ml2, ref2); + continue; + } + + /* Not enough space for match 2 : remove it */ + if (start3 < ip + ml + 3) { + /* + * can write Seq1 immediately ==> Seq2 is removed, + * so Seq3 becomes Seq1 + */ + if (start3 >= (ip + ml)) { + if (start2 < ip + ml) { + int correction = + (int)(ip + ml - start2); + start2 += correction; + ref2 += correction; + ml2 -= correction; + if (ml2 < MINMATCH) { + start2 = start3; + ref2 = ref3; + ml2 = ml3; + } + } + + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + ip = start3; + ref = ref3; + ml = ml3; + + start0 = start2; + ref0 = ref2; + ml0 = ml2; + goto _search2; + } + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + goto _search3; + } + + /* + * OK, now we have 3 ascending matches; let's write at least + * the first one ip & ref are known; Now for ml + */ + if (start2 < ip + ml) { + if ((start2 - ip) < (int)ML_MASK) { + int correction; + if (ml > OPTIMAL_ML) + ml = OPTIMAL_ML; + if (ip + ml > start2 + ml2 - MINMATCH) + ml = (int)(start2 - ip) + ml2 + - MINMATCH; + correction = ml - (int)(start2 - ip); + if (correction > 0) { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } else + ml = (int)(start2 - ip); + } + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + + ip = start2; + ref = ref2; + ml = ml2; + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + + goto _search3; + } + + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8) lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + /* End */ + return (int) (((char *)op) - dest); +} + +int lz4hc_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem) +{ + int ret = -1; + int out_len = 0; + + struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem; + lz4hc_init(hc4, (const u8 *)src); + out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src, + (char *)dst, (int)src_len); + + if (out_len < 0) + goto exit; + + *dst_len = out_len; + return 0; + +exit: + return ret; +} +EXPORT_SYMBOL(lz4hc_compress); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("LZ4HC compressor"); From d59ed16239388a1163af7e46661c1cf74b144ed3 Mon Sep 17 00:00:00 2001 From: carck Date: Sun, 22 May 2016 05:09:25 -0400 Subject: [PATCH 14/52] crypto: add lz4 Cryptographic API Add support for lz4 and lz4hc compression algorithm using the lib/lz4/* codebase. [akpm@linux-foundation.org: fix warnings] Signed-off-by: Chanho Min Cc: "Darrick J. Wong" Cc: Bob Pearson Cc: Richard Weinberger Cc: Herbert Xu Cc: Yann Collet Cc: Kyungsik Lee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Pranav Vashi --- crypto/Kconfig | 16 ++++++++ crypto/Makefile | 2 + crypto/lz4.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++ crypto/lz4hc.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 230 insertions(+) create mode 100644 crypto/lz4.c create mode 100644 crypto/lz4hc.c diff --git a/crypto/Kconfig b/crypto/Kconfig index 566ebe0d8..9a6bd9712 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1035,6 +1035,22 @@ config CRYPTO_LZO help This is the LZO algorithm. +config CRYPTO_LZ4 + tristate "LZ4 compression algorithm" + select CRYPTO_ALGAPI + select LZ4_COMPRESS + select LZ4_DECOMPRESS + help + This is the LZ4 algorithm. + +config CRYPTO_LZ4HC + tristate "LZ4HC compression algorithm" + select CRYPTO_ALGAPI + select LZ4HC_COMPRESS + select LZ4_DECOMPRESS + help + This is the LZ4 high compression mode algorithm. + comment "Random Number Generation" config CRYPTO_ANSI_CPRNG diff --git a/crypto/Makefile b/crypto/Makefile index cba53f235..df6e37262 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -86,6 +86,8 @@ obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o obj-$(CONFIG_CRYPTO_LZO) += lzo.o +obj-$(CONFIG_CRYPTO_LZ4) += lz4.o +obj-$(CONFIG_CRYPTO_LZ4HC) += lz4hc.o obj-$(CONFIG_CRYPTO_RNG2) += rng.o obj-$(CONFIG_CRYPTO_RNG2) += krng.o obj-$(CONFIG_CRYPTO_ANSI_CPRNG) += ansi_cprng.o diff --git a/crypto/lz4.c b/crypto/lz4.c new file mode 100644 index 000000000..4586dd15b --- /dev/null +++ b/crypto/lz4.c @@ -0,0 +1,106 @@ +/* + * Cryptographic API. + * + * Copyright (c) 2013 Chanho Min + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include + +struct lz4_ctx { + void *lz4_comp_mem; +}; + +static int lz4_init(struct crypto_tfm *tfm) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->lz4_comp_mem = vmalloc(LZ4_MEM_COMPRESS); + if (!ctx->lz4_comp_mem) + return -ENOMEM; + + return 0; +} + +static void lz4_exit(struct crypto_tfm *tfm) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + vfree(ctx->lz4_comp_mem); +} + +static int lz4_compress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct lz4_ctx *ctx = crypto_tfm_ctx(tfm); + size_t tmp_len = *dlen; + int err; + + err = lz4_compress(src, slen, dst, &tmp_len, ctx->lz4_comp_mem); + + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return 0; +} + +static int lz4_decompress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + int err; + size_t tmp_len = *dlen; + size_t __slen = slen; + + err = lz4_decompress(src, &__slen, dst, tmp_len); + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return err; +} + +static struct crypto_alg alg_lz4 = { + .cra_name = "lz4", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = sizeof(struct lz4_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg_lz4.cra_list), + .cra_init = lz4_init, + .cra_exit = lz4_exit, + .cra_u = { .compress = { + .coa_compress = lz4_compress_crypto, + .coa_decompress = lz4_decompress_crypto } } +}; + +static int __init lz4_mod_init(void) +{ + return crypto_register_alg(&alg_lz4); +} + +static void __exit lz4_mod_fini(void) +{ + crypto_unregister_alg(&alg_lz4); +} + +module_init(lz4_mod_init); +module_exit(lz4_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4 Compression Algorithm"); diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c new file mode 100644 index 000000000..151ba31d3 --- /dev/null +++ b/crypto/lz4hc.c @@ -0,0 +1,106 @@ +/* + * Cryptographic API. + * + * Copyright (c) 2013 Chanho Min + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ +#include +#include +#include +#include +#include + +struct lz4hc_ctx { + void *lz4hc_comp_mem; +}; + +static int lz4hc_init(struct crypto_tfm *tfm) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->lz4hc_comp_mem = vmalloc(LZ4HC_MEM_COMPRESS); + if (!ctx->lz4hc_comp_mem) + return -ENOMEM; + + return 0; +} + +static void lz4hc_exit(struct crypto_tfm *tfm) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + + vfree(ctx->lz4hc_comp_mem); +} + +static int lz4hc_compress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct lz4hc_ctx *ctx = crypto_tfm_ctx(tfm); + size_t tmp_len = *dlen; + int err; + + err = lz4hc_compress(src, slen, dst, &tmp_len, ctx->lz4hc_comp_mem); + + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return 0; +} + +static int lz4hc_decompress_crypto(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + int err; + size_t tmp_len = *dlen; + size_t __slen = slen; + + err = lz4_decompress(src, &__slen, dst, tmp_len); + if (err < 0) + return -EINVAL; + + *dlen = tmp_len; + return err; +} + +static struct crypto_alg alg_lz4hc = { + .cra_name = "lz4hc", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = sizeof(struct lz4hc_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg_lz4hc.cra_list), + .cra_init = lz4hc_init, + .cra_exit = lz4hc_exit, + .cra_u = { .compress = { + .coa_compress = lz4hc_compress_crypto, + .coa_decompress = lz4hc_decompress_crypto } } +}; + +static int __init lz4hc_mod_init(void) +{ + return crypto_register_alg(&alg_lz4hc); +} + +static void __exit lz4hc_mod_fini(void) +{ + crypto_unregister_alg(&alg_lz4hc); +} + +module_init(lz4hc_mod_init); +module_exit(lz4hc_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4HC Compression Algorithm"); From 864eb74afc27bfaaff3067ed180c644049ea89b2 Mon Sep 17 00:00:00 2001 From: carck Date: Sun, 22 May 2016 05:10:23 -0400 Subject: [PATCH 15/52] crypto: lz4,lz4hc - fix decompression The lz4 library has two functions for decompression, with slightly different signatures and behaviour. The lz4_decompress_crypto() function seemed to be using the one that assumes that the decompressed length is known in advance. This patch switches to the other decompression function and makes sure that the length of the decompressed output is properly returned to the caller. The same issue was present in the lz4hc algorithm. Coincidentally, this change also makes very basic lz4 and lz4hc compression tests in testmgr pass. Signed-off-by: KOVACS Krisztian Signed-off-by: Herbert Xu Signed-off-by: Pranav Vashi --- crypto/lz4.c | 2 +- crypto/lz4hc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crypto/lz4.c b/crypto/lz4.c index 4586dd15b..34d072b72 100644 --- a/crypto/lz4.c +++ b/crypto/lz4.c @@ -68,7 +68,7 @@ static int lz4_decompress_crypto(struct crypto_tfm *tfm, const u8 *src, size_t tmp_len = *dlen; size_t __slen = slen; - err = lz4_decompress(src, &__slen, dst, tmp_len); + err = lz4_decompress_unknownoutputsize(src, __slen, dst, &tmp_len); if (err < 0) return -EINVAL; diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c index 151ba31d3..9218b3fed 100644 --- a/crypto/lz4hc.c +++ b/crypto/lz4hc.c @@ -68,7 +68,7 @@ static int lz4hc_decompress_crypto(struct crypto_tfm *tfm, const u8 *src, size_t tmp_len = *dlen; size_t __slen = slen; - err = lz4_decompress(src, &__slen, dst, tmp_len); + err = lz4_decompress_unknownoutputsize(src, __slen, dst, &tmp_len); if (err < 0) return -EINVAL; From 9f3c4b7179983da9473fd3401dd2fb90273fe398 Mon Sep 17 00:00:00 2001 From: carck Date: Sun, 22 May 2016 05:12:10 -0400 Subject: [PATCH 16/52] crypto: prefix module autoloading with "crypto-" This prefixes all crypto module loading with "crypto-" so we never run the risk of exposing module auto-loading to userspace via a crypto API, as demonstrated by Mathias Krause: https://lkml.org/lkml/2013/3/4/70 Signed-off-by: Kees Cook Signed-off-by: Herbert Xu [neo: Only lz4, we have for others.] Signed-off-by: Pranav Vashi --- crypto/lz4.c | 1 + crypto/lz4hc.c | 1 + 2 files changed, 2 insertions(+) diff --git a/crypto/lz4.c b/crypto/lz4.c index 34d072b72..aefbceaf3 100644 --- a/crypto/lz4.c +++ b/crypto/lz4.c @@ -104,3 +104,4 @@ module_exit(lz4_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("LZ4 Compression Algorithm"); +MODULE_ALIAS_CRYPTO("lz4"); diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c index 9218b3fed..a1d3b5bd3 100644 --- a/crypto/lz4hc.c +++ b/crypto/lz4hc.c @@ -104,3 +104,4 @@ module_exit(lz4hc_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("LZ4HC Compression Algorithm"); +MODULE_ALIAS_CRYPTO("lz4hc"); From 1e13f81a9448d7cabd79a38aca7ee44f12b3ee20 Mon Sep 17 00:00:00 2001 From: carck Date: Sun, 22 May 2016 05:15:01 -0400 Subject: [PATCH 17/52] CONFIG: Enable CRYPTO LZ4 --- arch/arm/configs/ik_defconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/configs/ik_defconfig b/arch/arm/configs/ik_defconfig index 5b947d920..c0743493d 100644 --- a/arch/arm/configs/ik_defconfig +++ b/arch/arm/configs/ik_defconfig @@ -4194,6 +4194,9 @@ CONFIG_CRYPTO_TWOFISH_COMMON=y CONFIG_CRYPTO_DEFLATE=y # CONFIG_CRYPTO_ZLIB is not set CONFIG_CRYPTO_LZO=y +CONFIG_CRYPTO_LZO=y +CONFIG_CRYPTO_LZ4=y +CONFIG_CRYPTO_LZ4HC=y # # Random Number Generation From a69d4e4f1b47eb624c31febb13ee7bff65fb5ac7 Mon Sep 17 00:00:00 2001 From: carck Date: Sun, 22 May 2016 05:18:51 -0400 Subject: [PATCH 18/52] ZSWAP: Use lz4 for zswap if possible --- mm/zswap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/zswap.c b/mm/zswap.c index b825f9876..a00c56482 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -80,7 +80,11 @@ module_param_named(enabled, zswap_enabled, bool, 0); /* Compressor to be used by zswap (fixed at boot for now) */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" +#ifndef CONFIG_CRYPTO_LZ4 static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; +#else +static char *zswap_compressor = "lz4"; +#endif module_param_named(compressor, zswap_compressor, charp, 0); /* The maximum percentage of memory that the compressed pool can occupy */ From 66561caa0e223163bb180cbbd0c3dab1b0920f2a Mon Sep 17 00:00:00 2001 From: carck Date: Sun, 22 May 2016 05:28:29 -0400 Subject: [PATCH 19/52] LZ4: add missed include file for lz4 --- include/linux/lz4.h | 87 +++++++++++++++++++++++++++++++++++++++++++ include/linux/unlz4.h | 10 +++++ 2 files changed, 97 insertions(+) create mode 100644 include/linux/lz4.h create mode 100644 include/linux/unlz4.h diff --git a/include/linux/lz4.h b/include/linux/lz4.h new file mode 100644 index 000000000..6b784c59f --- /dev/null +++ b/include/linux/lz4.h @@ -0,0 +1,87 @@ +#ifndef __LZ4_H__ +#define __LZ4_H__ +/* + * LZ4 Kernel Interface + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#define LZ4_MEM_COMPRESS (16384) +#define LZ4HC_MEM_COMPRESS (262144 + (2 * sizeof(unsigned char *))) + +/* + * lz4_compressbound() + * Provides the maximum size that LZ4 may output in a "worst case" scenario + * (input data not compressible) + */ +static inline size_t lz4_compressbound(size_t isize) +{ + return isize + (isize / 255) + 16; +} + +/* + * lz4_compress() + * src : source address of the original data + * src_len : size of the original data + * dst : output buffer address of the compressed data + * This requires 'dst' of size LZ4_COMPRESSBOUND. + * dst_len : is the output size, which is returned after compress done + * workmem : address of the working memory. + * This requires 'workmem' of size LZ4_MEM_COMPRESS. + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer and workmem must be already allocated with + * the defined size. + */ +int lz4_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + + /* + * lz4hc_compress() + * src : source address of the original data + * src_len : size of the original data + * dst : output buffer address of the compressed data + * This requires 'dst' of size LZ4_COMPRESSBOUND. + * dst_len : is the output size, which is returned after compress done + * workmem : address of the working memory. + * This requires 'workmem' of size LZ4HC_MEM_COMPRESS. + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer and workmem must be already allocated with + * the defined size. + */ +int lz4hc_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + +/* + * lz4_decompress() + * src : source address of the compressed data + * src_len : is the input size, whcih is returned after decompress done + * dest : output buffer address of the decompressed data + * actual_dest_len: is the size of uncompressed data, supposing it's known + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer must be already allocated. + * slightly faster than lz4_decompress_unknownoutputsize() + */ +int lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len); + +/* + * lz4_decompress_unknownoutputsize() + * src : source address of the compressed data + * src_len : is the input size, therefore the compressed size + * dest : output buffer address of the decompressed data + * dest_len: is the max size of the destination buffer, which is + * returned with actual size of decompressed data after + * decompress done + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer must be already allocated. + */ +int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, + unsigned char *dest, size_t *dest_len); +#endif diff --git a/include/linux/unlz4.h b/include/linux/unlz4.h new file mode 100644 index 000000000..3273c2f36 --- /dev/null +++ b/include/linux/unlz4.h @@ -0,0 +1,10 @@ +#ifndef DECOMPRESS_UNLZ4_H +#define DECOMPRESS_UNLZ4_H + +int unlz4(unsigned char *inbuf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), + unsigned char *output, + long *pos, + void(*error)(char *x)); +#endif From 8672bf85c1850574f6a7420b831b347f6d9d4f28 Mon Sep 17 00:00:00 2001 From: carck Date: Sun, 22 May 2016 06:21:56 -0400 Subject: [PATCH 20/52] Revert crypto: prefix module autoloading with "crypto-" this reverted commit 481424adc454173f985b5d52569598417ff1f51d) --- crypto/lz4.c | 1 - crypto/lz4hc.c | 1 - 2 files changed, 2 deletions(-) diff --git a/crypto/lz4.c b/crypto/lz4.c index aefbceaf3..34d072b72 100644 --- a/crypto/lz4.c +++ b/crypto/lz4.c @@ -104,4 +104,3 @@ module_exit(lz4_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("LZ4 Compression Algorithm"); -MODULE_ALIAS_CRYPTO("lz4"); diff --git a/crypto/lz4hc.c b/crypto/lz4hc.c index a1d3b5bd3..9218b3fed 100644 --- a/crypto/lz4hc.c +++ b/crypto/lz4hc.c @@ -104,4 +104,3 @@ module_exit(lz4hc_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("LZ4HC Compression Algorithm"); -MODULE_ALIAS_CRYPTO("lz4hc"); From def3dc9c9e22cfa76d00227a8ce4526237c04572 Mon Sep 17 00:00:00 2001 From: Carck Date: Tue, 7 Jun 2016 16:56:46 +0800 Subject: [PATCH 21/52] Fix CONFIG_HZ dependency in wifi driver. Sleep for HZ jiffies, not 100. bug 27419732 Change-Id: I9717f2b539a7fe2e2ba2cd6c3fd231f987ec38a4 http://review.cyanogenmod.org/#/c/147683/ --- drivers/net/wireless/bcmdhd/wl_cfg80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/bcmdhd/wl_cfg80211.c b/drivers/net/wireless/bcmdhd/wl_cfg80211.c index 671c85a42..24081ea33 100644 --- a/drivers/net/wireless/bcmdhd/wl_cfg80211.c +++ b/drivers/net/wireless/bcmdhd/wl_cfg80211.c @@ -10084,7 +10084,7 @@ wl_cfg80211_netdev_notifier_call(struct notifier_block * nb, break; } set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(100); + schedule_timeout(HZ); set_current_state(TASK_RUNNING); refcnt++; } From ab8b83b16907cbb6468580b9b929482978331829 Mon Sep 17 00:00:00 2001 From: Carck Date: Wed, 8 Jun 2016 13:04:43 +0800 Subject: [PATCH 22/52] RAMDISK: disable init.carrier.rc --- ik.ramdisk/common/init.qcom.rc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ik.ramdisk/common/init.qcom.rc b/ik.ramdisk/common/init.qcom.rc index 5980e5ad3..aee4cfc3a 100755 --- a/ik.ramdisk/common/init.qcom.rc +++ b/ik.ramdisk/common/init.qcom.rc @@ -27,7 +27,7 @@ import init.qcom.usb.rc import init.target.rc -import init.carrier.rc +#import init.carrier.rc on early-init mount debugfs debugfs /sys/kernel/debug From 7c4affa009167e07b085d7d0789237e55e24e09c Mon Sep 17 00:00:00 2001 From: Carck Date: Wed, 8 Jun 2016 13:09:02 +0800 Subject: [PATCH 23/52] Tag as release version 7.2.0 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index a3fcc7121..0ee843cc6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -7.1.0 +7.2.0 From e17e0affb7065f8350d9362cf1dbe811bffc7dc7 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Wed, 8 Jun 2016 02:11:07 -0400 Subject: [PATCH 24/52] SSP: shut up debug messages (reverted from commit f6c817aeb8cc6ad018f1ba0a46499c5f2b4e8200) --- drivers/sensorhub/stm32f/ssp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/sensorhub/stm32f/ssp.h b/drivers/sensorhub/stm32f/ssp.h index 414182817..377170479 100755 --- a/drivers/sensorhub/stm32f/ssp.h +++ b/drivers/sensorhub/stm32f/ssp.h @@ -48,7 +48,7 @@ #include "ssp_sensorhub.h" #endif -#define SSP_DBG 1 +#define SSP_DBG 0 #define SUCCESS 1 #define FAIL 0 From 17248b1d5ad1f80f514cec2c9e4621fe4ab741b7 Mon Sep 17 00:00:00 2001 From: carck Date: Sat, 11 Jun 2016 05:05:43 -0400 Subject: [PATCH 25/52] SSP: Fix compile error when debug is off --- drivers/sensorhub/stm32f/ssp.h | 9 +++++---- drivers/sensorhub/stm32f/ssp_debug.c | 2 ++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/sensorhub/stm32f/ssp.h b/drivers/sensorhub/stm32f/ssp.h index 377170479..81c24adf0 100755 --- a/drivers/sensorhub/stm32f/ssp.h +++ b/drivers/sensorhub/stm32f/ssp.h @@ -56,18 +56,19 @@ #define FACTORY_DATA_MAX 100 -#if SSP_DBG -#define SSP_FUNC_DBG 1 -#define SSP_DATA_DBG 0 - /* ssp mcu device ID */ #define DEVICE_ID 0x55 +#if SSP_DBG +#define SSP_FUNC_DBG 1 +#define SSP_DATA_DBG 0 #define ssp_dbg(dev, format, ...) do { \ printk(KERN_INFO dev, format, ##__VA_ARGS__); \ } while (0) #else +#define SSP_FUNC_DBG 0 +#define SSP_DATA_DBG 0 #define ssp_dbg(dev, format, ...) #endif diff --git a/drivers/sensorhub/stm32f/ssp_debug.c b/drivers/sensorhub/stm32f/ssp_debug.c index af356f4ca..3d25335b4 100755 --- a/drivers/sensorhub/stm32f/ssp_debug.c +++ b/drivers/sensorhub/stm32f/ssp_debug.c @@ -196,7 +196,9 @@ int print_mcu_debug(char *pchRcvDataFrame, int *pDataIdx, int iRcvDataFrameLength) { int iLength = pchRcvDataFrame[(*pDataIdx)++]; +#if SSP_DBG int cur = *pDataIdx; +#endif if (iLength > iRcvDataFrameLength - *pDataIdx || iLength <= 0) { ssp_dbg("[SSP]: MSG From MCU - invalid debug length(%d/%d/%d)\n", From 081788bee62d0e25d17a1f12e42a3a72a71c63ff Mon Sep 17 00:00:00 2001 From: carck Date: Sat, 11 Jun 2016 05:07:02 -0400 Subject: [PATCH 26/52] RAMDISK: set swappiness to 50 --- ik.ramdisk/common/init.rc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ik.ramdisk/common/init.rc b/ik.ramdisk/common/init.rc index 7c6d46d58..756f30c68 100755 --- a/ik.ramdisk/common/init.rc +++ b/ik.ramdisk/common/init.rc @@ -72,7 +72,7 @@ on init chown root system /sys/fs/cgroup/memory/tasks chmod 0660 /sys/fs/cgroup/memory/tasks mkdir /sys/fs/cgroup/memory/sw 0750 root system - write /sys/fs/cgroup/memory/sw/memory.swappiness 100 + write /sys/fs/cgroup/memory/sw/memory.swappiness 50 write /sys/fs/cgroup/memory/sw/memory.move_charge_at_immigrate 1 chown root system /sys/fs/cgroup/memory/sw/tasks chmod 0660 /sys/fs/cgroup/memory/sw/tasks From 713b1dc9c17956677162b08fcf126ce0ee2cc413 Mon Sep 17 00:00:00 2001 From: Carck Date: Sun, 12 Jun 2016 20:48:39 +0800 Subject: [PATCH 27/52] clk: shut up debug logs --- arch/arm/mach-msm/clock-debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-msm/clock-debug.c b/arch/arm/mach-msm/clock-debug.c index ec3bd0036..89d97b0d1 100644 --- a/arch/arm/mach-msm/clock-debug.c +++ b/arch/arm/mach-msm/clock-debug.c @@ -33,7 +33,7 @@ static LIST_HEAD(clk_list); static DEFINE_SPINLOCK(clk_list_lock); static struct dentry *debugfs_base; -static u32 debug_suspend = 1; +static u32 debug_suspend = 0; struct clk_table { struct list_head node; From 27302318cce3133b63f3ad8af4df2f6e49f95a6b Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Fri, 17 Jun 2016 01:29:24 -0400 Subject: [PATCH 28/52] ZRAM: add zram support --- arch/arm/configs/ik_defconfig | 4 +- drivers/block/zram/Kconfig | 26 + drivers/block/zram/Makefile | 5 + drivers/block/zram/zcomp.c | 230 +++++ drivers/block/zram/zcomp.h | 61 ++ drivers/block/zram/zcomp_lz4.c | 56 ++ drivers/block/zram/zcomp_lz4.h | 17 + drivers/block/zram/zcomp_lzo.c | 56 ++ drivers/block/zram/zcomp_lzo.h | 17 + drivers/block/zram/zram_drv.c | 1555 ++++++++++++++++++++++++++++++++ drivers/block/zram/zram_drv.h | 122 +++ 11 files changed, 2148 insertions(+), 1 deletion(-) create mode 100644 drivers/block/zram/Kconfig create mode 100644 drivers/block/zram/Makefile create mode 100644 drivers/block/zram/zcomp.c create mode 100644 drivers/block/zram/zcomp.h create mode 100644 drivers/block/zram/zcomp_lz4.c create mode 100644 drivers/block/zram/zcomp_lz4.h create mode 100644 drivers/block/zram/zcomp_lzo.c create mode 100644 drivers/block/zram/zcomp_lzo.h create mode 100644 drivers/block/zram/zram_drv.c create mode 100644 drivers/block/zram/zram_drv.h diff --git a/arch/arm/configs/ik_defconfig b/arch/arm/configs/ik_defconfig index c0743493d..0d1527adc 100644 --- a/arch/arm/configs/ik_defconfig +++ b/arch/arm/configs/ik_defconfig @@ -1249,6 +1249,8 @@ CONFIG_OF_BATTERYDATA=y CONFIG_OF_SUBCMDLINE_PARSE=y # CONFIG_PARPORT is not set CONFIG_BLK_DEV=y +CONFIG_ZRAM=y +CONFIG_ZRAM_LZ4_COMPRESS=y # CONFIG_BLK_DEV_COW_COMMON is not set CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 @@ -3682,7 +3684,7 @@ CONFIG_SEC_FPGA=y # CONFIG_SENSORS_SSP_STM is not set CONFIG_SENSORS_SSP_STM_32F=y # CONFIG_SENSORS_SSP_STM_HESTIA is not set -CONFIG_SENSORS_SSP_STM_LEGACY=y +# CONFIG_SENSORS_SSP_STM_LEGACY is not set CONFIG_SENSORS_SYSFS=y CONFIG_SENSORS_SSP=y # CONFIG_SENSORS_SSP_AK8963C is not set diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig new file mode 100644 index 000000000..386ba3d1a --- /dev/null +++ b/drivers/block/zram/Kconfig @@ -0,0 +1,26 @@ +config ZRAM + tristate "Compressed RAM block device support" + depends on BLOCK && SYSFS && ZSMALLOC + select LZO_COMPRESS + select LZO_DECOMPRESS + default n + help + Creates virtual block devices called /dev/zramX (X = 0, 1, ...). + Pages written to these disks are compressed and stored in memory + itself. These disks allow very fast I/O and compression provides + good amounts of memory savings. + + It has several use cases, for example: /tmp storage, use as swap + disks and maybe many more. + + See zram.txt for more information. + +config ZRAM_LZ4_COMPRESS + bool "Enable LZ4 algorithm support" + depends on ZRAM + select LZ4_COMPRESS + select LZ4_DECOMPRESS + default n + help + This option enables LZ4 compression algorithm support. Compression + algorithm can be changed using `comp_algorithm' device attribute. \ No newline at end of file diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile new file mode 100644 index 000000000..be0763ff5 --- /dev/null +++ b/drivers/block/zram/Makefile @@ -0,0 +1,5 @@ +zram-y := zcomp_lzo.o zcomp.o zram_drv.o + +zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o + +obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c new file mode 100644 index 000000000..b51a816d7 --- /dev/null +++ b/drivers/block/zram/zcomp.c @@ -0,0 +1,230 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "zcomp.h" +#include "zcomp_lzo.h" +#ifdef CONFIG_ZRAM_LZ4_COMPRESS +#include "zcomp_lz4.h" +#endif + +static struct zcomp_backend *backends[] = { + &zcomp_lzo, +#ifdef CONFIG_ZRAM_LZ4_COMPRESS + &zcomp_lz4, +#endif + NULL +}; + +static struct zcomp_backend *find_backend(const char *compress) +{ + int i = 0; + while (backends[i]) { + if (sysfs_streq(compress, backends[i]->name)) + break; + i++; + } + return backends[i]; +} + +static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + if (zstrm->private) + comp->backend->destroy(zstrm->private); + free_pages((unsigned long)zstrm->buffer, 1); + kfree(zstrm); +} + +/* + * allocate new zcomp_strm structure with ->private initialized by + * backend, return NULL on error + */ +static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) +{ + struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags); + if (!zstrm) + return NULL; + + zstrm->private = comp->backend->create(flags); + /* + * allocate 2 pages. 1 for compressed data, plus 1 extra for the + * case when compressed size is larger than the original one + */ + zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1); + if (!zstrm->private || !zstrm->buffer) { + zcomp_strm_free(comp, zstrm); + zstrm = NULL; + } + return zstrm; +} + +/* show available compressors */ +ssize_t zcomp_available_show(const char *comp, char *buf) +{ + ssize_t sz = 0; + int i = 0; + + while (backends[i]) { + if (!strcmp(comp, backends[i]->name)) + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "[%s] ", backends[i]->name); + else + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "%s ", backends[i]->name); + i++; + } + sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); + return sz; +} + +bool zcomp_available_algorithm(const char *comp) +{ + return find_backend(comp) != NULL; +} + +struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) +{ + return *get_cpu_ptr(comp->stream); +} + +void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + put_cpu_ptr(comp->stream); +} + +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const unsigned char *src, size_t *dst_len) +{ + return comp->backend->compress(src, zstrm->buffer, dst_len, + zstrm->private); +} + +int zcomp_decompress(struct zcomp *comp, const unsigned char *src, + size_t src_len, unsigned char *dst) +{ + return comp->backend->decompress(src, src_len, dst); +} + +static int __zcomp_cpu_notifier(struct zcomp *comp, + unsigned long action, unsigned long cpu) +{ + struct zcomp_strm *zstrm; + + switch (action) { + case CPU_UP_PREPARE: + if (WARN_ON(*per_cpu_ptr(comp->stream, cpu))) + break; + zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); + if (IS_ERR_OR_NULL(zstrm)) { + pr_err("Can't allocate a compression stream\n"); + return NOTIFY_BAD; + } + *per_cpu_ptr(comp->stream, cpu) = zstrm; + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + zstrm = *per_cpu_ptr(comp->stream, cpu); + if (!IS_ERR_OR_NULL(zstrm)) + zcomp_strm_free(comp, zstrm); + *per_cpu_ptr(comp->stream, cpu) = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} + +static int zcomp_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + unsigned long cpu = (unsigned long)pcpu; + struct zcomp *comp = container_of(nb, typeof(*comp), notifier); + + return __zcomp_cpu_notifier(comp, action, cpu); +} + +static int zcomp_init(struct zcomp *comp) +{ + unsigned long cpu; + int ret; + + comp->notifier.notifier_call = zcomp_cpu_notifier; + + comp->stream = alloc_percpu(struct zcomp_strm *); + if (!comp->stream) + return -ENOMEM; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) { + ret = __zcomp_cpu_notifier(comp, CPU_UP_PREPARE, cpu); + if (ret == NOTIFY_BAD) + goto cleanup; + } + __register_cpu_notifier(&comp->notifier); + cpu_notifier_register_done(); + return 0; + +cleanup: + for_each_online_cpu(cpu) + __zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu); + cpu_notifier_register_done(); + return -ENOMEM; +} + +void zcomp_destroy(struct zcomp *comp) +{ + unsigned long cpu; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + __zcomp_cpu_notifier(comp, CPU_UP_CANCELED, cpu); + __unregister_cpu_notifier(&comp->notifier); + cpu_notifier_register_done(); + + free_percpu(comp->stream); + kfree(comp); +} + +/* + * search available compressors for requested algorithm. + * allocate new zcomp and initialize it. return compressing + * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL) + * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in + * case of allocation error, or any other error potentially + * returned by zcomp_init(). + */ +struct zcomp *zcomp_create(const char *compress) +{ + struct zcomp *comp; + struct zcomp_backend *backend; + int error; + + backend = find_backend(compress); + if (!backend) + return ERR_PTR(-EINVAL); + + comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); + if (!comp) + return ERR_PTR(-ENOMEM); + + comp->backend = backend; + error = zcomp_init(comp); + if (error) { + kfree(comp); + return ERR_PTR(error); + } + return comp; +} diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h new file mode 100644 index 000000000..ffd88cb74 --- /dev/null +++ b/drivers/block/zram/zcomp.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_H_ +#define _ZCOMP_H_ + +struct zcomp_strm { + /* compression/decompression buffer */ + void *buffer; + /* + * The private data of the compression stream, only compression + * stream backend can touch this (e.g. compression algorithm + * working memory) + */ + void *private; +}; + +/* static compression backend */ +struct zcomp_backend { + int (*compress)(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private); + + int (*decompress)(const unsigned char *src, size_t src_len, + unsigned char *dst); + + void *(*create)(gfp_t flags); + void (*destroy)(void *private); + + const char *name; +}; + +/* dynamic per-device compression frontend */ +struct zcomp { + struct zcomp_strm * __percpu *stream; + struct zcomp_backend *backend; + struct notifier_block notifier; +}; + +ssize_t zcomp_available_show(const char *comp, char *buf); +bool zcomp_available_algorithm(const char *comp); + +struct zcomp *zcomp_create(const char *comp); +void zcomp_destroy(struct zcomp *comp); + +struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); +void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm); + +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const unsigned char *src, size_t *dst_len); + +int zcomp_decompress(struct zcomp *comp, const unsigned char *src, + size_t src_len, unsigned char *dst); + +bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); +#endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c new file mode 100644 index 000000000..0110086ac --- /dev/null +++ b/drivers/block/zram/zcomp_lz4.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include "zcomp_lz4.h" + +static void *zcomp_lz4_create(gfp_t flags) +{ + void *ret; + + ret = kmalloc(LZ4_MEM_COMPRESS, flags); + if (!ret) + ret = __vmalloc(LZ4_MEM_COMPRESS, + flags | __GFP_HIGHMEM, + PAGE_KERNEL); + return ret; +} + +static void zcomp_lz4_destroy(void *private) +{ + kvfree(private); +} + +static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private) +{ + /* return : Success if return 0 */ + return lz4_compress(src, PAGE_SIZE, dst, dst_len, private); +} + +static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len, + unsigned char *dst) +{ + size_t dst_len = PAGE_SIZE; + /* return : Success if return 0 */ + return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len); +} + +struct zcomp_backend zcomp_lz4 = { + .compress = zcomp_lz4_compress, + .decompress = zcomp_lz4_decompress, + .create = zcomp_lz4_create, + .destroy = zcomp_lz4_destroy, + .name = "lz4", +}; diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h new file mode 100644 index 000000000..60613fb29 --- /dev/null +++ b/drivers/block/zram/zcomp_lz4.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_LZ4_H_ +#define _ZCOMP_LZ4_H_ + +#include "zcomp.h" + +extern struct zcomp_backend zcomp_lz4; + +#endif /* _ZCOMP_LZ4_H_ */ diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c new file mode 100644 index 000000000..ed7a1f054 --- /dev/null +++ b/drivers/block/zram/zcomp_lzo.c @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include "zcomp_lzo.h" + +static void *lzo_create(gfp_t flags) +{ + void *ret; + + ret = kmalloc(LZO1X_MEM_COMPRESS, flags); + if (!ret) + ret = __vmalloc(LZO1X_MEM_COMPRESS, + flags | __GFP_HIGHMEM, + PAGE_KERNEL); + return ret; +} + +static void lzo_destroy(void *private) +{ + kvfree(private); +} + +static int lzo_compress(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private) +{ + int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private); + return ret == LZO_E_OK ? 0 : ret; +} + +static int lzo_decompress(const unsigned char *src, size_t src_len, + unsigned char *dst) +{ + size_t dst_len = PAGE_SIZE; + int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len); + return ret == LZO_E_OK ? 0 : ret; +} + +struct zcomp_backend zcomp_lzo = { + .compress = lzo_compress, + .decompress = lzo_decompress, + .create = lzo_create, + .destroy = lzo_destroy, + .name = "lzo", +}; diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h new file mode 100644 index 000000000..128c5807f --- /dev/null +++ b/drivers/block/zram/zcomp_lzo.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_LZO_H_ +#define _ZCOMP_LZO_H_ + +#include "zcomp.h" + +extern struct zcomp_backend zcomp_lzo; + +#endif /* _ZCOMP_LZO_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c new file mode 100644 index 000000000..a0b07110d --- /dev/null +++ b/drivers/block/zram/zram_drv.c @@ -0,0 +1,1555 @@ +/* + * Compressed RAM block device + * + * Copyright (C) 2008, 2009, 2010 Nitin Gupta + * 2012, 2013 Minchan Kim + * + * This code is released using a dual license strategy: BSD/GPL + * You can choose the licence that better fits your requirements. + * + * Released under the terms of 3-clause BSD License + * Released under the terms of GNU General Public License Version 2.0 + * + */ + +#define KMSG_COMPONENT "zram" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zram_drv.h" + +#ifdef CONFIG_STATE_NOTIFIER +#include +static struct notifier_block notif; +#endif + +static DEFINE_IDR(zram_index_idr); +/* idr index must be protected */ +static DEFINE_MUTEX(zram_index_mutex); + +static int zram_major; +#ifdef CONFIG_ZRAM_LZ4_COMPRESS +static const char *default_compressor = "lz4"; +#else +static const char *default_compressor = "lzo"; +#endif + +/* Module params (documentation at end) */ +static unsigned int num_devices = 1; + +static inline void deprecated_attr_warn(const char *name) +{ + pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n", + task_pid_nr(current), + current->comm, + name, + "See zram documentation."); +} + +#define ZRAM_ATTR_RO(name) \ +static ssize_t name##_show(struct device *d, \ + struct device_attribute *attr, char *b) \ +{ \ + struct zram *zram = dev_to_zram(d); \ + \ + deprecated_attr_warn(__stringify(name)); \ + return scnprintf(b, PAGE_SIZE, "%llu\n", \ + (u64)atomic64_read(&zram->stats.name)); \ +} \ +static DEVICE_ATTR_RO(name); + +static inline bool init_done(struct zram *zram) +{ + return zram->disksize; +} + +static inline struct zram *dev_to_zram(struct device *dev) +{ + return (struct zram *)dev_to_disk(dev)->private_data; +} + +/* flag operations require table entry bit_spin_lock() being held */ +static int zram_test_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + return meta->table[index].value & BIT(flag); +} + +static void zram_set_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + meta->table[index].value |= BIT(flag); +} + +static void zram_clear_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + meta->table[index].value &= ~BIT(flag); +} + +static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +{ + return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); +} + +static void zram_set_obj_size(struct zram_meta *meta, + u32 index, size_t size) +{ + unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; + + meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; +} + +static inline bool is_partial_io(struct bio_vec *bvec) +{ + return bvec->bv_len != PAGE_SIZE; +} + +/* + * Check if request is within bounds and aligned on zram logical blocks. + */ +static inline bool valid_io_request(struct zram *zram, + sector_t start, unsigned int size) +{ + u64 end, bound; + + /* unaligned request */ + if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) + return false; + if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) + return false; + + end = start + (size >> SECTOR_SHIFT); + bound = zram->disksize >> SECTOR_SHIFT; + /* out of range range */ + if (unlikely(start >= bound || end > bound || start > end)) + return false; + + /* I/O request is valid */ + return true; +} + +static void update_position(u32 *index, int *offset, struct bio_vec *bvec) +{ + if (*offset + bvec->bv_len >= PAGE_SIZE) + (*index)++; + *offset = (*offset + bvec->bv_len) % PAGE_SIZE; +} + +static inline void update_used_max(struct zram *zram, + const unsigned long pages) +{ + unsigned long old_max, cur_max; + + old_max = atomic_long_read(&zram->stats.max_used_pages); + + do { + cur_max = old_max; + if (pages > cur_max) + old_max = atomic_long_cmpxchg( + &zram->stats.max_used_pages, cur_max, pages); + } while (old_max != cur_max); +} + +static bool page_zero_filled(void *ptr) +{ + unsigned int pos; + unsigned long *page; + + page = (unsigned long *)ptr; + + for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { + if (page[pos]) + return false; + } + + return true; +} + +static void handle_zero_page(struct bio_vec *bvec) +{ + struct page *page = bvec->bv_page; + void *user_mem; + + user_mem = kmap_atomic(page); + if (is_partial_io(bvec)) + memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); + else + clear_page(user_mem); + kunmap_atomic(user_mem); + + flush_dcache_page(page); +} + +#ifdef CONFIG_STATE_NOTIFIER +static int zram_compact(struct zram *zram) +{ + struct zram_meta *meta; + u64 new_size, data_size; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + pr_info("*** zram state notifier: zram is off ***\n"); + return -EINVAL; + } + + meta = zram->meta; + + data_size = atomic64_read(&zram->stats.compr_data_size); + + pr_info("*** zram state notifier: starting compaction ***\n"); + zs_compact(meta->mem_pool); + + new_size = atomic64_read(&zram->stats.compr_data_size); + if (new_size < data_size) + pr_info("%s compacted. Saved %llu kb.\n", + zram->disk->disk_name, + (unsigned long long)(data_size - new_size)); + pr_info("*** zram state notifier: finished compaction ***\n"); + + up_read(&zram->init_lock); + + return 0; +} + +static int zram_compact_cb(int id, void *ptr, void *data) +{ + zram_compact(ptr); + return 0; +} + +static int state_notifier_callback(struct notifier_block *this, + unsigned long event, void *data) +{ + switch (event) { + case STATE_NOTIFIER_ACTIVE: + break; + case STATE_NOTIFIER_SUSPEND: + idr_for_each(&zram_index_idr, &zram_compact_cb, NULL); + break; + default: + break; + } + + return NOTIFY_OK; +} +#endif + +static ssize_t initstate_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u32 val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = init_done(zram); + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%u\n", val); +} + +static ssize_t disksize_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); +} + +static ssize_t orig_data_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + deprecated_attr_warn("orig_data_size"); + return scnprintf(buf, PAGE_SIZE, "%llu\n", + (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); +} + +static ssize_t mem_used_total_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val = 0; + struct zram *zram = dev_to_zram(dev); + + deprecated_attr_warn("mem_used_total"); + down_read(&zram->init_lock); + if (init_done(zram)) { + struct zram_meta *meta = zram->meta; + val = zs_get_total_pages(meta->mem_pool); + } + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); +} + +static ssize_t mem_limit_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val; + struct zram *zram = dev_to_zram(dev); + + deprecated_attr_warn("mem_limit"); + down_read(&zram->init_lock); + val = zram->limit_pages; + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); +} + +static ssize_t mem_limit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + u64 limit; + char *tmp; + struct zram *zram = dev_to_zram(dev); + + limit = memparse(buf, &tmp); + if (buf == tmp) /* no chars parsed, invalid input */ + return -EINVAL; + + down_write(&zram->init_lock); + zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT; + up_write(&zram->init_lock); + + return len; +} + +static ssize_t mem_used_max_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val = 0; + struct zram *zram = dev_to_zram(dev); + + deprecated_attr_warn("mem_used_max"); + down_read(&zram->init_lock); + if (init_done(zram)) + val = atomic_long_read(&zram->stats.max_used_pages); + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); +} + +static ssize_t mem_used_max_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int err; + unsigned long val; + struct zram *zram = dev_to_zram(dev); + + err = kstrtoul(buf, 10, &val); + if (err || val != 0) + return -EINVAL; + + down_read(&zram->init_lock); + if (init_done(zram)) { + struct zram_meta *meta = zram->meta; + atomic_long_set(&zram->stats.max_used_pages, + zs_get_total_pages(meta->mem_pool)); + } + up_read(&zram->init_lock); + + return len; +} + +/* + * We switched to per-cpu streams and this attr is not needed anymore. + * However, we will keep it around for some time, because: + * a) we may revert per-cpu streams in the future + * b) it's visible to user space and we need to follow our 2 years + * retirement rule; but we already have a number of 'soon to be + * altered' attrs, so max_comp_streams need to wait for the next + * layoff cycle. + */ +static ssize_t max_comp_streams_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus()); +} + +static ssize_t max_comp_streams_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + return len; +} + +static ssize_t comp_algorithm_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + size_t sz; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + sz = zcomp_available_show(zram->compressor, buf); + up_read(&zram->init_lock); + + return sz; +} + +static ssize_t comp_algorithm_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + size_t sz; + + if (!zcomp_available_algorithm(buf)) + return -EINVAL; + + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + pr_info("Can't change algorithm for initialized device\n"); + return -EBUSY; + } + strlcpy(zram->compressor, buf, sizeof(zram->compressor)); + + /* ignore trailing newline */ + sz = strlen(zram->compressor); + if (sz > 0 && zram->compressor[sz - 1] == '\n') + zram->compressor[sz - 1] = 0x00; + + up_write(&zram->init_lock); + return len; +} + +static ssize_t compact_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + struct zram_meta *meta; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + return -EINVAL; + } + + meta = zram->meta; + zs_compact(meta->mem_pool); + up_read(&zram->init_lock); + + return len; +} + +static ssize_t io_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + ssize_t ret; + + down_read(&zram->init_lock); + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8llu\n", + (u64)atomic64_read(&zram->stats.failed_reads), + (u64)atomic64_read(&zram->stats.failed_writes), + (u64)atomic64_read(&zram->stats.invalid_io), + (u64)atomic64_read(&zram->stats.notify_free)); + up_read(&zram->init_lock); + + return ret; +} + +static ssize_t mm_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + struct zs_pool_stats pool_stats; + u64 orig_size, mem_used = 0; + long max_used; + ssize_t ret; + + memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats)); + + down_read(&zram->init_lock); + if (init_done(zram)) { + mem_used = zs_get_total_pages(zram->meta->mem_pool); + zs_pool_stats(zram->meta->mem_pool, &pool_stats); + } + + orig_size = atomic64_read(&zram->stats.pages_stored); + max_used = atomic_long_read(&zram->stats.max_used_pages); + + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n", + orig_size << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.compr_data_size), + mem_used << PAGE_SHIFT, + zram->limit_pages << PAGE_SHIFT, + max_used << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.zero_pages), + pool_stats.pages_compacted); + up_read(&zram->init_lock); + + return ret; +} + +static ssize_t debug_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int version = 1; + struct zram *zram = dev_to_zram(dev); + ssize_t ret; + + down_read(&zram->init_lock); + ret = scnprintf(buf, PAGE_SIZE, + "version: %d\n%8llu\n", + version, + (u64)atomic64_read(&zram->stats.writestall)); + up_read(&zram->init_lock); + + return ret; +} + +static DEVICE_ATTR_RO(io_stat); +static DEVICE_ATTR_RO(mm_stat); +static DEVICE_ATTR_RO(debug_stat); +ZRAM_ATTR_RO(num_reads); +ZRAM_ATTR_RO(num_writes); +ZRAM_ATTR_RO(failed_reads); +ZRAM_ATTR_RO(failed_writes); +ZRAM_ATTR_RO(invalid_io); +ZRAM_ATTR_RO(notify_free); +ZRAM_ATTR_RO(zero_pages); +ZRAM_ATTR_RO(compr_data_size); + +static inline bool zram_meta_get(struct zram *zram) +{ + if (atomic_inc_not_zero(&zram->refcount)) + return true; + return false; +} + +static inline void zram_meta_put(struct zram *zram) +{ + atomic_dec(&zram->refcount); +} + +static void zram_meta_free(struct zram_meta *meta, u64 disksize) +{ + size_t num_pages = disksize >> PAGE_SHIFT; + size_t index; + + /* Free all pages that are still in this zram device */ + for (index = 0; index < num_pages; index++) { + unsigned long handle = meta->table[index].handle; + + if (!handle) + continue; + + zs_free(meta->mem_pool, handle); + } + + zs_destroy_pool(meta->mem_pool); + vfree(meta->table); + kfree(meta); +} + +static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize) +{ + size_t num_pages; + struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); + + if (!meta) + return NULL; + + num_pages = disksize >> PAGE_SHIFT; + meta->table = vzalloc(num_pages * sizeof(*meta->table)); + if (!meta->table) { + pr_err("Error allocating zram address table\n"); + goto out_error; + } + + meta->mem_pool = zs_create_pool(pool_name); + if (!meta->mem_pool) { + pr_err("Error creating memory pool\n"); + goto out_error; + } + + return meta; + +out_error: + vfree(meta->table); + kfree(meta); + return NULL; +} + +/* + * To protect concurrent access to the same index entry, + * caller should hold this table index entry's bit_spinlock to + * indicate this index entry is accessing. + */ +static void zram_free_page(struct zram *zram, size_t index) +{ + struct zram_meta *meta = zram->meta; + unsigned long handle = meta->table[index].handle; + + if (unlikely(!handle)) { + /* + * No memory is allocated for zero filled pages. + * Simply clear zero page flag. + */ + if (zram_test_flag(meta, index, ZRAM_ZERO)) { + zram_clear_flag(meta, index, ZRAM_ZERO); + atomic64_dec(&zram->stats.zero_pages); + } + return; + } + + zs_free(meta->mem_pool, handle); + + atomic64_sub(zram_get_obj_size(meta, index), + &zram->stats.compr_data_size); + atomic64_dec(&zram->stats.pages_stored); + + meta->table[index].handle = 0; + zram_set_obj_size(meta, index, 0); +} + +static int zram_decompress_page(struct zram *zram, char *mem, u32 index) +{ + int ret = 0; + unsigned char *cmem; + struct zram_meta *meta = zram->meta; + unsigned long handle; + size_t size; + + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + handle = meta->table[index].handle; + size = zram_get_obj_size(meta, index); + + if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + clear_page(mem); + return 0; + } + + cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); + if (size == PAGE_SIZE) + copy_page(mem, cmem); + else + ret = zcomp_decompress(zram->comp, cmem, size, mem); + zs_unmap_object(meta->mem_pool, handle); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + /* Should NEVER happen. Return bio error if it does. */ + if (unlikely(ret)) { + pr_err("Decompression failed! err=%d, page=%u\n", ret, index); + return ret; + } + + return 0; +} + +static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset) +{ + int ret; + struct page *page; + unsigned char *user_mem, *uncmem = NULL; + struct zram_meta *meta = zram->meta; + page = bvec->bv_page; + + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + if (unlikely(!meta->table[index].handle) || + zram_test_flag(meta, index, ZRAM_ZERO)) { + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + handle_zero_page(bvec); + return 0; + } + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + if (is_partial_io(bvec)) + /* Use a temporary buffer to decompress the page */ + uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); + + user_mem = kmap_atomic(page); + if (!is_partial_io(bvec)) + uncmem = user_mem; + + if (!uncmem) { + pr_err("Unable to allocate temp memory\n"); + ret = -ENOMEM; + goto out_cleanup; + } + + ret = zram_decompress_page(zram, uncmem, index); + /* Should NEVER happen. Return bio error if it does. */ + if (unlikely(ret)) + goto out_cleanup; + + if (is_partial_io(bvec)) + memcpy(user_mem + bvec->bv_offset, uncmem + offset, + bvec->bv_len); + + flush_dcache_page(page); + ret = 0; +out_cleanup: + kunmap_atomic(user_mem); + if (is_partial_io(bvec)) + kfree(uncmem); + return ret; +} + +static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, + int offset) +{ + int ret = 0; + size_t clen; + unsigned long handle = 0; + struct page *page; + unsigned char *user_mem, *cmem, *src, *uncmem = NULL; + struct zram_meta *meta = zram->meta; + struct zcomp_strm *zstrm = NULL; + unsigned long alloced_pages; + + page = bvec->bv_page; + if (is_partial_io(bvec)) { + /* + * This is a partial IO. We need to read the full page + * before to write the changes. + */ + uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); + if (!uncmem) { + ret = -ENOMEM; + goto out; + } + ret = zram_decompress_page(zram, uncmem, index); + if (ret) + goto out; + } + +compress_again: + user_mem = kmap_atomic(page); + if (is_partial_io(bvec)) { + memcpy(uncmem + offset, user_mem + bvec->bv_offset, + bvec->bv_len); + kunmap_atomic(user_mem); + user_mem = NULL; + } else { + uncmem = user_mem; + } + + if (page_zero_filled(uncmem)) { + if (user_mem) + kunmap_atomic(user_mem); + /* Free memory associated with this sector now. */ + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_free_page(zram, index); + zram_set_flag(meta, index, ZRAM_ZERO); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + atomic64_inc(&zram->stats.zero_pages); + ret = 0; + goto out; + } + + zstrm = zcomp_strm_find(zram->comp); + ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); + if (!is_partial_io(bvec)) { + kunmap_atomic(user_mem); + user_mem = NULL; + uncmem = NULL; + } + + if (unlikely(ret)) { + pr_err("Compression failed! err=%d\n", ret); + goto out; + } + + src = zstrm->buffer; + if (unlikely(clen > max_zpage_size)) { + clen = PAGE_SIZE; + if (is_partial_io(bvec)) + src = uncmem; + } + + /* + * handle allocation has 2 paths: + * a) fast path is executed with preemption disabled (for + * per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear, + * since we can't sleep; + * b) slow path enables preemption and attempts to allocate + * the page with __GFP_DIRECT_RECLAIM bit set. we have to + * put per-cpu compression stream and, thus, to re-do + * the compression once handle is allocated. + * + * if we have a 'non-null' handle here then we are coming + * from the slow path and handle has already been allocated. + */ + if (!handle) + handle = zs_malloc(meta->mem_pool, clen, +#if 0 /* not implemented yet */ + __GFP_KSWAPD_RECLAIM | +#endif + __GFP_NOWARN | + __GFP_HIGHMEM); + if (!handle) { + zcomp_strm_release(zram->comp, zstrm); + zstrm = NULL; + + atomic64_inc(&zram->stats.writestall); + + handle = zs_malloc(meta->mem_pool, clen, + GFP_NOIO | __GFP_HIGHMEM); + if (handle) + goto compress_again; + + pr_err("Error allocating memory for compressed page: %u, size=%zu\n", + index, clen); + ret = -ENOMEM; + goto out; + } + + alloced_pages = zs_get_total_pages(meta->mem_pool); + update_used_max(zram, alloced_pages); + + if (zram->limit_pages && alloced_pages > zram->limit_pages) { + zs_free(meta->mem_pool, handle); + ret = -ENOMEM; + goto out; + } + + cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); + + if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { + src = kmap_atomic(page); + copy_page(cmem, src); + kunmap_atomic(src); + } else { + memcpy(cmem, src, clen); + } + + zcomp_strm_release(zram->comp, zstrm); + zstrm = NULL; + zs_unmap_object(meta->mem_pool, handle); + + /* + * Free memory associated with this sector + * before overwriting unused sectors. + */ + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_free_page(zram, index); + + meta->table[index].handle = handle; + zram_set_obj_size(meta, index, clen); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + /* Update stats */ + atomic64_add(clen, &zram->stats.compr_data_size); + atomic64_inc(&zram->stats.pages_stored); +out: + if (zstrm) + zcomp_strm_release(zram->comp, zstrm); + if (is_partial_io(bvec)) + kfree(uncmem); + return ret; +} + +/* + * zram_bio_discard - handler on discard request + * @index: physical block index in PAGE_SIZE units + * @offset: byte offset within physical block + */ +static void zram_bio_discard(struct zram *zram, u32 index, + int offset, struct bio *bio) +{ + size_t n = bio->bi_size; + struct zram_meta *meta = zram->meta; + + /* + * zram manages data in physical block size units. Because logical block + * size isn't identical with physical block size on some arch, we + * could get a discard request pointing to a specific offset within a + * certain physical block. Although we can handle this request by + * reading that physiclal block and decompressing and partially zeroing + * and re-compressing and then re-storing it, this isn't reasonable + * because our intent with a discard request is to save memory. So + * skipping this logical block is appropriate here. + */ + if (offset) { + if (n <= (PAGE_SIZE - offset)) + return; + + n -= (PAGE_SIZE - offset); + index++; + } + + while (n >= PAGE_SIZE) { + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_free_page(zram, index); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + atomic64_inc(&zram->stats.notify_free); + index++; + n -= PAGE_SIZE; + } +} + +static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, + int offset, int rw) +{ + unsigned long start_time = jiffies; + int ret; + + generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT, + &zram->disk->part0); + + if (rw == READ) { + atomic64_inc(&zram->stats.num_reads); + ret = zram_bvec_read(zram, bvec, index, offset); + } else { + atomic64_inc(&zram->stats.num_writes); + ret = zram_bvec_write(zram, bvec, index, offset); + } + + generic_end_io_acct(rw, &zram->disk->part0, start_time); + + if (unlikely(ret)) { + if (rw == READ) + atomic64_inc(&zram->stats.failed_reads); + else + atomic64_inc(&zram->stats.failed_writes); + } + + return ret; +} + +static void __zram_make_request(struct zram *zram, struct bio *bio) +{ + int offset, rw, i; + u32 index; + struct bio_vec *bvec; + + index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; + offset = (bio->bi_sector & + (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; + + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + zram_bio_discard(zram, index, offset, bio); + bio_endio(bio, 0); + return; + } + + rw = bio_data_dir(bio); + bio_for_each_segment(bvec, bio, i) { + int max_transfer_size = PAGE_SIZE - offset; + + if (bvec->bv_len > max_transfer_size) { + /* + * zram_bvec_rw() can only make operation on a single + * zram page. Split the bio vector. + */ + struct bio_vec bv; + + bv.bv_page = bvec->bv_page; + bv.bv_len = max_transfer_size; + bv.bv_offset = bvec->bv_offset; + + if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0) + goto out; + + bv.bv_len = bvec->bv_len - max_transfer_size; + bv.bv_offset += max_transfer_size; + if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0) + goto out; + } else + if (zram_bvec_rw(zram, bvec, index, offset, rw) < 0) + goto out; + + update_position(&index, &offset, bvec); + } + + set_bit(BIO_UPTODATE, &bio->bi_flags); + bio_endio(bio, 0); + return; + +out: + bio_io_error(bio); +} + +/* + * Handler function for all zram I/O requests. + */ +static void zram_make_request(struct request_queue *queue, struct bio *bio) +{ + struct zram *zram = queue->queuedata; + + if (unlikely(!zram_meta_get(zram))) + goto error; + + if (!valid_io_request(zram, bio->bi_sector, + bio->bi_size)) { + atomic64_inc(&zram->stats.invalid_io); + goto put_zram; + } + + __zram_make_request(zram, bio); + zram_meta_put(zram); + return; +put_zram: + zram_meta_put(zram); +error: + bio_io_error(bio); +} + +static void zram_slot_free_notify(struct block_device *bdev, + unsigned long index) +{ + struct zram *zram; + struct zram_meta *meta; + + zram = bdev->bd_disk->private_data; + meta = zram->meta; + + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_free_page(zram, index); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + atomic64_inc(&zram->stats.notify_free); +} + +/* not ready for this change right now: Dorimanx */ +#if 0 +static int zram_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + int offset, err = -EIO; + u32 index; + struct zram *zram; + struct bio_vec bv; + + zram = bdev->bd_disk->private_data; + if (unlikely(!zram_meta_get(zram))) + goto out; + + if (!valid_io_request(zram, sector, PAGE_SIZE)) { + atomic64_inc(&zram->stats.invalid_io); + err = -EINVAL; + goto put_zram; + } + + index = sector >> SECTORS_PER_PAGE_SHIFT; + offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT; + + bv.bv_page = page; + bv.bv_len = PAGE_SIZE; + bv.bv_offset = 0; + + err = zram_bvec_rw(zram, &bv, index, offset, rw); +put_zram: + zram_meta_put(zram); +out: + /* + * If I/O fails, just return error(ie, non-zero) without + * calling page_endio. + * It causes resubmit the I/O with bio request by upper functions + * of rw_page(e.g., swap_readpage, __swap_writepage) and + * bio->bi_end_io does things to handle the error + * (e.g., SetPageError, set_page_dirty and extra works). + */ + if (err == 0) + page_endio(page, rw, 0); + return err; +} +#endif + +static void zram_reset_device(struct zram *zram) +{ + struct zram_meta *meta; + struct zcomp *comp; + u64 disksize; + + down_write(&zram->init_lock); + + zram->limit_pages = 0; + + if (!init_done(zram)) { + up_write(&zram->init_lock); + return; + } + + meta = zram->meta; + comp = zram->comp; + disksize = zram->disksize; + /* + * Refcount will go down to 0 eventually and r/w handler + * cannot handle further I/O so it will bail out by + * check zram_meta_get. + */ + zram_meta_put(zram); + /* + * We want to free zram_meta in process context to avoid + * deadlock between reclaim path and any other locks. + */ + wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); + + /* Reset stats */ + memset(&zram->stats, 0, sizeof(zram->stats)); + zram->disksize = 0; + + set_capacity(zram->disk, 0); + part_stat_set_all(&zram->disk->part0, 0); + + up_write(&zram->init_lock); + /* I/O operation under all of CPU are done so let's free */ + zram_meta_free(meta, disksize); + zcomp_destroy(comp); +} + +static ssize_t disksize_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + u64 disksize; + struct zcomp *comp; + struct zram_meta *meta; + struct zram *zram = dev_to_zram(dev); + int err; + + disksize = memparse(buf, NULL); + if (!disksize) + return -EINVAL; + + disksize = PAGE_ALIGN(disksize); + meta = zram_meta_alloc(zram->disk->disk_name, disksize); + if (!meta) + return -ENOMEM; + + comp = zcomp_create(zram->compressor); + if (IS_ERR(comp)) { + pr_err("Cannot initialise %s compressing backend\n", + zram->compressor); + err = PTR_ERR(comp); + goto out_free_meta; + } + + down_write(&zram->init_lock); + if (init_done(zram)) { + pr_info("Cannot change disksize for initialized device\n"); + err = -EBUSY; + goto out_destroy_comp; + } + + init_waitqueue_head(&zram->io_done); + atomic_set(&zram->refcount, 1); + zram->meta = meta; + zram->comp = comp; + zram->disksize = disksize; + set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); + up_write(&zram->init_lock); + + /* + * Revalidate disk out of the init_lock to avoid lockdep splat. + * It's okay because disk's capacity is protected by init_lock + * so that revalidate_disk always sees up-to-date capacity. + */ + revalidate_disk(zram->disk); + + return len; + +out_destroy_comp: + up_write(&zram->init_lock); + zcomp_destroy(comp); +out_free_meta: + zram_meta_free(meta, disksize); + return err; +} + +static ssize_t reset_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int ret; + unsigned short do_reset; + struct zram *zram; + struct block_device *bdev; + + ret = kstrtou16(buf, 10, &do_reset); + if (ret) + return ret; + + if (!do_reset) + return -EINVAL; + + zram = dev_to_zram(dev); + bdev = bdget_disk(zram->disk, 0); + if (!bdev) + return -ENOMEM; + + mutex_lock(&bdev->bd_mutex); + /* Do not reset an active device or claimed device */ + if (bdev->bd_openers || zram->claim) { + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return -EBUSY; + } + + /* From now on, anyone can't open /dev/zram[0-9] */ + zram->claim = true; + mutex_unlock(&bdev->bd_mutex); + + /* Make sure all the pending I/O are finished */ + fsync_bdev(bdev); + zram_reset_device(zram); + revalidate_disk(zram->disk); + bdput(bdev); + + mutex_lock(&bdev->bd_mutex); + zram->claim = false; + mutex_unlock(&bdev->bd_mutex); + + return len; +} + +static int zram_open(struct block_device *bdev, fmode_t mode) +{ + int ret = 0; + struct zram *zram; + + WARN_ON(!mutex_is_locked(&bdev->bd_mutex)); + + zram = bdev->bd_disk->private_data; + /* zram was claimed to reset so open request fails */ + if (zram->claim) + ret = -EBUSY; + + return ret; +} + +static const struct block_device_operations zram_devops = { + .open = zram_open, + .swap_slot_free_notify = zram_slot_free_notify, +#if 0 + .rw_page = zram_rw_page, +#endif + .owner = THIS_MODULE +}; + +static DEVICE_ATTR_WO(compact); +static DEVICE_ATTR_RW(disksize); +static DEVICE_ATTR_RO(initstate); +static DEVICE_ATTR_WO(reset); +static DEVICE_ATTR_RO(orig_data_size); +static DEVICE_ATTR_RO(mem_used_total); +static DEVICE_ATTR_RW(mem_limit); +static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_RW(max_comp_streams); +static DEVICE_ATTR_RW(comp_algorithm); + +static struct attribute *zram_disk_attrs[] = { + &dev_attr_disksize.attr, + &dev_attr_initstate.attr, + &dev_attr_reset.attr, + &dev_attr_num_reads.attr, + &dev_attr_num_writes.attr, + &dev_attr_failed_reads.attr, + &dev_attr_failed_writes.attr, + &dev_attr_compact.attr, + &dev_attr_invalid_io.attr, + &dev_attr_notify_free.attr, + &dev_attr_zero_pages.attr, + &dev_attr_orig_data_size.attr, + &dev_attr_compr_data_size.attr, + &dev_attr_mem_used_total.attr, + &dev_attr_mem_limit.attr, + &dev_attr_mem_used_max.attr, + &dev_attr_max_comp_streams.attr, + &dev_attr_comp_algorithm.attr, + &dev_attr_io_stat.attr, + &dev_attr_mm_stat.attr, + &dev_attr_debug_stat.attr, + NULL, +}; + +static struct attribute_group zram_disk_attr_group = { + .attrs = zram_disk_attrs, +}; + +/* + * Allocate and initialize new zram device. the function returns + * '>= 0' device_id upon success, and negative value otherwise. + */ +static int zram_add(void) +{ + struct zram *zram; + struct request_queue *queue; + int ret, device_id; + + zram = kzalloc(sizeof(struct zram), GFP_KERNEL); + if (!zram) + return -ENOMEM; + + ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL); + if (ret < 0) + goto out_free_dev; + device_id = ret; + + init_rwsem(&zram->init_lock); + + queue = blk_alloc_queue(GFP_KERNEL); + if (!queue) { + pr_err("Error allocating disk queue for device %d\n", + device_id); + ret = -ENOMEM; + goto out_free_idr; + } + + blk_queue_make_request(queue, zram_make_request); + + /* gendisk structure */ + zram->disk = alloc_disk(1); + if (!zram->disk) { + pr_err("Error allocating disk structure for device %d\n", + device_id); + ret = -ENOMEM; + goto out_free_queue; + } + + zram->disk->major = zram_major; + zram->disk->first_minor = device_id; + zram->disk->fops = &zram_devops; + zram->disk->queue = queue; + zram->disk->queue->queuedata = zram; + zram->disk->private_data = zram; + snprintf(zram->disk->disk_name, 16, "zram%d", device_id); + + __set_bit(QUEUE_FLAG_FAST, &zram->disk->queue->queue_flags); + /* Actual capacity set using syfs (/sys/block/zram/disksize */ + set_capacity(zram->disk, 0); + /* zram devices sort of resembles non-rotational disks */ + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); + /* + * To ensure that we always get PAGE_SIZE aligned + * and n*PAGE_SIZED sized I/O requests. + */ + blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE); + blk_queue_logical_block_size(zram->disk->queue, + ZRAM_LOGICAL_BLOCK_SIZE); + blk_queue_io_min(zram->disk->queue, PAGE_SIZE); + blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); + zram->disk->queue->limits.discard_granularity = PAGE_SIZE; + blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX); + /* + * zram_bio_discard() will clear all logical blocks if logical block + * size is identical with physical block size(PAGE_SIZE). But if it is + * different, we will skip discarding some parts of logical blocks in + * the part of the request range which isn't aligned to physical block + * size. So we can't ensure that all discarded logical blocks are + * zeroed. + */ + if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) + zram->disk->queue->limits.discard_zeroes_data = 1; + else + zram->disk->queue->limits.discard_zeroes_data = 0; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue); + + add_disk(zram->disk); + + ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, + &zram_disk_attr_group); + if (ret < 0) { + pr_err("Error creating sysfs group for device %d\n", + device_id); + goto out_free_disk; + } + strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); + zram->meta = NULL; + + pr_info("Added device: %s\n", zram->disk->disk_name); + return device_id; + +out_free_disk: + del_gendisk(zram->disk); + put_disk(zram->disk); +out_free_queue: + blk_cleanup_queue(queue); +out_free_idr: + idr_remove(&zram_index_idr, device_id); +out_free_dev: + kfree(zram); + return ret; +} + +static int zram_remove(struct zram *zram) +{ + struct block_device *bdev; + + bdev = bdget_disk(zram->disk, 0); + if (!bdev) + return -ENOMEM; + + mutex_lock(&bdev->bd_mutex); + if (bdev->bd_openers || zram->claim) { + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return -EBUSY; + } + + zram->claim = true; + mutex_unlock(&bdev->bd_mutex); + + /* + * Remove sysfs first, so no one will perform a disksize + * store while we destroy the devices. This also helps during + * hot_remove -- zram_reset_device() is the last holder of + * ->init_lock, no later/concurrent disksize_store() or any + * other sysfs handlers are possible. + */ + sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, + &zram_disk_attr_group); + + /* Make sure all the pending I/O are finished */ + fsync_bdev(bdev); + zram_reset_device(zram); + bdput(bdev); + + pr_info("Removed device: %s\n", zram->disk->disk_name); + + blk_cleanup_queue(zram->disk->queue); + del_gendisk(zram->disk); + put_disk(zram->disk); + kfree(zram); + return 0; +} + +/* zram-control sysfs attributes */ +static ssize_t hot_add_show(struct class *class, + struct class_attribute *attr, + char *buf) +{ + int ret; + + mutex_lock(&zram_index_mutex); + ret = zram_add(); + mutex_unlock(&zram_index_mutex); + + if (ret < 0) + return ret; + return scnprintf(buf, PAGE_SIZE, "%d\n", ret); +} + +static ssize_t hot_remove_store(struct class *class, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct zram *zram; + int ret, dev_id; + + /* dev_id is gendisk->first_minor, which is `int' */ + ret = kstrtoint(buf, 10, &dev_id); + if (ret) + return ret; + if (dev_id < 0) + return -EINVAL; + + mutex_lock(&zram_index_mutex); + + zram = idr_find(&zram_index_idr, dev_id); + if (zram) { + ret = zram_remove(zram); + idr_remove(&zram_index_idr, dev_id); + } else { + ret = -ENODEV; + } + + mutex_unlock(&zram_index_mutex); + return ret ? ret : count; +} + +static struct class_attribute zram_control_class_attrs[] = { + __ATTR_RO(hot_add), + __ATTR_WO(hot_remove), + __ATTR_NULL, +}; + +static struct class zram_control_class = { + .name = "zram-control", + .owner = THIS_MODULE, + .class_attrs = zram_control_class_attrs, +}; + +static int zram_remove_cb(int id, void *ptr, void *data) +{ + zram_remove(ptr); + return 0; +} + +static void destroy_devices(void) +{ + class_unregister(&zram_control_class); + idr_for_each(&zram_index_idr, &zram_remove_cb, NULL); + idr_destroy(&zram_index_idr); + unregister_blkdev(zram_major, "zram"); +} + +static int __init zram_init(void) +{ + int ret; + +#ifdef CONFIG_STATE_NOTIFIER + notif.notifier_call = state_notifier_callback; + if (state_register_client(¬if)) + pr_warn("Failed to register State notifier callback\n"); +#endif + + ret = class_register(&zram_control_class); + if (ret) { + pr_err("Unable to register zram-control class\n"); + return ret; + } + + zram_major = register_blkdev(0, "zram"); + if (zram_major <= 0) { + pr_err("Unable to get major number\n"); + class_unregister(&zram_control_class); + return -EBUSY; + } + + while (num_devices != 0) { + mutex_lock(&zram_index_mutex); + ret = zram_add(); + mutex_unlock(&zram_index_mutex); + if (ret < 0) + goto out_error; + num_devices--; + } + + return 0; + +out_error: + destroy_devices(); + return ret; +} + +static void __exit zram_exit(void) +{ +#ifdef CONFIG_STATE_NOTIFIER + state_unregister_client(¬if); + notif.notifier_call = NULL; +#endif + destroy_devices(); +} + +module_init(zram_init); +module_exit(zram_exit); + +module_param(num_devices, uint, 0); +MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices"); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Nitin Gupta "); +MODULE_DESCRIPTION("Compressed RAM Block Device"); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h new file mode 100644 index 000000000..3f5bf66a2 --- /dev/null +++ b/drivers/block/zram/zram_drv.h @@ -0,0 +1,122 @@ +/* + * Compressed RAM block device + * + * Copyright (C) 2008, 2009, 2010 Nitin Gupta + * 2012, 2013 Minchan Kim + * + * This code is released using a dual license strategy: BSD/GPL + * You can choose the licence that better fits your requirements. + * + * Released under the terms of 3-clause BSD License + * Released under the terms of GNU General Public License Version 2.0 + * + */ + +#ifndef _ZRAM_DRV_H_ +#define _ZRAM_DRV_H_ + +#include +#include + +#include "zcomp.h" + +/*-- Configurable parameters */ + +/* + * Pages that compress to size greater than this are stored + * uncompressed in memory. + */ +static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; + +/* + * NOTE: max_zpage_size must be less than or equal to: + * ZS_MAX_ALLOC_SIZE. Otherwise, zs_malloc() would + * always return failure. + */ + +/*-- End of configurable params */ + +#define SECTOR_SHIFT 9 +#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) +#define ZRAM_LOGICAL_BLOCK_SHIFT 12 +#define ZRAM_LOGICAL_BLOCK_SIZE (1 << ZRAM_LOGICAL_BLOCK_SHIFT) +#define ZRAM_SECTOR_PER_LOGICAL_BLOCK \ + (1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT)) + + +/* + * The lower ZRAM_FLAG_SHIFT bits of table.value is for + * object size (excluding header), the higher bits is for + * zram_pageflags. + * + * zram is mainly used for memory efficiency so we want to keep memory + * footprint small so we can squeeze size and flags into a field. + * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header), + * the higher bits is for zram_pageflags. + */ +#define ZRAM_FLAG_SHIFT 24 + +/* Flags for zram pages (table[page_no].value) */ +enum zram_pageflags { + /* Page consists entirely of zeros */ + ZRAM_ZERO = ZRAM_FLAG_SHIFT, + ZRAM_ACCESS, /* page is now accessed */ + + __NR_ZRAM_PAGEFLAGS, +}; + +/*-- Data structures */ + +/* Allocated for each disk page */ +struct zram_table_entry { + unsigned long handle; + unsigned long value; +}; + +struct zram_stats { + atomic64_t compr_data_size; /* compressed size of pages stored */ + atomic64_t num_reads; /* failed + successful */ + atomic64_t num_writes; /* --do-- */ + atomic64_t failed_reads; /* can happen when memory is too low */ + atomic64_t failed_writes; /* can happen when memory is too low */ + atomic64_t invalid_io; /* non-page-aligned I/O requests */ + atomic64_t notify_free; /* no. of swap slot free notifications */ + atomic64_t zero_pages; /* no. of zero filled pages */ + atomic64_t pages_stored; /* no. of pages currently stored */ + atomic_long_t max_used_pages; /* no. of maximum pages stored */ + atomic64_t writestall; /* no. of write slow paths */ +}; + +struct zram_meta { + struct zram_table_entry *table; + struct zs_pool *mem_pool; +}; + +struct zram { + struct zram_meta *meta; + struct zcomp *comp; + struct gendisk *disk; + /* Prevent concurrent execution of device init */ + struct rw_semaphore init_lock; + /* + * the number of pages zram can consume for storing compressed data + */ + unsigned long limit_pages; + + struct zram_stats stats; + atomic_t refcount; /* refcount for zram_meta */ + /* wait all IO under all of cpu are done */ + wait_queue_head_t io_done; + /* + * This is the limit on amount of *uncompressed* worth of data + * we can store in a disk. + */ + u64 disksize; /* bytes */ + char compressor[10]; + /* + * zram is claimed so open request will be failed + */ + bool claim; /* Protected by bdev->bd_mutex */ +}; +#endif From f17dec212e1acb8b8c76b9603f4561f2f7a8a8ea Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Fri, 17 Jun 2016 07:48:01 -0400 Subject: [PATCH 29/52] mm: swap: don't delay swap free for fast swap devices There are couple of issues with swapcache usage when ZRAM is used as swap device. 1) Kernel does a swap readahead which can be around 6 to 8 pages depending on total ram, which is not required for zram since accesses are fast. 2) Kernel delays the freeing up of swapcache expecting a later hit, which again is useless in the case of zram. 3) This is not related to swapcache, but zram usage itself. As mentioned in (2) kernel delays freeing of swapcache, but along with that it delays zram compressed page free also. i.e. there can be 2 copies, though one is compressed. This patch addresses these issues using two new flags QUEUE_FLAG_FAST and SWP_FAST, to indicate that accesses to the device will be fast and cheap, and instructs the swap layer to free up swap space agressively, and not to do read ahead. Signed-off-by: Vinayak Menon Signed-off-by: Pranav Vashi --- include/linux/blkdev.h | 2 ++ include/linux/swap.h | 22 +++++++++++++++++----- mm/memory.c | 2 +- mm/swap_state.c | 3 ++- mm/swapfile.c | 30 ++++++++++++++++++++++++++---- mm/vmscan.c | 2 +- 6 files changed, 49 insertions(+), 12 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6619eb255..f134c8334 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -431,6 +431,7 @@ struct request_queue { #define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ #define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ #define QUEUE_FLAG_SANITIZE 19 /* supports SANITIZE */ +#define QUEUE_FLAG_FAST 20 /* fast block device (e.g. ram based) */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -513,6 +514,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define blk_queue_sanitize(q) test_bit(QUEUE_FLAG_SANITIZE, &(q)->queue_flags) #define blk_queue_secdiscard(q) (blk_queue_discard(q) && \ test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags)) +#define blk_queue_fast(q) test_bit(QUEUE_FLAG_FAST, &(q)->queue_flags) #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ diff --git a/include/linux/swap.h b/include/linux/swap.h index 886dda7f7..9781ed09b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -146,16 +146,20 @@ struct swap_extent { enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ - SWP_DISCARDABLE = (1 << 2), /* swapon+blkdev support discard */ + SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ + SWP_FILE = (1 << 7), /* set after swap_activate success */ + SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ + SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ /* add others here before... */ - SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ + SWP_FAST = (1 << 10), /* blkdev access is fast and cheap */ + SWP_SCANNING = (1 << 11), /* refcount in scan_swap_map */ }; -#define SWAP_CLUSTER_MAX 32 +#define SWAP_CLUSTER_MAX 32UL #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* @@ -354,10 +358,18 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t, /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; +extern bool is_swap_fast(swp_entry_t entry); /* Swap 50% full? Release swapcache more aggressively.. */ -static inline bool vm_swap_full(void) +static inline bool vm_swap_full(struct swap_info_struct *si) { + /* + * If the swap device is fast, return true + * not to delay swap free. + */ + if (si->flags & SWP_FAST) + return true; + return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages; } @@ -402,7 +414,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) #define get_nr_swap_pages() 0L #define total_swap_pages 0L #define total_swapcache_pages 0UL -#define vm_swap_full() 0 +#define vm_swap_full(si) 0 #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) diff --git a/mm/memory.c b/mm/memory.c index ec059db19..ed5b26277 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3258,7 +3258,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, mem_cgroup_commit_charge_swapin(page, ptr); swap_free(entry); - if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + if (vm_swap_full(page_swap_info(page)) || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); if (swapcache) { diff --git a/mm/swap_state.c b/mm/swap_state.c index e561867c1..77c2332b9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -393,7 +393,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct page *page; unsigned long offset = swp_offset(entry); unsigned long start_offset, end_offset; - unsigned long mask = (1UL << page_cluster) - 1; + unsigned long mask = is_swap_fast(entry) ? 0 : + (1UL << page_cluster) - 1; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; diff --git a/mm/swapfile.c b/mm/swapfile.c index 99d16ec7c..8e55e729f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -73,6 +73,26 @@ static inline unsigned char swap_count(unsigned char ent) return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ } +bool is_swap_fast(swp_entry_t entry) +{ + struct swap_info_struct *p; + unsigned long type; + + if (non_swap_entry(entry)) + return false; + + type = swp_type(entry); + if (type >= nr_swapfiles) + return false; + + p = swap_info[type]; + + if (p->flags & SWP_FAST) + return true; + + return false; +} + /* returns 1 if swap entry is freed */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) @@ -293,7 +313,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, scan_base = offset = si->lowest_bit; /* reuse swap entry of cache-only swap if not busy. */ - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + if (vm_swap_full(si) && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; spin_unlock(&si->lock); swap_was_freed = __try_to_reclaim_swap(si, offset); @@ -382,7 +402,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, spin_lock(&si->lock); goto checks; } - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + if (vm_swap_full(si) && si->swap_map[offset] == SWAP_HAS_CACHE) { spin_lock(&si->lock); goto checks; } @@ -397,7 +417,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, spin_lock(&si->lock); goto checks; } - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + if (vm_swap_full(si) && si->swap_map[offset] == SWAP_HAS_CACHE) { spin_lock(&si->lock); goto checks; } @@ -760,7 +780,7 @@ int free_swap_and_cache(swp_entry_t entry) * Also recheck PageSwapCache now page is locked (above). */ if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || vm_swap_full())) { + (!page_mapped(page) || vm_swap_full(page_swap_info(page)))) { delete_from_swap_cache(page); SetPageDirty(page); } @@ -2192,6 +2212,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) p->flags |= SWP_DISCARDABLE; + if (blk_queue_fast(bdev_get_queue(p->bdev))) + p->flags |= SWP_FAST; } mutex_lock(&swapon_mutex); diff --git a/mm/vmscan.c b/mm/vmscan.c index 530fab823..29d01dae4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -974,7 +974,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && vm_swap_full()) + if (PageSwapCache(page) && vm_swap_full(page_swap_info(page))) try_to_free_swap(page); VM_BUG_ON(PageActive(page)); SetPageActive(page); From d6d7ff7b9257d13eae14916de38e270d1f2057ad Mon Sep 17 00:00:00 2001 From: carck Date: Sun, 19 Jun 2016 04:05:17 -0400 Subject: [PATCH 30/52] ZRAM: add zram support --- arch/arm/configs/ik_defconfig | 2 + block/Makefile | 1 + drivers/block/Kconfig | 2 + drivers/staging/Kconfig | 4 - drivers/staging/Makefile | 2 - drivers/staging/zram/Kconfig | 25 - drivers/staging/zram/Makefile | 3 - drivers/staging/zram/zram.txt | 77 -- drivers/staging/zram/zram_drv.c | 995 -------------- drivers/staging/zram/zram_drv.h | 125 -- drivers/staging/zram/zram_sysfs.c | 240 ---- drivers/staging/zsmalloc/Kconfig | 10 - drivers/staging/zsmalloc/Makefile | 3 - drivers/staging/zsmalloc/zsmalloc-main.c | 1072 --------------- drivers/staging/zsmalloc/zsmalloc.h | 43 - drivers/staging/zsmalloc/zsmalloc_int.h | 155 --- include/linux/cpu.h | 3 + include/linux/swap.h | 3 +- include/linux/zbud.h | 22 + include/linux/zpool.h | 110 ++ include/linux/zsmalloc.h | 18 +- mm/Kconfig | 18 + mm/Makefile | 2 + mm/swap_state.c | 12 +- mm/zbud.c | 640 +++++++++ mm/zpool.c | 359 +++++ mm/zsmalloc.c | 1589 +++++++++++++++++----- mm/zswap.c | 782 ++++------- 28 files changed, 2723 insertions(+), 3594 deletions(-) delete mode 100644 drivers/staging/zram/Kconfig delete mode 100644 drivers/staging/zram/Makefile delete mode 100644 drivers/staging/zram/zram.txt delete mode 100644 drivers/staging/zram/zram_drv.c delete mode 100644 drivers/staging/zram/zram_drv.h delete mode 100644 drivers/staging/zram/zram_sysfs.c delete mode 100644 drivers/staging/zsmalloc/Kconfig delete mode 100644 drivers/staging/zsmalloc/Makefile delete mode 100644 drivers/staging/zsmalloc/zsmalloc-main.c delete mode 100644 drivers/staging/zsmalloc/zsmalloc.h delete mode 100644 drivers/staging/zsmalloc/zsmalloc_int.h create mode 100644 include/linux/zbud.h create mode 100644 include/linux/zpool.h create mode 100644 mm/zbud.c create mode 100644 mm/zpool.c diff --git a/arch/arm/configs/ik_defconfig b/arch/arm/configs/ik_defconfig index 0d1527adc..09688cb39 100644 --- a/arch/arm/configs/ik_defconfig +++ b/arch/arm/configs/ik_defconfig @@ -614,6 +614,8 @@ CONFIG_PGTABLE_MAPPING=y CONFIG_ZSWAP=y # CONFIG_SWAP_ENABLE_READAHEAD is not set # CONFIG_ZSWAP_ENABLE_WRITEBACK is not set +ONFIG_ZPOOL=y +# CONFIG_ZBUD is not set CONFIG_DIRECT_RECLAIM_FILE_PAGES_ONLY=y CONFIG_INCREASE_MAXIMUM_SWAPPINESS=y CONFIG_FIX_INACTIVE_RATIO=y diff --git a/block/Makefile b/block/Makefile index 25f094699..9085458d0 100644 --- a/block/Makefile +++ b/block/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_IOSCHED_SIO) += sio-iosched.o obj-$(CONFIG_IOSCHED_FIOPS) += fiops-iosched.o obj-$(CONFIG_IOSCHED_TEST) += test-iosched.o +obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index a79640712..e93e2ded7 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -118,6 +118,8 @@ source "drivers/block/paride/Kconfig" source "drivers/block/mtip32xx/Kconfig" +source "drivers/block/zram/Kconfig" + config BLK_CPQ_DA tristate "Compaq SMART2 support" depends on PCI && VIRT_TO_BUS diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index d6931e9cd..f5edd3dc8 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -80,12 +80,8 @@ source "drivers/staging/sep/Kconfig" source "drivers/staging/iio/Kconfig" -source "drivers/staging/zram/Kconfig" - source "drivers/staging/zcache/Kconfig" -source "drivers/staging/zsmalloc/Kconfig" - source "drivers/staging/wlags49_h2/Kconfig" source "drivers/staging/wlags49_h25/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 8229e82e9..4da2ea418 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -32,9 +32,7 @@ obj-$(CONFIG_VT6656) += vt6656/ obj-$(CONFIG_VME_BUS) += vme/ obj-$(CONFIG_DX_SEP) += sep/ obj-$(CONFIG_IIO) += iio/ -obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_ZCACHE) += zcache/ -obj-$(CONFIG_ZSMALLOC) += zsmalloc/ obj-$(CONFIG_WLAGS49_H2) += wlags49_h2/ obj-$(CONFIG_WLAGS49_H25) += wlags49_h25/ obj-$(CONFIG_FB_SM7XX) += sm7xx/ diff --git a/drivers/staging/zram/Kconfig b/drivers/staging/zram/Kconfig deleted file mode 100644 index 983314c41..000000000 --- a/drivers/staging/zram/Kconfig +++ /dev/null @@ -1,25 +0,0 @@ -config ZRAM - tristate "Compressed RAM block device support" - depends on BLOCK && SYSFS && ZSMALLOC - select LZO_COMPRESS - select LZO_DECOMPRESS - default n - help - Creates virtual block devices called /dev/zramX (X = 0, 1, ...). - Pages written to these disks are compressed and stored in memory - itself. These disks allow very fast I/O and compression provides - good amounts of memory savings. - - It has several use cases, for example: /tmp storage, use as swap - disks and maybe many more. - - See zram.txt for more information. - Project home: - -config ZRAM_DEBUG - bool "Compressed RAM block device debug support" - depends on ZRAM - default n - help - This option adds additional debugging code to the compressed - RAM block device driver. diff --git a/drivers/staging/zram/Makefile b/drivers/staging/zram/Makefile deleted file mode 100644 index cb0f9ced6..000000000 --- a/drivers/staging/zram/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -zram-y := zram_drv.o - -obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/staging/zram/zram.txt b/drivers/staging/zram/zram.txt deleted file mode 100644 index 765d790ae..000000000 --- a/drivers/staging/zram/zram.txt +++ /dev/null @@ -1,77 +0,0 @@ -zram: Compressed RAM based block devices ----------------------------------------- - -Project home: http://compcache.googlecode.com/ - -* Introduction - -The zram module creates RAM based block devices named /dev/zram -( = 0, 1, ...). Pages written to these disks are compressed and stored -in memory itself. These disks allow very fast I/O and compression provides -good amounts of memory savings. Some of the usecases include /tmp storage, -use as swap disks, various caches under /var and maybe many more :) - -Statistics for individual zram devices are exported through sysfs nodes at -/sys/block/zram/ - -* Usage - -Following shows a typical sequence of steps for using zram. - -1) Load Module: - modprobe zram num_devices=4 - This creates 4 devices: /dev/zram{0,1,2,3} - (num_devices parameter is optional. Default: 1) - -2) Set Disksize - Set disk size by writing the value to sysfs node 'disksize'. - The value can be either in bytes or you can use mem suffixes. - Examples: - # Initialize /dev/zram0 with 50MB disksize - echo $((50*1024*1024)) > /sys/block/zram0/disksize - - # Using mem suffixes - echo 256K > /sys/block/zram0/disksize - echo 512M > /sys/block/zram0/disksize - echo 1G > /sys/block/zram0/disksize - -3) Activate: - mkswap /dev/zram0 - swapon /dev/zram0 - - mkfs.ext4 /dev/zram1 - mount /dev/zram1 /tmp - -4) Stats: - Per-device statistics are exported as various nodes under - /sys/block/zram/ - disksize - num_reads - num_writes - invalid_io - notify_free - discard - zero_pages - orig_data_size - compr_data_size - mem_used_total - -5) Deactivate: - swapoff /dev/zram0 - umount /dev/zram1 - -6) Reset: - Write any positive value to 'reset' sysfs node - echo 1 > /sys/block/zram0/reset - echo 1 > /sys/block/zram1/reset - - This frees all the memory allocated for the given device and - resets the disksize to zero. You must set the disksize again - before reusing the device. - -Please report any problems at: - - Mailing list: linux-mm-cc at laptop dot org - - Issue tracker: http://code.google.com/p/compcache/issues/list - -Nitin Gupta -ngupta@vflare.org diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c deleted file mode 100644 index 83fa65773..000000000 --- a/drivers/staging/zram/zram_drv.c +++ /dev/null @@ -1,995 +0,0 @@ -/* - * Compressed RAM block device - * - * Copyright (C) 2008, 2009, 2010 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the licence that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - * - * Project home: http://compcache.googlecode.com - */ - -#define KMSG_COMPONENT "zram" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#ifdef CONFIG_ZRAM_DEBUG -#define DEBUG -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zram_drv.h" - -/* Globals */ -static int zram_major; -static struct zram *zram_devices; - -/* - * We don't need to see memory allocation errors more than once every 1 - * second to know that a problem is occurring. - */ -#define ALLOC_ERROR_LOG_RATE_MS 1000 - -/* Module params (documentation at end) */ -static unsigned int num_devices = 1; - -static inline struct zram *dev_to_zram(struct device *dev) -{ - return (struct zram *)dev_to_disk(dev)->private_data; -} - -static ssize_t disksize_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", zram->disksize); -} - -static ssize_t initstate_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%u\n", zram->init_done); -} - -static ssize_t num_reads_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.num_reads)); -} - -static ssize_t num_writes_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.num_writes)); -} - -static ssize_t invalid_io_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.invalid_io)); -} - -static ssize_t notify_free_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.notify_free)); -} - -static ssize_t zero_pages_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%u\n", zram->stats.pages_zero); -} - -static ssize_t orig_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)(zram->stats.pages_stored) << PAGE_SHIFT); -} - -static ssize_t compr_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.compr_size)); -} - -static ssize_t mem_used_total_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val = 0; - struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta = zram->meta; - - down_read(&zram->init_lock); - if (zram->init_done) - val = zs_get_total_size_bytes(meta->mem_pool); - up_read(&zram->init_lock); - - return sprintf(buf, "%llu\n", val); -} - -static int zram_test_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) -{ - return meta->table[index].flags & BIT(flag); -} - -static void zram_set_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) -{ - meta->table[index].flags |= BIT(flag); -} - -static void zram_clear_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) -{ - meta->table[index].flags &= ~BIT(flag); -} - -static inline int is_partial_io(struct bio_vec *bvec) -{ - return bvec->bv_len != PAGE_SIZE; -} - -/* - * Check if request is within bounds and aligned on zram logical blocks. - */ -static inline int valid_io_request(struct zram *zram, struct bio *bio) -{ - u64 start, end, bound; - - /* unaligned request */ - if (unlikely(bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) - return 0; - if (unlikely(bio->bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) - return 0; - - start = bio->bi_sector; - end = start + (bio->bi_size >> SECTOR_SHIFT); - bound = zram->disksize >> SECTOR_SHIFT; - /* out of range range */ - if (unlikely(start >= bound || end > bound || start > end)) - return 0; - - /* I/O request is valid */ - return 1; -} - -static void zram_meta_free(struct zram_meta *meta) -{ - zs_destroy_pool(meta->mem_pool); - kfree(meta->compress_workmem); - free_pages((unsigned long)meta->compress_buffer, 1); - vfree(meta->table); - kfree(meta); -} - -static struct zram_meta *zram_meta_alloc(u64 disksize) -{ - size_t num_pages; - struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); - if (!meta) - goto out; - - meta->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); - if (!meta->compress_workmem) - goto free_meta; - - meta->compress_buffer = - (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); - if (!meta->compress_buffer) { - pr_err("Error allocating compressor buffer space\n"); - goto free_workmem; - } - - num_pages = disksize >> PAGE_SHIFT; - meta->table = vzalloc(num_pages * sizeof(*meta->table)); - if (!meta->table) { - pr_err("Error allocating zram address table\n"); - goto free_buffer; - } - - meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM | - __GFP_NOWARN); - if (!meta->mem_pool) { - pr_err("Error creating memory pool\n"); - goto free_table; - } - - return meta; - -free_table: - vfree(meta->table); -free_buffer: - free_pages((unsigned long)meta->compress_buffer, 1); -free_workmem: - kfree(meta->compress_workmem); -free_meta: - kfree(meta); - meta = NULL; -out: - return meta; -} - -static void update_position(u32 *index, int *offset, struct bio_vec *bvec) -{ - if (*offset + bvec->bv_len >= PAGE_SIZE) - (*index)++; - *offset = (*offset + bvec->bv_len) % PAGE_SIZE; -} - -static int page_zero_filled(void *ptr) -{ - unsigned int pos; - unsigned long *page; - - page = (unsigned long *)ptr; - - for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { - if (page[pos]) - return 0; - } - - return 1; -} - -static void handle_zero_page(struct bio_vec *bvec) -{ - struct page *page = bvec->bv_page; - void *user_mem; - - user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) - memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); - else - clear_page(user_mem); - kunmap_atomic(user_mem); - - flush_dcache_page(page); -} - -static void zram_free_page(struct zram *zram, size_t index) -{ - struct zram_meta *meta = zram->meta; - unsigned long handle = meta->table[index].handle; - u16 size = meta->table[index].size; - - if (unlikely(!handle)) { - /* - * No memory is allocated for zero filled pages. - * Simply clear zero page flag. - */ - if (zram_test_flag(meta, index, ZRAM_ZERO)) { - zram_clear_flag(meta, index, ZRAM_ZERO); - zram->stats.pages_zero--; - } - return; - } - - if (unlikely(size > max_zpage_size)) - zram->stats.bad_compress--; - - zs_free(meta->mem_pool, handle); - - if (size <= PAGE_SIZE / 2) - zram->stats.good_compress--; - - atomic64_sub(meta->table[index].size, &zram->stats.compr_size); - zram->stats.pages_stored--; - - meta->table[index].handle = 0; - meta->table[index].size = 0; -} - -static int zram_decompress_page(struct zram *zram, char *mem, u32 index) -{ - int ret = LZO_E_OK; - size_t clen = PAGE_SIZE; - unsigned char *cmem; - struct zram_meta *meta = zram->meta; - unsigned long handle = meta->table[index].handle; - - if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { - clear_page(mem); - return 0; - } - - cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); - if (meta->table[index].size == PAGE_SIZE) - copy_page(mem, cmem); - else - ret = lzo1x_decompress_safe(cmem, meta->table[index].size, - mem, &clen); - zs_unmap_object(meta->mem_pool, handle); - - /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret != LZO_E_OK)) { - pr_err("Decompression failed! err=%d, page=%u\n", ret, index); - atomic64_inc(&zram->stats.failed_reads); - return ret; - } - - return 0; -} - -static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset, struct bio *bio) -{ - int ret; - struct page *page; - unsigned char *user_mem, *uncmem = NULL; - struct zram_meta *meta = zram->meta; - page = bvec->bv_page; - - if (unlikely(!meta->table[index].handle) || - zram_test_flag(meta, index, ZRAM_ZERO)) { - handle_zero_page(bvec); - return 0; - } - - if (is_partial_io(bvec)) - /* Use a temporary buffer to decompress the page */ - uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); - - user_mem = kmap_atomic(page); - if (!is_partial_io(bvec)) - uncmem = user_mem; - - if (!uncmem) { - pr_info("Unable to allocate temp memory\n"); - ret = -ENOMEM; - goto out_cleanup; - } - - ret = zram_decompress_page(zram, uncmem, index); - /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret != LZO_E_OK)) - goto out_cleanup; - - if (is_partial_io(bvec)) - memcpy(user_mem + bvec->bv_offset, uncmem + offset, - bvec->bv_len); - - flush_dcache_page(page); - ret = 0; -out_cleanup: - kunmap_atomic(user_mem); - if (is_partial_io(bvec)) - kfree(uncmem); - return ret; -} - -static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset) -{ - int ret = 0; - size_t clen; - unsigned long handle; - struct page *page; - unsigned char *user_mem, *cmem, *src, *uncmem = NULL; - struct zram_meta *meta = zram->meta; - static unsigned long zram_rs_time; - - page = bvec->bv_page; - src = meta->compress_buffer; - - if (is_partial_io(bvec)) { - /* - * This is a partial IO. We need to read the full page - * before to write the changes. - */ - uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); - if (!uncmem) { - ret = -ENOMEM; - goto out; - } - ret = zram_decompress_page(zram, uncmem, index); - if (ret) - goto out; - } - - user_mem = kmap_atomic(page); - - if (is_partial_io(bvec)) { - memcpy(uncmem + offset, user_mem + bvec->bv_offset, - bvec->bv_len); - kunmap_atomic(user_mem); - user_mem = NULL; - } else { - uncmem = user_mem; - } - - if (page_zero_filled(uncmem)) { - kunmap_atomic(user_mem); - /* Free memory associated with this sector now. */ - zram_free_page(zram, index); - - zram->stats.pages_zero++; - zram_set_flag(meta, index, ZRAM_ZERO); - ret = 0; - goto out; - } - - /* - * zram_slot_free_notify could miss free so that let's - * double check. - */ - if (unlikely(meta->table[index].handle || - zram_test_flag(meta, index, ZRAM_ZERO))) - zram_free_page(zram, index); - - ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen, - meta->compress_workmem); - - if (!is_partial_io(bvec)) { - kunmap_atomic(user_mem); - user_mem = NULL; - uncmem = NULL; - } - - if (unlikely(ret != LZO_E_OK)) { - pr_err("Compression failed! err=%d\n", ret); - goto out; - } - - if (unlikely(clen > max_zpage_size)) { - zram->stats.bad_compress++; - clen = PAGE_SIZE; - src = NULL; - if (is_partial_io(bvec)) - src = uncmem; - } - - handle = zs_malloc(meta->mem_pool, clen); - if (!handle) { - if (printk_timed_ratelimit(&zram_rs_time, - ALLOC_ERROR_LOG_RATE_MS)) - pr_info("Error allocating memory for compressed page: %u, size=%zu\n", - index, clen); - ret = -ENOMEM; - goto out; - } - cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); - - if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { - src = kmap_atomic(page); - copy_page(cmem, src); - kunmap_atomic(src); - } else { - memcpy(cmem, src, clen); - } - - zs_unmap_object(meta->mem_pool, handle); - - /* - * Free memory associated with this sector - * before overwriting unused sectors. - */ - zram_free_page(zram, index); - - meta->table[index].handle = handle; - meta->table[index].size = clen; - - /* Update stats */ - atomic64_add(clen, &zram->stats.compr_size); - zram->stats.pages_stored++; - if (clen <= PAGE_SIZE / 2) - zram->stats.good_compress++; - -out: - if (is_partial_io(bvec)) - kfree(uncmem); - - if (ret) - atomic64_inc(&zram->stats.failed_writes); - return ret; -} - -static void handle_pending_slot_free(struct zram *zram) -{ - struct zram_slot_free *free_rq; - - spin_lock(&zram->slot_free_lock); - while (zram->slot_free_rq) { - free_rq = zram->slot_free_rq; - zram->slot_free_rq = free_rq->next; - zram_free_page(zram, free_rq->index); - kfree(free_rq); - } - spin_unlock(&zram->slot_free_lock); -} - -static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, struct bio *bio, int rw) -{ - int ret; - - if (rw == READ) { - down_read(&zram->lock); - handle_pending_slot_free(zram); - ret = zram_bvec_read(zram, bvec, index, offset, bio); - up_read(&zram->lock); - } else { - down_write(&zram->lock); - handle_pending_slot_free(zram); - ret = zram_bvec_write(zram, bvec, index, offset); - up_write(&zram->lock); - } - - return ret; -} - -static void zram_reset_device(struct zram *zram, bool reset_capacity) -{ - size_t index; - struct zram_meta *meta; - - flush_work(&zram->free_work); - - down_write(&zram->init_lock); - if (!zram->init_done) { - up_write(&zram->init_lock); - return; - } - - meta = zram->meta; - zram->init_done = 0; - - /* Free all pages that are still in this zram device */ - for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) { - unsigned long handle = meta->table[index].handle; - if (!handle) - continue; - - zs_free(meta->mem_pool, handle); - } - - zram_meta_free(zram->meta); - zram->meta = NULL; - /* Reset stats */ - memset(&zram->stats, 0, sizeof(zram->stats)); - - zram->disksize = 0; - if (reset_capacity) - set_capacity(zram->disk, 0); - up_write(&zram->init_lock); -} - -static void zram_init_device(struct zram *zram, struct zram_meta *meta) -{ - if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) { - pr_info( - "There is little point creating a zram of greater than " - "twice the size of memory since we expect a 2:1 compression " - "ratio. Note that zram uses about 0.1%% of the size of " - "the disk when not in use so a huge zram is " - "wasteful.\n" - "\tMemory Size: %lu kB\n" - "\tSize you selected: %llu kB\n" - "Continuing anyway ...\n", - (totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10 - ); - } - - /* zram devices sort of resembles non-rotational disks */ - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); - - zram->meta = meta; - zram->init_done = 1; - - pr_debug("Initialization done!\n"); -} - -static ssize_t disksize_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - u64 disksize; - struct zram_meta *meta; - struct zram *zram = dev_to_zram(dev); - - disksize = memparse(buf, NULL); - if (!disksize) - return -EINVAL; - - disksize = PAGE_ALIGN(disksize); - meta = zram_meta_alloc(disksize); - down_write(&zram->init_lock); - if (zram->init_done) { - up_write(&zram->init_lock); - zram_meta_free(meta); - pr_info("Cannot change disksize for initialized device\n"); - return -EBUSY; - } - - zram->disksize = disksize; - set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - zram_init_device(zram, meta); - up_write(&zram->init_lock); - - return len; -} - -static ssize_t reset_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - int ret; - unsigned short do_reset; - struct zram *zram; - struct block_device *bdev; - - zram = dev_to_zram(dev); - bdev = bdget_disk(zram->disk, 0); - - /* Do not reset an active device! */ - if (bdev->bd_holders) - return -EBUSY; - - ret = kstrtou16(buf, 10, &do_reset); - if (ret) - return ret; - - if (!do_reset) - return -EINVAL; - - /* Make sure all pending I/O is finished */ - if (bdev) - fsync_bdev(bdev); - - zram_reset_device(zram, true); - return len; -} - -static void __zram_make_request(struct zram *zram, struct bio *bio, int rw) -{ - int i, offset; - u32 index; - struct bio_vec *bvec; - - switch (rw) { - case READ: - atomic64_inc(&zram->stats.num_reads); - break; - case WRITE: - atomic64_inc(&zram->stats.num_writes); - break; - } - - index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; - offset = (bio->bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; - - bio_for_each_segment(bvec, bio, i) { - int max_transfer_size = PAGE_SIZE - offset; - - if (bvec->bv_len > max_transfer_size) { - /* - * zram_bvec_rw() can only make operation on a single - * zram page. Split the bio vector. - */ - struct bio_vec bv; - - bv.bv_page = bvec->bv_page; - bv.bv_len = max_transfer_size; - bv.bv_offset = bvec->bv_offset; - - if (zram_bvec_rw(zram, &bv, index, offset, bio, rw) < 0) - goto out; - - bv.bv_len = bvec->bv_len - max_transfer_size; - bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index+1, 0, bio, rw) < 0) - goto out; - } else - if (zram_bvec_rw(zram, bvec, index, offset, bio, rw) - < 0) - goto out; - - update_position(&index, &offset, bvec); - } - - set_bit(BIO_UPTODATE, &bio->bi_flags); - bio_endio(bio, 0); - return; - -out: - bio_io_error(bio); -} - -/* - * Handler function for all zram I/O requests. - */ -static void zram_make_request(struct request_queue *queue, struct bio *bio) -{ - struct zram *zram = queue->queuedata; - - down_read(&zram->init_lock); - if (unlikely(!zram->init_done)) - goto error; - - if (!valid_io_request(zram, bio)) { - atomic64_inc(&zram->stats.invalid_io); - goto error; - } - - __zram_make_request(zram, bio, bio_data_dir(bio)); - up_read(&zram->init_lock); - - return; - -error: - up_read(&zram->init_lock); - bio_io_error(bio); -} - -static void zram_slot_free(struct work_struct *work) -{ - struct zram *zram; - - zram = container_of(work, struct zram, free_work); - down_write(&zram->lock); - handle_pending_slot_free(zram); - up_write(&zram->lock); -} - -static void add_slot_free(struct zram *zram, struct zram_slot_free *free_rq) -{ - spin_lock(&zram->slot_free_lock); - free_rq->next = zram->slot_free_rq; - zram->slot_free_rq = free_rq; - spin_unlock(&zram->slot_free_lock); -} - -static void zram_slot_free_notify(struct block_device *bdev, - unsigned long index) -{ - struct zram *zram; - struct zram_slot_free *free_rq; - - zram = bdev->bd_disk->private_data; - atomic64_inc(&zram->stats.notify_free); - - free_rq = kmalloc(sizeof(struct zram_slot_free), GFP_ATOMIC); - if (!free_rq) - return; - - free_rq->index = index; - add_slot_free(zram, free_rq); - schedule_work(&zram->free_work); -} - -static const struct block_device_operations zram_devops = { - .swap_slot_free_notify = zram_slot_free_notify, - .owner = THIS_MODULE -}; - -static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, - disksize_show, disksize_store); -static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); -static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); -static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL); -static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL); -static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL); -static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL); -static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL); -static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); -static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL); -static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); - -static struct attribute *zram_disk_attrs[] = { - &dev_attr_disksize.attr, - &dev_attr_initstate.attr, - &dev_attr_reset.attr, - &dev_attr_num_reads.attr, - &dev_attr_num_writes.attr, - &dev_attr_invalid_io.attr, - &dev_attr_notify_free.attr, - &dev_attr_zero_pages.attr, - &dev_attr_orig_data_size.attr, - &dev_attr_compr_data_size.attr, - &dev_attr_mem_used_total.attr, - NULL, -}; - -static struct attribute_group zram_disk_attr_group = { - .attrs = zram_disk_attrs, -}; - -static int create_device(struct zram *zram, int device_id) -{ - int ret = -ENOMEM; - - init_rwsem(&zram->lock); - init_rwsem(&zram->init_lock); - - INIT_WORK(&zram->free_work, zram_slot_free); - spin_lock_init(&zram->slot_free_lock); - zram->slot_free_rq = NULL; - - zram->queue = blk_alloc_queue(GFP_KERNEL); - if (!zram->queue) { - pr_err("Error allocating disk queue for device %d\n", - device_id); - goto out; - } - - blk_queue_make_request(zram->queue, zram_make_request); - zram->queue->queuedata = zram; - - /* gendisk structure */ - zram->disk = alloc_disk(1); - if (!zram->disk) { - pr_warn("Error allocating disk structure for device %d\n", - device_id); - goto out_free_queue; - } - - zram->disk->major = zram_major; - zram->disk->first_minor = device_id; - zram->disk->fops = &zram_devops; - zram->disk->queue = zram->queue; - zram->disk->private_data = zram; - snprintf(zram->disk->disk_name, 16, "zram%d", device_id); - - /* Actual capacity set using syfs (/sys/block/zram/disksize */ - set_capacity(zram->disk, 0); - - /* - * To ensure that we always get PAGE_SIZE aligned - * and n*PAGE_SIZED sized I/O requests. - */ - blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE); - blk_queue_logical_block_size(zram->disk->queue, - ZRAM_LOGICAL_BLOCK_SIZE); - blk_queue_io_min(zram->disk->queue, PAGE_SIZE); - blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); - - add_disk(zram->disk); - - ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, - &zram_disk_attr_group); - if (ret < 0) { - pr_warn("Error creating sysfs group"); - goto out_free_disk; - } - - zram->init_done = 0; - return 0; - -out_free_disk: - del_gendisk(zram->disk); - put_disk(zram->disk); -out_free_queue: - blk_cleanup_queue(zram->queue); -out: - return ret; -} - -static void destroy_device(struct zram *zram) -{ - sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, - &zram_disk_attr_group); - - if (zram->disk) { - del_gendisk(zram->disk); - put_disk(zram->disk); - } - - if (zram->queue) - blk_cleanup_queue(zram->queue); -} - -static int __init zram_init(void) -{ - int ret, dev_id; - - if (num_devices > max_num_devices) { - pr_warn("Invalid value for num_devices: %u\n", - num_devices); - ret = -EINVAL; - goto out; - } - - zram_major = register_blkdev(0, "zram"); - if (zram_major <= 0) { - pr_warn("Unable to get major number\n"); - ret = -EBUSY; - goto out; - } - - /* Allocate the device array and initialize each one */ - zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL); - if (!zram_devices) { - ret = -ENOMEM; - goto unregister; - } - - for (dev_id = 0; dev_id < num_devices; dev_id++) { - ret = create_device(&zram_devices[dev_id], dev_id); - if (ret) - goto free_devices; - } - - pr_info("Created %u device(s) ...\n", num_devices); - - return 0; - -free_devices: - while (dev_id) - destroy_device(&zram_devices[--dev_id]); - kfree(zram_devices); -unregister: - unregister_blkdev(zram_major, "zram"); -out: - return ret; -} - -static void __exit zram_exit(void) -{ - int i; - struct zram *zram; - - for (i = 0; i < num_devices; i++) { - zram = &zram_devices[i]; - - destroy_device(zram); - /* - * Shouldn't access zram->disk after destroy_device - * because destroy_device already released zram->disk. - */ - zram_reset_device(zram, false); - } - - unregister_blkdev(zram_major, "zram"); - - kfree(zram_devices); - pr_debug("Cleanup done!\n"); -} - -module_init(zram_init); -module_exit(zram_exit); - -module_param(num_devices, uint, 0); -MODULE_PARM_DESC(num_devices, "Number of zram devices"); - -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_AUTHOR("Nitin Gupta "); -MODULE_DESCRIPTION("Compressed RAM Block Device"); -MODULE_ALIAS("devname:zram"); diff --git a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h deleted file mode 100644 index 508a19f44..000000000 --- a/drivers/staging/zram/zram_drv.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Compressed RAM block device - * - * Copyright (C) 2008, 2009, 2010 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the licence that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - * - * Project home: http://compcache.googlecode.com - */ - -#ifndef _ZRAM_DRV_H_ -#define _ZRAM_DRV_H_ - -#include -#include - -#include "../zsmalloc/zsmalloc.h" - -/* - * Some arbitrary value. This is just to catch - * invalid value for num_devices module parameter. - */ -static const unsigned max_num_devices = 32; - -/*-- Configurable parameters */ - -/* - * Pages that compress to size greater than this are stored - * uncompressed in memory. - */ -static const size_t max_zpage_size = PAGE_SIZE / 10 * 9; - -/* - * NOTE: max_zpage_size must be less than or equal to: - * ZS_MAX_ALLOC_SIZE. Otherwise, zs_malloc() would - * always return failure. - */ - -/*-- End of configurable params */ - -#define SECTOR_SHIFT 9 -#define SECTOR_SIZE (1 << SECTOR_SHIFT) -#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) -#define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) -#define ZRAM_LOGICAL_BLOCK_SHIFT 12 -#define ZRAM_LOGICAL_BLOCK_SIZE (1 << ZRAM_LOGICAL_BLOCK_SHIFT) -#define ZRAM_SECTOR_PER_LOGICAL_BLOCK \ - (1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT)) - -/* Flags for zram pages (table[page_no].flags) */ -enum zram_pageflags { - /* Page consists entirely of zeros */ - ZRAM_ZERO, - - __NR_ZRAM_PAGEFLAGS, -}; - -/*-- Data structures */ - -/* Allocated for each disk page */ -struct table { - unsigned long handle; - u16 size; /* object size (excluding header) */ - u8 count; /* object ref count (not yet used) */ - u8 flags; -} __aligned(4); - -/* - * All 64bit fields should only be manipulated by 64bit atomic accessors. - * All modifications to 32bit counter should be protected by zram->lock. - */ -struct zram_stats { - atomic64_t compr_size; /* compressed size of pages stored */ - atomic64_t num_reads; /* failed + successful */ - atomic64_t num_writes; /* --do-- */ - atomic64_t failed_reads; /* should NEVER! happen */ - atomic64_t failed_writes; /* can happen when memory is too low */ - atomic64_t invalid_io; /* non-page-aligned I/O requests */ - atomic64_t notify_free; /* no. of swap slot free notifications */ - u32 pages_zero; /* no. of zero filled pages */ - u32 pages_stored; /* no. of pages currently stored */ - u32 good_compress; /* % of pages with compression ratio<=50% */ - u32 bad_compress; /* % of pages with compression ratio>=75% */ -}; - -struct zram_meta { - void *compress_workmem; - void *compress_buffer; - struct table *table; - struct zs_pool *mem_pool; -}; - -struct zram_slot_free { - unsigned long index; - struct zram_slot_free *next; -}; - -struct zram { - struct zram_meta *meta; - struct rw_semaphore lock; /* protect compression buffers, table, - * 32bit stat counters against concurrent - * notifications, reads and writes */ - - struct work_struct free_work; /* handle pending free request */ - struct zram_slot_free *slot_free_rq; /* list head of free request */ - - struct request_queue *queue; - struct gendisk *disk; - int init_done; - /* Prevent concurrent execution of device init, reset and R/W request */ - struct rw_semaphore init_lock; - /* - * This is the limit on amount of *uncompressed* worth of data - * we can store in a disk. - */ - u64 disksize; /* bytes */ - spinlock_t slot_free_lock; - - struct zram_stats stats; -}; -#endif diff --git a/drivers/staging/zram/zram_sysfs.c b/drivers/staging/zram/zram_sysfs.c deleted file mode 100644 index aafcf0330..000000000 --- a/drivers/staging/zram/zram_sysfs.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Compressed RAM block device - * - * Copyright (C) 2008, 2009, 2010 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the licence that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - * - * Project home: http://compcache.googlecode.com/ - */ - -#include -#include -#include - -#include "zram_drv.h" - -static u64 zram_stat64_read(struct zram *zram, u64 *v) -{ - u64 val; - - spin_lock(&zram->stat64_lock); - val = *v; - spin_unlock(&zram->stat64_lock); - - return val; -} - -static struct zram *dev_to_zram(struct device *dev) -{ - int i; - struct zram *zram = NULL; - - for (i = 0; i < zram_get_num_devices(); i++) { - zram = &zram_devices[i]; - if (disk_to_dev(zram->disk) == dev) - break; - } - - return zram; -} - -static ssize_t disksize_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", zram->disksize); -} - -static ssize_t disksize_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - int ret; - u64 disksize; - struct zram *zram = dev_to_zram(dev); - - ret = kstrtoull(buf, 10, &disksize); - if (ret) - return ret; - - down_write(&zram->init_lock); - if (zram->init_done) { - up_write(&zram->init_lock); - pr_info("Cannot change disksize for initialized device\n"); - return -EBUSY; - } - - zram->disksize = PAGE_ALIGN(disksize); - set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - up_write(&zram->init_lock); - - return len; -} - -static ssize_t initstate_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%u\n", zram->init_done); -} - -static ssize_t reset_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - int ret; - unsigned short do_reset; - struct zram *zram; - struct block_device *bdev; - - zram = dev_to_zram(dev); - bdev = bdget_disk(zram->disk, 0); - - if (!bdev) - return -ENOMEM; - - /* Do not reset an active device! */ - if (bdev->bd_holders) { - ret = -EBUSY; - goto out; - } - - ret = kstrtou16(buf, 10, &do_reset); - if (ret) - goto out; - - if (!do_reset) { - ret = -EINVAL; - goto out; - } - - /* Make sure all pending I/O is finished */ - fsync_bdev(bdev); - bdput(bdev); - - down_write(&zram->init_lock); - if (zram->init_done) - __zram_reset_device(zram); - up_write(&zram->init_lock); - - return len; - -out: - bdput(bdev); - return ret; -} - -static ssize_t num_reads_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.num_reads)); -} - -static ssize_t num_writes_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.num_writes)); -} - -static ssize_t invalid_io_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.invalid_io)); -} - -static ssize_t notify_free_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.notify_free)); -} - -static ssize_t zero_pages_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%u\n", zram->stats.pages_zero); -} - -static ssize_t orig_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)(zram->stats.pages_stored) << PAGE_SHIFT); -} - -static ssize_t compr_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.compr_size)); -} - -static ssize_t mem_used_total_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val = 0; - struct zram *zram = dev_to_zram(dev); - - down_read(&zram->init_lock); - if (zram->init_done) { - val = zs_get_total_size_bytes(zram->mem_pool) + - ((u64)(zram->stats.pages_expand) << PAGE_SHIFT); - } - up_read(&zram->init_lock); - - return sprintf(buf, "%llu\n", val); -} - -static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, - disksize_show, disksize_store); -static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); -static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); -static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL); -static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL); -static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL); -static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL); -static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL); -static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); -static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL); -static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); - -static struct attribute *zram_disk_attrs[] = { - &dev_attr_disksize.attr, - &dev_attr_initstate.attr, - &dev_attr_reset.attr, - &dev_attr_num_reads.attr, - &dev_attr_num_writes.attr, - &dev_attr_invalid_io.attr, - &dev_attr_notify_free.attr, - &dev_attr_zero_pages.attr, - &dev_attr_orig_data_size.attr, - &dev_attr_compr_data_size.attr, - &dev_attr_mem_used_total.attr, - NULL, -}; - -struct attribute_group zram_disk_attr_group = { - .attrs = zram_disk_attrs, -}; diff --git a/drivers/staging/zsmalloc/Kconfig b/drivers/staging/zsmalloc/Kconfig deleted file mode 100644 index 7fab03229..000000000 --- a/drivers/staging/zsmalloc/Kconfig +++ /dev/null @@ -1,10 +0,0 @@ -config ZSMALLOC - bool "Memory allocator for compressed pages" - default n - help - zsmalloc is a slab-based memory allocator designed to store - compressed RAM pages. zsmalloc uses virtual memory mapping - in order to reduce fragmentation. However, this results in a - non-standard allocator interface where a handle, not a pointer, is - returned by an alloc(). This handle must be mapped in order to - access the allocated space. diff --git a/drivers/staging/zsmalloc/Makefile b/drivers/staging/zsmalloc/Makefile deleted file mode 100644 index b134848a5..000000000 --- a/drivers/staging/zsmalloc/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -zsmalloc-y := zsmalloc-main.o - -obj-$(CONFIG_ZSMALLOC) += zsmalloc.o diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c deleted file mode 100644 index 41a68032c..000000000 --- a/drivers/staging/zsmalloc/zsmalloc-main.c +++ /dev/null @@ -1,1072 +0,0 @@ -/* - * zsmalloc memory allocator - * - * Copyright (C) 2011 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the license that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - */ - - -/* - * This allocator is designed for use with zcache and zram. Thus, the - * allocator is supposed to work well under low memory conditions. In - * particular, it never attempts higher order page allocation which is - * very likely to fail under memory pressure. On the other hand, if we - * just use single (0-order) pages, it would suffer from very high - * fragmentation -- any object of size PAGE_SIZE/2 or larger would occupy - * an entire page. This was one of the major issues with its predecessor - * (xvmalloc). - * - * To overcome these issues, zsmalloc allocates a bunch of 0-order pages - * and links them together using various 'struct page' fields. These linked - * pages act as a single higher-order page i.e. an object can span 0-order - * page boundaries. The code refers to these linked pages as a single entity - * called zspage. - * - * Following is how we use various fields and flags of underlying - * struct page(s) to form a zspage. - * - * Usage of struct page fields: - * page->first_page: points to the first component (0-order) page - * page->index (union with page->freelist): offset of the first object - * starting in this page. For the first page, this is - * always 0, so we use this field (aka freelist) to point - * to the first free object in zspage. - * page->lru: links together all component pages (except the first page) - * of a zspage - * - * For _first_ page only: - * - * page->private (union with page->first_page): refers to the - * component page after the first page - * page->freelist: points to the first free object in zspage. - * Free objects are linked together using in-place - * metadata. - * page->objects: maximum number of objects we can store in this - * zspage (class->zspage_order * PAGE_SIZE / class->size) - * page->lru: links together first pages of various zspages. - * Basically forming list of zspages in a fullness group. - * page->mapping: class index and fullness group of the zspage - * - * Usage of struct page flags: - * PG_private: identifies the first component page - * PG_private2: identifies the last component page - * - */ - -#ifdef CONFIG_ZSMALLOC_DEBUG -#define DEBUG -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "zsmalloc.h" - -/* - * This must be power of 2 and greater than of equal to sizeof(link_free). - * These two conditions ensure that any 'struct link_free' itself doesn't - * span more than 1 page which avoids complex case of mapping 2 pages simply - * to restore link_free pointer values. - */ -#define ZS_ALIGN 8 - -/* - * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) - * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. - */ -#define ZS_MAX_ZSPAGE_ORDER 2 -#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) - -/* - * Object location (, ) is encoded as - * as single (void *) handle value. - * - * Note that object index is relative to system - * page it is stored in, so for each sub-page belonging - * to a zspage, obj_idx starts with 0. - * - * This is made more complicated by various memory models and PAE. - */ - -#ifndef MAX_PHYSMEM_BITS -#ifdef CONFIG_HIGHMEM64G -#define MAX_PHYSMEM_BITS 36 -#else /* !CONFIG_HIGHMEM64G */ -/* - * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just - * be PAGE_SHIFT - */ -#define MAX_PHYSMEM_BITS BITS_PER_LONG -#endif -#endif -#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) -#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) -#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) - -#define MAX(a, b) ((a) >= (b) ? (a) : (b)) -/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ -#define ZS_MIN_ALLOC_SIZE \ - MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) -#define ZS_MAX_ALLOC_SIZE PAGE_SIZE - -/* - * On systems with 4K page size, this gives 254 size classes! There is a - * trader-off here: - * - Large number of size classes is potentially wasteful as free page are - * spread across these classes - * - Small number of size classes causes large internal fragmentation - * - Probably its better to use specific size classes (empirically - * determined). NOTE: all those class sizes must be set as multiple of - * ZS_ALIGN to make sure link_free itself never has to span 2 pages. - * - * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN - * (reason above) - */ -#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) -#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ - ZS_SIZE_CLASS_DELTA + 1) - -/* - * We do not maintain any list for completely empty or full pages - */ -enum fullness_group { - ZS_ALMOST_FULL, - ZS_ALMOST_EMPTY, - _ZS_NR_FULLNESS_GROUPS, - - ZS_EMPTY, - ZS_FULL -}; - -/* - * We assign a page to ZS_ALMOST_EMPTY fullness group when: - * n <= N / f, where - * n = number of allocated objects - * N = total number of objects zspage can store - * f = 1/fullness_threshold_frac - * - * Similarly, we assign zspage to: - * ZS_ALMOST_FULL when n > N / f - * ZS_EMPTY when n == 0 - * ZS_FULL when n == N - * - * (see: fix_fullness_group()) - */ -static const int fullness_threshold_frac = 4; - -struct size_class { - /* - * Size of objects stored in this class. Must be multiple - * of ZS_ALIGN. - */ - int size; - unsigned int index; - - /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ - int pages_per_zspage; - - spinlock_t lock; - - /* stats */ - u64 pages_allocated; - - struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; -}; - -/* - * Placed within free objects to form a singly linked list. - * For every zspage, first_page->freelist gives head of this list. - * - * This must be power of 2 and less than or equal to ZS_ALIGN - */ -struct link_free { - /* Handle of next free chunk (encodes ) */ - void *next; -}; - -struct zs_pool { - struct size_class size_class[ZS_SIZE_CLASSES]; - - gfp_t flags; /* allocation flags used when growing pool */ -}; - -/* - * A zspage's class index and fullness group - * are encoded in its (first)page->mapping - */ -#define CLASS_IDX_BITS 28 -#define FULLNESS_BITS 4 -#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) -#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) - -/* - * By default, zsmalloc uses a copy-based object mapping method to access - * allocations that span two pages. However, if a particular architecture - * performs VM mapping faster than copying, then it should be added here - * so that USE_PGTABLE_MAPPING is defined. This causes zsmalloc to use - * page table mapping rather than copying for object mapping. - */ -#if defined(CONFIG_ARM) && !defined(MODULE) -#define USE_PGTABLE_MAPPING -#endif - -struct mapping_area { -#ifdef USE_PGTABLE_MAPPING - struct vm_struct *vm; /* vm area for mapping object that span pages */ -#else - char *vm_buf; /* copy buffer for objects that span pages */ -#endif - char *vm_addr; /* address of kmap_atomic()'ed pages */ - enum zs_mapmode vm_mm; /* mapping mode */ -}; - - -/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ -static DEFINE_PER_CPU(struct mapping_area, zs_map_area); - -static int is_first_page(struct page *page) -{ - return PagePrivate(page); -} - -static int is_last_page(struct page *page) -{ - return PagePrivate2(page); -} - -static void get_zspage_mapping(struct page *page, unsigned int *class_idx, - enum fullness_group *fullness) -{ - unsigned long m; - BUG_ON(!is_first_page(page)); - - m = (unsigned long)page->mapping; - *fullness = m & FULLNESS_MASK; - *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; -} - -static void set_zspage_mapping(struct page *page, unsigned int class_idx, - enum fullness_group fullness) -{ - unsigned long m; - BUG_ON(!is_first_page(page)); - - m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | - (fullness & FULLNESS_MASK); - page->mapping = (struct address_space *)m; -} - -static int get_size_class_index(int size) -{ - int idx = 0; - - if (likely(size > ZS_MIN_ALLOC_SIZE)) - idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, - ZS_SIZE_CLASS_DELTA); - - return idx; -} - -static enum fullness_group get_fullness_group(struct page *page) -{ - int inuse, max_objects; - enum fullness_group fg; - BUG_ON(!is_first_page(page)); - - inuse = page->inuse; - max_objects = page->objects; - - if (inuse == 0) - fg = ZS_EMPTY; - else if (inuse == max_objects) - fg = ZS_FULL; - else if (inuse <= max_objects / fullness_threshold_frac) - fg = ZS_ALMOST_EMPTY; - else - fg = ZS_ALMOST_FULL; - - return fg; -} - -static void insert_zspage(struct page *page, struct size_class *class, - enum fullness_group fullness) -{ - struct page **head; - - BUG_ON(!is_first_page(page)); - - if (fullness >= _ZS_NR_FULLNESS_GROUPS) - return; - - head = &class->fullness_list[fullness]; - if (*head) - list_add_tail(&page->lru, &(*head)->lru); - - *head = page; -} - -static void remove_zspage(struct page *page, struct size_class *class, - enum fullness_group fullness) -{ - struct page **head; - - BUG_ON(!is_first_page(page)); - - if (fullness >= _ZS_NR_FULLNESS_GROUPS) - return; - - head = &class->fullness_list[fullness]; - BUG_ON(!*head); - if (list_empty(&(*head)->lru)) - *head = NULL; - else if (*head == page) - *head = (struct page *)list_entry((*head)->lru.next, - struct page, lru); - - list_del_init(&page->lru); -} - -static enum fullness_group fix_fullness_group(struct zs_pool *pool, - struct page *page) -{ - int class_idx; - struct size_class *class; - enum fullness_group currfg, newfg; - - BUG_ON(!is_first_page(page)); - - get_zspage_mapping(page, &class_idx, &currfg); - newfg = get_fullness_group(page); - if (newfg == currfg) - goto out; - - class = &pool->size_class[class_idx]; - remove_zspage(page, class, currfg); - insert_zspage(page, class, newfg); - set_zspage_mapping(page, class_idx, newfg); - -out: - return newfg; -} - -/* - * We have to decide on how many pages to link together - * to form a zspage for each size class. This is important - * to reduce wastage due to unusable space left at end of - * each zspage which is given as: - * wastage = Zp - Zp % size_class - * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... - * - * For example, for size class of 3/8 * PAGE_SIZE, we should - * link together 3 PAGE_SIZE sized pages to form a zspage - * since then we can perfectly fit in 8 such objects. - */ -static int get_pages_per_zspage(int class_size) -{ - int i, max_usedpc = 0; - /* zspage order which gives maximum used size per KB */ - int max_usedpc_order = 1; - - for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { - int zspage_size; - int waste, usedpc; - - zspage_size = i * PAGE_SIZE; - waste = zspage_size % class_size; - usedpc = (zspage_size - waste) * 100 / zspage_size; - - if (usedpc > max_usedpc) { - max_usedpc = usedpc; - max_usedpc_order = i; - } - } - - return max_usedpc_order; -} - -/* - * A single 'zspage' is composed of many system pages which are - * linked together using fields in struct page. This function finds - * the first/head page, given any component page of a zspage. - */ -static struct page *get_first_page(struct page *page) -{ - if (is_first_page(page)) - return page; - else - return page->first_page; -} - -static struct page *get_next_page(struct page *page) -{ - struct page *next; - - if (is_last_page(page)) - next = NULL; - else if (is_first_page(page)) - next = (struct page *)page_private(page); - else - next = list_entry(page->lru.next, struct page, lru); - - return next; -} - -/* - * Encode as a single handle value. - * On hardware platforms with physical memory starting at 0x0 the pfn - * could be 0 so we ensure that the handle will never be 0 by adjusting the - * encoded obj_idx value before encoding. - */ -static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) -{ - unsigned long handle; - - if (!page) { - BUG_ON(obj_idx); - return NULL; - } - - handle = page_to_pfn(page) << OBJ_INDEX_BITS; - handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); - - return (void *)handle; -} - -/* - * Decode pair from the given object handle. We adjust the - * decoded obj_idx back to its original value since it was adjusted in - * obj_location_to_handle(). - */ -static void obj_handle_to_location(unsigned long handle, struct page **page, - unsigned long *obj_idx) -{ - *page = pfn_to_page(handle >> OBJ_INDEX_BITS); - *obj_idx = (handle & OBJ_INDEX_MASK) - 1; -} - -static unsigned long obj_idx_to_offset(struct page *page, - unsigned long obj_idx, int class_size) -{ - unsigned long off = 0; - - if (!is_first_page(page)) - off = page->index; - - return off + obj_idx * class_size; -} - -static void reset_page(struct page *page) -{ - clear_bit(PG_private, &page->flags); - clear_bit(PG_private_2, &page->flags); - set_page_private(page, 0); - page->mapping = NULL; - page->freelist = NULL; - reset_page_mapcount(page); -} - -static void free_zspage(struct page *first_page) -{ - struct page *nextp, *tmp, *head_extra; - - BUG_ON(!is_first_page(first_page)); - BUG_ON(first_page->inuse); - - head_extra = (struct page *)page_private(first_page); - - reset_page(first_page); - __free_page(first_page); - - /* zspage with only 1 system page */ - if (!head_extra) - return; - - list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { - list_del(&nextp->lru); - reset_page(nextp); - __free_page(nextp); - } - reset_page(head_extra); - __free_page(head_extra); -} - -/* Initialize a newly allocated zspage */ -static void init_zspage(struct page *first_page, struct size_class *class) -{ - unsigned long off = 0; - struct page *page = first_page; - - BUG_ON(!is_first_page(first_page)); - while (page) { - struct page *next_page; - struct link_free *link; - unsigned int i, objs_on_page; - - /* - * page->index stores offset of first object starting - * in the page. For the first page, this is always 0, - * so we use first_page->index (aka ->freelist) to store - * head of corresponding zspage's freelist. - */ - if (page != first_page) - page->index = off; - - link = (struct link_free *)kmap_atomic(page) + - off / sizeof(*link); - objs_on_page = (PAGE_SIZE - off) / class->size; - - for (i = 1; i <= objs_on_page; i++) { - off += class->size; - if (off < PAGE_SIZE) { - link->next = obj_location_to_handle(page, i); - link += class->size / sizeof(*link); - } - } - - /* - * We now come to the last (full or partial) object on this - * page, which must point to the first object on the next - * page (if present) - */ - next_page = get_next_page(page); - link->next = obj_location_to_handle(next_page, 0); - kunmap_atomic(link); - page = next_page; - off = (off + class->size) % PAGE_SIZE; - } -} - -/* - * Allocate a zspage for the given size class - */ -static struct page *alloc_zspage(struct size_class *class, gfp_t flags) -{ - int i, error; - struct page *first_page = NULL, *uninitialized_var(prev_page); - - /* - * Allocate individual pages and link them together as: - * 1. first page->private = first sub-page - * 2. all sub-pages are linked together using page->lru - * 3. each sub-page is linked to the first page using page->first_page - * - * For each size class, First/Head pages are linked together using - * page->lru. Also, we set PG_private to identify the first page - * (i.e. no other sub-page has this flag set) and PG_private_2 to - * identify the last page. - */ - error = -ENOMEM; - for (i = 0; i < class->pages_per_zspage; i++) { - struct page *page; - - page = alloc_page(flags); - if (!page) - goto cleanup; - - INIT_LIST_HEAD(&page->lru); - if (i == 0) { /* first page */ - SetPagePrivate(page); - set_page_private(page, 0); - first_page = page; - first_page->inuse = 0; - } - if (i == 1) - set_page_private(first_page, (unsigned long)page); - if (i >= 1) - page->first_page = first_page; - if (i >= 2) - list_add(&page->lru, &prev_page->lru); - if (i == class->pages_per_zspage - 1) /* last page */ - SetPagePrivate2(page); - prev_page = page; - } - - init_zspage(first_page, class); - - first_page->freelist = obj_location_to_handle(first_page, 0); - /* Maximum number of objects we can store in this zspage */ - first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; - - error = 0; /* Success */ - -cleanup: - if (unlikely(error) && first_page) { - free_zspage(first_page); - first_page = NULL; - } - - return first_page; -} - -static struct page *find_get_zspage(struct size_class *class) -{ - int i; - struct page *page; - - for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { - page = class->fullness_list[i]; - if (page) - break; - } - - return page; -} - -#ifdef USE_PGTABLE_MAPPING -static inline int __zs_cpu_up(struct mapping_area *area) -{ - /* - * Make sure we don't leak memory if a cpu UP notification - * and zs_init() race and both call zs_cpu_up() on the same cpu - */ - if (area->vm) - return 0; - area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); - if (!area->vm) - return -ENOMEM; - return 0; -} - -static inline void __zs_cpu_down(struct mapping_area *area) -{ - if (area->vm) - free_vm_area(area->vm); - area->vm = NULL; -} - -static inline void *__zs_map_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); - area->vm_addr = area->vm->addr; - return area->vm_addr + off; -} - -static inline void __zs_unmap_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - unsigned long addr = (unsigned long)area->vm_addr; - - unmap_kernel_range(addr, PAGE_SIZE * 2); -} - -#else /* USE_PGTABLE_MAPPING */ - -static inline int __zs_cpu_up(struct mapping_area *area) -{ - /* - * Make sure we don't leak memory if a cpu UP notification - * and zs_init() race and both call zs_cpu_up() on the same cpu - */ - if (area->vm_buf) - return 0; - area->vm_buf = (char *)__get_free_page(GFP_KERNEL); - if (!area->vm_buf) - return -ENOMEM; - return 0; -} - -static inline void __zs_cpu_down(struct mapping_area *area) -{ - if (area->vm_buf) - free_page((unsigned long)area->vm_buf); - area->vm_buf = NULL; -} - -static void *__zs_map_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - int sizes[2]; - void *addr; - char *buf = area->vm_buf; - - /* disable page faults to match kmap_atomic() return conditions */ - pagefault_disable(); - - /* no read fastpath */ - if (area->vm_mm == ZS_MM_WO) - goto out; - - sizes[0] = PAGE_SIZE - off; - sizes[1] = size - sizes[0]; - - /* copy object to per-cpu buffer */ - addr = kmap_atomic(pages[0]); - memcpy(buf, addr + off, sizes[0]); - kunmap_atomic(addr); - addr = kmap_atomic(pages[1]); - memcpy(buf + sizes[0], addr, sizes[1]); - kunmap_atomic(addr); -out: - return area->vm_buf; -} - -static void __zs_unmap_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - int sizes[2]; - void *addr; - char *buf = area->vm_buf; - - /* no write fastpath */ - if (area->vm_mm == ZS_MM_RO) - goto out; - - sizes[0] = PAGE_SIZE - off; - sizes[1] = size - sizes[0]; - - /* copy per-cpu buffer to object */ - addr = kmap_atomic(pages[0]); - memcpy(addr + off, buf, sizes[0]); - kunmap_atomic(addr); - addr = kmap_atomic(pages[1]); - memcpy(addr, buf + sizes[0], sizes[1]); - kunmap_atomic(addr); - -out: - /* enable page faults to match kunmap_atomic() return conditions */ - pagefault_enable(); -} - -#endif /* USE_PGTABLE_MAPPING */ - -static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, - void *pcpu) -{ - int ret, cpu = (long)pcpu; - struct mapping_area *area; - - switch (action) { - case CPU_UP_PREPARE: - area = &per_cpu(zs_map_area, cpu); - ret = __zs_cpu_up(area); - if (ret) - return notifier_from_errno(ret); - break; - case CPU_DEAD: - case CPU_UP_CANCELED: - area = &per_cpu(zs_map_area, cpu); - __zs_cpu_down(area); - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block zs_cpu_nb = { - .notifier_call = zs_cpu_notifier -}; - -static void zs_exit(void) -{ - int cpu; - - for_each_online_cpu(cpu) - zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); - unregister_cpu_notifier(&zs_cpu_nb); -} - -static int zs_init(void) -{ - int cpu, ret; - - register_cpu_notifier(&zs_cpu_nb); - for_each_online_cpu(cpu) { - ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - if (notifier_to_errno(ret)) - goto fail; - } - return 0; -fail: - zs_exit(); - return notifier_to_errno(ret); -} - -/** - * zs_create_pool - Creates an allocation pool to work from. - * @flags: allocation flags used to allocate pool metadata - * - * This function must be called before anything when using - * the zsmalloc allocator. - * - * On success, a pointer to the newly created pool is returned, - * otherwise NULL. - */ -struct zs_pool *zs_create_pool(gfp_t flags) -{ - int i, ovhd_size; - struct zs_pool *pool; - - ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); - pool = kzalloc(ovhd_size, GFP_KERNEL); - if (!pool) - return NULL; - - for (i = 0; i < ZS_SIZE_CLASSES; i++) { - int size; - struct size_class *class; - - size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; - if (size > ZS_MAX_ALLOC_SIZE) - size = ZS_MAX_ALLOC_SIZE; - - class = &pool->size_class[i]; - class->size = size; - class->index = i; - spin_lock_init(&class->lock); - class->pages_per_zspage = get_pages_per_zspage(size); - - } - - pool->flags = flags; - - return pool; -} -EXPORT_SYMBOL_GPL(zs_create_pool); - -void zs_destroy_pool(struct zs_pool *pool) -{ - int i; - - for (i = 0; i < ZS_SIZE_CLASSES; i++) { - int fg; - struct size_class *class = &pool->size_class[i]; - - for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { - if (class->fullness_list[fg]) { - pr_info("Freeing non-empty class with size %db, fullness group %d\n", - class->size, fg); - } - } - } - kfree(pool); -} -EXPORT_SYMBOL_GPL(zs_destroy_pool); - -/** - * zs_malloc - Allocate block of given size from pool. - * @pool: pool to allocate from - * @size: size of block to allocate - * - * On success, handle to the allocated object is returned, - * otherwise 0. - * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. - */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size) -{ - unsigned long obj; - struct link_free *link; - int class_idx; - struct size_class *class; - - struct page *first_page, *m_page; - unsigned long m_objidx, m_offset; - - if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) - return 0; - - class_idx = get_size_class_index(size); - class = &pool->size_class[class_idx]; - BUG_ON(class_idx != class->index); - - spin_lock(&class->lock); - first_page = find_get_zspage(class); - - if (!first_page) { - spin_unlock(&class->lock); - first_page = alloc_zspage(class, pool->flags); - if (unlikely(!first_page)) - return 0; - - set_zspage_mapping(first_page, class->index, ZS_EMPTY); - spin_lock(&class->lock); - class->pages_allocated += class->pages_per_zspage; - } - - obj = (unsigned long)first_page->freelist; - obj_handle_to_location(obj, &m_page, &m_objidx); - m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); - - link = (struct link_free *)kmap_atomic(m_page) + - m_offset / sizeof(*link); - first_page->freelist = link->next; - memset(link, POISON_INUSE, sizeof(*link)); - kunmap_atomic(link); - - first_page->inuse++; - /* Now move the zspage to another fullness group, if required */ - fix_fullness_group(pool, first_page); - spin_unlock(&class->lock); - - return obj; -} -EXPORT_SYMBOL_GPL(zs_malloc); - -void zs_free(struct zs_pool *pool, unsigned long obj) -{ - struct link_free *link; - struct page *first_page, *f_page; - unsigned long f_objidx, f_offset; - - int class_idx; - struct size_class *class; - enum fullness_group fullness; - - if (unlikely(!obj)) - return; - - obj_handle_to_location(obj, &f_page, &f_objidx); - first_page = get_first_page(f_page); - - get_zspage_mapping(first_page, &class_idx, &fullness); - class = &pool->size_class[class_idx]; - f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); - - spin_lock(&class->lock); - - /* Insert this object in containing zspage's freelist */ - link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) - + f_offset); - link->next = first_page->freelist; - kunmap_atomic(link); - first_page->freelist = (void *)obj; - - first_page->inuse--; - fullness = fix_fullness_group(pool, first_page); - - if (fullness == ZS_EMPTY) - class->pages_allocated -= class->pages_per_zspage; - - spin_unlock(&class->lock); - - if (fullness == ZS_EMPTY) - free_zspage(first_page); -} -EXPORT_SYMBOL_GPL(zs_free); - -/** - * zs_map_object - get address of allocated object from handle. - * @pool: pool from which the object was allocated - * @handle: handle returned from zs_malloc - * - * Before using an object allocated from zs_malloc, it must be mapped using - * this function. When done with the object, it must be unmapped using - * zs_unmap_object. - * - * Only one object can be mapped per cpu at a time. There is no protection - * against nested mappings. - * - * This function returns with preemption and page faults disabled. - */ -void *zs_map_object(struct zs_pool *pool, unsigned long handle, - enum zs_mapmode mm) -{ - struct page *page; - unsigned long obj_idx, off; - - unsigned int class_idx; - enum fullness_group fg; - struct size_class *class; - struct mapping_area *area; - struct page *pages[2]; - - BUG_ON(!handle); - - /* - * Because we use per-cpu mapping areas shared among the - * pools/users, we can't allow mapping in interrupt context - * because it can corrupt another users mappings. - */ - BUG_ON(in_interrupt()); - - obj_handle_to_location(handle, &page, &obj_idx); - get_zspage_mapping(get_first_page(page), &class_idx, &fg); - class = &pool->size_class[class_idx]; - off = obj_idx_to_offset(page, obj_idx, class->size); - - area = &get_cpu_var(zs_map_area); - area->vm_mm = mm; - if (off + class->size <= PAGE_SIZE) { - /* this object is contained entirely within a page */ - area->vm_addr = kmap_atomic(page); - return area->vm_addr + off; - } - - /* this object spans two pages */ - pages[0] = page; - pages[1] = get_next_page(page); - BUG_ON(!pages[1]); - - return __zs_map_object(area, pages, off, class->size); -} -EXPORT_SYMBOL_GPL(zs_map_object); - -void zs_unmap_object(struct zs_pool *pool, unsigned long handle) -{ - struct page *page; - unsigned long obj_idx, off; - - unsigned int class_idx; - enum fullness_group fg; - struct size_class *class; - struct mapping_area *area; - - BUG_ON(!handle); - - obj_handle_to_location(handle, &page, &obj_idx); - get_zspage_mapping(get_first_page(page), &class_idx, &fg); - class = &pool->size_class[class_idx]; - off = obj_idx_to_offset(page, obj_idx, class->size); - - area = &__get_cpu_var(zs_map_area); - if (off + class->size <= PAGE_SIZE) - kunmap_atomic(area->vm_addr); - else { - struct page *pages[2]; - - pages[0] = page; - pages[1] = get_next_page(page); - BUG_ON(!pages[1]); - - __zs_unmap_object(area, pages, off, class->size); - } - put_cpu_var(zs_map_area); -} -EXPORT_SYMBOL_GPL(zs_unmap_object); - -u64 zs_get_total_size_bytes(struct zs_pool *pool) -{ - int i; - u64 npages = 0; - - for (i = 0; i < ZS_SIZE_CLASSES; i++) - npages += pool->size_class[i].pages_allocated; - - return npages << PAGE_SHIFT; -} -EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); - -module_init(zs_init); -module_exit(zs_exit); - -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_AUTHOR("Nitin Gupta "); diff --git a/drivers/staging/zsmalloc/zsmalloc.h b/drivers/staging/zsmalloc/zsmalloc.h deleted file mode 100644 index fbe6bec42..000000000 --- a/drivers/staging/zsmalloc/zsmalloc.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * zsmalloc memory allocator - * - * Copyright (C) 2011 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the license that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - */ - -#ifndef _ZS_MALLOC_H_ -#define _ZS_MALLOC_H_ - -#include - -/* - * zsmalloc mapping modes - * - * NOTE: These only make a difference when a mapped object spans pages - */ -enum zs_mapmode { - ZS_MM_RW, /* normal read-write mapping */ - ZS_MM_RO, /* read-only (no copy-out at unmap time) */ - ZS_MM_WO /* write-only (no copy-in at map time) */ -}; - -struct zs_pool; - -struct zs_pool *zs_create_pool(gfp_t flags); -void zs_destroy_pool(struct zs_pool *pool); - -unsigned long zs_malloc(struct zs_pool *pool, size_t size); -void zs_free(struct zs_pool *pool, unsigned long obj); - -void *zs_map_object(struct zs_pool *pool, unsigned long handle, - enum zs_mapmode mm); -void zs_unmap_object(struct zs_pool *pool, unsigned long handle); - -u64 zs_get_total_size_bytes(struct zs_pool *pool); - -#endif diff --git a/drivers/staging/zsmalloc/zsmalloc_int.h b/drivers/staging/zsmalloc/zsmalloc_int.h deleted file mode 100644 index 92eefc663..000000000 --- a/drivers/staging/zsmalloc/zsmalloc_int.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - * zsmalloc memory allocator - * - * Copyright (C) 2011 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the license that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - */ - -#ifndef _ZS_MALLOC_INT_H_ -#define _ZS_MALLOC_INT_H_ - -#include -#include -#include - -/* - * This must be power of 2 and greater than of equal to sizeof(link_free). - * These two conditions ensure that any 'struct link_free' itself doesn't - * span more than 1 page which avoids complex case of mapping 2 pages simply - * to restore link_free pointer values. - */ -#define ZS_ALIGN 8 - -/* - * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) - * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. - */ -#define ZS_MAX_ZSPAGE_ORDER 2 -#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) - -/* - * Object location (, ) is encoded as - * as single (void *) handle value. - * - * Note that object index is relative to system - * page it is stored in, so for each sub-page belonging - * to a zspage, obj_idx starts with 0. - * - * This is made more complicated by various memory models and PAE. - */ - -#ifndef MAX_PHYSMEM_BITS -#ifdef CONFIG_HIGHMEM64G -#define MAX_PHYSMEM_BITS 36 -#else /* !CONFIG_HIGHMEM64G */ -/* - * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just - * be PAGE_SHIFT - */ -#define MAX_PHYSMEM_BITS BITS_PER_LONG -#endif -#endif -#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) -#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) -#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) - -#define MAX(a, b) ((a) >= (b) ? (a) : (b)) -/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ -#define ZS_MIN_ALLOC_SIZE \ - MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) -#define ZS_MAX_ALLOC_SIZE PAGE_SIZE - -/* - * On systems with 4K page size, this gives 254 size classes! There is a - * trader-off here: - * - Large number of size classes is potentially wasteful as free page are - * spread across these classes - * - Small number of size classes causes large internal fragmentation - * - Probably its better to use specific size classes (empirically - * determined). NOTE: all those class sizes must be set as multiple of - * ZS_ALIGN to make sure link_free itself never has to span 2 pages. - * - * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN - * (reason above) - */ -#define ZS_SIZE_CLASS_DELTA 16 -#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ - ZS_SIZE_CLASS_DELTA + 1) - -/* - * We do not maintain any list for completely empty or full pages - */ -enum fullness_group { - ZS_ALMOST_FULL, - ZS_ALMOST_EMPTY, - _ZS_NR_FULLNESS_GROUPS, - - ZS_EMPTY, - ZS_FULL -}; - -/* - * We assign a page to ZS_ALMOST_EMPTY fullness group when: - * n <= N / f, where - * n = number of allocated objects - * N = total number of objects zspage can store - * f = 1/fullness_threshold_frac - * - * Similarly, we assign zspage to: - * ZS_ALMOST_FULL when n > N / f - * ZS_EMPTY when n == 0 - * ZS_FULL when n == N - * - * (see: fix_fullness_group()) - */ -static const int fullness_threshold_frac = 4; - -struct mapping_area { - struct vm_struct *vm; - pte_t *vm_ptes[2]; - char *vm_addr; -}; - -struct size_class { - /* - * Size of objects stored in this class. Must be multiple - * of ZS_ALIGN. - */ - int size; - unsigned int index; - - /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ - int zspage_order; - - spinlock_t lock; - - /* stats */ - u64 pages_allocated; - - struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; -}; - -/* - * Placed within free objects to form a singly linked list. - * For every zspage, first_page->freelist gives head of this list. - * - * This must be power of 2 and less than or equal to ZS_ALIGN - */ -struct link_free { - /* Handle of next free chunk (encodes ) */ - void *next; -}; - -struct zs_pool { - struct size_class size_class[ZS_SIZE_CLASSES]; - - gfp_t flags; /* allocation flags used when growing pool */ - const char *name; -}; - -#endif diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 92f9fcdc5..54afc7f76 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -148,6 +148,9 @@ void notify_cpu_starting(unsigned int cpu); extern void cpu_maps_update_begin(void); extern void cpu_maps_update_done(void); +#define cpu_notifier_register_begin cpu_maps_update_begin +#define cpu_notifier_register_done cpu_maps_update_done + #else /* CONFIG_SMP */ #define cpu_notifier(fn, pri) do { (void)(fn); } while (0) diff --git a/include/linux/swap.h b/include/linux/swap.h index 9781ed09b..00bdb7df3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -339,7 +339,8 @@ extern int __swap_writepage(struct page *page, struct writeback_control *wbc, extern void end_swap_bio_read(struct bio *bio, int err); /* linux/mm/swap_state.c */ -extern struct address_space swapper_space; +extern struct address_space swapper_spaces[]; +#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) #define total_swapcache_pages swapper_space.nrpages extern void show_swap_cache_info(void); extern int add_to_swap(struct page *); diff --git a/include/linux/zbud.h b/include/linux/zbud.h new file mode 100644 index 000000000..e183a0a65 --- /dev/null +++ b/include/linux/zbud.h @@ -0,0 +1,22 @@ +#ifndef _ZBUD_H_ +#define _ZBUD_H_ + +#include + +struct zbud_pool; + +struct zbud_ops { + int (*evict)(struct zbud_pool *pool, unsigned long handle); +}; + +struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops); +void zbud_destroy_pool(struct zbud_pool *pool); +int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, + unsigned long *handle); +void zbud_free(struct zbud_pool *pool, unsigned long handle); +int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries); +void *zbud_map(struct zbud_pool *pool, unsigned long handle); +void zbud_unmap(struct zbud_pool *pool, unsigned long handle); +u64 zbud_get_pool_size(struct zbud_pool *pool); + +#endif /* _ZBUD_H_ */ diff --git a/include/linux/zpool.h b/include/linux/zpool.h new file mode 100644 index 000000000..24e0173c7 --- /dev/null +++ b/include/linux/zpool.h @@ -0,0 +1,110 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for the zbud and zsmalloc memory + * storage pool implementations. Typically, this is used to + * store compressed memory. + */ + +#ifndef _ZPOOL_H_ +#define _ZPOOL_H_ + +struct zpool; + +struct zpool_ops { + int (*evict)(struct zpool *pool, unsigned long handle); +}; + +/* + * Control how a handle is mapped. It will be ignored if the + * implementation does not support it. Its use is optional. + * Note that this does not refer to memory protection, it + * refers to how the memory will be copied in/out if copying + * is necessary during mapping; read-write is the safest as + * it copies the existing memory in on map, and copies the + * changed memory back out on unmap. Write-only does not copy + * in the memory and should only be used for initialization. + * If in doubt, use ZPOOL_MM_DEFAULT which is read-write. + */ +enum zpool_mapmode { + ZPOOL_MM_RW, /* normal read-write mapping */ + ZPOOL_MM_RO, /* read-only (no copy-out at unmap time) */ + ZPOOL_MM_WO, /* write-only (no copy-in at map time) */ + + ZPOOL_MM_DEFAULT = ZPOOL_MM_RW +}; + +bool zpool_has_pool(char *type); + +struct zpool *zpool_create_pool(const char *type, const char *name, + gfp_t gfp, const struct zpool_ops *ops); + +char *zpool_get_type(struct zpool *pool); + +void zpool_destroy_pool(struct zpool *pool); + +int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, + unsigned long *handle); + +void zpool_free(struct zpool *pool, unsigned long handle); + +int zpool_shrink(struct zpool *pool, unsigned int pages, + unsigned int *reclaimed); + +void *zpool_map_handle(struct zpool *pool, unsigned long handle, + enum zpool_mapmode mm); + +void zpool_unmap_handle(struct zpool *pool, unsigned long handle); + +u64 zpool_get_total_size(struct zpool *pool); + + +/** + * struct zpool_driver - driver implementation for zpool + * @type: name of the driver. + * @list: entry in the list of zpool drivers. + * @create: create a new pool. + * @destroy: destroy a pool. + * @malloc: allocate mem from a pool. + * @free: free mem from a pool. + * @shrink: shrink the pool. + * @map: map a handle. + * @unmap: unmap a handle. + * @total_size: get total size of a pool. + * + * This is created by a zpool implementation and registered + * with zpool. + */ +struct zpool_driver { + char *type; + struct module *owner; + atomic_t refcount; + struct list_head list; + + void *(*create)(const char *name, + gfp_t gfp, + const struct zpool_ops *ops, + struct zpool *zpool); + void (*destroy)(void *pool); + + int (*malloc)(void *pool, size_t size, gfp_t gfp, + unsigned long *handle); + void (*free)(void *pool, unsigned long handle); + + int (*shrink)(void *pool, unsigned int pages, + unsigned int *reclaimed); + + void *(*map)(void *pool, unsigned long handle, + enum zpool_mapmode mm); + void (*unmap)(void *pool, unsigned long handle); + + u64 (*total_size)(void *pool); +}; + +void zpool_register_driver(struct zpool_driver *driver); + +int zpool_unregister_driver(struct zpool_driver *driver); + +#endif diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 398dae365..57a8e98f2 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -2,6 +2,7 @@ * zsmalloc memory allocator * * Copyright (C) 2011 Nitin Gupta + * Copyright (C) 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the license that better fits your requirements. @@ -14,14 +15,13 @@ #define _ZS_MALLOC_H_ #include -#include /* * zsmalloc mapping modes * * NOTE: These only make a difference when a mapped object spans pages. - * They also have no effect when PGTABLE_MAPPING is selected. -*/ + * They also have no effect when PGTABLE_MAPPING is selected. + */ enum zs_mapmode { ZS_MM_RW, /* normal read-write mapping */ ZS_MM_RO, /* read-only (no copy-out at unmap time) */ @@ -34,14 +34,14 @@ enum zs_mapmode { */ }; -struct zs_ops { - struct page * (*alloc)(gfp_t); - void (*free)(struct page *); +struct zs_pool_stats { + /* How many pages were migrated (freed) */ + unsigned long pages_compacted; }; struct zs_pool; -struct zs_pool *zs_create_pool(gfp_t flags, struct zs_ops *ops); +struct zs_pool *zs_create_pool(const char *name); void zs_destroy_pool(struct zs_pool *pool); unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags); @@ -51,6 +51,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm); void zs_unmap_object(struct zs_pool *pool, unsigned long handle); -u64 zs_get_total_size_bytes(struct zs_pool *pool); +unsigned long zs_get_total_pages(struct zs_pool *pool); +unsigned long zs_compact(struct zs_pool *pool); +void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); #endif diff --git a/mm/Kconfig b/mm/Kconfig index e0b4f6fbb..a34fbc381 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -482,6 +482,24 @@ config ZSWAP_ENABLE_WRITEBACK depends on ZSWAP default n +config ZPOOL + tristate "Common API for compressed memory storage" + default n + help + Compressed memory storage API. This allows using either zbud or + zsmalloc. + +config ZBUD + tristate "Low density storage for compressed pages" + default n + help + A special purpose allocator for storing compressed pages. + It is designed to store up to two compressed pages per physical + page. While this design limits storage density, it has simple and + deterministic reclaim properties that make it preferable to a higher + density approach when reclaim will be used. + + config DIRECT_RECLAIM_FILE_PAGES_ONLY bool "Reclaim file pages only on direct reclaim path" depends on ZSWAP diff --git a/mm/Makefile b/mm/Makefile index 0774dd357..e5904b71d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -54,3 +54,5 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_ZSMALLOC_NEW) += zsmalloc.o +obj-$(CONFIG_ZPOOL) += zpool.o +obj-$(CONFIG_ZBUD) += zbud.o \ No newline at end of file diff --git a/mm/swap_state.c b/mm/swap_state.c index 77c2332b9..51fc0cd2f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -35,12 +35,12 @@ static struct backing_dev_info swap_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, }; -struct address_space swapper_space = { - .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), - .a_ops = &swap_aops, - .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), - .backing_dev_info = &swap_backing_dev_info, +struct address_space swapper_spaces[MAX_SWAPFILES] = { + [0 ... MAX_SWAPFILES - 1] = { + .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), + .a_ops = &swap_aops, + .backing_dev_info = &swap_backing_dev_info, + } }; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) diff --git a/mm/zbud.c b/mm/zbud.c new file mode 100644 index 000000000..894108348 --- /dev/null +++ b/mm/zbud.c @@ -0,0 +1,640 @@ +/* + * zbud.c + * + * Copyright (C) 2013, Seth Jennings, IBM + * + * Concepts based on zcache internal zbud allocator by Dan Magenheimer. + * + * zbud is an special purpose allocator for storing compressed pages. Contrary + * to what its name may suggest, zbud is not a buddy allocator, but rather an + * allocator that "buddies" two compressed pages together in a single memory + * page. + * + * While this design limits storage density, it has simple and deterministic + * reclaim properties that make it preferable to a higher density approach when + * reclaim will be used. + * + * zbud works by storing compressed pages, or "zpages", together in pairs in a + * single memory page called a "zbud page". The first buddy is "left + * justified" at the beginning of the zbud page, and the last buddy is "right + * justified" at the end of the zbud page. The benefit is that if either + * buddy is freed, the freed buddy space, coalesced with whatever slack space + * that existed between the buddies, results in the largest possible free region + * within the zbud page. + * + * zbud also provides an attractive lower bound on density. The ratio of zpages + * to zbud pages can not be less than 1. This ensures that zbud can never "do + * harm" by using more pages to store zpages than the uncompressed zpages would + * have used on their own. + * + * zbud pages are divided into "chunks". The size of the chunks is fixed at + * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages + * into chunks allows organizing unbuddied zbud pages into a manageable number + * of unbuddied lists according to the number of free chunks available in the + * zbud page. + * + * The zbud API differs from that of conventional allocators in that the + * allocation function, zbud_alloc(), returns an opaque handle to the user, + * not a dereferenceable pointer. The user must map the handle using + * zbud_map() in order to get a usable pointer by which to access the + * allocation data and unmap the handle with zbud_unmap() when operations + * on the allocation data are complete. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/***************** + * Structures +*****************/ +/* + * NCHUNKS_ORDER determines the internal allocation granularity, effectively + * adjusting internal fragmentation. It also determines the number of + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the + * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk + * in allocated page is occupied by zbud header, NCHUNKS will be calculated to + * 63 which shows the max number of free chunks in zbud page, also there will be + * 63 freelists per pool. + */ +#define NCHUNKS_ORDER 6 + +#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define ZHDR_SIZE_ALIGNED CHUNK_SIZE +#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) + +/** + * struct zbud_pool - stores metadata for each zbud pool + * @lock: protects all pool fields and first|last_chunk fields of any + * zbud page in the pool + * @unbuddied: array of lists tracking zbud pages that only contain one buddy; + * the lists each zbud page is added to depends on the size of + * its free region. + * @buddied: list tracking the zbud pages that contain two buddies; + * these zbud pages are full + * @lru: list tracking the zbud pages in LRU order by most recently + * added buddy. + * @pages_nr: number of zbud pages in the pool. + * @ops: pointer to a structure of user defined operations specified at + * pool creation time. + * + * This structure is allocated at pool creation time and maintains metadata + * pertaining to a particular zbud pool. + */ +struct zbud_pool { + spinlock_t lock; + struct list_head unbuddied[NCHUNKS]; + struct list_head buddied; + struct list_head lru; + u64 pages_nr; + const struct zbud_ops *ops; +#ifdef CONFIG_ZPOOL + struct zpool *zpool; + const struct zpool_ops *zpool_ops; +#endif +}; + +/* + * struct zbud_header - zbud page metadata occupying the first chunk of each + * zbud page. + * @buddy: links the zbud page into the unbuddied/buddied lists in the pool + * @lru: links the zbud page into the lru list in the pool + * @first_chunks: the size of the first buddy in chunks, 0 if free + * @last_chunks: the size of the last buddy in chunks, 0 if free + */ +struct zbud_header { + struct list_head buddy; + struct list_head lru; + unsigned int first_chunks; + unsigned int last_chunks; + bool under_reclaim; +}; + +/***************** + * zpool + ****************/ + +#ifdef CONFIG_ZPOOL + +static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) +{ + if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) + return pool->zpool_ops->evict(pool->zpool, handle); + else + return -ENOENT; +} + +static const struct zbud_ops zbud_zpool_ops = { + .evict = zbud_zpool_evict +}; + +static void *zbud_zpool_create(const char *name, gfp_t gfp, + const struct zpool_ops *zpool_ops, + struct zpool *zpool) +{ + struct zbud_pool *pool; + + pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); + if (pool) { + pool->zpool = zpool; + pool->zpool_ops = zpool_ops; + } + return pool; +} + +static void zbud_zpool_destroy(void *pool) +{ + zbud_destroy_pool(pool); +} + +static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zbud_alloc(pool, size, gfp, handle); +} +static void zbud_zpool_free(void *pool, unsigned long handle) +{ + zbud_free(pool, handle); +} + +static int zbud_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = zbud_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + +static void *zbud_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + return zbud_map(pool, handle); +} +static void zbud_zpool_unmap(void *pool, unsigned long handle) +{ + zbud_unmap(pool, handle); +} + +static u64 zbud_zpool_total_size(void *pool) +{ + return zbud_get_pool_size(pool) * PAGE_SIZE; +} + +static struct zpool_driver zbud_zpool_driver = { + .type = "zbud", + .owner = THIS_MODULE, + .create = zbud_zpool_create, + .destroy = zbud_zpool_destroy, + .malloc = zbud_zpool_malloc, + .free = zbud_zpool_free, + .shrink = zbud_zpool_shrink, + .map = zbud_zpool_map, + .unmap = zbud_zpool_unmap, + .total_size = zbud_zpool_total_size, +}; + +MODULE_ALIAS("zpool-zbud"); +#endif /* CONFIG_ZPOOL */ + +/***************** + * Helpers +*****************/ +/* Just to make the code easier to read */ +enum buddy { + FIRST, + LAST +}; + +/* Converts an allocation size in bytes to size in zbud chunks */ +static int size_to_chunks(size_t size) +{ + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; +} + +#define for_each_unbuddied_list(_iter, _begin) \ + for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) + +/* Initializes the zbud header of a newly allocated zbud page */ +static struct zbud_header *init_zbud_page(struct page *page) +{ + struct zbud_header *zhdr = page_address(page); + zhdr->first_chunks = 0; + zhdr->last_chunks = 0; + INIT_LIST_HEAD(&zhdr->buddy); + INIT_LIST_HEAD(&zhdr->lru); + zhdr->under_reclaim = 0; + return zhdr; +} + +/* Resets the struct page fields and frees the page */ +static void free_zbud_page(struct zbud_header *zhdr) +{ + __free_page(virt_to_page(zhdr)); +} + +/* + * Encodes the handle of a particular buddy within a zbud page + * Pool lock should be held as this function accesses first|last_chunks + */ +static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud) +{ + unsigned long handle; + + /* + * For now, the encoded handle is actually just the pointer to the data + * but this might not always be the case. A little information hiding. + * Add CHUNK_SIZE to the handle if it is the first allocation to jump + * over the zbud header in the first chunk. + */ + handle = (unsigned long)zhdr; + if (bud == FIRST) + /* skip over zbud header */ + handle += ZHDR_SIZE_ALIGNED; + else /* bud == LAST */ + handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); + return handle; +} + +/* Returns the zbud page where a given handle is stored */ +static struct zbud_header *handle_to_zbud_header(unsigned long handle) +{ + return (struct zbud_header *)(handle & PAGE_MASK); +} + +/* Returns the number of free chunks in a zbud page */ +static int num_free_chunks(struct zbud_header *zhdr) +{ + /* + * Rather than branch for different situations, just use the fact that + * free buddies have a length of zero to simplify everything. + */ + return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; +} + +/***************** + * API Functions +*****************/ +/** + * zbud_create_pool() - create a new zbud pool + * @gfp: gfp flags when allocating the zbud pool structure + * @ops: user-defined operations for the zbud pool + * + * Return: pointer to the new zbud pool or NULL if the metadata allocation + * failed. + */ +struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) +{ + struct zbud_pool *pool; + int i; + + pool = kzalloc(sizeof(struct zbud_pool), gfp); + if (!pool) + return NULL; + spin_lock_init(&pool->lock); + for_each_unbuddied_list(i, 0) + INIT_LIST_HEAD(&pool->unbuddied[i]); + INIT_LIST_HEAD(&pool->buddied); + INIT_LIST_HEAD(&pool->lru); + pool->pages_nr = 0; + pool->ops = ops; + return pool; +} + +/** + * zbud_destroy_pool() - destroys an existing zbud pool + * @pool: the zbud pool to be destroyed + * + * The pool should be emptied before this function is called. + */ +void zbud_destroy_pool(struct zbud_pool *pool) +{ + kfree(pool); +} + +/** + * zbud_alloc() - allocates a region of a given size + * @pool: zbud pool from which to allocate + * @size: size in bytes of the desired allocation + * @gfp: gfp flags used if the pool needs to grow + * @handle: handle of the new allocation + * + * This function will attempt to find a free region in the pool large enough to + * satisfy the allocation request. A search of the unbuddied lists is + * performed first. If no suitable free region is found, then a new page is + * allocated and added to the pool to satisfy the request. + * + * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used + * as zbud pool pages. + * + * Return: 0 if success and handle is set, otherwise -EINVAL if the size or + * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate + * a new page. + */ +int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + int chunks, i, freechunks; + struct zbud_header *zhdr = NULL; + enum buddy bud; + struct page *page; + int found = 0; + + if (!size || (gfp & __GFP_HIGHMEM)) + return -EINVAL; + if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) + return -ENOSPC; + chunks = size_to_chunks(size); + spin_lock(&pool->lock); + + /* First, try to find an unbuddied zbud page. */ + zhdr = NULL; + for_each_unbuddied_list(i, chunks) { + if (!list_empty(&pool->unbuddied[i])) { + zhdr = list_first_entry(&pool->unbuddied[i], + struct zbud_header, buddy); + list_del(&zhdr->buddy); + if (zhdr->first_chunks == 0) + bud = FIRST; + else + bud = LAST; + found = 1; + goto found; + } + } + + /* Couldn't find unbuddied zbud page, create new one */ + spin_unlock(&pool->lock); + page = alloc_page(gfp); + if (!page) + return -ENOMEM; + spin_lock(&pool->lock); + pool->pages_nr++; + zhdr = init_zbud_page(page); + bud = FIRST; + +found: + if (bud == FIRST) + zhdr->first_chunks = chunks; + else + zhdr->last_chunks = chunks; + + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* Add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + + /* Add/move zbud page to beginning of LRU */ + if (!list_empty(&zhdr->lru)) + list_del(&zhdr->lru); + list_add(&zhdr->lru, &pool->lru); + + *handle = encode_handle(zhdr, bud); + if ((gfp & __GFP_ZERO) && found) + memset((void *)*handle, 0, size); + spin_unlock(&pool->lock); + + return 0; +} + +/** + * zbud_free() - frees the allocation associated with the given handle + * @pool: pool in which the allocation resided + * @handle: handle associated with the allocation returned by zbud_alloc() + * + * In the case that the zbud page in which the allocation resides is under + * reclaim, as indicated by the PG_reclaim flag being set, this function + * only sets the first|last_chunks to 0. The page is actually freed + * once both buddies are evicted (see zbud_reclaim_page() below). + */ +void zbud_free(struct zbud_pool *pool, unsigned long handle) +{ + struct zbud_header *zhdr; + int freechunks; + + spin_lock(&pool->lock); + zhdr = handle_to_zbud_header(handle); + + /* If first buddy, handle will be page aligned */ + if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK) + zhdr->last_chunks = 0; + else + zhdr->first_chunks = 0; + + if (zhdr->under_reclaim) { + /* zbud page is under reclaim, reclaim will free */ + spin_unlock(&pool->lock); + return; + } + + /* Remove from existing buddy list */ + list_del(&zhdr->buddy); + + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* zbud page is empty, free */ + list_del(&zhdr->lru); + free_zbud_page(zhdr); + pool->pages_nr--; + } else { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } + + spin_unlock(&pool->lock); +} + +/** + * zbud_reclaim_page() - evicts allocations from a pool page and frees it + * @pool: pool from which a page will attempt to be evicted + * @retires: number of pages on the LRU list for which eviction will + * be attempted before failing + * + * zbud reclaim is different from normal system reclaim in that the reclaim is + * done from the bottom, up. This is because only the bottom layer, zbud, has + * information on how the allocations are organized within each zbud page. This + * has the potential to create interesting locking situations between zbud and + * the user, however. + * + * To avoid these, this is how zbud_reclaim_page() should be called: + + * The user detects a page should be reclaimed and calls zbud_reclaim_page(). + * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call + * the user-defined eviction handler with the pool and handle as arguments. + * + * If the handle can not be evicted, the eviction handler should return + * non-zero. zbud_reclaim_page() will add the zbud page back to the + * appropriate list and try the next zbud page on the LRU up to + * a user defined number of retries. + * + * If the handle is successfully evicted, the eviction handler should + * return 0 _and_ should have called zbud_free() on the handle. zbud_free() + * contains logic to delay freeing the page if the page is under reclaim, + * as indicated by the setting of the PG_reclaim flag on the underlying page. + * + * If all buddies in the zbud page are successfully evicted, then the + * zbud page can be freed. + * + * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are + * no pages to evict or an eviction handler is not registered, -EAGAIN if + * the retry limit was hit. + */ +int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) +{ + int i, ret, freechunks; + struct zbud_header *zhdr; + unsigned long first_handle = 0, last_handle = 0; + + spin_lock(&pool->lock); + if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || + retries == 0) { + spin_unlock(&pool->lock); + return -EINVAL; + } + for (i = 0; i < retries; i++) { + zhdr = list_last_entry(&pool->lru, struct zbud_header, lru); + list_del(&zhdr->lru); + list_del(&zhdr->buddy); + /* Protect zbud page against free */ + zhdr->under_reclaim = true; + /* + * We need encode the handles before unlocking, since we can + * race with free that will set (first|last)_chunks to 0 + */ + first_handle = 0; + last_handle = 0; + if (zhdr->first_chunks) + first_handle = encode_handle(zhdr, FIRST); + if (zhdr->last_chunks) + last_handle = encode_handle(zhdr, LAST); + spin_unlock(&pool->lock); + + /* Issue the eviction callback(s) */ + if (first_handle) { + ret = pool->ops->evict(pool, first_handle); + if (ret) + goto next; + } + if (last_handle) { + ret = pool->ops->evict(pool, last_handle); + if (ret) + goto next; + } +next: + spin_lock(&pool->lock); + zhdr->under_reclaim = false; + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* + * Both buddies are now free, free the zbud page and + * return success. + */ + free_zbud_page(zhdr); + pool->pages_nr--; + spin_unlock(&pool->lock); + return 0; + } else if (zhdr->first_chunks == 0 || + zhdr->last_chunks == 0) { + /* add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + + /* add to beginning of LRU */ + list_add(&zhdr->lru, &pool->lru); + } + spin_unlock(&pool->lock); + return -EAGAIN; +} + +/** + * zbud_map() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be mapped + * + * While trivial for zbud, the mapping functions for others allocators + * implementing this allocation API could have more complex information encoded + * in the handle and could create temporary mappings to make the data + * accessible to the user. + * + * Returns: a pointer to the mapped allocation + */ +void *zbud_map(struct zbud_pool *pool, unsigned long handle) +{ + return (void *)(handle); +} + +/** + * zbud_unmap() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be unmapped + */ +void zbud_unmap(struct zbud_pool *pool, unsigned long handle) +{ +} + +/** + * zbud_get_pool_size() - gets the zbud pool size in pages + * @pool: pool whose size is being queried + * + * Returns: size in pages of the given pool. The pool lock need not be + * taken to access pages_nr. + */ +u64 zbud_get_pool_size(struct zbud_pool *pool) +{ + return pool->pages_nr; +} + +static int __init init_zbud(void) +{ + /* Make sure the zbud header will fit in one chunk */ + BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); + pr_info("loaded\n"); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zbud_zpool_driver); +#endif + + return 0; +} + +static void __exit exit_zbud(void) +{ +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zbud_zpool_driver); +#endif + + pr_info("unloaded\n"); +} + +module_init(init_zbud); +module_exit(exit_zbud); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Seth Jennings "); +MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages"); diff --git a/mm/zpool.c b/mm/zpool.c new file mode 100644 index 000000000..308395ba8 --- /dev/null +++ b/mm/zpool.c @@ -0,0 +1,359 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for memory storage pool implementations. + * Typically, this is used to store compressed memory. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +struct zpool { + char *type; + + struct zpool_driver *driver; + void *pool; + const struct zpool_ops *ops; + + struct list_head list; +}; + +static LIST_HEAD(drivers_head); +static DEFINE_SPINLOCK(drivers_lock); + +static LIST_HEAD(pools_head); +static DEFINE_SPINLOCK(pools_lock); + +/** + * zpool_register_driver() - register a zpool implementation. + * @driver: driver to register + */ +void zpool_register_driver(struct zpool_driver *driver) +{ + spin_lock(&drivers_lock); + atomic_set(&driver->refcount, 0); + list_add(&driver->list, &drivers_head); + spin_unlock(&drivers_lock); +} +EXPORT_SYMBOL(zpool_register_driver); + +/** + * zpool_unregister_driver() - unregister a zpool implementation. + * @driver: driver to unregister. + * + * Module usage counting is used to prevent using a driver + * while/after unloading, so if this is called from module + * exit function, this should never fail; if called from + * other than the module exit function, and this returns + * failure, the driver is in use and must remain available. + */ +int zpool_unregister_driver(struct zpool_driver *driver) +{ + int ret = 0, refcount; + + spin_lock(&drivers_lock); + refcount = atomic_read(&driver->refcount); + WARN_ON(refcount < 0); + if (refcount > 0) + ret = -EBUSY; + else + list_del(&driver->list); + spin_unlock(&drivers_lock); + + return ret; +} +EXPORT_SYMBOL(zpool_unregister_driver); + +/* this assumes @type is null-terminated. */ +static struct zpool_driver *zpool_get_driver(const char *type) +{ + struct zpool_driver *driver; + + spin_lock(&drivers_lock); + list_for_each_entry(driver, &drivers_head, list) { + if (!strcmp(driver->type, type)) { + bool got = try_module_get(driver->owner); + + if (got) + atomic_inc(&driver->refcount); + spin_unlock(&drivers_lock); + return got ? driver : NULL; + } + } + + spin_unlock(&drivers_lock); + return NULL; +} + +static void zpool_put_driver(struct zpool_driver *driver) +{ + atomic_dec(&driver->refcount); + module_put(driver->owner); +} + +/** + * zpool_has_pool() - Check if the pool driver is available + * @type The type of the zpool to check (e.g. zbud, zsmalloc) + * + * This checks if the @type pool driver is available. This will try to load + * the requested module, if needed, but there is no guarantee the module will + * still be loaded and available immediately after calling. If this returns + * true, the caller should assume the pool is available, but must be prepared + * to handle the @zpool_create_pool() returning failure. However if this + * returns false, the caller should assume the requested pool type is not + * available; either the requested pool type module does not exist, or could + * not be loaded, and calling @zpool_create_pool() with the pool type will + * fail. + * + * Returns: true if @type pool is available, false if not + */ +bool zpool_has_pool(char *type) +{ + struct zpool_driver *driver = zpool_get_driver(type); + + if (!driver) { + request_module("zpool-%s", type); + driver = zpool_get_driver(type); + } + + if (!driver) + return false; + + zpool_put_driver(driver); + return true; +} +EXPORT_SYMBOL(zpool_has_pool); + +/** + * zpool_create_pool() - Create a new zpool + * @type The type of the zpool to create (e.g. zbud, zsmalloc) + * @name The name of the zpool (e.g. zram0, zswap) + * @gfp The GFP flags to use when allocating the pool. + * @ops The optional ops callback. + * + * This creates a new zpool of the specified type. The gfp flags will be + * used when allocating memory, if the implementation supports it. If the + * ops param is NULL, then the created zpool will not be shrinkable. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: New zpool on success, NULL on failure. + */ +struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, + const struct zpool_ops *ops) +{ + struct zpool_driver *driver; + struct zpool *zpool; + + pr_debug("creating pool type %s\n", type); + + driver = zpool_get_driver(type); + + if (!driver) { + request_module("zpool-%s", type); + driver = zpool_get_driver(type); + } + + if (!driver) { + pr_err("no driver for type %s\n", type); + return NULL; + } + + zpool = kmalloc(sizeof(*zpool), gfp); + if (!zpool) { + pr_err("couldn't create zpool - out of memory\n"); + zpool_put_driver(driver); + return NULL; + } + + zpool->type = driver->type; + zpool->driver = driver; + zpool->pool = driver->create(name, gfp, ops, zpool); + zpool->ops = ops; + + if (!zpool->pool) { + pr_err("couldn't create %s pool\n", type); + zpool_put_driver(driver); + kfree(zpool); + return NULL; + } + + pr_debug("created pool type %s\n", type); + + spin_lock(&pools_lock); + list_add(&zpool->list, &pools_head); + spin_unlock(&pools_lock); + + return zpool; +} + +/** + * zpool_destroy_pool() - Destroy a zpool + * @pool The zpool to destroy. + * + * Implementations must guarantee this to be thread-safe, + * however only when destroying different pools. The same + * pool should only be destroyed once, and should not be used + * after it is destroyed. + * + * This destroys an existing zpool. The zpool should not be in use. + */ +void zpool_destroy_pool(struct zpool *zpool) +{ + pr_debug("destroying pool type %s\n", zpool->type); + + spin_lock(&pools_lock); + list_del(&zpool->list); + spin_unlock(&pools_lock); + zpool->driver->destroy(zpool->pool); + zpool_put_driver(zpool->driver); + kfree(zpool); +} + +/** + * zpool_get_type() - Get the type of the zpool + * @pool The zpool to check + * + * This returns the type of the pool. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: The type of zpool. + */ +char *zpool_get_type(struct zpool *zpool) +{ + return zpool->type; +} + +/** + * zpool_malloc() - Allocate memory + * @pool The zpool to allocate from. + * @size The amount of memory to allocate. + * @gfp The GFP flags to use when allocating memory. + * @handle Pointer to the handle to set + * + * This allocates the requested amount of memory from the pool. + * The gfp flags will be used when allocating memory, if the + * implementation supports it. The provided @handle will be + * set to the allocated object handle. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error. + */ +int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zpool->driver->malloc(zpool->pool, size, gfp, handle); +} + +/** + * zpool_free() - Free previously allocated memory + * @pool The zpool that allocated the memory. + * @handle The handle to the memory to free. + * + * This frees previously allocated memory. This does not guarantee + * that the pool will actually free memory, only that the memory + * in the pool will become available for use by the pool. + * + * Implementations must guarantee this to be thread-safe, + * however only when freeing different handles. The same + * handle should only be freed once, and should not be used + * after freeing. + */ +void zpool_free(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->free(zpool->pool, handle); +} + +/** + * zpool_shrink() - Shrink the pool size + * @pool The zpool to shrink. + * @pages The number of pages to shrink the pool. + * @reclaimed The number of pages successfully evicted. + * + * This attempts to shrink the actual memory size of the pool + * by evicting currently used handle(s). If the pool was + * created with no zpool_ops, or the evict call fails for any + * of the handles, this will fail. If non-NULL, the @reclaimed + * parameter will be set to the number of pages reclaimed, + * which may be more than the number of pages requested. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error/failure. + */ +int zpool_shrink(struct zpool *zpool, unsigned int pages, + unsigned int *reclaimed) +{ + return zpool->driver->shrink(zpool->pool, pages, reclaimed); +} + +/** + * zpool_map_handle() - Map a previously allocated handle into memory + * @pool The zpool that the handle was allocated from + * @handle The handle to map + * @mm How the memory should be mapped + * + * This maps a previously allocated handle into memory. The @mm + * param indicates to the implementation how the memory will be + * used, i.e. read-only, write-only, read-write. If the + * implementation does not support it, the memory will be treated + * as read-write. + * + * This may hold locks, disable interrupts, and/or preemption, + * and the zpool_unmap_handle() must be called to undo those + * actions. The code that uses the mapped handle should complete + * its operatons on the mapped handle memory quickly and unmap + * as soon as possible. As the implementation may use per-cpu + * data, multiple handles should not be mapped concurrently on + * any cpu. + * + * Returns: A pointer to the handle's mapped memory area. + */ +void *zpool_map_handle(struct zpool *zpool, unsigned long handle, + enum zpool_mapmode mapmode) +{ + return zpool->driver->map(zpool->pool, handle, mapmode); +} + +/** + * zpool_unmap_handle() - Unmap a previously mapped handle + * @pool The zpool that the handle was allocated from + * @handle The handle to unmap + * + * This unmaps a previously mapped handle. Any locks or other + * actions that the implementation took in zpool_map_handle() + * will be undone here. The memory area returned from + * zpool_map_handle() should no longer be used after this. + */ +void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->unmap(zpool->pool, handle); +} + +/** + * zpool_get_total_size() - The total size of the pool + * @pool The zpool to check + * + * This returns the total size in bytes of the pool. + * + * Returns: Total size of the zpool in bytes. + */ +u64 zpool_get_total_size(struct zpool *zpool) +{ + return zpool->driver->total_size(zpool->pool); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Streetman "); +MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index eff0ca018..8cb22699e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2,6 +2,7 @@ * zsmalloc memory allocator * * Copyright (C) 2011 Nitin Gupta + * Copyright (C) 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the license that better fits your requirements. @@ -10,43 +11,12 @@ * Released under the terms of GNU General Public License Version 2.0 */ - /* - * This allocator is designed for use with zcache and zram. Thus, the - * allocator is supposed to work well under low memory conditions. In - * particular, it never attempts higher order page allocation which is - * very likely to fail under memory pressure. On the other hand, if we - * just use single (0-order) pages, it would suffer from very high - * fragmentation -- any object of size PAGE_SIZE/2 or larger would occupy - * an entire page. This was one of the major issues with its predecessor - * (xvmalloc). - * - * To overcome these issues, zsmalloc allocates a bunch of 0-order pages - * and links them together using various 'struct page' fields. These linked - * pages act as a single higher-order page i.e. an object can span 0-order - * page boundaries. The code refers to these linked pages as a single entity - * called zspage. - * - * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE - * since this satisfies the requirements of all its current users (in the - * worst case, page is incompressible and is thus stored "as-is" i.e. in - * uncompressed form). For allocation requests larger than this size, failure - * is returned (see zs_malloc). - * - * Additionally, zs_malloc() does not return a dereferenceable pointer. - * Instead, it returns an opaque handle (unsigned long) which encodes actual - * location of the allocated object. The reason for this indirection is that - * zsmalloc does not keep zspages permanently mapped since that would cause - * issues on 32-bit systems where the VA region for kernel space mappings - * is very small. So, before using the allocating memory, the object has to - * be mapped using zs_map_object() to get a usable pointer and subsequently - * unmapped using zs_unmap_object(). - * * Following is how we use various fields and flags of underlying * struct page(s) to form a zspage. * * Usage of struct page fields: - * page->first_page: points to the first component (0-order) page + * page->private: points to the first component (0-order) page * page->index (union with page->freelist): offset of the first object * starting in this page. For the first page, this is * always 0, so we use this field (aka freelist) to point @@ -56,14 +26,18 @@ * * For _first_ page only: * - * page->private (union with page->first_page): refers to the - * component page after the first page + * page->private: refers to the component page after the first page + * If the page is first_page for huge object, it stores handle. + * Look at size_class->huge. * page->freelist: points to the first free object in zspage. * Free objects are linked together using in-place * metadata. + * page->objects: maximum number of objects we can store in this + * zspage (class->zspage_order * PAGE_SIZE / class->size) * page->lru: links together first pages of various zspages. * Basically forming list of zspages in a fullness group. * page->mapping: class index and fullness group of the zspage + * page->inuse: the number of objects that are used in this zspage * * Usage of struct page flags: * PG_private: identifies the first component page @@ -71,12 +45,14 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include +#include #include #include #include -#include #include #include #include @@ -84,11 +60,12 @@ #include #include #include -#include +#include #include #include - +#include #include +#include /* * This must be power of 2 and greater than of equal to sizeof(link_free). @@ -105,6 +82,8 @@ #define ZS_MAX_ZSPAGE_ORDER 2 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) +#define ZS_HANDLE_SIZE (sizeof(unsigned long)) + /* * Object location (, ) is encoded as * as single (unsigned long) handle value. @@ -128,17 +107,37 @@ #endif #endif #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) -#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) + +/* + * Memory for allocating for handle keeps object position by + * encoding and the encoded value has a room + * in least bit(ie, look at obj_to_location). + * We use the bit to synchronize between object access by + * user and migration. + */ +#define HANDLE_PIN_BIT 0 + +/* + * Head in allocated object should have OBJ_ALLOCATED_TAG + * to identify the object was allocated or not. + * It's okay to add the status bit in the least bit because + * header keeps handle which is 4byte-aligned address so we + * have room for two bit at least. + */ +#define OBJ_ALLOCATED_TAG 1 +#define OBJ_TAG_BITS 1 +#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) #define MAX(a, b) ((a) >= (b) ? (a) : (b)) /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ #define ZS_MIN_ALLOC_SIZE \ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) +/* each chunk includes extra space to keep handle */ #define ZS_MAX_ALLOC_SIZE PAGE_SIZE /* - * On systems with 4K page size, this gives 254 size classes! There is a + * On systems with 4K page size, this gives 255 size classes! There is a * trader-off here: * - Large number of size classes is potentially wasteful as free page are * spread across these classes @@ -151,8 +150,6 @@ * (reason above) */ #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) -#define ZS_SIZE_CLASSES ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / \ - ZS_SIZE_CLASS_DELTA + 1) /* * We do not maintain any list for completely empty or full pages @@ -166,12 +163,38 @@ enum fullness_group { ZS_FULL }; +enum zs_stat_type { + OBJ_ALLOCATED, + OBJ_USED, + CLASS_ALMOST_FULL, + CLASS_ALMOST_EMPTY, +}; + +#ifdef CONFIG_ZSMALLOC_STAT +#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) +#else +#define NR_ZS_STAT_TYPE (OBJ_USED + 1) +#endif + +struct zs_size_stat { + unsigned long objs[NR_ZS_STAT_TYPE]; +}; + +#ifdef CONFIG_ZSMALLOC_STAT +static struct dentry *zs_stat_root; +#endif + +/* + * number of size_classes + */ +static int zs_size_classes; + /* * We assign a page to ZS_ALMOST_EMPTY fullness group when: * n <= N / f, where * n = number of allocated objects * N = total number of objects zspage can store - * f = 1/fullness_threshold_frac + * f = fullness_threshold_frac * * Similarly, we assign zspage to: * ZS_ALMOST_FULL when n > N / f @@ -183,6 +206,8 @@ enum fullness_group { static const int fullness_threshold_frac = 4; struct size_class { + spinlock_t lock; + struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; /* * Size of objects stored in this class. Must be multiple * of ZS_ALIGN. @@ -190,15 +215,12 @@ struct size_class { int size; unsigned int index; + struct zs_size_stat stats; + /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ int pages_per_zspage; - - spinlock_t lock; - - /* stats */ - u64 pages_allocated; - - struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; + /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ + bool huge; }; /* @@ -208,14 +230,39 @@ struct size_class { * This must be power of 2 and less than or equal to ZS_ALIGN */ struct link_free { - /* Handle of next free chunk (encodes ) */ - void *next; + union { + /* + * Position of next free chunk (encodes ) + * It's valid for non-allocated object + */ + void *next; + /* + * Handle of allocated object. + */ + unsigned long handle; + }; }; struct zs_pool { - struct size_class size_class[ZS_SIZE_CLASSES]; + const char *name; + + struct size_class **size_class; + struct kmem_cache *handle_cachep; + + atomic_long_t pages_allocated; - struct zs_ops *ops; + struct zs_pool_stats stats; + + /* Compact classes */ + struct shrinker shrinker; + /* + * To signify that register_shrinker() was successful + * and unregister_shrinker() will not Oops. + */ + bool shrinker_enabled; +#ifdef CONFIG_ZSMALLOC_STAT + struct dentry *stat_dentry; +#endif }; /* @@ -237,22 +284,128 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; -/* default page alloc/free ops */ -struct page *zs_alloc_page(gfp_t flags) +static int create_handle_cache(struct zs_pool *pool) +{ + pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, + 0, 0, NULL); + return pool->handle_cachep ? 0 : 1; +} + +static void destroy_handle_cache(struct zs_pool *pool) +{ + kmem_cache_destroy(pool->handle_cachep); +} + +static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp) +{ + return (unsigned long)kmem_cache_alloc(pool->handle_cachep, + gfp & ~__GFP_HIGHMEM); +} + +static void free_handle(struct zs_pool *pool, unsigned long handle) +{ + kmem_cache_free(pool->handle_cachep, (void *)handle); +} + +static void record_obj(unsigned long handle, unsigned long obj) +{ + /* + * lsb of @obj represents handle lock while other bits + * represent object value the handle is pointing so + * updating shouldn't do store tearing. + */ + WRITE_ONCE(*(unsigned long *)handle, obj); +} + +/* zpool driver */ + +#ifdef CONFIG_ZPOOL + +static void *zs_zpool_create(const char *name, gfp_t gfp, + const struct zpool_ops *zpool_ops, + struct zpool *zpool) +{ + /* + * Ignore global gfp flags: zs_malloc() may be invoked from + * different contexts and its caller must provide a valid + * gfp mask. + */ + return zs_create_pool(name); +} + +static void zs_zpool_destroy(void *pool) +{ + zs_destroy_pool(pool); +} + +static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + *handle = zs_malloc(pool, size, gfp); + return *handle ? 0 : -1; +} +static void zs_zpool_free(void *pool, unsigned long handle) +{ + zs_free(pool, handle); +} + +static int zs_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + return -EINVAL; +} + +static void *zs_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + enum zs_mapmode zs_mm; + + switch (mm) { + case ZPOOL_MM_RO: + zs_mm = ZS_MM_RO; + break; + case ZPOOL_MM_WO: + zs_mm = ZS_MM_WO; + break; + case ZPOOL_MM_RW: /* fallthru */ + default: + zs_mm = ZS_MM_RW; + break; + } + + return zs_map_object(pool, handle, zs_mm); +} +static void zs_zpool_unmap(void *pool, unsigned long handle) { - return alloc_page(flags); + zs_unmap_object(pool, handle); } -void zs_free_page(struct page *page) +static u64 zs_zpool_total_size(void *pool) { - __free_page(page); + return zs_get_total_pages(pool) << PAGE_SHIFT; } -struct zs_ops zs_default_ops = { - .alloc = zs_alloc_page, - .free = zs_free_page +static struct zpool_driver zs_zpool_driver = { + .type = "zsmalloc", + .owner = THIS_MODULE, + .create = zs_zpool_create, + .destroy = zs_zpool_destroy, + .malloc = zs_zpool_malloc, + .free = zs_zpool_free, + .shrink = zs_zpool_shrink, + .map = zs_zpool_map, + .unmap = zs_zpool_unmap, + .total_size = zs_zpool_total_size, }; +MODULE_ALIAS("zpool-zsmalloc"); +#endif /* CONFIG_ZPOOL */ + +static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) +{ + return pages_per_zspage * PAGE_SIZE / size; +} + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); @@ -266,26 +419,28 @@ static int is_last_page(struct page *page) return PagePrivate2(page); } -static void get_zspage_mapping(struct page *page, unsigned int *class_idx, +static void get_zspage_mapping(struct page *first_page, + unsigned int *class_idx, enum fullness_group *fullness) { unsigned long m; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); - m = (unsigned long)page->mapping; + m = (unsigned long)first_page->mapping; *fullness = m & FULLNESS_MASK; *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; } -static void set_zspage_mapping(struct page *page, unsigned int class_idx, +static void set_zspage_mapping(struct page *first_page, + unsigned int class_idx, enum fullness_group fullness) { unsigned long m; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | (fullness & FULLNESS_MASK); - page->mapping = (struct address_space *)m; + first_page->mapping = (struct address_space *)m; } /* @@ -303,8 +458,171 @@ static int get_size_class_index(int size) idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, ZS_SIZE_CLASS_DELTA); - return idx; + return min(zs_size_classes - 1, idx); +} + +static inline void zs_stat_inc(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ + if (type < NR_ZS_STAT_TYPE) + class->stats.objs[type] += cnt; +} + +static inline void zs_stat_dec(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ + if (type < NR_ZS_STAT_TYPE) + class->stats.objs[type] -= cnt; +} + +static inline unsigned long zs_stat_get(struct size_class *class, + enum zs_stat_type type) +{ + if (type < NR_ZS_STAT_TYPE) + return class->stats.objs[type]; + return 0; +} + +#ifdef CONFIG_ZSMALLOC_STAT + +static void __init zs_stat_init(void) +{ + if (!debugfs_initialized()) { + pr_warn("debugfs not available, stat dir not created\n"); + return; + } + + zs_stat_root = debugfs_create_dir("zsmalloc", NULL); + if (!zs_stat_root) + pr_warn("debugfs 'zsmalloc' stat dir creation failed\n"); +} + +static void __exit zs_stat_exit(void) +{ + debugfs_remove_recursive(zs_stat_root); +} + +static unsigned long zs_can_compact(struct size_class *class); + +static int zs_stats_size_show(struct seq_file *s, void *v) +{ + int i; + struct zs_pool *pool = s->private; + struct size_class *class; + int objs_per_zspage; + unsigned long class_almost_full, class_almost_empty; + unsigned long obj_allocated, obj_used, pages_used, freeable; + unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; + unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; + unsigned long total_freeable = 0; + + seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n", + "class", "size", "almost_full", "almost_empty", + "obj_allocated", "obj_used", "pages_used", + "pages_per_zspage", "freeable"); + + for (i = 0; i < zs_size_classes; i++) { + class = pool->size_class[i]; + + if (class->index != i) + continue; + + spin_lock(&class->lock); + class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); + class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); + obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + obj_used = zs_stat_get(class, OBJ_USED); + freeable = zs_can_compact(class); + spin_unlock(&class->lock); + + objs_per_zspage = get_maxobj_per_zspage(class->size, + class->pages_per_zspage); + pages_used = obj_allocated / objs_per_zspage * + class->pages_per_zspage; + + seq_printf(s, " %5u %5u %11lu %12lu %13lu" + " %10lu %10lu %16d %8lu\n", + i, class->size, class_almost_full, class_almost_empty, + obj_allocated, obj_used, pages_used, + class->pages_per_zspage, freeable); + + total_class_almost_full += class_almost_full; + total_class_almost_empty += class_almost_empty; + total_objs += obj_allocated; + total_used_objs += obj_used; + total_pages += pages_used; + total_freeable += freeable; + } + + seq_puts(s, "\n"); + seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n", + "Total", "", total_class_almost_full, + total_class_almost_empty, total_objs, + total_used_objs, total_pages, "", total_freeable); + + return 0; +} + +static int zs_stats_size_open(struct inode *inode, struct file *file) +{ + return single_open(file, zs_stats_size_show, inode->i_private); +} + +static const struct file_operations zs_stat_size_ops = { + .open = zs_stats_size_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void zs_pool_stat_create(struct zs_pool *pool, const char *name) +{ + struct dentry *entry; + + if (!zs_stat_root) { + pr_warn("no root stat dir, not creating <%s> stat dir\n", name); + return; + } + + entry = debugfs_create_dir(name, zs_stat_root); + if (!entry) { + pr_warn("debugfs dir <%s> creation failed\n", name); + return; + } + pool->stat_dentry = entry; + + entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, + pool->stat_dentry, pool, &zs_stat_size_ops); + if (!entry) { + pr_warn("%s: debugfs file entry <%s> creation failed\n", + name, "classes"); + debugfs_remove_recursive(pool->stat_dentry); + pool->stat_dentry = NULL; + } +} + +static void zs_pool_stat_destroy(struct zs_pool *pool) +{ + debugfs_remove_recursive(pool->stat_dentry); +} + +#else /* CONFIG_ZSMALLOC_STAT */ +static void __init zs_stat_init(void) +{ +} + +static void __exit zs_stat_exit(void) +{ +} + +static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name) +{ +} + +static inline void zs_pool_stat_destroy(struct zs_pool *pool) +{ } +#endif /* * For each size class, zspages are divided into different groups @@ -313,21 +631,20 @@ static int get_size_class_index(int size) * the pool (not yet implemented). This function returns fullness * status of the given page. */ -static enum fullness_group get_fullness_group(struct page *page, - struct size_class *class) +static enum fullness_group get_fullness_group(struct page *first_page) { int inuse, max_objects; enum fullness_group fg; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); - inuse = page->inuse; - max_objects = class->pages_per_zspage * PAGE_SIZE / class->size; + inuse = first_page->inuse; + max_objects = first_page->objects; if (inuse == 0) fg = ZS_EMPTY; else if (inuse == max_objects) fg = ZS_FULL; - else if (inuse <= max_objects / fullness_threshold_frac) + else if (inuse <= 3 * max_objects / fullness_threshold_frac) fg = ZS_ALMOST_EMPTY; else fg = ZS_ALMOST_FULL; @@ -341,33 +658,46 @@ static enum fullness_group get_fullness_group(struct page *page, * have. This functions inserts the given zspage into the freelist * identified by . */ -static void insert_zspage(struct page *page, struct size_class *class, - enum fullness_group fullness) +static void insert_zspage(struct size_class *class, + enum fullness_group fullness, + struct page *first_page) { struct page **head; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); if (fullness >= _ZS_NR_FULLNESS_GROUPS) return; + zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? + CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); + head = &class->fullness_list[fullness]; - if (*head) - list_add_tail(&page->lru, &(*head)->lru); + if (!*head) { + *head = first_page; + return; + } - *head = page; + /* + * We want to see more ZS_FULL pages and less almost + * empty/full. Put pages with higher ->inuse first. + */ + list_add_tail(&first_page->lru, &(*head)->lru); + if (first_page->inuse >= (*head)->inuse) + *head = first_page; } /* * This function removes the given zspage from the freelist identified * by . */ -static void remove_zspage(struct page *page, struct size_class *class, - enum fullness_group fullness) +static void remove_zspage(struct size_class *class, + enum fullness_group fullness, + struct page *first_page) { struct page **head; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); if (fullness >= _ZS_NR_FULLNESS_GROUPS) return; @@ -376,11 +706,13 @@ static void remove_zspage(struct page *page, struct size_class *class, BUG_ON(!*head); if (list_empty(&(*head)->lru)) *head = NULL; - else if (*head == page) + else if (*head == first_page) *head = (struct page *)list_entry((*head)->lru.next, struct page, lru); - list_del_init(&page->lru); + list_del_init(&first_page->lru); + zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? + CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); } /* @@ -392,24 +724,22 @@ static void remove_zspage(struct page *page, struct size_class *class, * page from the freelist of the old fullness group to that of the new * fullness group. */ -static enum fullness_group fix_fullness_group(struct zs_pool *pool, - struct page *page) +static enum fullness_group fix_fullness_group(struct size_class *class, + struct page *first_page) { int class_idx; - struct size_class *class; enum fullness_group currfg, newfg; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); - get_zspage_mapping(page, &class_idx, &currfg); - class = &pool->size_class[class_idx]; - newfg = get_fullness_group(page, class); + get_zspage_mapping(first_page, &class_idx, &currfg); + newfg = get_fullness_group(first_page); if (newfg == currfg) goto out; - remove_zspage(page, class, currfg); - insert_zspage(page, class, newfg); - set_zspage_mapping(page, class_idx, newfg); + remove_zspage(class, currfg, first_page); + insert_zspage(class, newfg, first_page); + set_zspage_mapping(first_page, class_idx, newfg); out: return newfg; @@ -420,7 +750,8 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool, * to form a zspage for each size class. This is important * to reduce wastage due to unusable space left at end of * each zspage which is given as: - * wastage = Zp - Zp % size_class + * wastage = Zp % class_size + * usage = Zp - wastage * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... * * For example, for size class of 3/8 * PAGE_SIZE, we should @@ -460,7 +791,7 @@ static struct page *get_first_page(struct page *page) if (is_first_page(page)) return page; else - return page->first_page; + return (struct page *)page_private(page); } static struct page *get_next_page(struct page *page) @@ -470,35 +801,59 @@ static struct page *get_next_page(struct page *page) if (is_last_page(page)) next = NULL; else if (is_first_page(page)) - next = (struct page *)page->private; + next = (struct page *)page_private(page); else next = list_entry(page->lru.next, struct page, lru); return next; } -/* Encode as a single handle value */ -static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) +/* + * Encode as a single handle value. + * We use the least bit of handle for tagging. + */ +static void *location_to_obj(struct page *page, unsigned long obj_idx) { - unsigned long handle; + unsigned long obj; if (!page) { BUG_ON(obj_idx); return NULL; } - handle = page_to_pfn(page) << OBJ_INDEX_BITS; - handle |= (obj_idx & OBJ_INDEX_MASK); + obj = page_to_pfn(page) << OBJ_INDEX_BITS; + obj |= ((obj_idx) & OBJ_INDEX_MASK); + obj <<= OBJ_TAG_BITS; - return (void *)handle; + return (void *)obj; } -/* Decode pair from the given object handle */ -static void obj_handle_to_location(unsigned long handle, struct page **page, +/* + * Decode pair from the given object handle. We adjust the + * decoded obj_idx back to its original value since it was adjusted in + * location_to_obj(). + */ +static void obj_to_location(unsigned long obj, struct page **page, unsigned long *obj_idx) { - *page = pfn_to_page(handle >> OBJ_INDEX_BITS); - *obj_idx = handle & OBJ_INDEX_MASK; + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *obj_idx = (obj & OBJ_INDEX_MASK); +} + +static unsigned long handle_to_obj(unsigned long handle) +{ + return *(unsigned long *)handle; +} + +static unsigned long obj_to_head(struct size_class *class, struct page *page, + void *obj) +{ + if (class->huge) { + VM_BUG_ON(!is_first_page(page)); + return page_private(page); + } else + return *(unsigned long *)obj; } static unsigned long obj_idx_to_offset(struct page *page, @@ -512,6 +867,25 @@ static unsigned long obj_idx_to_offset(struct page *page, return off + obj_idx * class_size; } +static inline int trypin_tag(unsigned long handle) +{ + unsigned long *ptr = (unsigned long *)handle; + + return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); +} + +static void pin_tag(unsigned long handle) +{ + while (!trypin_tag(handle)); +} + +static void unpin_tag(unsigned long handle) +{ + unsigned long *ptr = (unsigned long *)handle; + + clear_bit_unlock(HANDLE_PIN_BIT, ptr); +} + static void reset_page(struct page *page) { clear_bit(PG_private, &page->flags); @@ -519,10 +893,10 @@ static void reset_page(struct page *page) set_page_private(page, 0); page->mapping = NULL; page->freelist = NULL; - reset_page_mapcount(page); + page_mapcount_reset(page); } -static void free_zspage(struct zs_ops *ops, struct page *first_page) +static void free_zspage(struct page *first_page) { struct page *nextp, *tmp, *head_extra; @@ -532,7 +906,7 @@ static void free_zspage(struct zs_ops *ops, struct page *first_page) head_extra = (struct page *)page_private(first_page); reset_page(first_page); - ops->free(first_page); + __free_page(first_page); /* zspage with only 1 system page */ if (!head_extra) @@ -541,14 +915,14 @@ static void free_zspage(struct zs_ops *ops, struct page *first_page) list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { list_del(&nextp->lru); reset_page(nextp); - ops->free(nextp); + __free_page(nextp); } reset_page(head_extra); - ops->free(head_extra); + __free_page(head_extra); } /* Initialize a newly allocated zspage */ -static void init_zspage(struct page *first_page, struct size_class *class) +static void init_zspage(struct size_class *class, struct page *first_page) { unsigned long off = 0; struct page *page = first_page; @@ -557,7 +931,8 @@ static void init_zspage(struct page *first_page, struct size_class *class) while (page) { struct page *next_page; struct link_free *link; - unsigned int i, objs_on_page; + unsigned int i = 1; + void *vaddr; /* * page->index stores offset of first object starting @@ -568,16 +943,12 @@ static void init_zspage(struct page *first_page, struct size_class *class) if (page != first_page) page->index = off; - link = (struct link_free *)kmap_atomic(page) + - off / sizeof(*link); - objs_on_page = (PAGE_SIZE - off) / class->size; + vaddr = kmap_atomic(page); + link = (struct link_free *)vaddr + off / sizeof(*link); - for (i = 1; i <= objs_on_page; i++) { - off += class->size; - if (off < PAGE_SIZE) { - link->next = obj_location_to_handle(page, i); - link += class->size / sizeof(*link); - } + while ((off += class->size) < PAGE_SIZE) { + link->next = location_to_obj(page, i++); + link += class->size / sizeof(*link); } /* @@ -586,18 +957,17 @@ static void init_zspage(struct page *first_page, struct size_class *class) * page (if present) */ next_page = get_next_page(page); - link->next = obj_location_to_handle(next_page, 0); - kunmap_atomic(link); + link->next = location_to_obj(next_page, 0); + kunmap_atomic(vaddr); page = next_page; - off = (off + class->size) % PAGE_SIZE; + off %= PAGE_SIZE; } } /* * Allocate a zspage for the given size class */ -static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class, - gfp_t flags) +static struct page *alloc_zspage(struct size_class *class, gfp_t flags) { int i, error; struct page *first_page = NULL, *uninitialized_var(prev_page); @@ -606,7 +976,7 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class, * Allocate individual pages and link them together as: * 1. first page->private = first sub-page * 2. all sub-pages are linked together using page->lru - * 3. each sub-page is linked to the first page using page->first_page + * 3. each sub-page is linked to the first page using page->private * * For each size class, First/Head pages are linked together using * page->lru. Also, we set PG_private to identify the first page @@ -617,7 +987,7 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class, for (i = 0; i < class->pages_per_zspage; i++) { struct page *page; - page = ops->alloc(flags); + page = alloc_page(flags); if (!page) goto cleanup; @@ -629,9 +999,9 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class, first_page->inuse = 0; } if (i == 1) - first_page->private = (unsigned long)page; + set_page_private(first_page, (unsigned long)page); if (i >= 1) - page->first_page = first_page; + set_page_private(page, (unsigned long)first_page); if (i >= 2) list_add(&page->lru, &prev_page->lru); if (i == class->pages_per_zspage - 1) /* last page */ @@ -639,15 +1009,17 @@ static struct page *alloc_zspage(struct zs_ops *ops, struct size_class *class, prev_page = page; } - init_zspage(first_page, class); + init_zspage(class, first_page); - first_page->freelist = obj_location_to_handle(first_page, 0); + first_page->freelist = location_to_obj(first_page, 0); + /* Maximum number of objects we can store in this zspage */ + first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; error = 0; /* Success */ cleanup: if (unlikely(error) && first_page) { - free_zspage(ops, first_page); + free_zspage(first_page); first_page = NULL; } @@ -693,7 +1065,7 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); + BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); area->vm_addr = area->vm->addr; return area->vm_addr + off; } @@ -702,14 +1074,11 @@ static inline void __zs_unmap_object(struct mapping_area *area, struct page *pages[2], int off, int size) { unsigned long addr = (unsigned long)area->vm_addr; - unsigned long end = addr + (PAGE_SIZE * 2); - flush_cache_vunmap(addr, end); - unmap_kernel_range_noflush(addr, PAGE_SIZE * 2); - flush_tlb_kernel_range(addr, end); + unmap_kernel_range(addr, PAGE_SIZE * 2); } -#else /* CONFIG_PGTABLE_MAPPING*/ +#else /* CONFIG_PGTABLE_MAPPING */ static inline int __zs_cpu_up(struct mapping_area *area) { @@ -719,7 +1088,7 @@ static inline int __zs_cpu_up(struct mapping_area *area) */ if (area->vm_buf) return 0; - area->vm_buf = (char *)__get_free_page(GFP_KERNEL); + area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); if (!area->vm_buf) return -ENOMEM; return 0; @@ -727,8 +1096,7 @@ static inline int __zs_cpu_up(struct mapping_area *area) static inline void __zs_cpu_down(struct mapping_area *area) { - if (area->vm_buf) - free_page((unsigned long)area->vm_buf); + kfree(area->vm_buf); area->vm_buf = NULL; } @@ -765,12 +1133,17 @@ static void __zs_unmap_object(struct mapping_area *area, { int sizes[2]; void *addr; - char *buf = area->vm_buf; + char *buf; /* no write fastpath */ if (area->vm_mm == ZS_MM_RO) goto out; + buf = area->vm_buf; + buf = buf + ZS_HANDLE_SIZE; + size -= ZS_HANDLE_SIZE; + off += ZS_HANDLE_SIZE; + sizes[0] = PAGE_SIZE - off; sizes[1] = size - sizes[0]; @@ -816,197 +1189,71 @@ static struct notifier_block zs_cpu_nb = { .notifier_call = zs_cpu_notifier }; -static void zs_exit(void) +static int zs_register_cpu_notifier(void) { - int cpu; - - for_each_online_cpu(cpu) - zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); - unregister_cpu_notifier(&zs_cpu_nb); -} + int cpu, uninitialized_var(ret); -static int zs_init(void) -{ - int cpu, ret; + cpu_notifier_register_begin(); - register_cpu_notifier(&zs_cpu_nb); + __register_cpu_notifier(&zs_cpu_nb); for_each_online_cpu(cpu) { ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); if (notifier_to_errno(ret)) - goto fail; + break; } - return 0; -fail: - zs_exit(); + + cpu_notifier_register_done(); return notifier_to_errno(ret); } -/** - * zs_create_pool - Creates an allocation pool to work from. - * @flags: allocation flags used to allocate pool metadata - * @ops: allocation/free callbacks for expanding the pool - * - * This function must be called before anything when using - * the zsmalloc allocator. - * - * On success, a pointer to the newly created pool is returned, - * otherwise NULL. - */ -struct zs_pool *zs_create_pool(gfp_t flags, struct zs_ops *ops) +static void zs_unregister_cpu_notifier(void) { - int i, ovhd_size; - struct zs_pool *pool; - - ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); - pool = kzalloc(ovhd_size, flags); - if (!pool) - return NULL; + int cpu; - for (i = 0; i < ZS_SIZE_CLASSES; i++) { - int size; - struct size_class *class; + cpu_notifier_register_begin(); - size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; - if (size > ZS_MAX_ALLOC_SIZE) - size = ZS_MAX_ALLOC_SIZE; + for_each_online_cpu(cpu) + zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); + __unregister_cpu_notifier(&zs_cpu_nb); - class = &pool->size_class[i]; - class->size = size; - class->index = i; - spin_lock_init(&class->lock); - class->pages_per_zspage = get_pages_per_zspage(size); + cpu_notifier_register_done(); +} - } +static void init_zs_size_classes(void) +{ + int nr; - if (ops) - pool->ops = ops; - else - pool->ops = &zs_default_ops; + nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; + if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) + nr += 1; - return pool; + zs_size_classes = nr; } -EXPORT_SYMBOL_GPL(zs_create_pool); -void zs_destroy_pool(struct zs_pool *pool) +static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) { - int i; + if (prev->pages_per_zspage != pages_per_zspage) + return false; - for (i = 0; i < ZS_SIZE_CLASSES; i++) { - int fg; - struct size_class *class = &pool->size_class[i]; + if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) + != get_maxobj_per_zspage(size, pages_per_zspage)) + return false; - for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { - if (class->fullness_list[fg]) { - pr_info("Freeing non-empty class with size " - "%db, fullness group %d\n", - class->size, fg); - } - } - } - kfree(pool); + return true; } -EXPORT_SYMBOL_GPL(zs_destroy_pool); -/** - * zs_malloc - Allocate block of given size from pool. - * @pool: pool to allocate from - * @size: size of block to allocate - * - * On success, handle to the allocated object is returned, - * otherwise 0. - * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. - */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags) +static bool zspage_full(struct page *first_page) { - unsigned long obj; - struct link_free *link; - int class_idx; - struct size_class *class; - - struct page *first_page, *m_page; - unsigned long m_objidx, m_offset; - - if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) - return 0; - - class_idx = get_size_class_index(size); - class = &pool->size_class[class_idx]; - BUG_ON(class_idx != class->index); - - spin_lock(&class->lock); - first_page = find_get_zspage(class); - - if (!first_page) { - spin_unlock(&class->lock); - first_page = alloc_zspage(pool->ops, class, flags); - if (unlikely(!first_page)) - return 0; - - set_zspage_mapping(first_page, class->index, ZS_EMPTY); - spin_lock(&class->lock); - class->pages_allocated += class->pages_per_zspage; - } - - obj = (unsigned long)first_page->freelist; - obj_handle_to_location(obj, &m_page, &m_objidx); - m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); - - link = (struct link_free *)kmap_atomic(m_page) + - m_offset / sizeof(*link); - first_page->freelist = link->next; - memset(link, POISON_INUSE, sizeof(*link)); - kunmap_atomic(link); - - first_page->inuse++; - /* Now move the zspage to another fullness group, if required */ - fix_fullness_group(pool, first_page); - spin_unlock(&class->lock); + BUG_ON(!is_first_page(first_page)); - return obj; + return first_page->inuse == first_page->objects; } -EXPORT_SYMBOL_GPL(zs_malloc); -void zs_free(struct zs_pool *pool, unsigned long obj) +unsigned long zs_get_total_pages(struct zs_pool *pool) { - struct link_free *link; - struct page *first_page, *f_page; - unsigned long f_objidx, f_offset; - - int class_idx; - struct size_class *class; - enum fullness_group fullness; - - if (unlikely(!obj)) - return; - - obj_handle_to_location(obj, &f_page, &f_objidx); - first_page = get_first_page(f_page); - - get_zspage_mapping(first_page, &class_idx, &fullness); - class = &pool->size_class[class_idx]; - f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); - - spin_lock(&class->lock); - - /* Insert this object in containing zspage's freelist */ - link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) - + f_offset); - link->next = first_page->freelist; - kunmap_atomic(link); - first_page->freelist = (void *)obj; - - first_page->inuse--; - fullness = fix_fullness_group(pool, first_page); - - if (fullness == ZS_EMPTY) - class->pages_allocated -= class->pages_per_zspage; - - spin_unlock(&class->lock); - - if (fullness == ZS_EMPTY) - free_zspage(pool->ops, first_page); + return atomic_long_read(&pool->pages_allocated); } -EXPORT_SYMBOL_GPL(zs_free); +EXPORT_SYMBOL_GPL(zs_get_total_pages); /** * zs_map_object - get address of allocated object from handle. @@ -1021,18 +1268,19 @@ EXPORT_SYMBOL_GPL(zs_free); * against nested mappings. * * This function returns with preemption and page faults disabled. -*/ + */ void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm) { struct page *page; - unsigned long obj_idx, off; + unsigned long obj, obj_idx, off; unsigned int class_idx; enum fullness_group fg; struct size_class *class; struct mapping_area *area; struct page *pages[2]; + void *ret; BUG_ON(!handle); @@ -1043,9 +1291,13 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, */ BUG_ON(in_interrupt()); - obj_handle_to_location(handle, &page, &obj_idx); + /* From now on, migration cannot move the object */ + pin_tag(handle); + + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); - class = &pool->size_class[class_idx]; + class = pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); area = &get_cpu_var(zs_map_area); @@ -1053,7 +1305,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ area->vm_addr = kmap_atomic(page); - return area->vm_addr + off; + ret = area->vm_addr + off; + goto out; } /* this object spans two pages */ @@ -1061,14 +1314,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, pages[1] = get_next_page(page); BUG_ON(!pages[1]); - return __zs_map_object(area, pages, off, class->size); + ret = __zs_map_object(area, pages, off, class->size); +out: + if (!class->huge) + ret += ZS_HANDLE_SIZE; + + return ret; } EXPORT_SYMBOL_GPL(zs_map_object); void zs_unmap_object(struct zs_pool *pool, unsigned long handle) { struct page *page; - unsigned long obj_idx, off; + unsigned long obj, obj_idx, off; unsigned int class_idx; enum fullness_group fg; @@ -1077,12 +1335,13 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) BUG_ON(!handle); - obj_handle_to_location(handle, &page, &obj_idx); + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); - class = &pool->size_class[class_idx]; + class = pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); - area = &__get_cpu_var(zs_map_area); + area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) kunmap_atomic(area->vm_addr); else { @@ -1095,20 +1354,704 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } put_cpu_var(zs_map_area); + unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); -u64 zs_get_total_size_bytes(struct zs_pool *pool) +static unsigned long obj_malloc(struct size_class *class, + struct page *first_page, unsigned long handle) +{ + unsigned long obj; + struct link_free *link; + + struct page *m_page; + unsigned long m_objidx, m_offset; + void *vaddr; + + handle |= OBJ_ALLOCATED_TAG; + obj = (unsigned long)first_page->freelist; + obj_to_location(obj, &m_page, &m_objidx); + m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); + + vaddr = kmap_atomic(m_page); + link = (struct link_free *)vaddr + m_offset / sizeof(*link); + first_page->freelist = link->next; + if (!class->huge) + /* record handle in the header of allocated chunk */ + link->handle = handle; + else + /* record handle in first_page->private */ + set_page_private(first_page, handle); + kunmap_atomic(vaddr); + first_page->inuse++; + zs_stat_inc(class, OBJ_USED, 1); + + return obj; +} + + +/** + * zs_malloc - Allocate block of given size from pool. + * @pool: pool to allocate from + * @size: size of block to allocate + * + * On success, handle to the allocated object is returned, + * otherwise 0. + * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. + */ +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) +{ + unsigned long handle, obj; + struct size_class *class; + struct page *first_page; + + if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) + return 0; + + handle = alloc_handle(pool, gfp); + if (!handle) + return 0; + + /* extra space in chunk to keep the handle */ + size += ZS_HANDLE_SIZE; + class = pool->size_class[get_size_class_index(size)]; + + spin_lock(&class->lock); + first_page = find_get_zspage(class); + + if (!first_page) { + spin_unlock(&class->lock); + first_page = alloc_zspage(class, gfp); + if (unlikely(!first_page)) { + free_handle(pool, handle); + return 0; + } + + set_zspage_mapping(first_page, class->index, ZS_EMPTY); + atomic_long_add(class->pages_per_zspage, + &pool->pages_allocated); + + spin_lock(&class->lock); + zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + } + + obj = obj_malloc(class, first_page, handle); + /* Now move the zspage to another fullness group, if required */ + fix_fullness_group(class, first_page); + record_obj(handle, obj); + spin_unlock(&class->lock); + + return handle; +} +EXPORT_SYMBOL_GPL(zs_malloc); + +static void obj_free(struct size_class *class, unsigned long obj) +{ + struct link_free *link; + struct page *first_page, *f_page; + unsigned long f_objidx, f_offset; + void *vaddr; + + BUG_ON(!obj); + + obj &= ~OBJ_ALLOCATED_TAG; + obj_to_location(obj, &f_page, &f_objidx); + first_page = get_first_page(f_page); + + f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); + + vaddr = kmap_atomic(f_page); + + /* Insert this object in containing zspage's freelist */ + link = (struct link_free *)(vaddr + f_offset); + link->next = first_page->freelist; + if (class->huge) + set_page_private(first_page, 0); + kunmap_atomic(vaddr); + first_page->freelist = (void *)obj; + first_page->inuse--; + zs_stat_dec(class, OBJ_USED, 1); +} + +void zs_free(struct zs_pool *pool, unsigned long handle) +{ + struct page *first_page, *f_page; + unsigned long obj, f_objidx; + int class_idx; + struct size_class *class; + enum fullness_group fullness; + + if (unlikely(!handle)) + return; + + pin_tag(handle); + obj = handle_to_obj(handle); + obj_to_location(obj, &f_page, &f_objidx); + first_page = get_first_page(f_page); + + get_zspage_mapping(first_page, &class_idx, &fullness); + class = pool->size_class[class_idx]; + + spin_lock(&class->lock); + obj_free(class, obj); + fullness = fix_fullness_group(class, first_page); + if (fullness == ZS_EMPTY) { + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); + free_zspage(first_page); + } + spin_unlock(&class->lock); + unpin_tag(handle); + + free_handle(pool, handle); +} +EXPORT_SYMBOL_GPL(zs_free); + +static void zs_object_copy(struct size_class *class, unsigned long dst, + unsigned long src) +{ + struct page *s_page, *d_page; + unsigned long s_objidx, d_objidx; + unsigned long s_off, d_off; + void *s_addr, *d_addr; + int s_size, d_size, size; + int written = 0; + + s_size = d_size = class->size; + + obj_to_location(src, &s_page, &s_objidx); + obj_to_location(dst, &d_page, &d_objidx); + + s_off = obj_idx_to_offset(s_page, s_objidx, class->size); + d_off = obj_idx_to_offset(d_page, d_objidx, class->size); + + if (s_off + class->size > PAGE_SIZE) + s_size = PAGE_SIZE - s_off; + + if (d_off + class->size > PAGE_SIZE) + d_size = PAGE_SIZE - d_off; + + s_addr = kmap_atomic(s_page); + d_addr = kmap_atomic(d_page); + + while (1) { + size = min(s_size, d_size); + memcpy(d_addr + d_off, s_addr + s_off, size); + written += size; + + if (written == class->size) + break; + + s_off += size; + s_size -= size; + d_off += size; + d_size -= size; + + if (s_off >= PAGE_SIZE) { + kunmap_atomic(d_addr); + kunmap_atomic(s_addr); + s_page = get_next_page(s_page); + BUG_ON(!s_page); + s_addr = kmap_atomic(s_page); + d_addr = kmap_atomic(d_page); + s_size = class->size - written; + s_off = 0; + } + + if (d_off >= PAGE_SIZE) { + kunmap_atomic(d_addr); + d_page = get_next_page(d_page); + BUG_ON(!d_page); + d_addr = kmap_atomic(d_page); + d_size = class->size - written; + d_off = 0; + } + } + + kunmap_atomic(d_addr); + kunmap_atomic(s_addr); +} + +/* + * Find alloced object in zspage from index object and + * return handle. + */ +static unsigned long find_alloced_obj(struct size_class *class, + struct page *page, int index) +{ + unsigned long head; + int offset = 0; + unsigned long handle = 0; + void *addr = kmap_atomic(page); + + if (!is_first_page(page)) + offset = page->index; + offset += class->size * index; + + while (offset < PAGE_SIZE) { + head = obj_to_head(class, page, addr + offset); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (trypin_tag(handle)) + break; + handle = 0; + } + + offset += class->size; + index++; + } + + kunmap_atomic(addr); + return handle; +} + +struct zs_compact_control { + /* Source page for migration which could be a subpage of zspage. */ + struct page *s_page; + /* Destination page for migration which should be a first page + * of zspage. */ + struct page *d_page; + /* Starting object index within @s_page which used for live object + * in the subpage. */ + int index; +}; + +static int migrate_zspage(struct zs_pool *pool, struct size_class *class, + struct zs_compact_control *cc) +{ + unsigned long used_obj, free_obj; + unsigned long handle; + struct page *s_page = cc->s_page; + struct page *d_page = cc->d_page; + unsigned long index = cc->index; + int ret = 0; + + while (1) { + handle = find_alloced_obj(class, s_page, index); + if (!handle) { + s_page = get_next_page(s_page); + if (!s_page) + break; + index = 0; + continue; + } + + /* Stop if there is no more space */ + if (zspage_full(d_page)) { + unpin_tag(handle); + ret = -ENOMEM; + break; + } + + used_obj = handle_to_obj(handle); + free_obj = obj_malloc(class, d_page, handle); + zs_object_copy(class, free_obj, used_obj); + index++; + /* + * record_obj updates handle's value to free_obj and it will + * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which + * breaks synchronization using pin_tag(e,g, zs_free) so + * let's keep the lock bit. + */ + free_obj |= BIT(HANDLE_PIN_BIT); + record_obj(handle, free_obj); + unpin_tag(handle); + obj_free(class, used_obj); + } + + /* Remember last position in this iteration */ + cc->s_page = s_page; + cc->index = index; + + return ret; +} + +static struct page *isolate_target_page(struct size_class *class) +{ + int i; + struct page *page; + + for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { + page = class->fullness_list[i]; + if (page) { + remove_zspage(class, i, page); + break; + } + } + + return page; +} + +/* + * putback_zspage - add @first_page into right class's fullness list + * @pool: target pool + * @class: destination class + * @first_page: target page + * + * Return @fist_page's fullness_group + */ +static enum fullness_group putback_zspage(struct zs_pool *pool, + struct size_class *class, + struct page *first_page) +{ + enum fullness_group fullness; + + BUG_ON(!is_first_page(first_page)); + + fullness = get_fullness_group(first_page); + insert_zspage(class, fullness, first_page); + set_zspage_mapping(first_page, class->index, fullness); + + if (fullness == ZS_EMPTY) { + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); + + free_zspage(first_page); + } + + return fullness; +} + +static struct page *isolate_source_page(struct size_class *class) { int i; - u64 npages = 0; + struct page *page = NULL; + + for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { + page = class->fullness_list[i]; + if (!page) + continue; - for (i = 0; i < ZS_SIZE_CLASSES; i++) - npages += pool->size_class[i].pages_allocated; + remove_zspage(class, i, page); + break; + } + + return page; +} + +/* + * + * Based on the number of unused allocated objects calculate + * and return the number of pages that we can free. + */ +static unsigned long zs_can_compact(struct size_class *class) +{ + unsigned long obj_wasted; + unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + unsigned long obj_used = zs_stat_get(class, OBJ_USED); + + if (obj_allocated <= obj_used) + return 0; + + obj_wasted = obj_allocated - obj_used; + obj_wasted /= get_maxobj_per_zspage(class->size, + class->pages_per_zspage); + + return obj_wasted * class->pages_per_zspage; +} + +static void __zs_compact(struct zs_pool *pool, struct size_class *class) +{ + struct zs_compact_control cc; + struct page *src_page; + struct page *dst_page = NULL; + + spin_lock(&class->lock); + while ((src_page = isolate_source_page(class))) { + + BUG_ON(!is_first_page(src_page)); + + if (!zs_can_compact(class)) + break; + + cc.index = 0; + cc.s_page = src_page; + + while ((dst_page = isolate_target_page(class))) { + cc.d_page = dst_page; + /* + * If there is no more space in dst_page, resched + * and see if anyone had allocated another zspage. + */ + if (!migrate_zspage(pool, class, &cc)) + break; + + putback_zspage(pool, class, dst_page); + } + + /* Stop if we couldn't find slot */ + if (dst_page == NULL) + break; + + putback_zspage(pool, class, dst_page); + if (putback_zspage(pool, class, src_page) == ZS_EMPTY) + pool->stats.pages_compacted += class->pages_per_zspage; + spin_unlock(&class->lock); + cond_resched(); + spin_lock(&class->lock); + } + + if (src_page) + putback_zspage(pool, class, src_page); + + spin_unlock(&class->lock); +} + +unsigned long zs_compact(struct zs_pool *pool) +{ + int i; + struct size_class *class; + + for (i = zs_size_classes - 1; i >= 0; i--) { + class = pool->size_class[i]; + if (!class) + continue; + if (class->index != i) + continue; + __zs_compact(pool, class); + } + + return pool->stats.pages_compacted; +} +EXPORT_SYMBOL_GPL(zs_compact); + +void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) +{ + memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); +} +EXPORT_SYMBOL_GPL(zs_pool_stats); + +static unsigned long zs_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long pages_freed; + struct zs_pool *pool = container_of(shrinker, struct zs_pool, + shrinker); + + pages_freed = pool->stats.pages_compacted; + /* + * Compact classes and calculate compaction delta. + * Can run concurrently with a manually triggered + * (by user) compaction. + */ + pages_freed = zs_compact(pool) - pages_freed; + + return pages_freed ? pages_freed : SHRINK_STOP; +} + +static unsigned long zs_shrinker_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int i; + struct size_class *class; + unsigned long pages_to_free = 0; + struct zs_pool *pool = container_of(shrinker, struct zs_pool, + shrinker); + + for (i = zs_size_classes - 1; i >= 0; i--) { + class = pool->size_class[i]; + if (!class) + continue; + if (class->index != i) + continue; + + pages_to_free += zs_can_compact(class); + } + + return pages_to_free; +} + +static void zs_unregister_shrinker(struct zs_pool *pool) +{ + if (pool->shrinker_enabled) { + unregister_shrinker(&pool->shrinker); + pool->shrinker_enabled = false; + } +} + +static void zs_register_shrinker(struct zs_pool *pool) +{ + pool->shrinker.scan_objects = zs_shrinker_scan; + pool->shrinker.count_objects = zs_shrinker_count; + pool->shrinker.batch = 0; + pool->shrinker.seeks = DEFAULT_SEEKS; + + register_shrinker(&pool->shrinker); +} + +static int zs_register_shrinker_int(struct zs_pool *pool) +{ + zs_register_shrinker(pool); + return 0; +} + +/** + * zs_create_pool - Creates an allocation pool to work from. + * @flags: allocation flags used to allocate pool metadata + * + * This function must be called before anything when using + * the zsmalloc allocator. + * + * On success, a pointer to the newly created pool is returned, + * otherwise NULL. + */ +struct zs_pool *zs_create_pool(const char *name) +{ + int i; + struct zs_pool *pool; + struct size_class *prev_class = NULL; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + + pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), + GFP_KERNEL); + if (!pool->size_class) { + kfree(pool); + return NULL; + } + + pool->name = kstrdup(name, GFP_KERNEL); + if (!pool->name) + goto err; + + if (create_handle_cache(pool)) + goto err; + + /* + * Iterate reversly, because, size of size_class that we want to use + * for merging should be larger or equal to current size. + */ + for (i = zs_size_classes - 1; i >= 0; i--) { + int size; + int pages_per_zspage; + struct size_class *class; + + size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; + if (size > ZS_MAX_ALLOC_SIZE) + size = ZS_MAX_ALLOC_SIZE; + pages_per_zspage = get_pages_per_zspage(size); + + /* + * size_class is used for normal zsmalloc operation such + * as alloc/free for that size. Although it is natural that we + * have one size_class for each size, there is a chance that we + * can get more memory utilization if we use one size_class for + * many different sizes whose size_class have same + * characteristics. So, we makes size_class point to + * previous size_class if possible. + */ + if (prev_class) { + if (can_merge(prev_class, size, pages_per_zspage)) { + pool->size_class[i] = prev_class; + continue; + } + } + + class = kzalloc(sizeof(struct size_class), GFP_KERNEL); + if (!class) + goto err; + + class->size = size; + class->index = i; + class->pages_per_zspage = pages_per_zspage; + if (pages_per_zspage == 1 && + get_maxobj_per_zspage(size, pages_per_zspage) == 1) + class->huge = true; + spin_lock_init(&class->lock); + pool->size_class[i] = class; + + prev_class = class; + } + + /* debug only, don't abort if it fails */ + zs_pool_stat_create(pool, name); + + /* + * Not critical, we still can use the pool + * and user can trigger compaction manually. + */ + if (zs_register_shrinker_int(pool) == 0) + pool->shrinker_enabled = true; + return pool; + +err: + zs_destroy_pool(pool); + return NULL; +} +EXPORT_SYMBOL_GPL(zs_create_pool); + +void zs_destroy_pool(struct zs_pool *pool) +{ + int i; + + zs_unregister_shrinker(pool); + zs_pool_stat_destroy(pool); + + for (i = 0; i < zs_size_classes; i++) { + int fg; + struct size_class *class = pool->size_class[i]; + + if (!class) + continue; + + if (class->index != i) + continue; + + for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { + if (class->fullness_list[fg]) { + pr_info("Freeing non-empty class with size %db, fullness group %d\n", + class->size, fg); + } + } + kfree(class); + } + + destroy_handle_cache(pool); + kfree(pool->size_class); + kfree(pool->name); + kfree(pool); +} +EXPORT_SYMBOL_GPL(zs_destroy_pool); + +static int __init zs_init(void) +{ + int ret = zs_register_cpu_notifier(); + + if (ret) + goto notifier_fail; + + init_zs_size_classes(); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zs_zpool_driver); +#endif + + zs_stat_init(); + + return 0; + +notifier_fail: + zs_unregister_cpu_notifier(); + + return ret; +} + +static void __exit zs_exit(void) +{ +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif + zs_unregister_cpu_notifier(); - return npages << PAGE_SHIFT; + zs_stat_exit(); } -EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); module_init(zs_init); module_exit(zs_exit); diff --git a/mm/zswap.c b/mm/zswap.c index a00c56482..ffc2c794d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1,11 +1,11 @@ /* * zswap.c - zswap driver file * - * zswap is a backend for frontswap that takes pages that are in the - * process of being swapped out and attempts to compress them and store - * them in a RAM-based memory pool. This results in a significant I/O - * reduction on the real swap device and, in the case of a slow swap - * device, can also improve workload performance. + * zswap is a backend for frontswap that takes pages that are in the process + * of being swapped out and attempts to compress and store them in a + * RAM-based memory pool. This can result in a significant I/O reduction on + * the swap device and, in the case where decompressing from RAM is faster + * than reading from the swap device, can also improve workload performance. * * Copyright (C) 2012 Seth Jennings * @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include @@ -45,15 +45,10 @@ /********************************* * statistics **********************************/ -/* Number of memory pages used by the compressed pool */ -atomic_t zswap_pool_pages = ATOMIC_INIT(0); +/* Total bytes used by the compressed storage */ +static u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ -atomic_t zswap_stored_pages = ATOMIC_INIT(0); - -#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK -/* The number of outstanding pages awaiting writeback */ -static atomic_t zswap_outstanding_writebacks = ATOMIC_INIT(0); -#endif +static atomic_t zswap_stored_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -61,53 +56,47 @@ static atomic_t zswap_outstanding_writebacks = ATOMIC_INIT(0); * they do provide useful information on roughly how many times a * certain event is occurring. */ + +/* Pool limit was hit (see zswap_max_pool_percent) */ static u64 zswap_pool_limit_hit; +/* Pages written back when pool limit was reached */ static u64 zswap_written_back_pages; +/* Store failed due to a reclaim failure after pool limit was reached */ +static u64 zswap_reject_reclaim_fail; +/* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; -static u64 zswap_writeback_attempted; -static u64 zswap_reject_tmppage_fail; -static u64 zswap_reject_zsmalloc_fail; +/* Store failed because underlying allocator could not get memory */ +static u64 zswap_reject_alloc_fail; +/* Store failed because the entry metadata could not be allocated (rare) */ static u64 zswap_reject_kmemcache_fail; -static u64 zswap_saved_by_writeback; +/* Duplicate store was encountered (rare) */ static u64 zswap_duplicate_entry; /********************************* * tunables **********************************/ -/* Enable/disable zswap (enabled by default, fixed at boot for now) */ -static bool zswap_enabled = 1; -module_param_named(enabled, zswap_enabled, bool, 0); + +/* Enable/disable zswap (disabled by default) */ +static bool zswap_enabled; +module_param_named(enabled, zswap_enabled, bool, 0644); /* Compressor to be used by zswap (fixed at boot for now) */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" -#ifndef CONFIG_CRYPTO_LZ4 static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; -#else -static char *zswap_compressor = "lz4"; -#endif -module_param_named(compressor, zswap_compressor, charp, 0); +module_param_named(compressor, zswap_compressor, charp, 0444); /* The maximum percentage of memory that the compressed pool can occupy */ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); -/* - * Maximum compression ratio, as as percentage, for an acceptable - * compressed page. Any pages that do not compress by at least - * this ratio will be rejected. -*/ -static unsigned int zswap_max_compression_ratio = 80; -module_param_named(max_compression_ratio, - zswap_max_compression_ratio, uint, 0644); +/* Compressed storage to use */ +#define ZSWAP_ZPOOL_DEFAULT "zbud" +static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; +module_param_named(zpool, zswap_zpool_type, charp, 0444); -/* - * Maximum number of outstanding writebacks allowed at any given time. - * This is to prevent decompressing an unbounded number of compressed - * pages into the swap cache all at once, and to help with writeback - * congestion. -*/ -#define ZSWAP_MAX_OUTSTANDING_FLUSHES 64 +/* zpool is shared by all of zswap backend */ +static struct zpool *zswap_pool; /********************************* * compression functions @@ -161,17 +150,15 @@ static int __init zswap_comp_init(void) return 0; } -static void zswap_comp_exit(void) +static void __init zswap_comp_exit(void) { /* free percpu transforms */ - if (zswap_comp_pcpu_tfms) - free_percpu(zswap_comp_pcpu_tfms); + free_percpu(zswap_comp_pcpu_tfms); } /********************************* * data structures **********************************/ - /* * struct zswap_entry * @@ -179,41 +166,37 @@ static void zswap_comp_exit(void) * page within zswap. * * rbnode - links the entry into red-black tree for the appropriate swap type - * lru - links the entry into the lru list for the appropriate swap type * refcount - the number of outstanding reference to the entry. This is needed * to protect against premature freeing of the entry by code - * concurent calls to load, invalidate, and writeback. The lock + * concurrent calls to load, invalidate, and writeback. The lock * for the zswap_tree structure that contains the entry must * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. - * type - the swap type for the entry. Used to map back to the zswap_tree - * structure that contains the entry. * offset - the swap offset for the entry. Index into the red-black tree. - * handle - zsmalloc allocation handle that stores the compressed page data + * handle - zpool allocation handle that stores the compressed page data * length - the length in bytes of the compressed page data. Needed during - * decompression + * decompression */ struct zswap_entry { struct rb_node rbnode; - struct list_head lru; - int refcount; pgoff_t offset; - unsigned long handle; + int refcount; unsigned int length; + unsigned long handle; +}; + +struct zswap_header { + swp_entry_t swpentry; }; /* * The tree lock in the zswap_tree struct protects a few things: * - the rbtree - * - the lru list * - the refcount field of each entry in the tree */ struct zswap_tree { struct rb_root rbroot; - struct list_head lru; spinlock_t lock; - struct zs_pool *pool; - unsigned type; }; static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; @@ -221,49 +204,35 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; /********************************* * zswap entry functions **********************************/ -#define ZSWAP_KMEM_CACHE_NAME "zswap_entry_cache" static struct kmem_cache *zswap_entry_cache; -static inline int zswap_entry_cache_create(void) +static int __init zswap_entry_cache_create(void) { - zswap_entry_cache = - kmem_cache_create(ZSWAP_KMEM_CACHE_NAME, - sizeof(struct zswap_entry), 0, 0, NULL); - return (zswap_entry_cache == NULL); + zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); + return zswap_entry_cache == NULL; } -static inline void zswap_entry_cache_destory(void) +static void __init zswap_entry_cache_destroy(void) { kmem_cache_destroy(zswap_entry_cache); } -static inline struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) { struct zswap_entry *entry; entry = kmem_cache_alloc(zswap_entry_cache, gfp); if (!entry) return NULL; - INIT_LIST_HEAD(&entry->lru); entry->refcount = 1; + RB_CLEAR_NODE(&entry->rbnode); return entry; } -static inline void zswap_entry_cache_free(struct zswap_entry *entry) +static void zswap_entry_cache_free(struct zswap_entry *entry) { kmem_cache_free(zswap_entry_cache, entry); } -static inline void zswap_entry_get(struct zswap_entry *entry) -{ - entry->refcount++; -} - -static inline int zswap_entry_put(struct zswap_entry *entry) -{ - entry->refcount--; - return entry->refcount; -} - /********************************* * rbtree functions **********************************/ @@ -285,9 +254,9 @@ static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) } /* - * In the case that a entry with the same offset is found, it a pointer to + * In the case that a entry with the same offset is found, a pointer to * the existing entry is stored in dupentry and the function returns -EEXIST -*/ + */ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, struct zswap_entry **dupentry) { @@ -311,6 +280,60 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, return 0; } +static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) +{ + if (!RB_EMPTY_NODE(&entry->rbnode)) { + rb_erase(&entry->rbnode, root); + RB_CLEAR_NODE(&entry->rbnode); + } +} + +/* + * Carries out the common pattern of freeing and entry's zpool allocation, + * freeing the entry itself, and decrementing the number of stored pages. + */ +static void zswap_free_entry(struct zswap_entry *entry) +{ + zpool_free(zswap_pool, entry->handle); + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); +} + +/* caller must hold the tree lock */ +static void zswap_entry_get(struct zswap_entry *entry) +{ + entry->refcount++; +} + +/* caller must hold the tree lock +* remove from the tree and free it, if nobody reference the entry +*/ +static void zswap_entry_put(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + int refcount = --entry->refcount; + + BUG_ON(refcount < 0); + if (refcount == 0) { + zswap_rb_erase(&tree->rbroot, entry); + zswap_free_entry(entry); + } +} + +/* caller must hold the tree lock */ +static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, + pgoff_t offset) +{ + struct zswap_entry *entry = NULL; + + entry = zswap_rb_search(root, offset); + if (entry) + zswap_entry_get(entry); + + return entry; +} + /********************************* * per-cpu code **********************************/ @@ -329,7 +352,7 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) return NOTIFY_BAD; } *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; - dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); + dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!dst) { pr_err("can't allocate compressor buffer\n"); crypto_free_comp(tfm); @@ -346,10 +369,8 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; } dst = per_cpu(zswap_dstmem, cpu); - if (dst) { - kfree(dst); - per_cpu(zswap_dstmem, cpu) = NULL; - } + kfree(dst); + per_cpu(zswap_dstmem, cpu) = NULL; break; default: break; @@ -368,108 +389,42 @@ static struct notifier_block zswap_cpu_notifier_block = { .notifier_call = zswap_cpu_notifier }; -static int zswap_cpu_init(void) +static int __init zswap_cpu_init(void) { unsigned long cpu; - get_online_cpus(); + cpu_notifier_register_begin(); for_each_online_cpu(cpu) if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) goto cleanup; - register_cpu_notifier(&zswap_cpu_notifier_block); - put_online_cpus(); + __register_cpu_notifier(&zswap_cpu_notifier_block); + cpu_notifier_register_done(); return 0; cleanup: for_each_online_cpu(cpu) __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); - put_online_cpus(); + cpu_notifier_register_done(); return -ENOMEM; } -/********************************* -* zsmalloc callbacks -**********************************/ -static mempool_t *zswap_page_pool; - -static inline unsigned int zswap_max_pool_pages(void) -{ - return zswap_max_pool_percent * totalram_pages / 100; -} - -static inline int zswap_page_pool_create(void) -{ - /* TODO: dynamically size mempool */ - zswap_page_pool = mempool_create_page_pool(256, 0); - if (!zswap_page_pool) - return -ENOMEM; - return 0; -} - -static inline void zswap_page_pool_destroy(void) -{ - mempool_destroy(zswap_page_pool); -} - -static struct page *zswap_alloc_page(gfp_t flags) -{ - struct page *page; - - if (atomic_read(&zswap_pool_pages) >= zswap_max_pool_pages()) { - zswap_pool_limit_hit++; - return NULL; - } - page = mempool_alloc(zswap_page_pool, flags); - if (page) - atomic_inc(&zswap_pool_pages); - return page; -} - -static void zswap_free_page(struct page *page) -{ - if (!page) - return; - mempool_free(page, zswap_page_pool); - atomic_dec(&zswap_pool_pages); -} - -static struct zs_ops zswap_zs_ops = { - .alloc = zswap_alloc_page, - .free = zswap_free_page -}; - - /********************************* * helpers **********************************/ - -/* - * Carries out the common pattern of freeing and entry's zsmalloc allocation, - * freeing the entry itself, and decrementing the number of stored pages. - */ -static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) +static bool zswap_is_full(void) { - zs_free(tree->pool, entry->handle); - zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); + return totalram_pages * zswap_max_pool_percent / 100 < + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } -#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK /********************************* * writeback code **********************************/ -static void zswap_end_swap_write(struct bio *bio, int err) -{ - end_swap_bio_write(bio, err); - atomic_dec(&zswap_outstanding_writebacks); - zswap_written_back_pages++; -} - /* return enum for zswap_get_swap_cache_page */ enum zswap_get_swap_ret { ZSWAP_SWAPCACHE_NEW, ZSWAP_SWAPCACHE_EXIST, - ZSWAP_SWAPCACHE_NOMEM + ZSWAP_SWAPCACHE_FAIL, }; /* @@ -483,15 +438,16 @@ enum zswap_get_swap_ret { * added to the swap cache, and returned in retpage. * * If success, the swap cache page is returned in retpage - * Returns 0 if page was already in the swap cache, page is not locked - * Returns 1 if the new page needs to be populated, page is locked - * Returns <0 on error + * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache + * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, + * the new page is added to swapcache and locked + * Returns ZSWAP_SWAPCACHE_FAIL on error */ static int zswap_get_swap_cache_page(swp_entry_t entry, struct page **retpage) { struct page *found_page, *new_page = NULL; - struct address_space *swapper_space = &swapper_spaces[swp_type(entry)]; + struct address_space *swapper_space = swap_address_space(entry); int err; *retpage = NULL; @@ -557,13 +513,13 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, if (new_page) page_cache_release(new_page); if (!found_page) - return ZSWAP_SWAPCACHE_NOMEM; + return ZSWAP_SWAPCACHE_FAIL; *retpage = found_page; return ZSWAP_SWAPCACHE_EXIST; } /* - * Attempts to free and entry by adding a page to the swap cache, + * Attempts to free an entry by adding a page to the swap cache, * decompressing the entry data into the page, and issuing a * bio write to write the page back to the swap device. * @@ -574,12 +530,14 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, * the swap cache, the compressed version stored by zswap can be * freed. */ -static int zswap_writeback_entry(struct zswap_tree *tree, - struct zswap_entry *entry) +static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) { - unsigned long type = tree->type; - struct page *page; + struct zswap_header *zhdr; swp_entry_t swpentry; + struct zswap_tree *tree; + pgoff_t offset; + struct zswap_entry *entry; + struct page *page; u8 *src, *dst; unsigned int dlen; int ret; @@ -587,30 +545,46 @@ static int zswap_writeback_entry(struct zswap_tree *tree, .sync_mode = WB_SYNC_NONE, }; - /* get/allocate page in the swap cache */ - swpentry = swp_entry(type, entry->offset); + /* extract swpentry from data */ + zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); + swpentry = zhdr->swpentry; /* here */ + zpool_unmap_handle(pool, handle); + tree = zswap_trees[swp_type(swpentry)]; + offset = swp_offset(swpentry); + + /* find and ref zswap entry */ + spin_lock(&tree->lock); + entry = zswap_entry_find_get(&tree->rbroot, offset); + if (!entry) { + /* entry was invalidated */ + spin_unlock(&tree->lock); + return 0; + } + spin_unlock(&tree->lock); + BUG_ON(offset != entry->offset); /* try to allocate swap cache page */ switch (zswap_get_swap_cache_page(swpentry, &page)) { + case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ + ret = -ENOMEM; + goto fail; - case ZSWAP_SWAPCACHE_NOMEM: /* no memory */ - return -ENOMEM; - break; /* not reached */ - - case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */ + case ZSWAP_SWAPCACHE_EXIST: /* page is already in the swap cache, ignore for now */ - return -EEXIST; - break; /* not reached */ + page_cache_release(page); + ret = -EEXIST; + goto fail; case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = zs_map_object(tree->pool, entry->handle, ZS_MM_RO); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); - ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, - dst, &dlen); + ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, + entry->length, dst, &dlen); kunmap_atomic(dst); - zs_unmap_object(tree->pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -618,148 +592,45 @@ static int zswap_writeback_entry(struct zswap_tree *tree, SetPageUptodate(page); } - /* start writeback */ + /* move it to the tail of the inactive list after end_writeback */ SetPageReclaim(page); - if (!__swap_writepage(page, &wbc, zswap_end_swap_write)) - atomic_inc(&zswap_outstanding_writebacks); - page_cache_release(page); - - return 0; -} - -/* - * Attempts to free nr of entries via writeback to the swap device. - * The number of entries that were actually freed is returned. - */ -static int zswap_writeback_entries(struct zswap_tree *tree, int nr) -{ - struct zswap_entry *entry; - int i, ret, refcount, freed_nr = 0; - - for (i = 0; i < nr; i++) { - /* - * This limits is arbitrary for now until a better - * policy can be implemented. This is so we don't - * eat all of RAM decompressing pages for writeback. - */ - if (atomic_read(&zswap_outstanding_writebacks) > - ZSWAP_MAX_OUTSTANDING_FLUSHES) - break; - - spin_lock(&tree->lock); - - /* dequeue from lru */ - if (list_empty(&tree->lru)) { - spin_unlock(&tree->lock); - break; - } - entry = list_first_entry(&tree->lru, - struct zswap_entry, lru); - list_del_init(&entry->lru); - - /* so invalidate doesn't free the entry from under us */ - zswap_entry_get(entry); - - spin_unlock(&tree->lock); - /* attempt writeback */ - ret = zswap_writeback_entry(tree, entry); - - spin_lock(&tree->lock); - - /* drop reference from above */ - refcount = zswap_entry_put(entry); - - if (!ret) - /* drop the initial reference from entry creation */ - refcount = zswap_entry_put(entry); - - /* - * There are four possible values for refcount here: - * (1) refcount is 2, writeback failed and load is in progress; - * do nothing, load will add us back to the LRU - * (2) refcount is 1, writeback failed; do not free entry, - * add back to LRU - * (3) refcount is 0, (normal case) not invalidate yet; - * remove from rbtree and free entry - * (4) refcount is -1, invalidate happened during writeback; - * free entry - */ - if (refcount == 1) - list_add(&entry->lru, &tree->lru); - - if (refcount == 0) { - /* no invalidate yet, remove from rbtree */ - rb_erase(&entry->rbnode, &tree->rbroot); - } - spin_unlock(&tree->lock); - if (refcount <= 0) { - /* free the entry */ - zswap_free_entry(tree, entry); - freed_nr++; - } - } - return freed_nr; -} -#endif /* CONFIG_ZSWAP_ENABLE_WRITEBACK */ - -/******************************************* -* page pool for temporary compression result -********************************************/ -#define ZSWAP_TMPPAGE_POOL_PAGES 16 -static LIST_HEAD(zswap_tmppage_list); -static DEFINE_SPINLOCK(zswap_tmppage_lock); - -static void zswap_tmppage_pool_destroy(void) -{ - struct page *page, *tmppage; + /* start writeback */ + __swap_writepage(page, &wbc, end_swap_bio_write); + page_cache_release(page); + zswap_written_back_pages++; - spin_lock(&zswap_tmppage_lock); - list_for_each_entry_safe(page, tmppage, &zswap_tmppage_list, lru) { - list_del(&page->lru); - __free_pages(page, 1); - } - spin_unlock(&zswap_tmppage_lock); -} + spin_lock(&tree->lock); + /* drop local reference */ + zswap_entry_put(tree, entry); -static int zswap_tmppage_pool_create(void) -{ - int i; - struct page *page; + /* + * There are two possible situations for entry here: + * (1) refcount is 1(normal case), entry is valid and on the tree + * (2) refcount is 0, entry is freed and not on the tree + * because invalidate happened during writeback + * search the tree and free the entry if find entry + */ + if (entry == zswap_rb_search(&tree->rbroot, offset)) + zswap_entry_put(tree, entry); + spin_unlock(&tree->lock); - for (i = 0; i < ZSWAP_TMPPAGE_POOL_PAGES; i++) { - page = alloc_pages(GFP_KERNEL, 1); - if (!page) { - zswap_tmppage_pool_destroy(); - return -ENOMEM; - } - spin_lock(&zswap_tmppage_lock); - list_add(&page->lru, &zswap_tmppage_list); - spin_unlock(&zswap_tmppage_lock); - } - return 0; -} + goto end; -static inline struct page *zswap_tmppage_alloc(void) -{ - struct page *page; - - spin_lock(&zswap_tmppage_lock); - if (list_empty(&zswap_tmppage_list)) { - spin_unlock(&zswap_tmppage_lock); - return NULL; - } - page = list_first_entry(&zswap_tmppage_list, struct page, lru); - list_del(&page->lru); - spin_unlock(&zswap_tmppage_lock); - return page; -} + /* + * if we get here due to ZSWAP_SWAPCACHE_EXIST + * a load may happening concurrently + * it is safe and okay to not free the entry + * if we free the entry in the following put + * it it either okay to return !0 + */ +fail: + spin_lock(&tree->lock); + zswap_entry_put(tree, entry); + spin_unlock(&tree->lock); -static inline void zswap_tmppage_free(struct page *page) -{ - spin_lock(&zswap_tmppage_lock); - list_add(&page->lru, &zswap_tmppage_list); - spin_unlock(&zswap_tmppage_lock); +end: + return ret; } /********************************* @@ -772,25 +643,25 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *dupentry; int ret; - unsigned int dlen = PAGE_SIZE; + unsigned int dlen = PAGE_SIZE, len; unsigned long handle; char *buf; u8 *src, *dst; - struct page *tmppage; - bool writeback_attempted = 0; -#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK - u8 *tmpdst; -#endif + struct zswap_header *zhdr; - if (!tree) { + if (!zswap_enabled || !tree) { ret = -ENODEV; goto reject; } - /* if this page got EIO on pageout before, give up immediately */ - if (PageError(page)) { - ret = -ENOMEM; - goto reject; + /* reclaim space if needed */ + if (zswap_is_full()) { + zswap_pool_limit_hit++; + if (zpool_shrink(zswap_pool, 1, NULL)) { + zswap_reject_reclaim_fail++; + ret = -ENOMEM; + goto reject; + } } /* allocate entry */ @@ -810,62 +681,25 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, ret = -EINVAL; goto freepage; } - if ((dlen * 100 / PAGE_SIZE) > zswap_max_compression_ratio) { + + /* store */ + len = dlen + sizeof(struct zswap_header); + ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, + &handle); + if (ret == -ENOSPC) { zswap_reject_compress_poor++; - ret = -E2BIG; goto freepage; } - - /* store */ - handle = zs_malloc(tree->pool, dlen, - __GFP_NORETRY | __GFP_HIGHMEM | __GFP_NOMEMALLOC | - __GFP_NOWARN); - if (!handle) { -#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK - zswap_writeback_attempted++; - /* - * Copy compressed buffer out of per-cpu storage so - * we can re-enable preemption. - */ - tmppage = zswap_tmppage_alloc(); - if (!tmppage) { - zswap_reject_tmppage_fail++; - ret = -ENOMEM; - goto freepage; - } - writeback_attempted = 1; - tmpdst = page_address(tmppage); - memcpy(tmpdst, dst, dlen); - dst = tmpdst; - put_cpu_var(zswap_dstmem); - - /* try to free up some space */ - /* TODO: replace with more targeted policy */ - zswap_writeback_entries(tree, 16); - /* try again, allowing wait */ - handle = zs_malloc(tree->pool, dlen, - __GFP_NORETRY | __GFP_HIGHMEM | __GFP_NOMEMALLOC | - __GFP_NOWARN); - if (!handle) { - /* still no space, fail */ - zswap_reject_zsmalloc_fail++; - ret = -ENOMEM; - goto freepage; - } - zswap_saved_by_writeback++; -#else - ret = -ENOMEM; + if (ret) { + zswap_reject_alloc_fail++; goto freepage; -#endif } - - buf = zs_map_object(tree->pool, handle, ZS_MM_WO); + zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); + zhdr->swpentry = swp_entry(type, offset); + buf = (u8 *)(zhdr + 1); memcpy(buf, dst, dlen); - zs_unmap_object(tree->pool, handle); - if (writeback_attempted) - zswap_tmppage_free(tmppage); - else - put_cpu_var(zswap_dstmem); + zpool_unmap_handle(zswap_pool, handle); + put_cpu_var(zswap_dstmem); /* populate entry */ entry->offset = offset; @@ -878,29 +712,21 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); if (ret == -EEXIST) { zswap_duplicate_entry++; - /* remove from rbtree and lru */ - rb_erase(&dupentry->rbnode, &tree->rbroot); - if (!list_empty(&dupentry->lru)) - list_del_init(&dupentry->lru); - if (!zswap_entry_put(dupentry)) { - /* free */ - zswap_free_entry(tree, dupentry); - } + /* remove from rbtree */ + zswap_rb_erase(&tree->rbroot, dupentry); + zswap_entry_put(tree, dupentry); } } while (ret == -EEXIST); - list_add_tail(&entry->lru, &tree->lru); spin_unlock(&tree->lock); /* update stats */ atomic_inc(&zswap_stored_pages); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); return 0; freepage: - if (writeback_attempted) - zswap_tmppage_free(tmppage); - else - put_cpu_var(zswap_dstmem); + put_cpu_var(zswap_dstmem); zswap_entry_cache_free(entry); reject: return ret; @@ -917,59 +743,41 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, struct zswap_entry *entry; u8 *src, *dst; unsigned int dlen; - int refcount; + int ret; /* find */ spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); + entry = zswap_entry_find_get(&tree->rbroot, offset); if (!entry) { /* entry was written back */ spin_unlock(&tree->lock); return -1; } - zswap_entry_get(entry); - - /* remove from lru */ - if (!list_empty(&entry->lru)) - list_del_init(&entry->lru); spin_unlock(&tree->lock); /* decompress */ dlen = PAGE_SIZE; - src = zs_map_object(tree->pool, entry->handle, ZS_MM_RO); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); - zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, + ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zs_unmap_object(tree->pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); + BUG_ON(ret); spin_lock(&tree->lock); - refcount = zswap_entry_put(entry); - if (likely(refcount)) { - list_add_tail(&entry->lru, &tree->lru); - spin_unlock(&tree->lock); - return 0; - } + zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - /* - * We don't have to unlink from the rbtree because - * zswap_writeback_entry() or zswap_frontswap_invalidate page() - * has already done this for us if we are the last reference. - */ - /* free */ - - zswap_free_entry(tree, entry); - return 0; } -/* invalidates a single page */ +/* frees an entry in zswap */ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; - int refcount; /* find */ spin_lock(&tree->lock); @@ -980,80 +788,51 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) return; } - /* remove from rbtree and lru */ - rb_erase(&entry->rbnode, &tree->rbroot); - if (!list_empty(&entry->lru)) - list_del_init(&entry->lru); + /* remove from rbtree */ + zswap_rb_erase(&tree->rbroot, entry); /* drop the initial reference from entry creation */ - refcount = zswap_entry_put(entry); + zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - - if (refcount) { - /* writeback in progress, writeback will free */ - return; - } - - /* free */ - zswap_free_entry(tree, entry); } -/* invalidates all pages for the given swap type */ +/* frees all zswap entries for the given swap type */ static void zswap_frontswap_invalidate_area(unsigned type) { struct zswap_tree *tree = zswap_trees[type]; - struct rb_node *node; - struct zswap_entry *entry; + struct zswap_entry *entry, *n; if (!tree) return; /* walk the tree and free everything */ spin_lock(&tree->lock); - /* - * TODO: Even though this code should not be executed because - * the try_to_unuse() in swapoff should have emptied the tree, - * it is very wasteful to rebalance the tree after every - * removal when we are freeing the whole tree. - * - * If post-order traversal code is ever added to the rbtree - * implementation, it should be used here. - */ - while ((node = rb_first(&tree->rbroot))) { - entry = rb_entry(node, struct zswap_entry, rbnode); - rb_erase(&entry->rbnode, &tree->rbroot); - zs_free(tree->pool, entry->handle); - zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); - } + rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) + zswap_free_entry(entry); tree->rbroot = RB_ROOT; - INIT_LIST_HEAD(&tree->lru); spin_unlock(&tree->lock); + kfree(tree); + zswap_trees[type] = NULL; } -/* NOTE: this is called in atomic context from swapon and must not sleep */ +static const struct zpool_ops zswap_zpool_ops = { + .evict = zswap_writeback_entry +}; + static void zswap_frontswap_init(unsigned type) { struct zswap_tree *tree; - tree = kzalloc(sizeof(struct zswap_tree), GFP_ATOMIC); - if (!tree) - goto err; - tree->pool = zs_create_pool(GFP_NOWAIT, &zswap_zs_ops); - if (!tree->pool) - goto freetree; + tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); + if (!tree) { + pr_err("alloc failed, zswap disabled for swap type %d\n", type); + return; + } + tree->rbroot = RB_ROOT; - INIT_LIST_HEAD(&tree->lru); spin_lock_init(&tree->lock); - tree->type = type; zswap_trees[type] = tree; - return; - -freetree: - kfree(tree); -err: - pr_err("alloc failed, zswap disabled for swap type %d\n", type); } static struct frontswap_ops zswap_frontswap_ops = { @@ -1081,16 +860,12 @@ static int __init zswap_debugfs_init(void) if (!zswap_debugfs_root) return -ENOMEM; - debugfs_create_u64("saved_by_writeback", S_IRUGO, - zswap_debugfs_root, &zswap_saved_by_writeback); debugfs_create_u64("pool_limit_hit", S_IRUGO, zswap_debugfs_root, &zswap_pool_limit_hit); - debugfs_create_u64("reject_writeback_attempted", S_IRUGO, - zswap_debugfs_root, &zswap_writeback_attempted); - debugfs_create_u64("reject_tmppage_fail", S_IRUGO, - zswap_debugfs_root, &zswap_reject_tmppage_fail); - debugfs_create_u64("reject_zsmalloc_fail", S_IRUGO, - zswap_debugfs_root, &zswap_reject_zsmalloc_fail); + debugfs_create_u64("reject_reclaim_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_reclaim_fail); + debugfs_create_u64("reject_alloc_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_alloc_fail); debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, zswap_debugfs_root, &zswap_reject_kmemcache_fail); debugfs_create_u64("reject_compress_poor", S_IRUGO, @@ -1099,14 +874,11 @@ static int __init zswap_debugfs_init(void) zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_u64("duplicate_entry", S_IRUGO, zswap_debugfs_root, &zswap_duplicate_entry); - debugfs_create_atomic_t("pool_pages", S_IRUGO, - zswap_debugfs_root, &zswap_pool_pages); + debugfs_create_u64("pool_total_size", S_IRUGO, + zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", S_IRUGO, zswap_debugfs_root, &zswap_stored_pages); -#ifdef CONFIG_ZSWAP_ENABLE_WRITEBACK - debugfs_create_atomic_t("outstanding_writebacks", S_IRUGO, - zswap_debugfs_root, &zswap_outstanding_writebacks); -#endif + return 0; } @@ -1115,12 +887,12 @@ static void __exit zswap_debugfs_exit(void) debugfs_remove_recursive(zswap_debugfs_root); } #else -static inline int __init zswap_debugfs_init(void) +static int __init zswap_debugfs_init(void) { return 0; } -static inline void __exit zswap_debugfs_exit(void) { } +static void __exit zswap_debugfs_exit(void) { } #endif /********************************* @@ -1128,21 +900,28 @@ static inline void __exit zswap_debugfs_exit(void) { } **********************************/ static int __init init_zswap(void) { - if (!zswap_enabled) - return 0; + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; pr_info("loading zswap\n"); - if (zswap_entry_cache_create()) { - pr_err("entry cache creation failed\n"); - goto error; + + zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, + &zswap_zpool_ops); + if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + pr_info("%s zpool not available\n", zswap_zpool_type); + zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; + zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, + &zswap_zpool_ops); } - if (zswap_page_pool_create()) { - pr_err("page pool initialization failed\n"); - goto pagepoolfail; + if (!zswap_pool) { + pr_err("%s zpool not available\n", zswap_zpool_type); + pr_err("zpool creation failed\n"); + goto error; } - if (zswap_tmppage_pool_create()) { - pr_err("workmem pool initialization failed\n"); - goto tmppoolfail; + pr_info("using %s pool\n", zswap_zpool_type); + + if (zswap_entry_cache_create()) { + pr_err("entry cache creation failed\n"); + goto cachefail; } if (zswap_comp_init()) { pr_err("compressor initialization failed\n"); @@ -1152,6 +931,7 @@ static int __init init_zswap(void) pr_err("per-cpu initialization failed\n"); goto pcpufail; } + frontswap_register_ops(&zswap_frontswap_ops); if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); @@ -1159,11 +939,9 @@ static int __init init_zswap(void) pcpufail: zswap_comp_exit(); compfail: - zswap_tmppage_pool_destroy(); -tmppoolfail: - zswap_page_pool_destroy(); -pagepoolfail: - zswap_entry_cache_destory(); + zswap_entry_cache_destroy(); +cachefail: + zpool_destroy_pool(zswap_pool); error: return -ENOMEM; } @@ -1171,5 +949,5 @@ static int __init init_zswap(void) late_initcall(init_zswap); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Seth Jennings "); -MODULE_DESCRIPTION("Compressed cache for swap pages"); +MODULE_AUTHOR("Seth Jennings "); +MODULE_DESCRIPTION("Compressed cache for swap pages"); \ No newline at end of file From a04038f9daaa900337ca39735adeb7ee16ca045e Mon Sep 17 00:00:00 2001 From: carck Date: Sun, 19 Jun 2016 05:22:00 -0400 Subject: [PATCH 31/52] swap: make each swap partition have one address_space When I use several fast SSD to do swap, swapper_space.tree_lock is heavily contended. This makes each swap partition have one address_space to reduce the lock contention. There is an array of address_space for swap. The swap entry type is the index to the array. In my test with 3 SSD, this increases the swapout throughput 20%. [akpm@linux-foundation.org: revert unneeded change to __add_to_swap_cache] Signed-off-by: Shaohua Li Cc: Hugh Dickins Acked-by: Rik van Riel Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 4 ++-- include/linux/swap.h | 5 +++-- mm/memcontrol.c | 2 +- mm/mincore.c | 2 +- mm/swap.c | 8 +++++++- mm/swap_state.c | 40 +++++++++++++++++++++++++++++----------- mm/swapfile.c | 4 ++-- 7 files changed, 45 insertions(+), 20 deletions(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 33f9e9b81..e1bd03dda 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) * sysctl_overcommit_ratio / 100) + total_swap_pages; cached = global_page_state(NR_FILE_PAGES) - - total_swapcache_pages - i.bufferram; + total_swapcache_pages()- i.bufferram; if (cached < 0) cached = 0; @@ -112,7 +112,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeram), K(i.bufferram), K(cached), - K(total_swapcache_pages), + K(total_swapcache_pages()), K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), K(pages[LRU_ACTIVE_ANON]), diff --git a/include/linux/swap.h b/include/linux/swap.h index 00bdb7df3..102021b8a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -341,7 +342,7 @@ extern void end_swap_bio_read(struct bio *bio, int err); /* linux/mm/swap_state.c */ extern struct address_space swapper_spaces[]; #define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) -#define total_swapcache_pages swapper_space.nrpages +extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); extern int add_to_swap(struct page *); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); @@ -414,7 +415,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) #define get_nr_swap_pages() 0L #define total_swap_pages 0L -#define total_swapcache_pages 0UL +#define total_swapcache_pages() 0UL #define vm_swap_full(si) 0 #define si_swapinfo(val) \ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5265cb948..6e96f56da 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5229,7 +5229,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, swp_entry_t swap = radix_to_swp_entry(page); if (do_swap_account) *entry = swap; - page = find_get_page(&swapper_space, swap.val); + page = find_get_page(swap_address_space(swap), swap.val); } #endif return page; diff --git a/mm/mincore.c b/mm/mincore.c index 936b4cee8..cc439ae35 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) /* shmem/tmpfs may return swap: account for swapcache page too. */ if (radix_tree_exceptional_entry(page)) { swp_entry_t swap = radix_to_swp_entry(page); - page = find_get_page(&swapper_space, swap.val); + page = find_get_page(swap_address_space(swap), swap.val); } #endif if (page) { diff --git a/mm/swap.c b/mm/swap.c index a8feea68a..db74d14f9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -845,7 +845,13 @@ void __init swap_setup(void) unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); #ifdef CONFIG_SWAP - bdi_init(swapper_space.backing_dev_info); + int i; + + bdi_init(swapper_spaces[0].backing_dev_info); + for (i = 0; i < MAX_SWAPFILES; i++) { + spin_lock_init(&swapper_spaces[i].tree_lock); + INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); + } #endif /* Use a smaller cluster for small-memory machines */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 51fc0cd2f..7692d99b9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -52,9 +52,19 @@ static struct { unsigned long find_total; } swap_cache_info; +unsigned long total_swapcache_pages(void) +{ + int i; + unsigned long ret = 0; + + for (i = 0; i < MAX_SWAPFILES; i++) + ret += swapper_spaces[i].nrpages; + return ret; +} + void show_swap_cache_info(void) { - printk("%lu pages in swap cache\n", total_swapcache_pages); + printk("%lu pages in swap cache\n", total_swapcache_pages()); printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total); @@ -70,6 +80,7 @@ void show_swap_cache_info(void) int __add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; + struct address_space *address_space; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageSwapCache(page)); @@ -79,14 +90,15 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) SetPageSwapCache(page); set_page_private(page, entry.val); - spin_lock_irq(&swapper_space.tree_lock); - error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); + error = radix_tree_insert(&address_space->page_tree, entry.val, page); if (likely(!error)) { - total_swapcache_pages++; + address_space->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(add_total); } - spin_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&address_space->tree_lock); if (unlikely(error)) { /* @@ -122,14 +134,18 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { + swp_entry_t entry; + struct address_space *address_space; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapCache(page)); VM_BUG_ON(PageWriteback(page)); - radix_tree_delete(&swapper_space.page_tree, page_private(page)); + entry.val = page_private(page); + address_space = swap_address_space(entry); + radix_tree_delete(&address_space->page_tree, page_private(page)); set_page_private(page, 0); ClearPageSwapCache(page); - total_swapcache_pages--; + address_space->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(del_total); } @@ -195,12 +211,14 @@ int add_to_swap(struct page *page) void delete_from_swap_cache(struct page *page) { swp_entry_t entry; + struct address_space *address_space; entry.val = page_private(page); - spin_lock_irq(&swapper_space.tree_lock); + address_space = swap_address_space(entry); + spin_lock_irq(&address_space->tree_lock); __delete_from_swap_cache(page); - spin_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&address_space->tree_lock); swapcache_free(entry, page); page_cache_release(page); @@ -263,7 +281,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) { struct page *page; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (page) INC_CACHE_INFO(find_success); @@ -290,7 +308,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(&swapper_space, entry.val); + found_page = find_get_page(swap_address_space(entry), entry.val); if (found_page) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index 8e55e729f..3521cf159 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -101,7 +101,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) struct page *page; int ret = 0; - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (!page) return 0; /* @@ -766,7 +766,7 @@ int free_swap_and_cache(swp_entry_t entry) p = swap_info_get(entry); if (p) { if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { - page = find_get_page(&swapper_space, entry.val); + page = find_get_page(swap_address_space(entry), entry.val); if (page && !trylock_page(page)) { page_cache_release(page); page = NULL; From fec6e39e7fc8346984ef488c49bb133f06a251c2 Mon Sep 17 00:00:00 2001 From: carck Date: Sun, 19 Jun 2016 07:50:33 -0400 Subject: [PATCH 32/52] backport: needed parts for zram & zsmalloc from 3.10 --- include/linux/compiler.h | 104 +++++++++++++++++++++++++++++++++++++++ include/linux/cpu.h | 12 +++++ include/linux/mm.h | 2 +- include/linux/rbtree.h | 31 ++++++++++++ kernel/cpu.c | 12 +++++ lib/rbtree.c | 40 +++++++++++++++ mm/page_alloc.c | 6 +-- mm/slob.c | 2 +- mm/slub.c | 2 +- mm/zsmalloc.c | 2 +- 10 files changed, 206 insertions(+), 7 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 923d093c9..7f5c2a39e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -164,6 +164,110 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); (typeof(ptr)) (__ptr + (off)); }) #endif +#include + +#define __READ_ONCE_SIZE \ +({ \ + switch (size) { \ + case 1: *(__u8 *)res = *(volatile __u8 *)p; break; \ + case 2: *(__u16 *)res = *(volatile __u16 *)p; break; \ + case 4: *(__u32 *)res = *(volatile __u32 *)p; break; \ + case 8: *(__u64 *)res = *(volatile __u64 *)p; break; \ + default: \ + barrier(); \ + __builtin_memcpy((void *)res, (const void *)p, size); \ + barrier(); \ + } \ +}) + +static __always_inline +void __read_once_size(const volatile void *p, void *res, int size) +{ + __READ_ONCE_SIZE; +} + +#ifdef CONFIG_KASAN +/* + * This function is not 'inline' because __no_sanitize_address confilcts + * with inlining. Attempt to inline it may cause a build failure. + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368 + * '__maybe_unused' allows us to avoid defined-but-not-used warnings. + */ +static __no_sanitize_address __maybe_unused +void __read_once_size_nocheck(const volatile void *p, void *res, int size) +{ + __READ_ONCE_SIZE; +} +#else +static __always_inline +void __read_once_size_nocheck(const volatile void *p, void *res, int size) +{ + __READ_ONCE_SIZE; +} +#endif + +static __always_inline void __write_once_size(volatile void *p, void *res, int size) +{ + switch (size) { + case 1: *(volatile __u8 *)p = *(__u8 *)res; break; + case 2: *(volatile __u16 *)p = *(__u16 *)res; break; + case 4: *(volatile __u32 *)p = *(__u32 *)res; break; + case 8: *(volatile __u64 *)p = *(__u64 *)res; break; + default: + barrier(); + __builtin_memcpy((void *)p, (const void *)res, size); + barrier(); + } +} + +/* + * Prevent the compiler from merging or refetching reads or writes. The + * compiler is also forbidden from reordering successive instances of + * READ_ONCE, WRITE_ONCE and ACCESS_ONCE (see below), but only when the + * compiler is aware of some particular ordering. One way to make the + * compiler aware of ordering is to put the two invocations of READ_ONCE, + * WRITE_ONCE or ACCESS_ONCE() in different C statements. + * + * In contrast to ACCESS_ONCE these two macros will also work on aggregate + * data types like structs or unions. If the size of the accessed data + * type exceeds the word size of the machine (e.g., 32 bits or 64 bits) + * READ_ONCE() and WRITE_ONCE() will fall back to memcpy(). There's at + * least two memcpy()s: one for the __builtin_memcpy() and then one for + * the macro doing the copy of variable - '__u' allocated on the stack. + * + * Their two major use cases are: (1) Mediating communication between + * process-level code and irq/NMI handlers, all running on the same CPU, + * and (2) Ensuring that the compiler does not fold, spindle, or otherwise + * mutilate accesses that either do not require ordering or that interact + * with an explicit memory barrier or atomic instruction that provides the + * required ordering. + */ + +#define __READ_ONCE(x, check) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u; \ + if (check) \ + __read_once_size(&(x), __u.__c, sizeof(x)); \ + else \ + __read_once_size_nocheck(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) +#define READ_ONCE(x) __READ_ONCE(x, 1) + +/* + * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need + * to hide memory access from KASAN. + */ +#define READ_ONCE_NOCHECK(x) __READ_ONCE(x, 0) + +#define WRITE_ONCE(x, val) \ +({ \ + union { typeof(x) __val; char __c[1]; } __u = \ + { .__val = (__force typeof(x)) (val) }; \ + __write_once_size(&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 54afc7f76..233c27277 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -126,21 +126,33 @@ enum { #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */ #ifdef CONFIG_HOTPLUG_CPU extern int register_cpu_notifier(struct notifier_block *nb); +extern int __register_cpu_notifier(struct notifier_block *nb); extern void unregister_cpu_notifier(struct notifier_block *nb); +extern void __unregister_cpu_notifier(struct notifier_block *nb); #else #ifndef MODULE extern int register_cpu_notifier(struct notifier_block *nb); +extern int __register_cpu_notifier(struct notifier_block *nb); #else static inline int register_cpu_notifier(struct notifier_block *nb) { return 0; } + +static inline int __register_cpu_notifier(struct notifier_block *nb) +{ + return 0; +} #endif static inline void unregister_cpu_notifier(struct notifier_block *nb) { } + +static inline void __unregister_cpu_notifier(struct notifier_block *nb) +{ +} #endif int cpu_up(unsigned int cpu); diff --git a/include/linux/mm.h b/include/linux/mm.h index f5ca7ecfc..773577495 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -389,7 +389,7 @@ static inline struct page *compound_head(struct page *page) * both from it and to it can be tracked, using atomic_inc_and_test * and atomic_add_negative(-1). */ -static inline void reset_page_mapcount(struct page *page) +static inline void page_mapcount_reset(struct page *page) { atomic_set(&(page)->_mapcount, -1); } diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 033b507b3..99ce4aedd 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -161,6 +161,10 @@ extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); +/* Postorder iteration - always visit the parent after its children */ +extern struct rb_node *rb_first_postorder(const struct rb_root *); +extern struct rb_node *rb_next_postorder(const struct rb_node *); + /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); @@ -174,4 +178,31 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, *rb_link = node; } +#define rb_entry_safe(ptr, type, member) \ + ({ typeof(ptr) ____ptr = (ptr); \ + ____ptr ? rb_entry(____ptr, type, member) : NULL; \ + }) + +/** + * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of + * given type allowing the backing memory of @pos to be invalidated + * + * @pos: the 'type *' to use as a loop cursor. + * @n: another 'type *' to use as temporary storage + * @root: 'rb_root *' of the rbtree. + * @field: the name of the rb_node field within 'type'. + * + * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as + * list_for_each_entry_safe() and allows the iteration to continue independent + * of changes to @pos by the body of the loop. + * + * Note, however, that it cannot handle other modifications that re-order the + * rbtree it is iterating over. This includes calling rb_erase() on @pos, as + * rb_erase() may rebalance the tree, causing us to miss some nodes. + */ +#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ + for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ + pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ + typeof(*pos), field); 1; }); \ + pos = n) #endif /* _LINUX_RBTREE_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 02caf610c..19362f187 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -166,6 +166,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb) return ret; } +int __ref __register_cpu_notifier(struct notifier_block *nb) +{ + return raw_notifier_chain_register(&cpu_chain, nb); +} + static int __cpu_notify(unsigned long val, void *v, int nr_to_call, int *nr_calls) { @@ -189,6 +194,7 @@ static void cpu_notify_nofail(unsigned long val, void *v) BUG_ON(cpu_notify(val, v)); } EXPORT_SYMBOL(register_cpu_notifier); +EXPORT_SYMBOL(__register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) { @@ -198,6 +204,12 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); +void __ref __unregister_cpu_notifier(struct notifier_block *nb) +{ + raw_notifier_chain_unregister(&cpu_chain, nb); +} +EXPORT_SYMBOL(__unregister_cpu_notifier); + static inline void check_for_tasks(int cpu) { struct task_struct *p; diff --git a/lib/rbtree.c b/lib/rbtree.c index d4175565d..68d1faf12 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -460,3 +460,43 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, *new = *victim; } EXPORT_SYMBOL(rb_replace_node); + +static struct rb_node *rb_left_deepest_node(const struct rb_node *node) +{ + for (;;) { + if (node->rb_left) + node = node->rb_left; + else if (node->rb_right) + node = node->rb_right; + else + return (struct rb_node *)node; + } +} + +struct rb_node *rb_next_postorder(const struct rb_node *node) +{ + const struct rb_node *parent; + if (!node) + return NULL; + parent = rb_parent(node); + + /* If we're sitting on node, we've already seen our children */ + if (parent && node == parent->rb_left && parent->rb_right) { + /* If we are the parent's left node, go to the parent's right + * node then all the way down to the left */ + return rb_left_deepest_node(parent->rb_right); + } else + /* Otherwise we are the parent's right node, and the parent + * should be next */ + return (struct rb_node *)parent; +} +EXPORT_SYMBOL(rb_next_postorder); + +struct rb_node *rb_first_postorder(const struct rb_root *root) +{ + if (!root->rb_node) + return NULL; + + return rb_left_deepest_node(root->rb_node); +} +EXPORT_SYMBOL(rb_first_postorder); \ No newline at end of file diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 79d3c67be..fd68682b4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -317,7 +317,7 @@ static void bad_page(struct page *page) /* Don't complain about poisoned pages */ if (PageHWPoison(page)) { - reset_page_mapcount(page); /* remove PageBuddy */ + page_mapcount_reset(page); /* remove PageBuddy */ return; } @@ -349,7 +349,7 @@ static void bad_page(struct page *page) dump_stack(); out: /* Leave bad fields for debug, except PageBuddy could make trouble */ - reset_page_mapcount(page); /* remove PageBuddy */ + page_mapcount_reset(page); /* remove PageBuddy */ add_taint(TAINT_BAD_PAGE); } @@ -4039,7 +4039,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, set_page_links(page, zone, nid, pfn); mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); - reset_page_mapcount(page); + page_mapcount_reset(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for diff --git a/mm/slob.c b/mm/slob.c index 8105be42c..dcc976b33 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -117,7 +117,7 @@ static inline void struct_slob_page_wrong_size(void) */ static inline void free_slob_page(struct slob_page *sp) { - reset_page_mapcount(&sp->page); + page_mapcount_reset(&sp->page); sp->page.mapping = NULL; } diff --git a/mm/slub.c b/mm/slub.c index 286f736e9..5cb23f8ea 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1419,7 +1419,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) -pages); __ClearPageSlab(page); - reset_page_mapcount(page); + page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; __free_pages(page, order); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 8cb22699e..7543e3fca 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1065,7 +1065,7 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); + BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); area->vm_addr = area->vm->addr; return area->vm_addr + off; } From 5164b305af9a4240861a597c8fb3d1e9b455c450 Mon Sep 17 00:00:00 2001 From: carck Date: Sun, 19 Jun 2016 09:41:56 -0400 Subject: [PATCH 33/52] mm: new shrinker API The current shrinker callout API uses an a single shrinker call for multiple functions. To determine the function, a special magical value is passed in a parameter to change the behaviour. This complicates the implementation and return value specification for the different behaviours. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Separate the two different behaviours into separate operations, one to return a count of freeable objects in the cache, and another to scan a certain number of objects in the cache for freeing. In defining these new operations, ensure the return values and resultant behaviours are clearly defined and documented. Modify shrink_slab() to use the new API and implement the callouts for all the existing shrinkers. Signed-off-by: Dave Chinner Signed-off-by: Glauber Costa Acked-by: Mel Gorman Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve HjønnevÃ¥g Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro Signed-off-by: Pranav Vashi --- include/linux/shrinker.h | 40 +++++++++++++++++------ mm/vmscan.c | 70 ++++++++++++++++++++++++---------------- 2 files changed, 73 insertions(+), 37 deletions(-) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 5c25b9a8c..e4a26ea07 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -4,6 +4,12 @@ /* * This struct is used to pass information from page reclaim to the shrinkers. * We consolidate the values for easier extention later. + * + * The 'gfpmask' refers to the allocation we are currently trying to + * fulfil. + * + * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is + * querying the cache size, so a fastpath for that case is appropriate. */ struct shrink_control { gfp_t gfp_mask; @@ -14,24 +20,38 @@ struct shrink_control { int priority; }; +#define SHRINK_STOP (~0UL) + /* * A callback you can register to apply pressure to ageable caches. * - * 'sc' is passed shrink_control which includes a count 'nr_to_scan' - * and a 'gfpmask'. It should look through the least-recently-used - * 'nr_to_scan' entries and attempt to free them up. It should return - * the number of objects which remain in the cache. If it returns -1, it means - * it cannot do any scanning at this time (eg. there is a risk of deadlock). - * The callback must not return -1 if nr_to_scan is zero. + * @shrink() should look through the least-recently-used 'nr_to_scan' entries + * and attempt to free them up. It should return the number of objects which + * remain in the cache. If it returns -1, it means it cannot do any scanning at + * this time (eg. there is a risk of deadlock). * - * The 'gfpmask' refers to the allocation we are currently trying to - * fulfil. + * @count_objects should return the number of freeable items in the cache. If + * there are no objects to free or the number of freeable items cannot be + * determined, it should return 0. No deadlock checks should be done during the + * count callback - the shrinker relies on aggregating scan counts that couldn't + * be executed due to potential deadlocks to be run at a later call when the + * deadlock condition is no longer pending. * - * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is - * querying the cache size, so a fastpath for that case is appropriate. + * @scan_objects will only be called if @count_objects returned a non-zero + * value for the number of freeable objects. The callout should scan the cache + * and attempt to free items from the cache. It should then return the number + * of objects freed during the scan, or SHRINK_STOP if progress cannot be made + * due to potential deadlocks. If SHRINK_STOP is returned, then no further + * attempts to call the @scan_objects will be made from the current reclaim + * context. */ struct shrinker { int (*shrink)(struct shrinker *, struct shrink_control *sc); + unsigned long (*count_objects)(struct shrinker *, + struct shrink_control *sc); + unsigned long (*scan_objects)(struct shrinker *, + struct shrink_control *sc); + int seeks; /* seeks to recreate an obj */ long batch; /* reclaim batch size, 0 = default */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 29d01dae4..3df4609d1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -274,34 +274,41 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker, * * Returns the number of slab objects which we shrunk. */ -unsigned long shrink_slab(struct shrink_control *shrink, +unsigned long shrink_slab(struct shrink_control *shrinkctl, unsigned long nr_pages_scanned, unsigned long lru_pages) { struct shrinker *shrinker; - unsigned long ret = 0; + unsigned long freed = 0; if (nr_pages_scanned == 0) nr_pages_scanned = SWAP_CLUSTER_MAX; if (!down_read_trylock(&shrinker_rwsem)) { - /* Assume we'll be able to shrink next time */ - ret = 1; + /* + * If we would return 0, our callers would understand that we + * have nothing else to shrink and give up trying. By returning + * 1 we keep it going and assume we'll be able to shrink next + * time. + */ + freed = 1; goto out; } list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; - long total_scan, pages_got; + long total_scan; long max_pass; - int shrink_ret = 0; long nr; long new_nr; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; - max_pass = do_shrinker_shrink(shrinker, shrink, 0); - if (max_pass <= 0) + if (shrinker->count_objects) + max_pass = shrinker->count_objects(shrinker, shrinkctl); + else + max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0); + if (max_pass <= 0) continue; /* @@ -317,9 +324,9 @@ unsigned long shrink_slab(struct shrink_control *shrink, do_div(delta, lru_pages + 1); total_scan += delta; if (total_scan < 0) { - printk(KERN_ERR "shrink_slab: %pF negative objects to " - "delete nr=%ld\n", - shrinker->shrink, total_scan); + printk(KERN_ERR + "shrink_slab: %pF negative objects to delete nr=%ld\n", + shrinker->shrink, total_scan); total_scan = max_pass; } @@ -346,26 +353,35 @@ unsigned long shrink_slab(struct shrink_control *shrink, if (total_scan > max_pass * 2) total_scan = max_pass * 2; - trace_mm_shrink_slab_start(shrinker, shrink, nr, + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, nr_pages_scanned, lru_pages, max_pass, delta, total_scan); while (total_scan >= batch_size) { - int nr_before; - nr_before = do_shrinker_shrink(shrinker, shrink, 0); - shrink_ret = do_shrinker_shrink(shrinker, shrink, - batch_size); - if (shrink_ret == -1) - break; - if (shrink_ret < nr_before) { - pages_got = nr_before - shrink_ret; - ret += pages_got; - total_scan -= pages_got > batch_size ? pages_got : batch_size; - } else { - total_scan -= batch_size; - } + if (shrinker->scan_objects) { + unsigned long ret; + shrinkctl->nr_to_scan = batch_size; + ret = shrinker->scan_objects(shrinker, shrinkctl); + + if (ret == SHRINK_STOP) + break; + freed += ret; + } else { + int nr_before; + long ret; + + nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0); + ret = do_shrinker_shrink(shrinker, shrinkctl, + batch_size); + if (ret == -1) + break; + if (ret < nr_before) + freed += nr_before - ret; + } + count_vm_events(SLABS_SCANNED, batch_size); + total_scan -= batch_size; cond_resched(); } @@ -381,12 +397,12 @@ unsigned long shrink_slab(struct shrink_control *shrink, else new_nr = atomic_long_read(&shrinker->nr_in_batch); - trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); + trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); } up_read(&shrinker_rwsem); out: cond_resched(); - return ret; + return freed; } static inline int is_page_cache_freeable(struct page *page) From 30cc1dac4b954c3a649a1ad9d538a8fd8380697a Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 20 Jun 2016 01:44:20 -0400 Subject: [PATCH 34/52] backport: needed parts for zram & zsmalloc from 3.10 --- arch/arm/configs/ik_defconfig | 2 +- drivers/misc/sec_misc.c | 4 ++-- drivers/staging/android/lowmemorykiller.c | 10 +++++----- include/linux/mm.h | 13 +------------ mm/mincore.c | 2 +- mm/util.c | 20 ++++++++++++++++++++ mm/zswap.c | 4 ++-- 7 files changed, 32 insertions(+), 23 deletions(-) diff --git a/arch/arm/configs/ik_defconfig b/arch/arm/configs/ik_defconfig index 09688cb39..ff131efed 100644 --- a/arch/arm/configs/ik_defconfig +++ b/arch/arm/configs/ik_defconfig @@ -614,7 +614,7 @@ CONFIG_PGTABLE_MAPPING=y CONFIG_ZSWAP=y # CONFIG_SWAP_ENABLE_READAHEAD is not set # CONFIG_ZSWAP_ENABLE_WRITEBACK is not set -ONFIG_ZPOOL=y +CONFIG_ZPOOL=y # CONFIG_ZBUD is not set CONFIG_DIRECT_RECLAIM_FILE_PAGES_ONLY=y CONFIG_INCREASE_MAXIMUM_SWAPPINESS=y diff --git a/drivers/misc/sec_misc.c b/drivers/misc/sec_misc.c index dbd80b854..5a6a17eb7 100755 --- a/drivers/misc/sec_misc.c +++ b/drivers/misc/sec_misc.c @@ -283,7 +283,7 @@ static ssize_t drop_caches_store si_meminfo(&i); printk("[Before]\nMemFree : %8lu kB\n", K(i.freeram)); printk("Cached : %8lu kB\n\n", K(global_page_state(NR_FILE_PAGES) - \ - total_swapcache_pages - i.bufferram)); + total_swapcache_pages() - i.bufferram)); iterate_supers(drop_pagecache_sb, NULL); drop_slab(); @@ -291,7 +291,7 @@ static ssize_t drop_caches_store si_meminfo(&i); printk("[After]\nMemFree : %8lu kB\n", K(i.freeram)); printk("Cached : %8lu kB\n\n", K(global_page_state(NR_FILE_PAGES) - \ - total_swapcache_pages - i.bufferram)); + total_swapcache_pages() - i.bufferram)); printk("Cached Drop done!\n"); } out: diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index 159cbf8a7..07760d099 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -144,7 +144,7 @@ static DEFINE_MUTEX(scan_mutex); #endif #if defined(CONFIG_ZSWAP) -extern atomic_t zswap_pool_pages; +extern u64 zswap_pool_total_size; extern atomic_t zswap_stored_pages; #endif @@ -205,8 +205,8 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) flag = 1; } #endif - if (global_page_state(NR_SHMEM) + total_swapcache_pages < other_file) - other_file -= global_page_state(NR_SHMEM) + total_swapcache_pages; + if (global_page_state(NR_SHMEM) + total_swapcache_pages() < other_file) + other_file -= global_page_state(NR_SHMEM) + total_swapcache_pages(); else other_file = 0; @@ -275,7 +275,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) #if defined(CONFIG_ZSWAP) if (atomic_read(&zswap_stored_pages)) { lowmem_print(3, "shown tasksize : %d\n", tasksize); - tasksize += atomic_read(&zswap_pool_pages) * get_mm_counter(p->mm, MM_SWAPENTS) + tasksize += DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE) * get_mm_counter(p->mm, MM_SWAPENTS) / atomic_read(&zswap_stored_pages); lowmem_print(3, "real tasksize : %d\n", tasksize); } @@ -401,7 +401,7 @@ static int android_oom_handler(struct notifier_block *nb, nr_cma_active_file = global_page_state(NR_CMA_ACTIVE_FILE); other_file = global_page_state(NR_FILE_PAGES) - global_page_state(NR_SHMEM) - - total_swapcache_pages - + total_swapcache_pages() - nr_cma_inactive_file - nr_cma_active_file; #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 773577495..4e7889f1b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -810,18 +810,7 @@ void page_address_init(void); #define PAGE_MAPPING_KSM 2 #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) -extern struct address_space swapper_space; -static inline struct address_space *page_mapping(struct page *page) -{ - struct address_space *mapping = page->mapping; - - VM_BUG_ON(PageSlab(page)); - if (unlikely(PageSwapCache(page))) - mapping = &swapper_space; - else if ((unsigned long)mapping & PAGE_MAPPING_ANON) - mapping = NULL; - return mapping; -} +extern struct address_space *page_mapping(struct page *page); /* Neutral page->mapping pointer to address_space or anon_vma or other */ static inline void *page_rmapping(struct page *page) diff --git a/mm/mincore.c b/mm/mincore.c index cc439ae35..8be8c84b9 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -135,7 +135,7 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } else { #ifdef CONFIG_SWAP pgoff = entry.val; - *vec = mincore_page(&swapper_space, pgoff); + *vec = mincore_page(swap_address_space(entry), pgoff); #else WARN_ON(1); *vec = 1; diff --git a/mm/util.c b/mm/util.c index ae962b31d..a280deeb1 100644 --- a/mm/util.c +++ b/mm/util.c @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include "internal.h" @@ -341,6 +343,24 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, } EXPORT_SYMBOL_GPL(get_user_pages_fast); +struct address_space *page_mapping(struct page *page) +{ + struct address_space *mapping = page->mapping; + + VM_BUG_ON(PageSlab(page)); +#ifdef CONFIG_SWAP + if (unlikely(PageSwapCache(page))) { + swp_entry_t entry; + + entry.val = page_private(page); + mapping = swap_address_space(entry); + } else +#endif + if ((unsigned long)mapping & PAGE_MAPPING_ANON) + mapping = NULL; + return mapping; +} + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); diff --git a/mm/zswap.c b/mm/zswap.c index ffc2c794d..b8405e481 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -46,9 +46,9 @@ * statistics **********************************/ /* Total bytes used by the compressed storage */ -static u64 zswap_pool_total_size; +u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ -static atomic_t zswap_stored_pages = ATOMIC_INIT(0); +atomic_t zswap_stored_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for From eb3b7a0d58338d2a3659715363aaff550d62b7a3 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 20 Jun 2016 03:46:36 -0400 Subject: [PATCH 35/52] backport: needed parts for zram & zsmalloc from 3.10 --- drivers/staging/android/lowmemorykiller.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index 07760d099..9b01986b8 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -174,6 +174,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) bool is_active_high; bool flag = 0; #endif + u32 zswap_pool_total_pages; if (nr_to_scan > 0) { if (mutex_lock_interruptible(&scan_mutex) < 0) @@ -275,7 +276,8 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) #if defined(CONFIG_ZSWAP) if (atomic_read(&zswap_stored_pages)) { lowmem_print(3, "shown tasksize : %d\n", tasksize); - tasksize += DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE) * get_mm_counter(p->mm, MM_SWAPENTS) + zswap_pool_total_pages = DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); + tasksize += zswap_pool_total_pages * get_mm_counter(p->mm, MM_SWAPENTS) / atomic_read(&zswap_stored_pages); lowmem_print(3, "real tasksize : %d\n", tasksize); } From cebe5b90d72ef6e84f3d1c545e3f8cbde7a3e62d Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 20 Jun 2016 04:06:23 -0400 Subject: [PATCH 36/52] drivers: Add state notifier driver big thanks to @neobuddy89 * Doze mode and FB notiers aren't friendly. Introduce state notifier to make things smooth. Merged all latest changes and adapted to my mdss driver. --- drivers/input/touchscreen/Kconfig | 6 + drivers/input/touchscreen/Makefile | 1 + drivers/input/touchscreen/state_notifier.c | 135 +++++++++++++++++++++ drivers/video/msm/mdss/mdss_dsi.c | 12 ++ include/linux/state_notifier.h | 19 +++ 5 files changed, 173 insertions(+) create mode 100644 drivers/input/touchscreen/state_notifier.c create mode 100644 include/linux/state_notifier.h diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig index f0f234dda..bc44d4c6c 100644 --- a/drivers/input/touchscreen/Kconfig +++ b/drivers/input/touchscreen/Kconfig @@ -1145,6 +1145,12 @@ config TOUCHSCREEN_ZINITIX_BT531 To compile this driver as a module, choose M here: the module will be called BT531. +config STATE_NOTIFIER + bool "State Notifier" + depends on CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI + default y + + source "drivers/input/touchscreen/wacom/Kconfig" source "drivers/input/touchscreen/gt9xx/Kconfig" source "drivers/input/touchscreen/synaptics/Kconfig" diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile index be6587f82..33101e7e9 100644 --- a/drivers/input/touchscreen/Makefile +++ b/drivers/input/touchscreen/Makefile @@ -100,3 +100,4 @@ obj-$(CONFIG_TOUCHSCREEN_FTS_S) += stm_s/ obj-$(CONFIG_TOUCHSCREEN_FTS) += stm_fts/ obj-$(CONFIG_TOUCHSCREEN_ZINITIX_BT532) += zinitix_bt532_ts.o obj-$(CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI_JSGLTE) += synaptics_jsglte/ +obj-$(CONFIG_STATE_NOTIFIER) += state_notifier.o diff --git a/drivers/input/touchscreen/state_notifier.c b/drivers/input/touchscreen/state_notifier.c new file mode 100644 index 000000000..7c67b9fc6 --- /dev/null +++ b/drivers/input/touchscreen/state_notifier.c @@ -0,0 +1,135 @@ +/* + * State Notifier Driver + * + * Copyright (c) 2013-2015, Pranav Vashi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include + +#define DEFAULT_SUSPEND_DEFER_TIME 10 +#define STATE_NOTIFIER "state_notifier" + +/* + * debug = 1 will print all + */ +static unsigned int debug = 1; +module_param_named(debug_mask, debug, uint, 0644); + +static bool state_suspended; +module_param_named(state_suspended, state_suspended, bool, 0444); + +#define dprintk(msg...) \ +do { \ + if (debug) \ + pr_info(msg); \ +} while (0) + +static unsigned int suspend_defer_time = DEFAULT_SUSPEND_DEFER_TIME; +module_param_named(suspend_defer_time, suspend_defer_time, uint, 0664); +static struct delayed_work suspend_work; +static struct workqueue_struct *susp_wq; +struct work_struct resume_work; +static bool suspend_in_progress; + +static BLOCKING_NOTIFIER_HEAD(state_notifier_list); + +/** + * state_register_client - register a client notifier + * @nb: notifier block to callback on events + */ +int state_register_client(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&state_notifier_list, nb); +} +EXPORT_SYMBOL(state_register_client); + +/** + * state_unregister_client - unregister a client notifier + * @nb: notifier block to callback on events + */ +int state_unregister_client(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&state_notifier_list, nb); +} +EXPORT_SYMBOL(state_unregister_client); + +/** + * state_notifier_call_chain - notify clients on state_events + * @val: Value passed unmodified to notifier function + * @v: pointer passed unmodified to notifier function + * + */ +int state_notifier_call_chain(unsigned long val, void *v) +{ + return blocking_notifier_call_chain(&state_notifier_list, val, v); +} +EXPORT_SYMBOL_GPL(state_notifier_call_chain); + +static void _suspend_work(struct work_struct *work) +{ + state_notifier_call_chain(STATE_NOTIFIER_SUSPEND, NULL); + state_suspended = true; + suspend_in_progress = false; + dprintk("%s: suspend completed.\n", STATE_NOTIFIER); +} + +static void _resume_work(struct work_struct *work) +{ + state_notifier_call_chain(STATE_NOTIFIER_ACTIVE, NULL); + msleep_interruptible(50); + state_suspended = false; + dprintk("%s: resume completed.\n", STATE_NOTIFIER); +} + +void state_suspend(void) +{ + if (state_suspended || suspend_in_progress) + return; + + dprintk("%s: suspend called.\n", STATE_NOTIFIER); + suspend_in_progress = true; + + queue_delayed_work_on(0, susp_wq, &suspend_work, + msecs_to_jiffies(suspend_defer_time * 1000)); +} + +void state_resume(void) +{ + dprintk("%s: resume called.\n", STATE_NOTIFIER); + cancel_delayed_work_sync(&suspend_work); + suspend_in_progress = false; + + if (state_suspended) + queue_work_on(0, susp_wq, &resume_work); +} + +static int __init state_notifier_init(void) +{ + susp_wq = create_singlethread_workqueue("state_susp_wq"); + + if (!susp_wq) { + pr_err("State Notifier failed to allocate suspend workqueue\n"); + return 0; + } + + INIT_DELAYED_WORK(&suspend_work, _suspend_work); + INIT_WORK(&resume_work, _resume_work); + + return 0; +} + +subsys_initcall(state_notifier_init); + +MODULE_AUTHOR("Pranav Vashi "); +MODULE_DESCRIPTION("State Notifier Driver"); +MODULE_LICENSE("GPLv2"); diff --git a/drivers/video/msm/mdss/mdss_dsi.c b/drivers/video/msm/mdss/mdss_dsi.c index 3163ebd9a..a15204ef3 100644 --- a/drivers/video/msm/mdss/mdss_dsi.c +++ b/drivers/video/msm/mdss/mdss_dsi.c @@ -23,6 +23,10 @@ #include #include +#ifdef CONFIG_STATE_NOTIFIER +#include +#endif + #include "mdss.h" #include "mdss_fb.h" #include "mdss_panel.h" @@ -1474,6 +1478,10 @@ static int mdss_dsi_event_handler(struct mdss_panel_data *pdata, ctrl_pdata->mdp_tg_on = 1; if (ctrl_pdata->on_cmds.link_state == DSI_HS_MODE) rc = mdss_dsi_unblank(pdata); +#ifdef CONFIG_STATE_NOTIFIER + if (!use_fb_notifier) + state_resume(); +#endif break; case MDSS_EVENT_BLANK: if (ctrl_pdata->off_cmds.link_state == DSI_HS_MODE) @@ -1484,6 +1492,10 @@ static int mdss_dsi_event_handler(struct mdss_panel_data *pdata, if (ctrl_pdata->off_cmds.link_state == DSI_LP_MODE) rc = mdss_dsi_blank(pdata); rc = mdss_dsi_off(pdata); +#ifdef CONFIG_STATE_NOTIFIER + if (!use_fb_notifier) + state_suspend(); +#endif break; #if defined(CONFIG_FB_MSM_MDSS_S6E8AA0A_HD_PANEL) case MTP_READ: diff --git a/include/linux/state_notifier.h b/include/linux/state_notifier.h new file mode 100644 index 000000000..34f9287aa --- /dev/null +++ b/include/linux/state_notifier.h @@ -0,0 +1,19 @@ +#ifndef __LINUX_STATE_NOTIFIER_H +#define __LINUX_STATE_NOTIFIER_H + +#include + +#define STATE_NOTIFIER_ACTIVE 0x01 +#define STATE_NOTIFIER_SUSPEND 0x02 + +struct state_event { + void *data; +}; + +extern void state_suspend(void); +extern void state_resume(void); +int state_register_client(struct notifier_block *nb); +int state_unregister_client(struct notifier_block *nb); +int state_notifier_call_chain(unsigned long val, void *v); + +#endif /* _LINUX_STATE_NOTIFIER_H */ From 31ee9d80ff66af505abe9f3a519b642dbd0fc783 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 20 Jun 2016 04:27:03 -0400 Subject: [PATCH 37/52] Config: Enable state notifier & zswap --- arch/arm/configs/ik_defconfig | 1 + mm/zswap.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm/configs/ik_defconfig b/arch/arm/configs/ik_defconfig index ff131efed..5ff50e067 100644 --- a/arch/arm/configs/ik_defconfig +++ b/arch/arm/configs/ik_defconfig @@ -1727,6 +1727,7 @@ CONFIG_INPUT_WACOM=y # CONFIG_TOUCHSCREEN_FTS is not set # CONFIG_TOUCHSCREEN_FTS_S is not set # CONFIG_TOUCHSCREEN_SYNAPTICS_I2C_RMI_JSGLTE is not set +CONFIG_STATE_NOTIFIER=y CONFIG_INPUT_MISC=y # CONFIG_INPUT_AD714X is not set # CONFIG_INPUT_BMA150 is not set diff --git a/mm/zswap.c b/mm/zswap.c index b8405e481..d5980efc1 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -77,12 +77,16 @@ static u64 zswap_duplicate_entry; **********************************/ /* Enable/disable zswap (disabled by default) */ -static bool zswap_enabled; +static bool zswap_enabled = 1; module_param_named(enabled, zswap_enabled, bool, 0644); /* Compressor to be used by zswap (fixed at boot for now) */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" +#ifndef CONFIG_CRYPTO_LZ4 static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; +#else +static char *zswap_compressor = "lz4"; +#endif module_param_named(compressor, zswap_compressor, charp, 0444); /* The maximum percentage of memory that the compressed pool can occupy */ From 4d13b96989775614f25e871136b1b6f1a348c6d2 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 20 Jun 2016 06:03:28 -0400 Subject: [PATCH 38/52] RAMDISK: set swappiness to 100 (reverted from commit 081788bee62d0e25d17a1f12e42a3a72a71c63ff) --- ik.ramdisk/common/init.rc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ik.ramdisk/common/init.rc b/ik.ramdisk/common/init.rc index 756f30c68..7c6d46d58 100755 --- a/ik.ramdisk/common/init.rc +++ b/ik.ramdisk/common/init.rc @@ -72,7 +72,7 @@ on init chown root system /sys/fs/cgroup/memory/tasks chmod 0660 /sys/fs/cgroup/memory/tasks mkdir /sys/fs/cgroup/memory/sw 0750 root system - write /sys/fs/cgroup/memory/sw/memory.swappiness 50 + write /sys/fs/cgroup/memory/sw/memory.swappiness 100 write /sys/fs/cgroup/memory/sw/memory.move_charge_at_immigrate 1 chown root system /sys/fs/cgroup/memory/sw/tasks chmod 0660 /sys/fs/cgroup/memory/sw/tasks From e96e0456f078d6491f657115bb6a919d57d9b955 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 20 Jun 2016 06:28:38 -0400 Subject: [PATCH 39/52] staging: zcache: delete it zcache is obsolete and not used anymore, Bob Liu has rewritten it and is submitting it for inclusion through the main -mm tree, as it should have been done in the first place... Cc: Bob Liu Cc: Konrad Rzeszutek Wilk Cc: Kyungmin Park Cc: Wanpeng Li Signed-off-by: Greg Kroah-Hartman Signed-off-by: Pranav Vashi --- drivers/staging/ramster/Kconfig | 13 - drivers/staging/ramster/Makefile | 1 - drivers/staging/ramster/TODO | 13 - drivers/staging/ramster/cluster/Makefile | 3 - drivers/staging/ramster/cluster/heartbeat.c | 464 --- drivers/staging/ramster/cluster/heartbeat.h | 87 - drivers/staging/ramster/cluster/masklog.c | 155 - drivers/staging/ramster/cluster/masklog.h | 220 -- drivers/staging/ramster/cluster/nodemanager.c | 992 ----- drivers/staging/ramster/cluster/nodemanager.h | 88 - .../ramster/cluster/ramster_nodemanager.h | 39 - drivers/staging/ramster/cluster/tcp.c | 2256 ----------- drivers/staging/ramster/cluster/tcp.h | 159 - .../staging/ramster/cluster/tcp_internal.h | 248 -- drivers/staging/ramster/r2net.c | 401 -- drivers/staging/ramster/ramster.h | 118 - drivers/staging/ramster/tmem.c | 851 ----- drivers/staging/ramster/tmem.h | 244 -- drivers/staging/ramster/xvmalloc.c | 509 --- drivers/staging/ramster/xvmalloc.h | 30 - drivers/staging/ramster/xvmalloc_int.h | 95 - drivers/staging/ramster/zcache-main.c | 3320 ----------------- drivers/staging/ramster/zcache.h | 22 - drivers/staging/zcache/Kconfig | 14 - drivers/staging/zcache/Makefile | 3 - drivers/staging/zcache/tmem.c | 770 ---- drivers/staging/zcache/tmem.h | 206 - drivers/staging/zcache/zcache-main.c | 2083 ----------- 28 files changed, 13404 deletions(-) delete mode 100644 drivers/staging/ramster/Kconfig delete mode 100644 drivers/staging/ramster/Makefile delete mode 100644 drivers/staging/ramster/TODO delete mode 100755 drivers/staging/ramster/cluster/Makefile delete mode 100755 drivers/staging/ramster/cluster/heartbeat.c delete mode 100755 drivers/staging/ramster/cluster/heartbeat.h delete mode 100755 drivers/staging/ramster/cluster/masklog.c delete mode 100755 drivers/staging/ramster/cluster/masklog.h delete mode 100755 drivers/staging/ramster/cluster/nodemanager.c delete mode 100755 drivers/staging/ramster/cluster/nodemanager.h delete mode 100755 drivers/staging/ramster/cluster/ramster_nodemanager.h delete mode 100755 drivers/staging/ramster/cluster/tcp.c delete mode 100755 drivers/staging/ramster/cluster/tcp.h delete mode 100755 drivers/staging/ramster/cluster/tcp_internal.h delete mode 100644 drivers/staging/ramster/r2net.c delete mode 100644 drivers/staging/ramster/ramster.h delete mode 100644 drivers/staging/ramster/tmem.c delete mode 100644 drivers/staging/ramster/tmem.h delete mode 100644 drivers/staging/ramster/xvmalloc.c delete mode 100644 drivers/staging/ramster/xvmalloc.h delete mode 100644 drivers/staging/ramster/xvmalloc_int.h delete mode 100644 drivers/staging/ramster/zcache-main.c delete mode 100644 drivers/staging/ramster/zcache.h delete mode 100644 drivers/staging/zcache/Kconfig delete mode 100644 drivers/staging/zcache/Makefile delete mode 100644 drivers/staging/zcache/tmem.c delete mode 100644 drivers/staging/zcache/tmem.h delete mode 100644 drivers/staging/zcache/zcache-main.c diff --git a/drivers/staging/ramster/Kconfig b/drivers/staging/ramster/Kconfig deleted file mode 100644 index 4af1f8d4b..000000000 --- a/drivers/staging/ramster/Kconfig +++ /dev/null @@ -1,13 +0,0 @@ -config RAMSTER - bool "Cross-machine RAM capacity sharing, aka peer-to-peer tmem" - depends on (CLEANCACHE || FRONTSWAP) && CONFIGFS_FS=y && !ZCACHE && !XVMALLOC && !HIGHMEM - select LZO_COMPRESS - select LZO_DECOMPRESS - default n - help - RAMster allows RAM on other machines in a cluster to be utilized - dynamically and symmetrically instead of swapping to a local swap - disk, thus improving performance on memory-constrained workloads - while minimizing total RAM across the cluster. RAMster, like - zcache, compresses swap pages into local RAM, but then remotifies - the compressed pages to another node in the RAMster cluster. diff --git a/drivers/staging/ramster/Makefile b/drivers/staging/ramster/Makefile deleted file mode 100644 index bcc13c87f..000000000 --- a/drivers/staging/ramster/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_RAMSTER) += zcache-main.o tmem.o r2net.o xvmalloc.o cluster/ diff --git a/drivers/staging/ramster/TODO b/drivers/staging/ramster/TODO deleted file mode 100644 index 46fcf0c58..000000000 --- a/drivers/staging/ramster/TODO +++ /dev/null @@ -1,13 +0,0 @@ -For this staging driver, RAMster duplicates code from drivers/staging/zcache -then incorporates changes to the local copy of the code. For V5, it also -directly incorporates the soon-to-be-removed drivers/staging/zram/xvmalloc.[ch] -as all testing has been done with xvmalloc rather than the new zsmalloc. -Before RAMster can be promoted from staging, the zcache and RAMster drivers -should be either merged or reorganized to separate out common code. - -Until V4, RAMster duplicated code from fs/ocfs2/cluster, but this made -RAMster incompatible with ocfs2 running in the same kernel and included -lots of code that could be removed. As of V5, the ocfs2 code has been -mined and made RAMster-specific, made to communicate with a userland -ramster-tools package rather than ocfs2-tools, and can co-exist with ocfs2 -both in the same kernel and in userland on the same machine. diff --git a/drivers/staging/ramster/cluster/Makefile b/drivers/staging/ramster/cluster/Makefile deleted file mode 100755 index 9c6943652..000000000 --- a/drivers/staging/ramster/cluster/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -obj-$(CONFIG_RAMSTER) += ramster_nodemanager.o - -ramster_nodemanager-objs := heartbeat.o masklog.o nodemanager.o tcp.o diff --git a/drivers/staging/ramster/cluster/heartbeat.c b/drivers/staging/ramster/cluster/heartbeat.c deleted file mode 100755 index 002094907..000000000 --- a/drivers/staging/ramster/cluster/heartbeat.c +++ /dev/null @@ -1,464 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * Copyright (C) 2004, 2005, 2012 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include -#include -#include - -#include "heartbeat.h" -#include "tcp.h" -#include "nodemanager.h" - -#include "masklog.h" - -/* - * The first heartbeat pass had one global thread that would serialize all hb - * callback calls. This global serializing sem should only be removed once - * we've made sure that all callees can deal with being called concurrently - * from multiple hb region threads. - */ -static DECLARE_RWSEM(r2hb_callback_sem); - -/* - * multiple hb threads are watching multiple regions. A node is live - * whenever any of the threads sees activity from the node in its region. - */ -static DEFINE_SPINLOCK(r2hb_live_lock); -static unsigned long r2hb_live_node_bitmap[BITS_TO_LONGS(R2NM_MAX_NODES)]; - -static struct r2hb_callback { - struct list_head list; -} r2hb_callbacks[R2HB_NUM_CB]; - -enum r2hb_heartbeat_modes { - R2HB_HEARTBEAT_LOCAL = 0, - R2HB_HEARTBEAT_GLOBAL, - R2HB_HEARTBEAT_NUM_MODES, -}; - -char *r2hb_heartbeat_mode_desc[R2HB_HEARTBEAT_NUM_MODES] = { - "local", /* R2HB_HEARTBEAT_LOCAL */ - "global", /* R2HB_HEARTBEAT_GLOBAL */ -}; - -unsigned int r2hb_dead_threshold = R2HB_DEFAULT_DEAD_THRESHOLD; -unsigned int r2hb_heartbeat_mode = R2HB_HEARTBEAT_LOCAL; - -/* Only sets a new threshold if there are no active regions. - * - * No locking or otherwise interesting code is required for reading - * r2hb_dead_threshold as it can't change once regions are active and - * it's not interesting to anyone until then anyway. */ -static void r2hb_dead_threshold_set(unsigned int threshold) -{ - if (threshold > R2HB_MIN_DEAD_THRESHOLD) { - spin_lock(&r2hb_live_lock); - r2hb_dead_threshold = threshold; - spin_unlock(&r2hb_live_lock); - } -} - -static int r2hb_global_hearbeat_mode_set(unsigned int hb_mode) -{ - int ret = -1; - - if (hb_mode < R2HB_HEARTBEAT_NUM_MODES) { - spin_lock(&r2hb_live_lock); - r2hb_heartbeat_mode = hb_mode; - ret = 0; - spin_unlock(&r2hb_live_lock); - } - - return ret; -} - -void r2hb_exit(void) -{ -} - -int r2hb_init(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(r2hb_callbacks); i++) - INIT_LIST_HEAD(&r2hb_callbacks[i].list); - - memset(r2hb_live_node_bitmap, 0, sizeof(r2hb_live_node_bitmap)); - - return 0; -} - -/* if we're already in a callback then we're already serialized by the sem */ -static void r2hb_fill_node_map_from_callback(unsigned long *map, - unsigned bytes) -{ - BUG_ON(bytes < (BITS_TO_LONGS(R2NM_MAX_NODES) * sizeof(unsigned long))); - - memcpy(map, &r2hb_live_node_bitmap, bytes); -} - -/* - * get a map of all nodes that are heartbeating in any regions - */ -void r2hb_fill_node_map(unsigned long *map, unsigned bytes) -{ - /* callers want to serialize this map and callbacks so that they - * can trust that they don't miss nodes coming to the party */ - down_read(&r2hb_callback_sem); - spin_lock(&r2hb_live_lock); - r2hb_fill_node_map_from_callback(map, bytes); - spin_unlock(&r2hb_live_lock); - up_read(&r2hb_callback_sem); -} -EXPORT_SYMBOL_GPL(r2hb_fill_node_map); - -/* - * heartbeat configfs bits. The heartbeat set is a default set under - * the cluster set in nodemanager.c. - */ - -/* heartbeat set */ - -struct r2hb_hb_group { - struct config_group hs_group; - /* some stuff? */ -}; - -static struct r2hb_hb_group *to_r2hb_hb_group(struct config_group *group) -{ - return group ? - container_of(group, struct r2hb_hb_group, hs_group) - : NULL; -} - -static struct config_item r2hb_config_item; - -static struct config_item *r2hb_hb_group_make_item(struct config_group *group, - const char *name) -{ - int ret; - - if (strlen(name) > R2HB_MAX_REGION_NAME_LEN) { - ret = -ENAMETOOLONG; - goto free; - } - - config_item_put(&r2hb_config_item); - - return &r2hb_config_item; -free: - return ERR_PTR(ret); -} - -static void r2hb_hb_group_drop_item(struct config_group *group, - struct config_item *item) -{ - if (r2hb_global_heartbeat_active()) { - printk(KERN_NOTICE "ramster: Heartbeat %s " - "on region %s (%s)\n", - "stopped/aborted", config_item_name(item), - "no region"); - } - - config_item_put(item); -} - -struct r2hb_hb_group_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct r2hb_hb_group *, char *); - ssize_t (*store)(struct r2hb_hb_group *, const char *, size_t); -}; - -static ssize_t r2hb_hb_group_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct r2hb_hb_group *reg = to_r2hb_hb_group(to_config_group(item)); - struct r2hb_hb_group_attribute *r2hb_hb_group_attr = - container_of(attr, struct r2hb_hb_group_attribute, attr); - ssize_t ret = 0; - - if (r2hb_hb_group_attr->show) - ret = r2hb_hb_group_attr->show(reg, page); - return ret; -} - -static ssize_t r2hb_hb_group_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct r2hb_hb_group *reg = to_r2hb_hb_group(to_config_group(item)); - struct r2hb_hb_group_attribute *r2hb_hb_group_attr = - container_of(attr, struct r2hb_hb_group_attribute, attr); - ssize_t ret = -EINVAL; - - if (r2hb_hb_group_attr->store) - ret = r2hb_hb_group_attr->store(reg, page, count); - return ret; -} - -static ssize_t r2hb_hb_group_threshold_show(struct r2hb_hb_group *group, - char *page) -{ - return sprintf(page, "%u\n", r2hb_dead_threshold); -} - -static ssize_t r2hb_hb_group_threshold_store(struct r2hb_hb_group *group, - const char *page, - size_t count) -{ - unsigned long tmp; - char *p = (char *)page; - int err; - - err = kstrtoul(p, 10, &tmp); - if (err) - return err; - - /* this will validate ranges for us. */ - r2hb_dead_threshold_set((unsigned int) tmp); - - return count; -} - -static -ssize_t r2hb_hb_group_mode_show(struct r2hb_hb_group *group, - char *page) -{ - return sprintf(page, "%s\n", - r2hb_heartbeat_mode_desc[r2hb_heartbeat_mode]); -} - -static -ssize_t r2hb_hb_group_mode_store(struct r2hb_hb_group *group, - const char *page, size_t count) -{ - unsigned int i; - int ret; - size_t len; - - len = (page[count - 1] == '\n') ? count - 1 : count; - if (!len) - return -EINVAL; - - for (i = 0; i < R2HB_HEARTBEAT_NUM_MODES; ++i) { - if (strnicmp(page, r2hb_heartbeat_mode_desc[i], len)) - continue; - - ret = r2hb_global_hearbeat_mode_set(i); - if (!ret) - printk(KERN_NOTICE "ramster: Heartbeat mode " - "set to %s\n", - r2hb_heartbeat_mode_desc[i]); - return count; - } - - return -EINVAL; - -} - -static struct r2hb_hb_group_attribute r2hb_hb_group_attr_threshold = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "dead_threshold", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2hb_hb_group_threshold_show, - .store = r2hb_hb_group_threshold_store, -}; - -static struct r2hb_hb_group_attribute r2hb_hb_group_attr_mode = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "mode", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2hb_hb_group_mode_show, - .store = r2hb_hb_group_mode_store, -}; - -static struct configfs_attribute *r2hb_hb_group_attrs[] = { - &r2hb_hb_group_attr_threshold.attr, - &r2hb_hb_group_attr_mode.attr, - NULL, -}; - -static struct configfs_item_operations r2hb_hearbeat_group_item_ops = { - .show_attribute = r2hb_hb_group_show, - .store_attribute = r2hb_hb_group_store, -}; - -static struct configfs_group_operations r2hb_hb_group_group_ops = { - .make_item = r2hb_hb_group_make_item, - .drop_item = r2hb_hb_group_drop_item, -}; - -static struct config_item_type r2hb_hb_group_type = { - .ct_group_ops = &r2hb_hb_group_group_ops, - .ct_item_ops = &r2hb_hearbeat_group_item_ops, - .ct_attrs = r2hb_hb_group_attrs, - .ct_owner = THIS_MODULE, -}; - -/* this is just here to avoid touching group in heartbeat.h which the - * entire damn world #includes */ -struct config_group *r2hb_alloc_hb_set(void) -{ - struct r2hb_hb_group *hs = NULL; - struct config_group *ret = NULL; - - hs = kzalloc(sizeof(struct r2hb_hb_group), GFP_KERNEL); - if (hs == NULL) - goto out; - - config_group_init_type_name(&hs->hs_group, "heartbeat", - &r2hb_hb_group_type); - - ret = &hs->hs_group; -out: - if (ret == NULL) - kfree(hs); - return ret; -} - -void r2hb_free_hb_set(struct config_group *group) -{ - struct r2hb_hb_group *hs = to_r2hb_hb_group(group); - kfree(hs); -} - -/* hb callback registration and issuing */ - -static struct r2hb_callback *hbcall_from_type(enum r2hb_callback_type type) -{ - if (type == R2HB_NUM_CB) - return ERR_PTR(-EINVAL); - - return &r2hb_callbacks[type]; -} - -void r2hb_setup_callback(struct r2hb_callback_func *hc, - enum r2hb_callback_type type, - r2hb_cb_func *func, - void *data, - int priority) -{ - INIT_LIST_HEAD(&hc->hc_item); - hc->hc_func = func; - hc->hc_data = data; - hc->hc_priority = priority; - hc->hc_type = type; - hc->hc_magic = R2HB_CB_MAGIC; -} -EXPORT_SYMBOL_GPL(r2hb_setup_callback); - -int r2hb_register_callback(const char *region_uuid, - struct r2hb_callback_func *hc) -{ - struct r2hb_callback_func *tmp; - struct list_head *iter; - struct r2hb_callback *hbcall; - int ret; - - BUG_ON(hc->hc_magic != R2HB_CB_MAGIC); - BUG_ON(!list_empty(&hc->hc_item)); - - hbcall = hbcall_from_type(hc->hc_type); - if (IS_ERR(hbcall)) { - ret = PTR_ERR(hbcall); - goto out; - } - - down_write(&r2hb_callback_sem); - - list_for_each(iter, &hbcall->list) { - tmp = list_entry(iter, struct r2hb_callback_func, hc_item); - if (hc->hc_priority < tmp->hc_priority) { - list_add_tail(&hc->hc_item, iter); - break; - } - } - if (list_empty(&hc->hc_item)) - list_add_tail(&hc->hc_item, &hbcall->list); - - up_write(&r2hb_callback_sem); - ret = 0; -out: - mlog(ML_CLUSTER, "returning %d on behalf of %p for funcs %p\n", - ret, __builtin_return_address(0), hc); - return ret; -} -EXPORT_SYMBOL_GPL(r2hb_register_callback); - -void r2hb_unregister_callback(const char *region_uuid, - struct r2hb_callback_func *hc) -{ - BUG_ON(hc->hc_magic != R2HB_CB_MAGIC); - - mlog(ML_CLUSTER, "on behalf of %p for funcs %p\n", - __builtin_return_address(0), hc); - - /* XXX Can this happen _with_ a region reference? */ - if (list_empty(&hc->hc_item)) - return; - - down_write(&r2hb_callback_sem); - - list_del_init(&hc->hc_item); - - up_write(&r2hb_callback_sem); -} -EXPORT_SYMBOL_GPL(r2hb_unregister_callback); - -int r2hb_check_node_heartbeating_from_callback(u8 node_num) -{ - unsigned long testing_map[BITS_TO_LONGS(R2NM_MAX_NODES)]; - - r2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); - if (!test_bit(node_num, testing_map)) { - mlog(ML_HEARTBEAT, - "node (%u) does not have heartbeating enabled.\n", - node_num); - return 0; - } - - return 1; -} -EXPORT_SYMBOL_GPL(r2hb_check_node_heartbeating_from_callback); - -void r2hb_stop_all_regions(void) -{ -} -EXPORT_SYMBOL_GPL(r2hb_stop_all_regions); - -/* - * this is just a hack until we get the plumbing which flips file systems - * read only and drops the hb ref instead of killing the node dead. - */ -int r2hb_global_heartbeat_active(void) -{ - return (r2hb_heartbeat_mode == R2HB_HEARTBEAT_GLOBAL); -} -EXPORT_SYMBOL(r2hb_global_heartbeat_active); - -/* added for RAMster */ -void r2hb_manual_set_node_heartbeating(int node_num) -{ - if (node_num < R2NM_MAX_NODES) - set_bit(node_num, r2hb_live_node_bitmap); -} -EXPORT_SYMBOL(r2hb_manual_set_node_heartbeating); diff --git a/drivers/staging/ramster/cluster/heartbeat.h b/drivers/staging/ramster/cluster/heartbeat.h deleted file mode 100755 index 6cbc775bd..000000000 --- a/drivers/staging/ramster/cluster/heartbeat.h +++ /dev/null @@ -1,87 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * heartbeat.h - * - * Function prototypes - * - * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - */ - -#ifndef R2CLUSTER_HEARTBEAT_H -#define R2CLUSTER_HEARTBEAT_H - -#define R2HB_REGION_TIMEOUT_MS 2000 - -#define R2HB_MAX_REGION_NAME_LEN 32 - -/* number of changes to be seen as live */ -#define R2HB_LIVE_THRESHOLD 2 -/* number of equal samples to be seen as dead */ -extern unsigned int r2hb_dead_threshold; -#define R2HB_DEFAULT_DEAD_THRESHOLD 31 -/* Otherwise MAX_WRITE_TIMEOUT will be zero... */ -#define R2HB_MIN_DEAD_THRESHOLD 2 -#define R2HB_MAX_WRITE_TIMEOUT_MS \ - (R2HB_REGION_TIMEOUT_MS * (r2hb_dead_threshold - 1)) - -#define R2HB_CB_MAGIC 0x51d1e4ec - -/* callback stuff */ -enum r2hb_callback_type { - R2HB_NODE_DOWN_CB = 0, - R2HB_NODE_UP_CB, - R2HB_NUM_CB -}; - -struct r2nm_node; -typedef void (r2hb_cb_func)(struct r2nm_node *, int, void *); - -struct r2hb_callback_func { - u32 hc_magic; - struct list_head hc_item; - r2hb_cb_func *hc_func; - void *hc_data; - int hc_priority; - enum r2hb_callback_type hc_type; -}; - -struct config_group *r2hb_alloc_hb_set(void); -void r2hb_free_hb_set(struct config_group *group); - -void r2hb_setup_callback(struct r2hb_callback_func *hc, - enum r2hb_callback_type type, - r2hb_cb_func *func, - void *data, - int priority); -int r2hb_register_callback(const char *region_uuid, - struct r2hb_callback_func *hc); -void r2hb_unregister_callback(const char *region_uuid, - struct r2hb_callback_func *hc); -void r2hb_fill_node_map(unsigned long *map, - unsigned bytes); -void r2hb_exit(void); -int r2hb_init(void); -int r2hb_check_node_heartbeating_from_callback(u8 node_num); -void r2hb_stop_all_regions(void); -int r2hb_get_all_regions(char *region_uuids, u8 numregions); -int r2hb_global_heartbeat_active(void); -void r2hb_manual_set_node_heartbeating(int); - -#endif /* R2CLUSTER_HEARTBEAT_H */ diff --git a/drivers/staging/ramster/cluster/masklog.c b/drivers/staging/ramster/cluster/masklog.c deleted file mode 100755 index 1261d8579..000000000 --- a/drivers/staging/ramster/cluster/masklog.c +++ /dev/null @@ -1,155 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * Copyright (C) 2004, 2005, 2012 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include -#include -#include -#include -#include -#include - -#include "masklog.h" - -struct mlog_bits r2_mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK); -EXPORT_SYMBOL_GPL(r2_mlog_and_bits); -struct mlog_bits r2_mlog_not_bits = MLOG_BITS_RHS(0); -EXPORT_SYMBOL_GPL(r2_mlog_not_bits); - -static ssize_t mlog_mask_show(u64 mask, char *buf) -{ - char *state; - - if (__mlog_test_u64(mask, r2_mlog_and_bits)) - state = "allow"; - else if (__mlog_test_u64(mask, r2_mlog_not_bits)) - state = "deny"; - else - state = "off"; - - return snprintf(buf, PAGE_SIZE, "%s\n", state); -} - -static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) -{ - if (!strnicmp(buf, "allow", 5)) { - __mlog_set_u64(mask, r2_mlog_and_bits); - __mlog_clear_u64(mask, r2_mlog_not_bits); - } else if (!strnicmp(buf, "deny", 4)) { - __mlog_set_u64(mask, r2_mlog_not_bits); - __mlog_clear_u64(mask, r2_mlog_and_bits); - } else if (!strnicmp(buf, "off", 3)) { - __mlog_clear_u64(mask, r2_mlog_not_bits); - __mlog_clear_u64(mask, r2_mlog_and_bits); - } else - return -EINVAL; - - return count; -} - -struct mlog_attribute { - struct attribute attr; - u64 mask; -}; - -#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr) - -#define define_mask(_name) { \ - .attr = { \ - .name = #_name, \ - .mode = S_IRUGO | S_IWUSR, \ - }, \ - .mask = ML_##_name, \ -} - -static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { - define_mask(TCP), - define_mask(MSG), - define_mask(SOCKET), - define_mask(HEARTBEAT), - define_mask(HB_BIO), - define_mask(DLMFS), - define_mask(DLM), - define_mask(DLM_DOMAIN), - define_mask(DLM_THREAD), - define_mask(DLM_MASTER), - define_mask(DLM_RECOVERY), - define_mask(DLM_GLUE), - define_mask(VOTE), - define_mask(CONN), - define_mask(QUORUM), - define_mask(BASTS), - define_mask(CLUSTER), - define_mask(ERROR), - define_mask(NOTICE), - define_mask(KTHREAD), -}; - -static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; - -static ssize_t mlog_show(struct kobject *obj, struct attribute *attr, - char *buf) -{ - struct mlog_attribute *mlog_attr = to_mlog_attr(attr); - - return mlog_mask_show(mlog_attr->mask, buf); -} - -static ssize_t mlog_store(struct kobject *obj, struct attribute *attr, - const char *buf, size_t count) -{ - struct mlog_attribute *mlog_attr = to_mlog_attr(attr); - - return mlog_mask_store(mlog_attr->mask, buf, count); -} - -static const struct sysfs_ops mlog_attr_ops = { - .show = mlog_show, - .store = mlog_store, -}; - -static struct kobj_type mlog_ktype = { - .default_attrs = mlog_attr_ptrs, - .sysfs_ops = &mlog_attr_ops, -}; - -static struct kset mlog_kset = { - .kobj = {.ktype = &mlog_ktype}, -}; - -int r2_mlog_sys_init(struct kset *r2cb_kset) -{ - int i = 0; - - while (mlog_attrs[i].attr.mode) { - mlog_attr_ptrs[i] = &mlog_attrs[i].attr; - i++; - } - mlog_attr_ptrs[i] = NULL; - - kobject_set_name(&mlog_kset.kobj, "logmask"); - mlog_kset.kobj.kset = r2cb_kset; - return kset_register(&mlog_kset); -} - -void r2_mlog_sys_shutdown(void) -{ - kset_unregister(&mlog_kset); -} diff --git a/drivers/staging/ramster/cluster/masklog.h b/drivers/staging/ramster/cluster/masklog.h deleted file mode 100755 index 918ae110b..000000000 --- a/drivers/staging/ramster/cluster/masklog.h +++ /dev/null @@ -1,220 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * Copyright (C) 2005, 2012 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef R2CLUSTER_MASKLOG_H -#define R2CLUSTER_MASKLOG_H - -/* - * For now this is a trivial wrapper around printk() that gives the critical - * ability to enable sets of debugging output at run-time. In the future this - * will almost certainly be redirected to relayfs so that it can pay a - * substantially lower heisenberg tax. - * - * Callers associate the message with a bitmask and a global bitmask is - * maintained with help from /proc. If any of the bits match the message is - * output. - * - * We must have efficient bit tests on i386 and it seems gcc still emits crazy - * code for the 64bit compare. It emits very good code for the dual unsigned - * long tests, though, completely avoiding tests that can never pass if the - * caller gives a constant bitmask that fills one of the longs with all 0s. So - * the desire is to have almost all of the calls decided on by comparing just - * one of the longs. This leads to having infrequently given bits that are - * frequently matched in the high bits. - * - * _ERROR and _NOTICE are used for messages that always go to the console and - * have appropriate KERN_ prefixes. We wrap these in our function instead of - * just calling printk() so that this can eventually make its way through - * relayfs along with the debugging messages. Everything else gets KERN_DEBUG. - * The inline tests and macro dance give GCC the opportunity to quite cleverly - * only emit the appropriage printk() when the caller passes in a constant - * mask, as is almost always the case. - * - * All this bitmask nonsense is managed from the files under - * /sys/fs/r2cb/logmask/. Reading the files gives a straightforward - * indication of which bits are allowed (allow) or denied (off/deny). - * ENTRY deny - * EXIT deny - * TCP off - * MSG off - * SOCKET off - * ERROR allow - * NOTICE allow - * - * Writing changes the state of a given bit and requires a strictly formatted - * single write() call: - * - * write(fd, "allow", 5); - * - * Echoing allow/deny/off string into the logmask files can flip the bits - * on or off as expected; here is the bash script for example: - * - * log_mask="/sys/fs/r2cb/log_mask" - * for node in ENTRY EXIT TCP MSG SOCKET ERROR NOTICE; do - * echo allow >"$log_mask"/"$node" - * done - * - * The debugfs.ramster tool can also flip the bits with the -l option: - * - * debugfs.ramster -l TCP allow - */ - -/* for task_struct */ -#include - -/* bits that are frequently given and infrequently matched in the low word */ -/* NOTE: If you add a flag, you need to also update masklog.c! */ -#define ML_TCP 0x0000000000000001ULL /* net cluster/tcp.c */ -#define ML_MSG 0x0000000000000002ULL /* net network messages */ -#define ML_SOCKET 0x0000000000000004ULL /* net socket lifetime */ -#define ML_HEARTBEAT 0x0000000000000008ULL /* hb all heartbeat tracking */ -#define ML_HB_BIO 0x0000000000000010ULL /* hb io tracing */ -#define ML_DLMFS 0x0000000000000020ULL /* dlm user dlmfs */ -#define ML_DLM 0x0000000000000040ULL /* dlm general debugging */ -#define ML_DLM_DOMAIN 0x0000000000000080ULL /* dlm domain debugging */ -#define ML_DLM_THREAD 0x0000000000000100ULL /* dlm domain thread */ -#define ML_DLM_MASTER 0x0000000000000200ULL /* dlm master functions */ -#define ML_DLM_RECOVERY 0x0000000000000400ULL /* dlm master functions */ -#define ML_DLM_GLUE 0x0000000000000800ULL /* ramster dlm glue layer */ -#define ML_VOTE 0x0000000000001000ULL /* ramster node messaging */ -#define ML_CONN 0x0000000000002000ULL /* net connection management */ -#define ML_QUORUM 0x0000000000004000ULL /* net connection quorum */ -#define ML_BASTS 0x0000000000008000ULL /* dlmglue asts and basts */ -#define ML_CLUSTER 0x0000000000010000ULL /* cluster stack */ - -/* bits that are infrequently given and frequently matched in the high word */ -#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */ -#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */ -#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */ - -#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) -#ifndef MLOG_MASK_PREFIX -#define MLOG_MASK_PREFIX 0 -#endif - -/* - * When logging is disabled, force the bit test to 0 for anything other - * than errors and notices, allowing gcc to remove the code completely. - * When enabled, allow all masks. - */ -#if defined(CONFIG_RAMSTER_DEBUG_MASKLOG) -#define ML_ALLOWED_BITS (~0) -#else -#define ML_ALLOWED_BITS (ML_ERROR|ML_NOTICE) -#endif - -#define MLOG_MAX_BITS 64 - -struct mlog_bits { - unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG]; -}; - -extern struct mlog_bits r2_mlog_and_bits, r2_mlog_not_bits; - -#if BITS_PER_LONG == 32 - -#define __mlog_test_u64(mask, bits) \ - ((u32)(mask & 0xffffffff) & bits.words[0] || \ - ((u64)(mask) >> 32) & bits.words[1]) -#define __mlog_set_u64(mask, bits) do { \ - bits.words[0] |= (u32)(mask & 0xffffffff); \ - bits.words[1] |= (u64)(mask) >> 32; \ -} while (0) -#define __mlog_clear_u64(mask, bits) do { \ - bits.words[0] &= ~((u32)(mask & 0xffffffff)); \ - bits.words[1] &= ~((u64)(mask) >> 32); \ -} while (0) -#define MLOG_BITS_RHS(mask) { \ - { \ - [0] = (u32)(mask & 0xffffffff), \ - [1] = (u64)(mask) >> 32, \ - } \ -} - -#else /* 32bit long above, 64bit long below */ - -#define __mlog_test_u64(mask, bits) ((mask) & bits.words[0]) -#define __mlog_set_u64(mask, bits) do { \ - bits.words[0] |= (mask); \ -} while (0) -#define __mlog_clear_u64(mask, bits) do { \ - bits.words[0] &= ~(mask); \ -} while (0) -#define MLOG_BITS_RHS(mask) { { (mask) } } - -#endif - -/* - * smp_processor_id() "helpfully" screams when called outside preemptible - * regions in current kernels. sles doesn't have the variants that don't - * scream. just do this instead of trying to guess which we're building - * against.. *sigh*. - */ -#define __mlog_cpu_guess ({ \ - unsigned long _cpu = get_cpu(); \ - put_cpu(); \ - _cpu; \ -}) - -/* In the following two macros, the whitespace after the ',' just - * before ##args is intentional. Otherwise, gcc 2.95 will eat the - * previous token if args expands to nothing. - */ -#define __mlog_printk(level, fmt, args...) \ - printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \ - task_pid_nr(current), __mlog_cpu_guess, \ - __PRETTY_FUNCTION__, __LINE__ , ##args) - -#define mlog(mask, fmt, args...) do { \ - u64 __m = MLOG_MASK_PREFIX | (mask); \ - if ((__m & ML_ALLOWED_BITS) && \ - __mlog_test_u64(__m, r2_mlog_and_bits) && \ - !__mlog_test_u64(__m, r2_mlog_not_bits)) { \ - if (__m & ML_ERROR) \ - __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \ - else if (__m & ML_NOTICE) \ - __mlog_printk(KERN_NOTICE, fmt , ##args); \ - else \ - __mlog_printk(KERN_INFO, fmt , ##args); \ - } \ -} while (0) - -#define mlog_errno(st) do { \ - int _st = (st); \ - if (_st != -ERESTARTSYS && _st != -EINTR && \ - _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \ - mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ -} while (0) - -#define mlog_bug_on_msg(cond, fmt, args...) do { \ - if (cond) { \ - mlog(ML_ERROR, "bug expression: " #cond "\n"); \ - mlog(ML_ERROR, fmt, ##args); \ - BUG(); \ - } \ -} while (0) - -#include -#include -int r2_mlog_sys_init(struct kset *r2cb_subsys); -void r2_mlog_sys_shutdown(void); - -#endif /* R2CLUSTER_MASKLOG_H */ diff --git a/drivers/staging/ramster/cluster/nodemanager.c b/drivers/staging/ramster/cluster/nodemanager.c deleted file mode 100755 index de0e5c8da..000000000 --- a/drivers/staging/ramster/cluster/nodemanager.c +++ /dev/null @@ -1,992 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * Copyright (C) 2004, 2005, 2012 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include -#include -#include -#include - -#include "tcp.h" -#include "nodemanager.h" -#include "heartbeat.h" -#include "masklog.h" - -/* for now we operate under the assertion that there can be only one - * cluster active at a time. Changing this will require trickling - * cluster references throughout where nodes are looked up */ -struct r2nm_cluster *r2nm_single_cluster; - -char *r2nm_fence_method_desc[R2NM_FENCE_METHODS] = { - "reset", /* R2NM_FENCE_RESET */ - "panic", /* R2NM_FENCE_PANIC */ -}; - -struct r2nm_node *r2nm_get_node_by_num(u8 node_num) -{ - struct r2nm_node *node = NULL; - - if (node_num >= R2NM_MAX_NODES || r2nm_single_cluster == NULL) - goto out; - - read_lock(&r2nm_single_cluster->cl_nodes_lock); - node = r2nm_single_cluster->cl_nodes[node_num]; - if (node) - config_item_get(&node->nd_item); - read_unlock(&r2nm_single_cluster->cl_nodes_lock); -out: - return node; -} -EXPORT_SYMBOL_GPL(r2nm_get_node_by_num); - -int r2nm_configured_node_map(unsigned long *map, unsigned bytes) -{ - struct r2nm_cluster *cluster = r2nm_single_cluster; - - BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap))); - - if (cluster == NULL) - return -EINVAL; - - read_lock(&cluster->cl_nodes_lock); - memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap)); - read_unlock(&cluster->cl_nodes_lock); - - return 0; -} -EXPORT_SYMBOL_GPL(r2nm_configured_node_map); - -static struct r2nm_node *r2nm_node_ip_tree_lookup(struct r2nm_cluster *cluster, - __be32 ip_needle, - struct rb_node ***ret_p, - struct rb_node **ret_parent) -{ - struct rb_node **p = &cluster->cl_node_ip_tree.rb_node; - struct rb_node *parent = NULL; - struct r2nm_node *node, *ret = NULL; - - while (*p) { - int cmp; - - parent = *p; - node = rb_entry(parent, struct r2nm_node, nd_ip_node); - - cmp = memcmp(&ip_needle, &node->nd_ipv4_address, - sizeof(ip_needle)); - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else { - ret = node; - break; - } - } - - if (ret_p != NULL) - *ret_p = p; - if (ret_parent != NULL) - *ret_parent = parent; - - return ret; -} - -struct r2nm_node *r2nm_get_node_by_ip(__be32 addr) -{ - struct r2nm_node *node = NULL; - struct r2nm_cluster *cluster = r2nm_single_cluster; - - if (cluster == NULL) - goto out; - - read_lock(&cluster->cl_nodes_lock); - node = r2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL); - if (node) - config_item_get(&node->nd_item); - read_unlock(&cluster->cl_nodes_lock); - -out: - return node; -} -EXPORT_SYMBOL_GPL(r2nm_get_node_by_ip); - -void r2nm_node_put(struct r2nm_node *node) -{ - config_item_put(&node->nd_item); -} -EXPORT_SYMBOL_GPL(r2nm_node_put); - -void r2nm_node_get(struct r2nm_node *node) -{ - config_item_get(&node->nd_item); -} -EXPORT_SYMBOL_GPL(r2nm_node_get); - -u8 r2nm_this_node(void) -{ - u8 node_num = R2NM_MAX_NODES; - - if (r2nm_single_cluster && r2nm_single_cluster->cl_has_local) - node_num = r2nm_single_cluster->cl_local_node; - - return node_num; -} -EXPORT_SYMBOL_GPL(r2nm_this_node); - -/* node configfs bits */ - -static struct r2nm_cluster *to_r2nm_cluster(struct config_item *item) -{ - return item ? - container_of(to_config_group(item), struct r2nm_cluster, - cl_group) - : NULL; -} - -static struct r2nm_node *to_r2nm_node(struct config_item *item) -{ - return item ? container_of(item, struct r2nm_node, nd_item) : NULL; -} - -static void r2nm_node_release(struct config_item *item) -{ - struct r2nm_node *node = to_r2nm_node(item); - kfree(node); -} - -static ssize_t r2nm_node_num_read(struct r2nm_node *node, char *page) -{ - return sprintf(page, "%d\n", node->nd_num); -} - -static struct r2nm_cluster *to_r2nm_cluster_from_node(struct r2nm_node *node) -{ - /* through the first node_set .parent - * mycluster/nodes/mynode == r2nm_cluster->r2nm_node_group->r2nm_node */ - return to_r2nm_cluster(node->nd_item.ci_parent->ci_parent); -} - -enum { - R2NM_NODE_ATTR_NUM = 0, - R2NM_NODE_ATTR_PORT, - R2NM_NODE_ATTR_ADDRESS, - R2NM_NODE_ATTR_LOCAL, -}; - -static ssize_t r2nm_node_num_write(struct r2nm_node *node, const char *page, - size_t count) -{ - struct r2nm_cluster *cluster = to_r2nm_cluster_from_node(node); - unsigned long tmp; - char *p = (char *)page; - int err; - - err = kstrtoul(p, 10, &tmp); - if (err) - return err; - - if (tmp >= R2NM_MAX_NODES) - return -ERANGE; - - /* once we're in the cl_nodes tree networking can look us up by - * node number and try to use our address and port attributes - * to connect to this node.. make sure that they've been set - * before writing the node attribute? */ - if (!test_bit(R2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) || - !test_bit(R2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) - return -EINVAL; /* XXX */ - - write_lock(&cluster->cl_nodes_lock); - if (cluster->cl_nodes[tmp]) - p = NULL; - else { - cluster->cl_nodes[tmp] = node; - node->nd_num = tmp; - set_bit(tmp, cluster->cl_nodes_bitmap); - } - write_unlock(&cluster->cl_nodes_lock); - if (p == NULL) - return -EEXIST; - - return count; -} -static ssize_t r2nm_node_ipv4_port_read(struct r2nm_node *node, char *page) -{ - return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port)); -} - -static ssize_t r2nm_node_ipv4_port_write(struct r2nm_node *node, - const char *page, size_t count) -{ - unsigned long tmp; - char *p = (char *)page; - int err; - - err = kstrtoul(p, 10, &tmp); - if (err) - return err; - - if (tmp == 0) - return -EINVAL; - if (tmp >= (u16)-1) - return -ERANGE; - - node->nd_ipv4_port = htons(tmp); - - return count; -} - -static ssize_t r2nm_node_ipv4_address_read(struct r2nm_node *node, char *page) -{ - return sprintf(page, "%pI4\n", &node->nd_ipv4_address); -} - -static ssize_t r2nm_node_ipv4_address_write(struct r2nm_node *node, - const char *page, - size_t count) -{ - struct r2nm_cluster *cluster = to_r2nm_cluster_from_node(node); - int ret, i; - struct rb_node **p, *parent; - unsigned int octets[4]; - __be32 ipv4_addr = 0; - - ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2], - &octets[1], &octets[0]); - if (ret != 4) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(octets); i++) { - if (octets[i] > 255) - return -ERANGE; - be32_add_cpu(&ipv4_addr, octets[i] << (i * 8)); - } - - ret = 0; - write_lock(&cluster->cl_nodes_lock); - if (r2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) - ret = -EEXIST; - else { - rb_link_node(&node->nd_ip_node, parent, p); - rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); - } - write_unlock(&cluster->cl_nodes_lock); - if (ret) - return ret; - - memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr)); - - return count; -} - -static ssize_t r2nm_node_local_read(struct r2nm_node *node, char *page) -{ - return sprintf(page, "%d\n", node->nd_local); -} - -static ssize_t r2nm_node_local_write(struct r2nm_node *node, const char *page, - size_t count) -{ - struct r2nm_cluster *cluster = to_r2nm_cluster_from_node(node); - unsigned long tmp; - char *p = (char *)page; - ssize_t ret; - int err; - - err = kstrtoul(p, 10, &tmp); - if (err) - return err; - - tmp = !!tmp; /* boolean of whether this node wants to be local */ - - /* setting local turns on networking rx for now so we require having - * set everything else first */ - if (!test_bit(R2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) || - !test_bit(R2NM_NODE_ATTR_NUM, &node->nd_set_attributes) || - !test_bit(R2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) - return -EINVAL; /* XXX */ - - /* the only failure case is trying to set a new local node - * when a different one is already set */ - if (tmp && tmp == cluster->cl_has_local && - cluster->cl_local_node != node->nd_num) - return -EBUSY; - - /* bring up the rx thread if we're setting the new local node. */ - if (tmp && !cluster->cl_has_local) { - ret = r2net_start_listening(node); - if (ret) - return ret; - } - - if (!tmp && cluster->cl_has_local && - cluster->cl_local_node == node->nd_num) { - r2net_stop_listening(node); - cluster->cl_local_node = R2NM_INVALID_NODE_NUM; - } - - node->nd_local = tmp; - if (node->nd_local) { - cluster->cl_has_local = tmp; - cluster->cl_local_node = node->nd_num; - } - - return count; -} - -struct r2nm_node_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct r2nm_node *, char *); - ssize_t (*store)(struct r2nm_node *, const char *, size_t); -}; - -static struct r2nm_node_attribute r2nm_node_attr_num = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "num", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_node_num_read, - .store = r2nm_node_num_write, -}; - -static struct r2nm_node_attribute r2nm_node_attr_ipv4_port = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "ipv4_port", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_node_ipv4_port_read, - .store = r2nm_node_ipv4_port_write, -}; - -static struct r2nm_node_attribute r2nm_node_attr_ipv4_address = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "ipv4_address", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_node_ipv4_address_read, - .store = r2nm_node_ipv4_address_write, -}; - -static struct r2nm_node_attribute r2nm_node_attr_local = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "local", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_node_local_read, - .store = r2nm_node_local_write, -}; - -static struct configfs_attribute *r2nm_node_attrs[] = { - [R2NM_NODE_ATTR_NUM] = &r2nm_node_attr_num.attr, - [R2NM_NODE_ATTR_PORT] = &r2nm_node_attr_ipv4_port.attr, - [R2NM_NODE_ATTR_ADDRESS] = &r2nm_node_attr_ipv4_address.attr, - [R2NM_NODE_ATTR_LOCAL] = &r2nm_node_attr_local.attr, - NULL, -}; - -static int r2nm_attr_index(struct configfs_attribute *attr) -{ - int i; - for (i = 0; i < ARRAY_SIZE(r2nm_node_attrs); i++) { - if (attr == r2nm_node_attrs[i]) - return i; - } - BUG(); - return 0; -} - -static ssize_t r2nm_node_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct r2nm_node *node = to_r2nm_node(item); - struct r2nm_node_attribute *r2nm_node_attr = - container_of(attr, struct r2nm_node_attribute, attr); - ssize_t ret = 0; - - if (r2nm_node_attr->show) - ret = r2nm_node_attr->show(node, page); - return ret; -} - -static ssize_t r2nm_node_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct r2nm_node *node = to_r2nm_node(item); - struct r2nm_node_attribute *r2nm_node_attr = - container_of(attr, struct r2nm_node_attribute, attr); - ssize_t ret; - int attr_index = r2nm_attr_index(attr); - - if (r2nm_node_attr->store == NULL) { - ret = -EINVAL; - goto out; - } - - if (test_bit(attr_index, &node->nd_set_attributes)) - return -EBUSY; - - ret = r2nm_node_attr->store(node, page, count); - if (ret < count) - goto out; - - set_bit(attr_index, &node->nd_set_attributes); -out: - return ret; -} - -static struct configfs_item_operations r2nm_node_item_ops = { - .release = r2nm_node_release, - .show_attribute = r2nm_node_show, - .store_attribute = r2nm_node_store, -}; - -static struct config_item_type r2nm_node_type = { - .ct_item_ops = &r2nm_node_item_ops, - .ct_attrs = r2nm_node_attrs, - .ct_owner = THIS_MODULE, -}; - -/* node set */ - -struct r2nm_node_group { - struct config_group ns_group; - /* some stuff? */ -}; - -#if 0 -static struct r2nm_node_group *to_r2nm_node_group(struct config_group *group) -{ - return group ? - container_of(group, struct r2nm_node_group, ns_group) - : NULL; -} -#endif - -struct r2nm_cluster_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct r2nm_cluster *, char *); - ssize_t (*store)(struct r2nm_cluster *, const char *, size_t); -}; - -static ssize_t r2nm_cluster_attr_write(const char *page, ssize_t count, - unsigned int *val) -{ - unsigned long tmp; - char *p = (char *)page; - int err; - - err = kstrtoul(p, 10, &tmp); - if (err) - return err; - - if (tmp == 0) - return -EINVAL; - if (tmp >= (u32)-1) - return -ERANGE; - - *val = tmp; - - return count; -} - -static ssize_t r2nm_cluster_attr_idle_timeout_ms_read( - struct r2nm_cluster *cluster, char *page) -{ - return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms); -} - -static ssize_t r2nm_cluster_attr_idle_timeout_ms_write( - struct r2nm_cluster *cluster, const char *page, size_t count) -{ - ssize_t ret; - unsigned int val = 0; - - ret = r2nm_cluster_attr_write(page, count, &val); - - if (ret > 0) { - if (cluster->cl_idle_timeout_ms != val - && r2net_num_connected_peers()) { - mlog(ML_NOTICE, - "r2net: cannot change idle timeout after " - "the first peer has agreed to it." - " %d connected peers\n", - r2net_num_connected_peers()); - ret = -EINVAL; - } else if (val <= cluster->cl_keepalive_delay_ms) { - mlog(ML_NOTICE, "r2net: idle timeout must be larger " - "than keepalive delay\n"); - ret = -EINVAL; - } else { - cluster->cl_idle_timeout_ms = val; - } - } - - return ret; -} - -static ssize_t r2nm_cluster_attr_keepalive_delay_ms_read( - struct r2nm_cluster *cluster, char *page) -{ - return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms); -} - -static ssize_t r2nm_cluster_attr_keepalive_delay_ms_write( - struct r2nm_cluster *cluster, const char *page, size_t count) -{ - ssize_t ret; - unsigned int val = 0; - - ret = r2nm_cluster_attr_write(page, count, &val); - - if (ret > 0) { - if (cluster->cl_keepalive_delay_ms != val - && r2net_num_connected_peers()) { - mlog(ML_NOTICE, - "r2net: cannot change keepalive delay after" - " the first peer has agreed to it." - " %d connected peers\n", - r2net_num_connected_peers()); - ret = -EINVAL; - } else if (val >= cluster->cl_idle_timeout_ms) { - mlog(ML_NOTICE, "r2net: keepalive delay must be " - "smaller than idle timeout\n"); - ret = -EINVAL; - } else { - cluster->cl_keepalive_delay_ms = val; - } - } - - return ret; -} - -static ssize_t r2nm_cluster_attr_reconnect_delay_ms_read( - struct r2nm_cluster *cluster, char *page) -{ - return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms); -} - -static ssize_t r2nm_cluster_attr_reconnect_delay_ms_write( - struct r2nm_cluster *cluster, const char *page, size_t count) -{ - return r2nm_cluster_attr_write(page, count, - &cluster->cl_reconnect_delay_ms); -} - -static ssize_t r2nm_cluster_attr_fence_method_read( - struct r2nm_cluster *cluster, char *page) -{ - ssize_t ret = 0; - - if (cluster) - ret = sprintf(page, "%s\n", - r2nm_fence_method_desc[cluster->cl_fence_method]); - return ret; -} - -static ssize_t r2nm_cluster_attr_fence_method_write( - struct r2nm_cluster *cluster, const char *page, size_t count) -{ - unsigned int i; - - if (page[count - 1] != '\n') - goto bail; - - for (i = 0; i < R2NM_FENCE_METHODS; ++i) { - if (count != strlen(r2nm_fence_method_desc[i]) + 1) - continue; - if (strncasecmp(page, r2nm_fence_method_desc[i], count - 1)) - continue; - if (cluster->cl_fence_method != i) { - printk(KERN_INFO "ramster: Changing fence method to %s\n", - r2nm_fence_method_desc[i]); - cluster->cl_fence_method = i; - } - return count; - } - -bail: - return -EINVAL; -} - -static struct r2nm_cluster_attribute r2nm_cluster_attr_idle_timeout_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "idle_timeout_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_cluster_attr_idle_timeout_ms_read, - .store = r2nm_cluster_attr_idle_timeout_ms_write, -}; - -static struct r2nm_cluster_attribute r2nm_cluster_attr_keepalive_delay_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "keepalive_delay_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_cluster_attr_keepalive_delay_ms_read, - .store = r2nm_cluster_attr_keepalive_delay_ms_write, -}; - -static struct r2nm_cluster_attribute r2nm_cluster_attr_reconnect_delay_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "reconnect_delay_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_cluster_attr_reconnect_delay_ms_read, - .store = r2nm_cluster_attr_reconnect_delay_ms_write, -}; - -static struct r2nm_cluster_attribute r2nm_cluster_attr_fence_method = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "fence_method", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = r2nm_cluster_attr_fence_method_read, - .store = r2nm_cluster_attr_fence_method_write, -}; - -static struct configfs_attribute *r2nm_cluster_attrs[] = { - &r2nm_cluster_attr_idle_timeout_ms.attr, - &r2nm_cluster_attr_keepalive_delay_ms.attr, - &r2nm_cluster_attr_reconnect_delay_ms.attr, - &r2nm_cluster_attr_fence_method.attr, - NULL, -}; -static ssize_t r2nm_cluster_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct r2nm_cluster *cluster = to_r2nm_cluster(item); - struct r2nm_cluster_attribute *r2nm_cluster_attr = - container_of(attr, struct r2nm_cluster_attribute, attr); - ssize_t ret = 0; - - if (r2nm_cluster_attr->show) - ret = r2nm_cluster_attr->show(cluster, page); - return ret; -} - -static ssize_t r2nm_cluster_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct r2nm_cluster *cluster = to_r2nm_cluster(item); - struct r2nm_cluster_attribute *r2nm_cluster_attr = - container_of(attr, struct r2nm_cluster_attribute, attr); - ssize_t ret; - - if (r2nm_cluster_attr->store == NULL) { - ret = -EINVAL; - goto out; - } - - ret = r2nm_cluster_attr->store(cluster, page, count); - if (ret < count) - goto out; -out: - return ret; -} - -static struct config_item *r2nm_node_group_make_item(struct config_group *group, - const char *name) -{ - struct r2nm_node *node = NULL; - - if (strlen(name) > R2NM_MAX_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - - node = kzalloc(sizeof(struct r2nm_node), GFP_KERNEL); - if (node == NULL) - return ERR_PTR(-ENOMEM); - - strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */ - config_item_init_type_name(&node->nd_item, name, &r2nm_node_type); - spin_lock_init(&node->nd_lock); - - mlog(ML_CLUSTER, "r2nm: Registering node %s\n", name); - - return &node->nd_item; -} - -static void r2nm_node_group_drop_item(struct config_group *group, - struct config_item *item) -{ - struct r2nm_node *node = to_r2nm_node(item); - struct r2nm_cluster *cluster = - to_r2nm_cluster(group->cg_item.ci_parent); - - r2net_disconnect_node(node); - - if (cluster->cl_has_local && - (cluster->cl_local_node == node->nd_num)) { - cluster->cl_has_local = 0; - cluster->cl_local_node = R2NM_INVALID_NODE_NUM; - r2net_stop_listening(node); - } - - /* XXX call into net to stop this node from trading messages */ - - write_lock(&cluster->cl_nodes_lock); - - /* XXX sloppy */ - if (node->nd_ipv4_address) - rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree); - - /* nd_num might be 0 if the node number hasn't been set.. */ - if (cluster->cl_nodes[node->nd_num] == node) { - cluster->cl_nodes[node->nd_num] = NULL; - clear_bit(node->nd_num, cluster->cl_nodes_bitmap); - } - write_unlock(&cluster->cl_nodes_lock); - - mlog(ML_CLUSTER, "r2nm: Unregistered node %s\n", - config_item_name(&node->nd_item)); - - config_item_put(item); -} - -static struct configfs_group_operations r2nm_node_group_group_ops = { - .make_item = r2nm_node_group_make_item, - .drop_item = r2nm_node_group_drop_item, -}; - -static struct config_item_type r2nm_node_group_type = { - .ct_group_ops = &r2nm_node_group_group_ops, - .ct_owner = THIS_MODULE, -}; - -/* cluster */ - -static void r2nm_cluster_release(struct config_item *item) -{ - struct r2nm_cluster *cluster = to_r2nm_cluster(item); - - kfree(cluster->cl_group.default_groups); - kfree(cluster); -} - -static struct configfs_item_operations r2nm_cluster_item_ops = { - .release = r2nm_cluster_release, - .show_attribute = r2nm_cluster_show, - .store_attribute = r2nm_cluster_store, -}; - -static struct config_item_type r2nm_cluster_type = { - .ct_item_ops = &r2nm_cluster_item_ops, - .ct_attrs = r2nm_cluster_attrs, - .ct_owner = THIS_MODULE, -}; - -/* cluster set */ - -struct r2nm_cluster_group { - struct configfs_subsystem cs_subsys; - /* some stuff? */ -}; - -#if 0 -static struct r2nm_cluster_group * -to_r2nm_cluster_group(struct config_group *group) -{ - return group ? - container_of(to_configfs_subsystem(group), - struct r2nm_cluster_group, cs_subsys) - : NULL; -} -#endif - -static struct config_group * -r2nm_cluster_group_make_group(struct config_group *group, - const char *name) -{ - struct r2nm_cluster *cluster = NULL; - struct r2nm_node_group *ns = NULL; - struct config_group *r2hb_group = NULL, *ret = NULL; - void *defs = NULL; - - /* this runs under the parent dir's i_mutex; there can be only - * one caller in here at a time */ - if (r2nm_single_cluster) - return ERR_PTR(-ENOSPC); - - cluster = kzalloc(sizeof(struct r2nm_cluster), GFP_KERNEL); - ns = kzalloc(sizeof(struct r2nm_node_group), GFP_KERNEL); - defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); - r2hb_group = r2hb_alloc_hb_set(); - if (cluster == NULL || ns == NULL || r2hb_group == NULL || defs == NULL) - goto out; - - config_group_init_type_name(&cluster->cl_group, name, - &r2nm_cluster_type); - config_group_init_type_name(&ns->ns_group, "node", - &r2nm_node_group_type); - - cluster->cl_group.default_groups = defs; - cluster->cl_group.default_groups[0] = &ns->ns_group; - cluster->cl_group.default_groups[1] = r2hb_group; - cluster->cl_group.default_groups[2] = NULL; - rwlock_init(&cluster->cl_nodes_lock); - cluster->cl_node_ip_tree = RB_ROOT; - cluster->cl_reconnect_delay_ms = R2NET_RECONNECT_DELAY_MS_DEFAULT; - cluster->cl_idle_timeout_ms = R2NET_IDLE_TIMEOUT_MS_DEFAULT; - cluster->cl_keepalive_delay_ms = R2NET_KEEPALIVE_DELAY_MS_DEFAULT; - cluster->cl_fence_method = R2NM_FENCE_RESET; - - ret = &cluster->cl_group; - r2nm_single_cluster = cluster; - -out: - if (ret == NULL) { - kfree(cluster); - kfree(ns); - r2hb_free_hb_set(r2hb_group); - kfree(defs); - ret = ERR_PTR(-ENOMEM); - } - - return ret; -} - -static void r2nm_cluster_group_drop_item(struct config_group *group, - struct config_item *item) -{ - struct r2nm_cluster *cluster = to_r2nm_cluster(item); - int i; - struct config_item *killme; - - BUG_ON(r2nm_single_cluster != cluster); - r2nm_single_cluster = NULL; - - for (i = 0; cluster->cl_group.default_groups[i]; i++) { - killme = &cluster->cl_group.default_groups[i]->cg_item; - cluster->cl_group.default_groups[i] = NULL; - config_item_put(killme); - } - - config_item_put(item); -} - -static struct configfs_group_operations r2nm_cluster_group_group_ops = { - .make_group = r2nm_cluster_group_make_group, - .drop_item = r2nm_cluster_group_drop_item, -}; - -static struct config_item_type r2nm_cluster_group_type = { - .ct_group_ops = &r2nm_cluster_group_group_ops, - .ct_owner = THIS_MODULE, -}; - -static struct r2nm_cluster_group r2nm_cluster_group = { - .cs_subsys = { - .su_group = { - .cg_item = { - .ci_namebuf = "cluster", - .ci_type = &r2nm_cluster_group_type, - }, - }, - }, -}; - -int r2nm_depend_item(struct config_item *item) -{ - return configfs_depend_item(&r2nm_cluster_group.cs_subsys, item); -} - -void r2nm_undepend_item(struct config_item *item) -{ - configfs_undepend_item(&r2nm_cluster_group.cs_subsys, item); -} - -int r2nm_depend_this_node(void) -{ - int ret = 0; - struct r2nm_node *local_node; - - local_node = r2nm_get_node_by_num(r2nm_this_node()); - if (!local_node) { - ret = -EINVAL; - goto out; - } - - ret = r2nm_depend_item(&local_node->nd_item); - r2nm_node_put(local_node); - -out: - return ret; -} - -void r2nm_undepend_this_node(void) -{ - struct r2nm_node *local_node; - - local_node = r2nm_get_node_by_num(r2nm_this_node()); - BUG_ON(!local_node); - - r2nm_undepend_item(&local_node->nd_item); - r2nm_node_put(local_node); -} - - -static void __exit exit_r2nm(void) -{ - /* XXX sync with hb callbacks and shut down hb? */ - r2net_unregister_hb_callbacks(); - configfs_unregister_subsystem(&r2nm_cluster_group.cs_subsys); - - r2net_exit(); - r2hb_exit(); -} - -static int __init init_r2nm(void) -{ - int ret = -1; - - ret = r2hb_init(); - if (ret) - goto out; - - ret = r2net_init(); - if (ret) - goto out_r2hb; - - ret = r2net_register_hb_callbacks(); - if (ret) - goto out_r2net; - - config_group_init(&r2nm_cluster_group.cs_subsys.su_group); - mutex_init(&r2nm_cluster_group.cs_subsys.su_mutex); - ret = configfs_register_subsystem(&r2nm_cluster_group.cs_subsys); - if (ret) { - printk(KERN_ERR "nodemanager: Registration returned %d\n", ret); - goto out_callbacks; - } - - if (!ret) - goto out; - - configfs_unregister_subsystem(&r2nm_cluster_group.cs_subsys); -out_callbacks: - r2net_unregister_hb_callbacks(); -out_r2net: - r2net_exit(); -out_r2hb: - r2hb_exit(); -out: - return ret; -} - -MODULE_AUTHOR("Oracle"); -MODULE_LICENSE("GPL"); - -module_init(init_r2nm) -module_exit(exit_r2nm) diff --git a/drivers/staging/ramster/cluster/nodemanager.h b/drivers/staging/ramster/cluster/nodemanager.h deleted file mode 100755 index 41a04df58..000000000 --- a/drivers/staging/ramster/cluster/nodemanager.h +++ /dev/null @@ -1,88 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * nodemanager.h - * - * Function prototypes - * - * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - */ - -#ifndef R2CLUSTER_NODEMANAGER_H -#define R2CLUSTER_NODEMANAGER_H - -#include "ramster_nodemanager.h" - -/* This totally doesn't belong here. */ -#include -#include - -enum r2nm_fence_method { - R2NM_FENCE_RESET = 0, - R2NM_FENCE_PANIC, - R2NM_FENCE_METHODS, /* Number of fence methods */ -}; - -struct r2nm_node { - spinlock_t nd_lock; - struct config_item nd_item; - char nd_name[R2NM_MAX_NAME_LEN+1]; /* replace? */ - __u8 nd_num; - /* only one address per node, as attributes, for now. */ - __be32 nd_ipv4_address; - __be16 nd_ipv4_port; - struct rb_node nd_ip_node; - /* there can be only one local node for now */ - int nd_local; - - unsigned long nd_set_attributes; -}; - -struct r2nm_cluster { - struct config_group cl_group; - unsigned cl_has_local:1; - u8 cl_local_node; - rwlock_t cl_nodes_lock; - struct r2nm_node *cl_nodes[R2NM_MAX_NODES]; - struct rb_root cl_node_ip_tree; - unsigned int cl_idle_timeout_ms; - unsigned int cl_keepalive_delay_ms; - unsigned int cl_reconnect_delay_ms; - enum r2nm_fence_method cl_fence_method; - - /* part of a hack for disk bitmap.. will go eventually. - zab */ - unsigned long cl_nodes_bitmap[BITS_TO_LONGS(R2NM_MAX_NODES)]; -}; - -extern struct r2nm_cluster *r2nm_single_cluster; - -u8 r2nm_this_node(void); - -int r2nm_configured_node_map(unsigned long *map, unsigned bytes); -struct r2nm_node *r2nm_get_node_by_num(u8 node_num); -struct r2nm_node *r2nm_get_node_by_ip(__be32 addr); -void r2nm_node_get(struct r2nm_node *node); -void r2nm_node_put(struct r2nm_node *node); - -int r2nm_depend_item(struct config_item *item); -void r2nm_undepend_item(struct config_item *item); -int r2nm_depend_this_node(void); -void r2nm_undepend_this_node(void); - -#endif /* R2CLUSTER_NODEMANAGER_H */ diff --git a/drivers/staging/ramster/cluster/ramster_nodemanager.h b/drivers/staging/ramster/cluster/ramster_nodemanager.h deleted file mode 100755 index 49f879d94..000000000 --- a/drivers/staging/ramster/cluster/ramster_nodemanager.h +++ /dev/null @@ -1,39 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ramster_nodemanager.h - * - * Header describing the interface between userspace and the kernel - * for the ramster_nodemanager module. - * - * Copyright (C) 2002, 2004, 2012 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - */ - -#ifndef _RAMSTER_NODEMANAGER_H -#define _RAMSTER_NODEMANAGER_H - -#define R2NM_API_VERSION 5 - -#define R2NM_MAX_NODES 255 -#define R2NM_INVALID_NODE_NUM 255 - -/* host name, group name, cluster name all 64 bytes */ -#define R2NM_MAX_NAME_LEN 64 /* __NEW_UTS_LEN */ - -#endif /* _RAMSTER_NODEMANAGER_H */ diff --git a/drivers/staging/ramster/cluster/tcp.c b/drivers/staging/ramster/cluster/tcp.c deleted file mode 100755 index 3af1b2c51..000000000 --- a/drivers/staging/ramster/cluster/tcp.c +++ /dev/null @@ -1,2256 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - * ---- - * - * Callers for this were originally written against a very simple synchronus - * API. This implementation reflects those simple callers. Some day I'm sure - * we'll need to move to a more robust posting/callback mechanism. - * - * Transmit calls pass in kernel virtual addresses and block copying this into - * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting - * for a failed socket to timeout. TX callers can also pass in a poniter to an - * 'int' which gets filled with an errno off the wire in response to the - * message they send. - * - * Handlers for unsolicited messages are registered. Each socket has a page - * that incoming data is copied into. First the header, then the data. - * Handlers are called from only one thread with a reference to this per-socket - * page. This page is destroyed after the handler call, so it can't be - * referenced beyond the call. Handlers may block but are discouraged from - * doing so. - * - * Any framing errors (bad magic, large payload lengths) close a connection. - * - * Our sock_container holds the state we associate with a socket. It's current - * framing state is held there as well as the refcounting we do around when it - * is safe to tear down the socket. The socket is only finally torn down from - * the container when the container loses all of its references -- so as long - * as you hold a ref on the container you can trust that the socket is valid - * for use with kernel socket APIs. - * - * Connections are initiated between a pair of nodes when the node with the - * higher node number gets a heartbeat callback which indicates that the lower - * numbered node has started heartbeating. The lower numbered node is passive - * and only accepts the connection if the higher numbered node is heartbeating. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "heartbeat.h" -#include "tcp.h" -#include "nodemanager.h" -#define MLOG_MASK_PREFIX ML_TCP -#include "masklog.h" - -#include "tcp_internal.h" - -#define SC_NODEF_FMT "node %s (num %u) at %pI4:%u" - -/* - * In the following two log macros, the whitespace after the ',' just - * before ##args is intentional. Otherwise, gcc 2.95 will eat the - * previous token if args expands to nothing. - */ -#define msglog(hdr, fmt, args...) do { \ - typeof(hdr) __hdr = (hdr); \ - mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d " \ - "key %08x num %u] " fmt, \ - be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), \ - be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status), \ - be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key), \ - be32_to_cpu(__hdr->msg_num) , ##args); \ -} while (0) - -#define sclog(sc, fmt, args...) do { \ - typeof(sc) __sc = (sc); \ - mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \ - "pg_off %zu] " fmt, __sc, \ - atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \ - __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \ - ##args); \ -} while (0) - -static DEFINE_RWLOCK(r2net_handler_lock); -static struct rb_root r2net_handler_tree = RB_ROOT; - -static struct r2net_node r2net_nodes[R2NM_MAX_NODES]; - -/* XXX someday we'll need better accounting */ -static struct socket *r2net_listen_sock; - -/* - * listen work is only queued by the listening socket callbacks on the - * r2net_wq. teardown detaches the callbacks before destroying the workqueue. - * quorum work is queued as sock containers are shutdown.. stop_listening - * tears down all the node's sock containers, preventing future shutdowns - * and queued quroum work, before canceling delayed quorum work and - * destroying the work queue. - */ -static struct workqueue_struct *r2net_wq; -static struct work_struct r2net_listen_work; - -static struct r2hb_callback_func r2net_hb_up, r2net_hb_down; -#define R2NET_HB_PRI 0x1 - -static struct r2net_handshake *r2net_hand; -static struct r2net_msg *r2net_keep_req, *r2net_keep_resp; - -static int r2net_sys_err_translations[R2NET_ERR_MAX] = { - [R2NET_ERR_NONE] = 0, - [R2NET_ERR_NO_HNDLR] = -ENOPROTOOPT, - [R2NET_ERR_OVERFLOW] = -EOVERFLOW, - [R2NET_ERR_DIED] = -EHOSTDOWN,}; - -/* can't quite avoid *all* internal declarations :/ */ -static void r2net_sc_connect_completed(struct work_struct *work); -static void r2net_rx_until_empty(struct work_struct *work); -static void r2net_shutdown_sc(struct work_struct *work); -static void r2net_listen_data_ready(struct sock *sk, int bytes); -static void r2net_sc_send_keep_req(struct work_struct *work); -static void r2net_idle_timer(unsigned long data); -static void r2net_sc_postpone_idle(struct r2net_sock_container *sc); -static void r2net_sc_reset_idle_timer(struct r2net_sock_container *sc); - -#ifdef CONFIG_DEBUG_FS -static void r2net_init_nst(struct r2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node) -{ - INIT_LIST_HEAD(&nst->st_net_debug_item); - nst->st_task = task; - nst->st_msg_type = msgtype; - nst->st_msg_key = msgkey; - nst->st_node = node; -} - -static inline void r2net_set_nst_sock_time(struct r2net_send_tracking *nst) -{ - nst->st_sock_time = ktime_get(); -} - -static inline void r2net_set_nst_send_time(struct r2net_send_tracking *nst) -{ - nst->st_send_time = ktime_get(); -} - -static inline void r2net_set_nst_status_time(struct r2net_send_tracking *nst) -{ - nst->st_status_time = ktime_get(); -} - -static inline void r2net_set_nst_sock_container(struct r2net_send_tracking *nst, - struct r2net_sock_container *sc) -{ - nst->st_sc = sc; -} - -static inline void r2net_set_nst_msg_id(struct r2net_send_tracking *nst, - u32 msg_id) -{ - nst->st_id = msg_id; -} - -static inline void r2net_set_sock_timer(struct r2net_sock_container *sc) -{ - sc->sc_tv_timer = ktime_get(); -} - -static inline void r2net_set_data_ready_time(struct r2net_sock_container *sc) -{ - sc->sc_tv_data_ready = ktime_get(); -} - -static inline void r2net_set_advance_start_time(struct r2net_sock_container *sc) -{ - sc->sc_tv_advance_start = ktime_get(); -} - -static inline void r2net_set_advance_stop_time(struct r2net_sock_container *sc) -{ - sc->sc_tv_advance_stop = ktime_get(); -} - -static inline void r2net_set_func_start_time(struct r2net_sock_container *sc) -{ - sc->sc_tv_func_start = ktime_get(); -} - -static inline void r2net_set_func_stop_time(struct r2net_sock_container *sc) -{ - sc->sc_tv_func_stop = ktime_get(); -} - -#else /* CONFIG_DEBUG_FS */ -# define r2net_init_nst(a, b, c, d, e) -# define r2net_set_nst_sock_time(a) -# define r2net_set_nst_send_time(a) -# define r2net_set_nst_status_time(a) -# define r2net_set_nst_sock_container(a, b) -# define r2net_set_nst_msg_id(a, b) -# define r2net_set_sock_timer(a) -# define r2net_set_data_ready_time(a) -# define r2net_set_advance_start_time(a) -# define r2net_set_advance_stop_time(a) -# define r2net_set_func_start_time(a) -# define r2net_set_func_stop_time(a) -#endif /* CONFIG_DEBUG_FS */ - -#ifdef CONFIG_RAMSTER_FS_STATS -static ktime_t r2net_get_func_run_time(struct r2net_sock_container *sc) -{ - return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start); -} - -static void r2net_update_send_stats(struct r2net_send_tracking *nst, - struct r2net_sock_container *sc) -{ - sc->sc_tv_status_total = ktime_add(sc->sc_tv_status_total, - ktime_sub(ktime_get(), - nst->st_status_time)); - sc->sc_tv_send_total = ktime_add(sc->sc_tv_send_total, - ktime_sub(nst->st_status_time, - nst->st_send_time)); - sc->sc_tv_acquiry_total = ktime_add(sc->sc_tv_acquiry_total, - ktime_sub(nst->st_send_time, - nst->st_sock_time)); - sc->sc_send_count++; -} - -static void r2net_update_recv_stats(struct r2net_sock_container *sc) -{ - sc->sc_tv_process_total = ktime_add(sc->sc_tv_process_total, - r2net_get_func_run_time(sc)); - sc->sc_recv_count++; -} - -#else - -# define r2net_update_send_stats(a, b) - -# define r2net_update_recv_stats(sc) - -#endif /* CONFIG_RAMSTER_FS_STATS */ - -static inline int r2net_reconnect_delay(void) -{ - return r2nm_single_cluster->cl_reconnect_delay_ms; -} - -static inline int r2net_keepalive_delay(void) -{ - return r2nm_single_cluster->cl_keepalive_delay_ms; -} - -static inline int r2net_idle_timeout(void) -{ - return r2nm_single_cluster->cl_idle_timeout_ms; -} - -static inline int r2net_sys_err_to_errno(enum r2net_system_error err) -{ - int trans; - BUG_ON(err >= R2NET_ERR_MAX); - trans = r2net_sys_err_translations[err]; - - /* Just in case we mess up the translation table above */ - BUG_ON(err != R2NET_ERR_NONE && trans == 0); - return trans; -} - -struct r2net_node *r2net_nn_from_num(u8 node_num) -{ - BUG_ON(node_num >= ARRAY_SIZE(r2net_nodes)); - return &r2net_nodes[node_num]; -} - -static u8 r2net_num_from_nn(struct r2net_node *nn) -{ - BUG_ON(nn == NULL); - return nn - r2net_nodes; -} - -/* ------------------------------------------------------------ */ - -static int r2net_prep_nsw(struct r2net_node *nn, struct r2net_status_wait *nsw) -{ - int ret = 0; - - do { - if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) { - ret = -EAGAIN; - break; - } - spin_lock(&nn->nn_lock); - ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id); - if (ret == 0) - list_add_tail(&nsw->ns_node_item, - &nn->nn_status_list); - spin_unlock(&nn->nn_lock); - } while (ret == -EAGAIN); - - if (ret == 0) { - init_waitqueue_head(&nsw->ns_wq); - nsw->ns_sys_status = R2NET_ERR_NONE; - nsw->ns_status = 0; - } - - return ret; -} - -static void r2net_complete_nsw_locked(struct r2net_node *nn, - struct r2net_status_wait *nsw, - enum r2net_system_error sys_status, - s32 status) -{ - assert_spin_locked(&nn->nn_lock); - - if (!list_empty(&nsw->ns_node_item)) { - list_del_init(&nsw->ns_node_item); - nsw->ns_sys_status = sys_status; - nsw->ns_status = status; - idr_remove(&nn->nn_status_idr, nsw->ns_id); - wake_up(&nsw->ns_wq); - } -} - -static void r2net_complete_nsw(struct r2net_node *nn, - struct r2net_status_wait *nsw, - u64 id, enum r2net_system_error sys_status, - s32 status) -{ - spin_lock(&nn->nn_lock); - if (nsw == NULL) { - if (id > INT_MAX) - goto out; - - nsw = idr_find(&nn->nn_status_idr, id); - if (nsw == NULL) - goto out; - } - - r2net_complete_nsw_locked(nn, nsw, sys_status, status); - -out: - spin_unlock(&nn->nn_lock); - return; -} - -static void r2net_complete_nodes_nsw(struct r2net_node *nn) -{ - struct r2net_status_wait *nsw, *tmp; - unsigned int num_kills = 0; - - assert_spin_locked(&nn->nn_lock); - - list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) { - r2net_complete_nsw_locked(nn, nsw, R2NET_ERR_DIED, 0); - num_kills++; - } - - mlog(0, "completed %d messages for node %u\n", num_kills, - r2net_num_from_nn(nn)); -} - -static int r2net_nsw_completed(struct r2net_node *nn, - struct r2net_status_wait *nsw) -{ - int completed; - spin_lock(&nn->nn_lock); - completed = list_empty(&nsw->ns_node_item); - spin_unlock(&nn->nn_lock); - return completed; -} - -/* ------------------------------------------------------------ */ - -static void sc_kref_release(struct kref *kref) -{ - struct r2net_sock_container *sc = container_of(kref, - struct r2net_sock_container, sc_kref); - BUG_ON(timer_pending(&sc->sc_idle_timeout)); - - sclog(sc, "releasing\n"); - - if (sc->sc_sock) { - sock_release(sc->sc_sock); - sc->sc_sock = NULL; - } - - r2nm_undepend_item(&sc->sc_node->nd_item); - r2nm_node_put(sc->sc_node); - sc->sc_node = NULL; - - r2net_debug_del_sc(sc); - kfree(sc); -} - -static void sc_put(struct r2net_sock_container *sc) -{ - sclog(sc, "put\n"); - kref_put(&sc->sc_kref, sc_kref_release); -} -static void sc_get(struct r2net_sock_container *sc) -{ - sclog(sc, "get\n"); - kref_get(&sc->sc_kref); -} -static struct r2net_sock_container *sc_alloc(struct r2nm_node *node) -{ - struct r2net_sock_container *sc, *ret = NULL; - struct page *page = NULL; - int status = 0; - - page = alloc_page(GFP_NOFS); - sc = kzalloc(sizeof(*sc), GFP_NOFS); - if (sc == NULL || page == NULL) - goto out; - - kref_init(&sc->sc_kref); - r2nm_node_get(node); - sc->sc_node = node; - - /* pin the node item of the remote node */ - status = r2nm_depend_item(&node->nd_item); - if (status) { - mlog_errno(status); - r2nm_node_put(node); - goto out; - } - INIT_WORK(&sc->sc_connect_work, r2net_sc_connect_completed); - INIT_WORK(&sc->sc_rx_work, r2net_rx_until_empty); - INIT_WORK(&sc->sc_shutdown_work, r2net_shutdown_sc); - INIT_DELAYED_WORK(&sc->sc_keepalive_work, r2net_sc_send_keep_req); - - init_timer(&sc->sc_idle_timeout); - sc->sc_idle_timeout.function = r2net_idle_timer; - sc->sc_idle_timeout.data = (unsigned long)sc; - - sclog(sc, "alloced\n"); - - ret = sc; - sc->sc_page = page; - r2net_debug_add_sc(sc); - sc = NULL; - page = NULL; - -out: - if (page) - __free_page(page); - kfree(sc); - - return ret; -} - -/* ------------------------------------------------------------ */ - -static void r2net_sc_queue_work(struct r2net_sock_container *sc, - struct work_struct *work) -{ - sc_get(sc); - if (!queue_work(r2net_wq, work)) - sc_put(sc); -} -static void r2net_sc_queue_delayed_work(struct r2net_sock_container *sc, - struct delayed_work *work, - int delay) -{ - sc_get(sc); - if (!queue_delayed_work(r2net_wq, work, delay)) - sc_put(sc); -} -static void r2net_sc_cancel_delayed_work(struct r2net_sock_container *sc, - struct delayed_work *work) -{ - if (cancel_delayed_work(work)) - sc_put(sc); -} - -static atomic_t r2net_connected_peers = ATOMIC_INIT(0); - -int r2net_num_connected_peers(void) -{ - return atomic_read(&r2net_connected_peers); -} - -static void r2net_set_nn_state(struct r2net_node *nn, - struct r2net_sock_container *sc, - unsigned valid, int err) -{ - int was_valid = nn->nn_sc_valid; - int was_err = nn->nn_persistent_error; - struct r2net_sock_container *old_sc = nn->nn_sc; - - assert_spin_locked(&nn->nn_lock); - - if (old_sc && !sc) - atomic_dec(&r2net_connected_peers); - else if (!old_sc && sc) - atomic_inc(&r2net_connected_peers); - - /* the node num comparison and single connect/accept path should stop - * an non-null sc from being overwritten with another */ - BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc); - mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); - mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); - - if (was_valid && !valid && err == 0) - err = -ENOTCONN; - - mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n", - r2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid, - nn->nn_persistent_error, err); - - nn->nn_sc = sc; - nn->nn_sc_valid = valid ? 1 : 0; - nn->nn_persistent_error = err; - - /* mirrors r2net_tx_can_proceed() */ - if (nn->nn_persistent_error || nn->nn_sc_valid) - wake_up(&nn->nn_sc_wq); - - if (!was_err && nn->nn_persistent_error) { - queue_delayed_work(r2net_wq, &nn->nn_still_up, - msecs_to_jiffies(R2NET_QUORUM_DELAY_MS)); - } - - if (was_valid && !valid) { - printk(KERN_NOTICE "ramster: No longer connected to " - SC_NODEF_FMT "\n", - old_sc->sc_node->nd_name, old_sc->sc_node->nd_num, - &old_sc->sc_node->nd_ipv4_address, - ntohs(old_sc->sc_node->nd_ipv4_port)); - r2net_complete_nodes_nsw(nn); - } - - if (!was_valid && valid) { - cancel_delayed_work(&nn->nn_connect_expired); - printk(KERN_NOTICE "ramster: %s " SC_NODEF_FMT "\n", - r2nm_this_node() > sc->sc_node->nd_num ? - "Connected to" : "Accepted connection from", - sc->sc_node->nd_name, sc->sc_node->nd_num, - &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port)); - } - - /* trigger the connecting worker func as long as we're not valid, - * it will back off if it shouldn't connect. This can be called - * from node config teardown and so needs to be careful about - * the work queue actually being up. */ - if (!valid && r2net_wq) { - unsigned long delay; - /* delay if we're within a RECONNECT_DELAY of the - * last attempt */ - delay = (nn->nn_last_connect_attempt + - msecs_to_jiffies(r2net_reconnect_delay())) - - jiffies; - if (delay > msecs_to_jiffies(r2net_reconnect_delay())) - delay = 0; - mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); - queue_delayed_work(r2net_wq, &nn->nn_connect_work, delay); - - /* - * Delay the expired work after idle timeout. - * - * We might have lots of failed connection attempts that run - * through here but we only cancel the connect_expired work when - * a connection attempt succeeds. So only the first enqueue of - * the connect_expired work will do anything. The rest will see - * that it's already queued and do nothing. - */ - delay += msecs_to_jiffies(r2net_idle_timeout()); - queue_delayed_work(r2net_wq, &nn->nn_connect_expired, delay); - } - - /* keep track of the nn's sc ref for the caller */ - if ((old_sc == NULL) && sc) - sc_get(sc); - if (old_sc && (old_sc != sc)) { - r2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work); - sc_put(old_sc); - } -} - -/* see r2net_register_callbacks() */ -static void r2net_data_ready(struct sock *sk, int bytes) -{ - void (*ready)(struct sock *sk, int bytes); - - read_lock(&sk->sk_callback_lock); - if (sk->sk_user_data) { - struct r2net_sock_container *sc = sk->sk_user_data; - sclog(sc, "data_ready hit\n"); - r2net_set_data_ready_time(sc); - r2net_sc_queue_work(sc, &sc->sc_rx_work); - ready = sc->sc_data_ready; - } else { - ready = sk->sk_data_ready; - } - read_unlock(&sk->sk_callback_lock); - - ready(sk, bytes); -} - -/* see r2net_register_callbacks() */ -static void r2net_state_change(struct sock *sk) -{ - void (*state_change)(struct sock *sk); - struct r2net_sock_container *sc; - - read_lock(&sk->sk_callback_lock); - sc = sk->sk_user_data; - if (sc == NULL) { - state_change = sk->sk_state_change; - goto out; - } - - sclog(sc, "state_change to %d\n", sk->sk_state); - - state_change = sc->sc_state_change; - - switch (sk->sk_state) { - - /* ignore connecting sockets as they make progress */ - case TCP_SYN_SENT: - case TCP_SYN_RECV: - break; - case TCP_ESTABLISHED: - r2net_sc_queue_work(sc, &sc->sc_connect_work); - break; - default: - printk(KERN_INFO "ramster: Connection to " - SC_NODEF_FMT " shutdown, state %d\n", - sc->sc_node->nd_name, sc->sc_node->nd_num, - &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port), sk->sk_state); - r2net_sc_queue_work(sc, &sc->sc_shutdown_work); - break; - - } -out: - read_unlock(&sk->sk_callback_lock); - state_change(sk); -} - -/* - * we register callbacks so we can queue work on events before calling - * the original callbacks. our callbacks our careful to test user_data - * to discover when they've reaced with r2net_unregister_callbacks(). - */ -static void r2net_register_callbacks(struct sock *sk, - struct r2net_sock_container *sc) -{ - write_lock_bh(&sk->sk_callback_lock); - - /* accepted sockets inherit the old listen socket data ready */ - if (sk->sk_data_ready == r2net_listen_data_ready) { - sk->sk_data_ready = sk->sk_user_data; - sk->sk_user_data = NULL; - } - - BUG_ON(sk->sk_user_data != NULL); - sk->sk_user_data = sc; - sc_get(sc); - - sc->sc_data_ready = sk->sk_data_ready; - sc->sc_state_change = sk->sk_state_change; - sk->sk_data_ready = r2net_data_ready; - sk->sk_state_change = r2net_state_change; - - mutex_init(&sc->sc_send_lock); - - write_unlock_bh(&sk->sk_callback_lock); -} - -static int r2net_unregister_callbacks(struct sock *sk, - struct r2net_sock_container *sc) -{ - int ret = 0; - - write_lock_bh(&sk->sk_callback_lock); - if (sk->sk_user_data == sc) { - ret = 1; - sk->sk_user_data = NULL; - sk->sk_data_ready = sc->sc_data_ready; - sk->sk_state_change = sc->sc_state_change; - } - write_unlock_bh(&sk->sk_callback_lock); - - return ret; -} - -/* - * this is a little helper that is called by callers who have seen a problem - * with an sc and want to detach it from the nn if someone already hasn't beat - * them to it. if an error is given then the shutdown will be persistent - * and pending transmits will be canceled. - */ -static void r2net_ensure_shutdown(struct r2net_node *nn, - struct r2net_sock_container *sc, - int err) -{ - spin_lock(&nn->nn_lock); - if (nn->nn_sc == sc) - r2net_set_nn_state(nn, NULL, 0, err); - spin_unlock(&nn->nn_lock); -} - -/* - * This work queue function performs the blocking parts of socket shutdown. A - * few paths lead here. set_nn_state will trigger this callback if it sees an - * sc detached from the nn. state_change will also trigger this callback - * directly when it sees errors. In that case we need to call set_nn_state - * ourselves as state_change couldn't get the nn_lock and call set_nn_state - * itself. - */ -static void r2net_shutdown_sc(struct work_struct *work) -{ - struct r2net_sock_container *sc = - container_of(work, struct r2net_sock_container, - sc_shutdown_work); - struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num); - - sclog(sc, "shutting down\n"); - - /* drop the callbacks ref and call shutdown only once */ - if (r2net_unregister_callbacks(sc->sc_sock->sk, sc)) { - /* we shouldn't flush as we're in the thread, the - * races with pending sc work structs are harmless */ - del_timer_sync(&sc->sc_idle_timeout); - r2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); - sc_put(sc); - kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR); - } - - /* not fatal so failed connects before the other guy has our - * heartbeat can be retried */ - r2net_ensure_shutdown(nn, sc, 0); - sc_put(sc); -} - -/* ------------------------------------------------------------ */ - -static int r2net_handler_cmp(struct r2net_msg_handler *nmh, u32 msg_type, - u32 key) -{ - int ret = memcmp(&nmh->nh_key, &key, sizeof(key)); - - if (ret == 0) - ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type)); - - return ret; -} - -static struct r2net_msg_handler * -r2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p, - struct rb_node **ret_parent) -{ - struct rb_node **p = &r2net_handler_tree.rb_node; - struct rb_node *parent = NULL; - struct r2net_msg_handler *nmh, *ret = NULL; - int cmp; - - while (*p) { - parent = *p; - nmh = rb_entry(parent, struct r2net_msg_handler, nh_node); - cmp = r2net_handler_cmp(nmh, msg_type, key); - - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else { - ret = nmh; - break; - } - } - - if (ret_p != NULL) - *ret_p = p; - if (ret_parent != NULL) - *ret_parent = parent; - - return ret; -} - -static void r2net_handler_kref_release(struct kref *kref) -{ - struct r2net_msg_handler *nmh; - nmh = container_of(kref, struct r2net_msg_handler, nh_kref); - - kfree(nmh); -} - -static void r2net_handler_put(struct r2net_msg_handler *nmh) -{ - kref_put(&nmh->nh_kref, r2net_handler_kref_release); -} - -/* max_len is protection for the handler func. incoming messages won't - * be given to the handler if their payload is longer than the max. */ -int r2net_register_handler(u32 msg_type, u32 key, u32 max_len, - r2net_msg_handler_func *func, void *data, - r2net_post_msg_handler_func *post_func, - struct list_head *unreg_list) -{ - struct r2net_msg_handler *nmh = NULL; - struct rb_node **p, *parent; - int ret = 0; - - if (max_len > R2NET_MAX_PAYLOAD_BYTES) { - mlog(0, "max_len for message handler out of range: %u\n", - max_len); - ret = -EINVAL; - goto out; - } - - if (!msg_type) { - mlog(0, "no message type provided: %u, %p\n", msg_type, func); - ret = -EINVAL; - goto out; - - } - if (!func) { - mlog(0, "no message handler provided: %u, %p\n", - msg_type, func); - ret = -EINVAL; - goto out; - } - - nmh = kzalloc(sizeof(struct r2net_msg_handler), GFP_NOFS); - if (nmh == NULL) { - ret = -ENOMEM; - goto out; - } - - nmh->nh_func = func; - nmh->nh_func_data = data; - nmh->nh_post_func = post_func; - nmh->nh_msg_type = msg_type; - nmh->nh_max_len = max_len; - nmh->nh_key = key; - /* the tree and list get this ref.. they're both removed in - * unregister when this ref is dropped */ - kref_init(&nmh->nh_kref); - INIT_LIST_HEAD(&nmh->nh_unregister_item); - - write_lock(&r2net_handler_lock); - if (r2net_handler_tree_lookup(msg_type, key, &p, &parent)) - ret = -EEXIST; - else { - rb_link_node(&nmh->nh_node, parent, p); - rb_insert_color(&nmh->nh_node, &r2net_handler_tree); - list_add_tail(&nmh->nh_unregister_item, unreg_list); - - mlog(ML_TCP, "registered handler func %p type %u key %08x\n", - func, msg_type, key); - /* we've had some trouble with handlers seemingly vanishing. */ - mlog_bug_on_msg(r2net_handler_tree_lookup(msg_type, key, &p, - &parent) == NULL, - "couldn't find handler we *just* registered " - "for type %u key %08x\n", msg_type, key); - } - write_unlock(&r2net_handler_lock); - if (ret) - goto out; - -out: - if (ret) - kfree(nmh); - - return ret; -} -EXPORT_SYMBOL_GPL(r2net_register_handler); - -void r2net_unregister_handler_list(struct list_head *list) -{ - struct r2net_msg_handler *nmh, *n; - - write_lock(&r2net_handler_lock); - list_for_each_entry_safe(nmh, n, list, nh_unregister_item) { - mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", - nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); - rb_erase(&nmh->nh_node, &r2net_handler_tree); - list_del_init(&nmh->nh_unregister_item); - kref_put(&nmh->nh_kref, r2net_handler_kref_release); - } - write_unlock(&r2net_handler_lock); -} -EXPORT_SYMBOL_GPL(r2net_unregister_handler_list); - -static struct r2net_msg_handler *r2net_handler_get(u32 msg_type, u32 key) -{ - struct r2net_msg_handler *nmh; - - read_lock(&r2net_handler_lock); - nmh = r2net_handler_tree_lookup(msg_type, key, NULL, NULL); - if (nmh) - kref_get(&nmh->nh_kref); - read_unlock(&r2net_handler_lock); - - return nmh; -} - -/* ------------------------------------------------------------ */ - -static int r2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) -{ - int ret; - mm_segment_t oldfs; - struct kvec vec = { - .iov_len = len, - .iov_base = data, - }; - struct msghdr msg = { - .msg_iovlen = 1, - .msg_iov = (struct iovec *)&vec, - .msg_flags = MSG_DONTWAIT, - }; - - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_recvmsg(sock, &msg, len, msg.msg_flags); - set_fs(oldfs); - - return ret; -} - -static int r2net_send_tcp_msg(struct socket *sock, struct kvec *vec, - size_t veclen, size_t total) -{ - int ret; - mm_segment_t oldfs; - struct msghdr msg = { - .msg_iov = (struct iovec *)vec, - .msg_iovlen = veclen, - }; - - if (sock == NULL) { - ret = -EINVAL; - goto out; - } - - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_sendmsg(sock, &msg, total); - set_fs(oldfs); - if (ret != total) { - mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, - total); - if (ret >= 0) - ret = -EPIPE; /* should be smarter, I bet */ - goto out; - } - - ret = 0; -out: - if (ret < 0) - mlog(0, "returning error: %d\n", ret); - return ret; -} - -static void r2net_sendpage(struct r2net_sock_container *sc, - void *kmalloced_virt, - size_t size) -{ - struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num); - ssize_t ret; - - while (1) { - mutex_lock(&sc->sc_send_lock); - ret = sc->sc_sock->ops->sendpage(sc->sc_sock, - virt_to_page(kmalloced_virt), - (long)kmalloced_virt & ~PAGE_MASK, - size, MSG_DONTWAIT); - mutex_unlock(&sc->sc_send_lock); - if (ret == size) - break; - if (ret == (ssize_t)-EAGAIN) { - mlog(0, "sendpage of size %zu to " SC_NODEF_FMT - " returned EAGAIN\n", size, sc->sc_node->nd_name, - sc->sc_node->nd_num, - &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port)); - cond_resched(); - continue; - } - mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT - " failed with %zd\n", size, sc->sc_node->nd_name, - sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port), ret); - r2net_ensure_shutdown(nn, sc, 0); - break; - } -} - -static void r2net_init_msg(struct r2net_msg *msg, u16 data_len, - u16 msg_type, u32 key) -{ - memset(msg, 0, sizeof(struct r2net_msg)); - msg->magic = cpu_to_be16(R2NET_MSG_MAGIC); - msg->data_len = cpu_to_be16(data_len); - msg->msg_type = cpu_to_be16(msg_type); - msg->sys_status = cpu_to_be32(R2NET_ERR_NONE); - msg->status = 0; - msg->key = cpu_to_be32(key); -} - -static int r2net_tx_can_proceed(struct r2net_node *nn, - struct r2net_sock_container **sc_ret, - int *error) -{ - int ret = 0; - - spin_lock(&nn->nn_lock); - if (nn->nn_persistent_error) { - ret = 1; - *sc_ret = NULL; - *error = nn->nn_persistent_error; - } else if (nn->nn_sc_valid) { - kref_get(&nn->nn_sc->sc_kref); - - ret = 1; - *sc_ret = nn->nn_sc; - *error = 0; - } - spin_unlock(&nn->nn_lock); - - return ret; -} - -/* Get a map of all nodes to which this node is currently connected to */ -void r2net_fill_node_map(unsigned long *map, unsigned bytes) -{ - struct r2net_sock_container *sc; - int node, ret; - - BUG_ON(bytes < (BITS_TO_LONGS(R2NM_MAX_NODES) * sizeof(unsigned long))); - - memset(map, 0, bytes); - for (node = 0; node < R2NM_MAX_NODES; ++node) { - r2net_tx_can_proceed(r2net_nn_from_num(node), &sc, &ret); - if (!ret) { - set_bit(node, map); - sc_put(sc); - } - } -} -EXPORT_SYMBOL_GPL(r2net_fill_node_map); - -int r2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, - size_t caller_veclen, u8 target_node, int *status) -{ - int ret = 0; - struct r2net_msg *msg = NULL; - size_t veclen, caller_bytes = 0; - struct kvec *vec = NULL; - struct r2net_sock_container *sc = NULL; - struct r2net_node *nn = r2net_nn_from_num(target_node); - struct r2net_status_wait nsw = { - .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), - }; - struct r2net_send_tracking nst; - - /* this may be a general bug fix */ - init_waitqueue_head(&nsw.ns_wq); - - r2net_init_nst(&nst, msg_type, key, current, target_node); - - if (r2net_wq == NULL) { - mlog(0, "attempt to tx without r2netd running\n"); - ret = -ESRCH; - goto out; - } - - if (caller_veclen == 0) { - mlog(0, "bad kvec array length\n"); - ret = -EINVAL; - goto out; - } - - caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen); - if (caller_bytes > R2NET_MAX_PAYLOAD_BYTES) { - mlog(0, "total payload len %zu too large\n", caller_bytes); - ret = -EINVAL; - goto out; - } - - if (target_node == r2nm_this_node()) { - ret = -ELOOP; - goto out; - } - - r2net_debug_add_nst(&nst); - - r2net_set_nst_sock_time(&nst); - - wait_event(nn->nn_sc_wq, r2net_tx_can_proceed(nn, &sc, &ret)); - if (ret) - goto out; - - r2net_set_nst_sock_container(&nst, sc); - - veclen = caller_veclen + 1; - vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); - if (vec == NULL) { - mlog(0, "failed to %zu element kvec!\n", veclen); - ret = -ENOMEM; - goto out; - } - - msg = kmalloc(sizeof(struct r2net_msg), GFP_ATOMIC); - if (!msg) { - mlog(0, "failed to allocate a r2net_msg!\n"); - ret = -ENOMEM; - goto out; - } - - r2net_init_msg(msg, caller_bytes, msg_type, key); - - vec[0].iov_len = sizeof(struct r2net_msg); - vec[0].iov_base = msg; - memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec)); - - ret = r2net_prep_nsw(nn, &nsw); - if (ret) - goto out; - - msg->msg_num = cpu_to_be32(nsw.ns_id); - r2net_set_nst_msg_id(&nst, nsw.ns_id); - - r2net_set_nst_send_time(&nst); - - /* finally, convert the message header to network byte-order - * and send */ - mutex_lock(&sc->sc_send_lock); - ret = r2net_send_tcp_msg(sc->sc_sock, vec, veclen, - sizeof(struct r2net_msg) + caller_bytes); - mutex_unlock(&sc->sc_send_lock); - msglog(msg, "sending returned %d\n", ret); - if (ret < 0) { - mlog(0, "error returned from r2net_send_tcp_msg=%d\n", ret); - goto out; - } - - /* wait on other node's handler */ - r2net_set_nst_status_time(&nst); - wait_event(nsw.ns_wq, r2net_nsw_completed(nn, &nsw)); - - r2net_update_send_stats(&nst, sc); - - /* Note that we avoid overwriting the callers status return - * variable if a system error was reported on the other - * side. Callers beware. */ - ret = r2net_sys_err_to_errno(nsw.ns_sys_status); - if (status && !ret) - *status = nsw.ns_status; - - mlog(0, "woken, returning system status %d, user status %d\n", - ret, nsw.ns_status); -out: - r2net_debug_del_nst(&nst); /* must be before dropping sc and node */ - if (sc) - sc_put(sc); - kfree(vec); - kfree(msg); - r2net_complete_nsw(nn, &nsw, 0, 0, 0); - return ret; -} -EXPORT_SYMBOL_GPL(r2net_send_message_vec); - -int r2net_send_message(u32 msg_type, u32 key, void *data, u32 len, - u8 target_node, int *status) -{ - struct kvec vec = { - .iov_base = data, - .iov_len = len, - }; - return r2net_send_message_vec(msg_type, key, &vec, 1, - target_node, status); -} -EXPORT_SYMBOL_GPL(r2net_send_message); - -static int r2net_send_status_magic(struct socket *sock, struct r2net_msg *hdr, - enum r2net_system_error syserr, int err) -{ - struct kvec vec = { - .iov_base = hdr, - .iov_len = sizeof(struct r2net_msg), - }; - - BUG_ON(syserr >= R2NET_ERR_MAX); - - /* leave other fields intact from the incoming message, msg_num - * in particular */ - hdr->sys_status = cpu_to_be32(syserr); - hdr->status = cpu_to_be32(err); - /* twiddle the magic */ - hdr->magic = cpu_to_be16(R2NET_MSG_STATUS_MAGIC); - hdr->data_len = 0; - - msglog(hdr, "about to send status magic %d\n", err); - /* hdr has been in host byteorder this whole time */ - return r2net_send_tcp_msg(sock, &vec, 1, sizeof(struct r2net_msg)); -} - -/* - * "data magic" is a long version of "status magic" where the message - * payload actually contains data to be passed in reply to certain messages - */ -static int r2net_send_data_magic(struct r2net_sock_container *sc, - struct r2net_msg *hdr, - void *data, size_t data_len, - enum r2net_system_error syserr, int err) -{ - struct kvec vec[2]; - int ret; - - vec[0].iov_base = hdr; - vec[0].iov_len = sizeof(struct r2net_msg); - vec[1].iov_base = data; - vec[1].iov_len = data_len; - - BUG_ON(syserr >= R2NET_ERR_MAX); - - /* leave other fields intact from the incoming message, msg_num - * in particular */ - hdr->sys_status = cpu_to_be32(syserr); - hdr->status = cpu_to_be32(err); - hdr->magic = cpu_to_be16(R2NET_MSG_DATA_MAGIC); /* twiddle magic */ - hdr->data_len = cpu_to_be16(data_len); - - msglog(hdr, "about to send data magic %d\n", err); - /* hdr has been in host byteorder this whole time */ - ret = r2net_send_tcp_msg(sc->sc_sock, vec, 2, - sizeof(struct r2net_msg) + data_len); - return ret; -} - -/* - * called by a message handler to convert an otherwise normal reply - * message into a "data magic" message - */ -void r2net_force_data_magic(struct r2net_msg *hdr, u16 msgtype, u32 msgkey) -{ - hdr->magic = cpu_to_be16(R2NET_MSG_DATA_MAGIC); - hdr->msg_type = cpu_to_be16(msgtype); - hdr->key = cpu_to_be32(msgkey); -} - -/* this returns -errno if the header was unknown or too large, etc. - * after this is called the buffer us reused for the next message */ -static int r2net_process_message(struct r2net_sock_container *sc, - struct r2net_msg *hdr) -{ - struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num); - int ret = 0, handler_status; - enum r2net_system_error syserr; - struct r2net_msg_handler *nmh = NULL; - void *ret_data = NULL; - int data_magic = 0; - - msglog(hdr, "processing message\n"); - - r2net_sc_postpone_idle(sc); - - switch (be16_to_cpu(hdr->magic)) { - - case R2NET_MSG_STATUS_MAGIC: - /* special type for returning message status */ - r2net_complete_nsw(nn, NULL, be32_to_cpu(hdr->msg_num), - be32_to_cpu(hdr->sys_status), - be32_to_cpu(hdr->status)); - goto out; - case R2NET_MSG_KEEP_REQ_MAGIC: - r2net_sendpage(sc, r2net_keep_resp, sizeof(*r2net_keep_resp)); - goto out; - case R2NET_MSG_KEEP_RESP_MAGIC: - goto out; - case R2NET_MSG_MAGIC: - break; - case R2NET_MSG_DATA_MAGIC: - /* - * unlike a normal status magic, a data magic DOES - * (MUST) have a handler, so the control flow is - * a little funky here as a result - */ - data_magic = 1; - break; - default: - msglog(hdr, "bad magic\n"); - ret = -EINVAL; - goto out; - break; - } - - /* find a handler for it */ - handler_status = 0; - nmh = r2net_handler_get(be16_to_cpu(hdr->msg_type), - be32_to_cpu(hdr->key)); - if (!nmh) { - mlog(ML_TCP, "couldn't find handler for type %u key %08x\n", - be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key)); - syserr = R2NET_ERR_NO_HNDLR; - goto out_respond; - } - - syserr = R2NET_ERR_NONE; - - if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len) - syserr = R2NET_ERR_OVERFLOW; - - if (syserr != R2NET_ERR_NONE) - goto out_respond; - - r2net_set_func_start_time(sc); - sc->sc_msg_key = be32_to_cpu(hdr->key); - sc->sc_msg_type = be16_to_cpu(hdr->msg_type); - handler_status = (nmh->nh_func)(hdr, sizeof(struct r2net_msg) + - be16_to_cpu(hdr->data_len), - nmh->nh_func_data, &ret_data); - if (data_magic) { - /* - * handler handled data sent in reply to request - * so complete the transaction - */ - r2net_complete_nsw(nn, NULL, be32_to_cpu(hdr->msg_num), - be32_to_cpu(hdr->sys_status), handler_status); - goto out; - } - /* - * handler changed magic to DATA_MAGIC to reply to request for data, - * implies ret_data points to data to return and handler_status - * is the number of bytes of data - */ - if (be16_to_cpu(hdr->magic) == R2NET_MSG_DATA_MAGIC) { - ret = r2net_send_data_magic(sc, hdr, - ret_data, handler_status, - syserr, 0); - hdr = NULL; - mlog(0, "sending data reply %d, syserr %d returned %d\n", - handler_status, syserr, ret); - r2net_set_func_stop_time(sc); - - r2net_update_recv_stats(sc); - goto out; - } - r2net_set_func_stop_time(sc); - - r2net_update_recv_stats(sc); - -out_respond: - /* this destroys the hdr, so don't use it after this */ - mutex_lock(&sc->sc_send_lock); - ret = r2net_send_status_magic(sc->sc_sock, hdr, syserr, - handler_status); - mutex_unlock(&sc->sc_send_lock); - hdr = NULL; - mlog(0, "sending handler status %d, syserr %d returned %d\n", - handler_status, syserr, ret); - - if (nmh) { - BUG_ON(ret_data != NULL && nmh->nh_post_func == NULL); - if (nmh->nh_post_func) - (nmh->nh_post_func)(handler_status, nmh->nh_func_data, - ret_data); - } - -out: - if (nmh) - r2net_handler_put(nmh); - return ret; -} - -static int r2net_check_handshake(struct r2net_sock_container *sc) -{ - struct r2net_handshake *hand = page_address(sc->sc_page); - struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num); - - if (hand->protocol_version != cpu_to_be64(R2NET_PROTOCOL_VERSION)) { - printk(KERN_NOTICE "ramster: " SC_NODEF_FMT " Advertised net " - "protocol version %llu but %llu is required. " - "Disconnecting.\n", sc->sc_node->nd_name, - sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port), - (unsigned long long)be64_to_cpu(hand->protocol_version), - R2NET_PROTOCOL_VERSION); - - /* don't bother reconnecting if its the wrong version. */ - r2net_ensure_shutdown(nn, sc, -ENOTCONN); - return -1; - } - - /* - * Ensure timeouts are consistent with other nodes, otherwise - * we can end up with one node thinking that the other must be down, - * but isn't. This can ultimately cause corruption. - */ - if (be32_to_cpu(hand->r2net_idle_timeout_ms) != - r2net_idle_timeout()) { - printk(KERN_NOTICE "ramster: " SC_NODEF_FMT " uses a network " - "idle timeout of %u ms, but we use %u ms locally. " - "Disconnecting.\n", sc->sc_node->nd_name, - sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port), - be32_to_cpu(hand->r2net_idle_timeout_ms), - r2net_idle_timeout()); - r2net_ensure_shutdown(nn, sc, -ENOTCONN); - return -1; - } - - if (be32_to_cpu(hand->r2net_keepalive_delay_ms) != - r2net_keepalive_delay()) { - printk(KERN_NOTICE "ramster: " SC_NODEF_FMT " uses a keepalive " - "delay of %u ms, but we use %u ms locally. " - "Disconnecting.\n", sc->sc_node->nd_name, - sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port), - be32_to_cpu(hand->r2net_keepalive_delay_ms), - r2net_keepalive_delay()); - r2net_ensure_shutdown(nn, sc, -ENOTCONN); - return -1; - } - - if (be32_to_cpu(hand->r2hb_heartbeat_timeout_ms) != - R2HB_MAX_WRITE_TIMEOUT_MS) { - printk(KERN_NOTICE "ramster: " SC_NODEF_FMT " uses a heartbeat " - "timeout of %u ms, but we use %u ms locally. " - "Disconnecting.\n", sc->sc_node->nd_name, - sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port), - be32_to_cpu(hand->r2hb_heartbeat_timeout_ms), - R2HB_MAX_WRITE_TIMEOUT_MS); - r2net_ensure_shutdown(nn, sc, -ENOTCONN); - return -1; - } - - sc->sc_handshake_ok = 1; - - spin_lock(&nn->nn_lock); - /* set valid and queue the idle timers only if it hasn't been - * shut down already */ - if (nn->nn_sc == sc) { - r2net_sc_reset_idle_timer(sc); - atomic_set(&nn->nn_timeout, 0); - r2net_set_nn_state(nn, sc, 1, 0); - } - spin_unlock(&nn->nn_lock); - - /* shift everything up as though it wasn't there */ - sc->sc_page_off -= sizeof(struct r2net_handshake); - if (sc->sc_page_off) - memmove(hand, hand + 1, sc->sc_page_off); - - return 0; -} - -/* this demuxes the queued rx bytes into header or payload bits and calls - * handlers as each full message is read off the socket. it returns -error, - * == 0 eof, or > 0 for progress made.*/ -static int r2net_advance_rx(struct r2net_sock_container *sc) -{ - struct r2net_msg *hdr; - int ret = 0; - void *data; - size_t datalen; - - sclog(sc, "receiving\n"); - r2net_set_advance_start_time(sc); - - if (unlikely(sc->sc_handshake_ok == 0)) { - if (sc->sc_page_off < sizeof(struct r2net_handshake)) { - data = page_address(sc->sc_page) + sc->sc_page_off; - datalen = sizeof(struct r2net_handshake) - - sc->sc_page_off; - ret = r2net_recv_tcp_msg(sc->sc_sock, data, datalen); - if (ret > 0) - sc->sc_page_off += ret; - } - - if (sc->sc_page_off == sizeof(struct r2net_handshake)) { - r2net_check_handshake(sc); - if (unlikely(sc->sc_handshake_ok == 0)) - ret = -EPROTO; - } - goto out; - } - - /* do we need more header? */ - if (sc->sc_page_off < sizeof(struct r2net_msg)) { - data = page_address(sc->sc_page) + sc->sc_page_off; - datalen = sizeof(struct r2net_msg) - sc->sc_page_off; - ret = r2net_recv_tcp_msg(sc->sc_sock, data, datalen); - if (ret > 0) { - sc->sc_page_off += ret; - /* only swab incoming here.. we can - * only get here once as we cross from - * being under to over */ - if (sc->sc_page_off == sizeof(struct r2net_msg)) { - hdr = page_address(sc->sc_page); - if (be16_to_cpu(hdr->data_len) > - R2NET_MAX_PAYLOAD_BYTES) - ret = -EOVERFLOW; - } - } - if (ret <= 0) - goto out; - } - - if (sc->sc_page_off < sizeof(struct r2net_msg)) { - /* oof, still don't have a header */ - goto out; - } - - /* this was swabbed above when we first read it */ - hdr = page_address(sc->sc_page); - - msglog(hdr, "at page_off %zu\n", sc->sc_page_off); - - /* do we need more payload? */ - if (sc->sc_page_off - sizeof(struct r2net_msg) < - be16_to_cpu(hdr->data_len)) { - /* need more payload */ - data = page_address(sc->sc_page) + sc->sc_page_off; - datalen = (sizeof(struct r2net_msg) + - be16_to_cpu(hdr->data_len)) - - sc->sc_page_off; - ret = r2net_recv_tcp_msg(sc->sc_sock, data, datalen); - if (ret > 0) - sc->sc_page_off += ret; - if (ret <= 0) - goto out; - } - - if (sc->sc_page_off - sizeof(struct r2net_msg) == - be16_to_cpu(hdr->data_len)) { - /* we can only get here once, the first time we read - * the payload.. so set ret to progress if the handler - * works out. after calling this the message is toast */ - ret = r2net_process_message(sc, hdr); - if (ret == 0) - ret = 1; - sc->sc_page_off = 0; - } - -out: - sclog(sc, "ret = %d\n", ret); - r2net_set_advance_stop_time(sc); - return ret; -} - -/* this work func is triggerd by data ready. it reads until it can read no - * more. it interprets 0, eof, as fatal. if data_ready hits while we're doing - * our work the work struct will be marked and we'll be called again. */ -static void r2net_rx_until_empty(struct work_struct *work) -{ - struct r2net_sock_container *sc = - container_of(work, struct r2net_sock_container, sc_rx_work); - int ret; - - do { - ret = r2net_advance_rx(sc); - } while (ret > 0); - - if (ret <= 0 && ret != -EAGAIN) { - struct r2net_node *nn = r2net_nn_from_num(sc->sc_node->nd_num); - sclog(sc, "saw error %d, closing\n", ret); - /* not permanent so read failed handshake can retry */ - r2net_ensure_shutdown(nn, sc, 0); - } - - sc_put(sc); -} - -static int r2net_set_nodelay(struct socket *sock) -{ - int ret, val = 1; - mm_segment_t oldfs; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - - /* - * Dear unsuspecting programmer, - * - * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level - * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will - * silently turn into SO_DEBUG. - * - * Yours, - * Keeper of hilariously fragile interfaces. - */ - ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char __user *)&val, sizeof(val)); - - set_fs(oldfs); - return ret; -} - -static void r2net_initialize_handshake(void) -{ - r2net_hand->r2hb_heartbeat_timeout_ms = cpu_to_be32( - R2HB_MAX_WRITE_TIMEOUT_MS); - r2net_hand->r2net_idle_timeout_ms = cpu_to_be32(r2net_idle_timeout()); - r2net_hand->r2net_keepalive_delay_ms = cpu_to_be32( - r2net_keepalive_delay()); - r2net_hand->r2net_reconnect_delay_ms = cpu_to_be32( - r2net_reconnect_delay()); -} - -/* ------------------------------------------------------------ */ - -/* called when a connect completes and after a sock is accepted. the - * rx path will see the response and mark the sc valid */ -static void r2net_sc_connect_completed(struct work_struct *work) -{ - struct r2net_sock_container *sc = - container_of(work, struct r2net_sock_container, - sc_connect_work); - - mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n", - (unsigned long long)R2NET_PROTOCOL_VERSION, - (unsigned long long)be64_to_cpu(r2net_hand->connector_id)); - - r2net_initialize_handshake(); - r2net_sendpage(sc, r2net_hand, sizeof(*r2net_hand)); - sc_put(sc); -} - -/* this is called as a work_struct func. */ -static void r2net_sc_send_keep_req(struct work_struct *work) -{ - struct r2net_sock_container *sc = - container_of(work, struct r2net_sock_container, - sc_keepalive_work.work); - - r2net_sendpage(sc, r2net_keep_req, sizeof(*r2net_keep_req)); - sc_put(sc); -} - -/* socket shutdown does a del_timer_sync against this as it tears down. - * we can't start this timer until we've got to the point in sc buildup - * where shutdown is going to be involved */ -static void r2net_idle_timer(unsigned long data) -{ - struct r2net_sock_container *sc = (struct r2net_sock_container *)data; -#ifdef CONFIG_DEBUG_FS - unsigned long msecs = ktime_to_ms(ktime_get()) - - ktime_to_ms(sc->sc_tv_timer); -#else - unsigned long msecs = r2net_idle_timeout(); -#endif - - printk(KERN_NOTICE "ramster: Connection to " SC_NODEF_FMT " has been " - "idle for %lu.%lu secs, shutting it down.\n", - sc->sc_node->nd_name, sc->sc_node->nd_num, - &sc->sc_node->nd_ipv4_address, ntohs(sc->sc_node->nd_ipv4_port), - msecs / 1000, msecs % 1000); - - /* - * Initialize the nn_timeout so that the next connection attempt - * will continue in r2net_start_connect. - */ - /* Avoid spurious shutdowns... not sure if this is still necessary */ - pr_err("ramster_idle_timer, skipping shutdown work\n"); -#if 0 - /* old code used to do these two lines */ - atomic_set(&nn->nn_timeout, 1); - r2net_sc_queue_work(sc, &sc->sc_shutdown_work); -#endif -} - -static void r2net_sc_reset_idle_timer(struct r2net_sock_container *sc) -{ - r2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); - r2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, - msecs_to_jiffies(r2net_keepalive_delay())); - r2net_set_sock_timer(sc); - mod_timer(&sc->sc_idle_timeout, - jiffies + msecs_to_jiffies(r2net_idle_timeout())); -} - -static void r2net_sc_postpone_idle(struct r2net_sock_container *sc) -{ - /* Only push out an existing timer */ - if (timer_pending(&sc->sc_idle_timeout)) - r2net_sc_reset_idle_timer(sc); -} - -/* this work func is kicked whenever a path sets the nn state which doesn't - * have valid set. This includes seeing hb come up, losing a connection, - * having a connect attempt fail, etc. This centralizes the logic which decides - * if a connect attempt should be made or if we should give up and all future - * transmit attempts should fail */ -static void r2net_start_connect(struct work_struct *work) -{ - struct r2net_node *nn = - container_of(work, struct r2net_node, nn_connect_work.work); - struct r2net_sock_container *sc = NULL; - struct r2nm_node *node = NULL, *mynode = NULL; - struct socket *sock = NULL; - struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; - int ret = 0, stop; - unsigned int timeout; - - /* if we're greater we initiate tx, otherwise we accept */ - if (r2nm_this_node() <= r2net_num_from_nn(nn)) - goto out; - - /* watch for racing with tearing a node down */ - node = r2nm_get_node_by_num(r2net_num_from_nn(nn)); - if (node == NULL) { - ret = 0; - goto out; - } - - mynode = r2nm_get_node_by_num(r2nm_this_node()); - if (mynode == NULL) { - ret = 0; - goto out; - } - - spin_lock(&nn->nn_lock); - /* - * see if we already have one pending or have given up. - * For nn_timeout, it is set when we close the connection - * because of the idle time out. So it means that we have - * at least connected to that node successfully once, - * now try to connect to it again. - */ - timeout = atomic_read(&nn->nn_timeout); - stop = (nn->nn_sc || - (nn->nn_persistent_error && - (nn->nn_persistent_error != -ENOTCONN || timeout == 0))); - spin_unlock(&nn->nn_lock); - if (stop) - goto out; - - nn->nn_last_connect_attempt = jiffies; - - sc = sc_alloc(node); - if (sc == NULL) { - mlog(0, "couldn't allocate sc\n"); - ret = -ENOMEM; - goto out; - } - - ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (ret < 0) { - mlog(0, "can't create socket: %d\n", ret); - goto out; - } - sc->sc_sock = sock; /* freed by sc_kref_release */ - - sock->sk->sk_allocation = GFP_ATOMIC; - - myaddr.sin_family = AF_INET; - myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; - myaddr.sin_port = htons(0); /* any port */ - - ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, - sizeof(myaddr)); - if (ret) { - mlog(ML_ERROR, "bind failed with %d at address %pI4\n", - ret, &mynode->nd_ipv4_address); - goto out; - } - - ret = r2net_set_nodelay(sc->sc_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } - - r2net_register_callbacks(sc->sc_sock->sk, sc); - - spin_lock(&nn->nn_lock); - /* handshake completion will set nn->nn_sc_valid */ - r2net_set_nn_state(nn, sc, 0, 0); - spin_unlock(&nn->nn_lock); - - remoteaddr.sin_family = AF_INET; - remoteaddr.sin_addr.s_addr = node->nd_ipv4_address; - remoteaddr.sin_port = node->nd_ipv4_port; - - ret = sc->sc_sock->ops->connect(sc->sc_sock, - (struct sockaddr *)&remoteaddr, - sizeof(remoteaddr), - O_NONBLOCK); - if (ret == -EINPROGRESS) - ret = 0; - -out: - if (ret) { - printk(KERN_NOTICE "ramster: Connect attempt to " SC_NODEF_FMT - " failed with errno %d\n", sc->sc_node->nd_name, - sc->sc_node->nd_num, &sc->sc_node->nd_ipv4_address, - ntohs(sc->sc_node->nd_ipv4_port), ret); - /* 0 err so that another will be queued and attempted - * from set_nn_state */ - if (sc) - r2net_ensure_shutdown(nn, sc, 0); - } - if (sc) - sc_put(sc); - if (node) - r2nm_node_put(node); - if (mynode) - r2nm_node_put(mynode); - - return; -} - -static void r2net_connect_expired(struct work_struct *work) -{ - struct r2net_node *nn = - container_of(work, struct r2net_node, nn_connect_expired.work); - - spin_lock(&nn->nn_lock); - if (!nn->nn_sc_valid) { - printk(KERN_NOTICE "ramster: No connection established with " - "node %u after %u.%u seconds, giving up.\n", - r2net_num_from_nn(nn), - r2net_idle_timeout() / 1000, - r2net_idle_timeout() % 1000); - - r2net_set_nn_state(nn, NULL, 0, -ENOTCONN); - } - spin_unlock(&nn->nn_lock); -} - -static void r2net_still_up(struct work_struct *work) -{ -} - -/* ------------------------------------------------------------ */ - -void r2net_disconnect_node(struct r2nm_node *node) -{ - struct r2net_node *nn = r2net_nn_from_num(node->nd_num); - - /* don't reconnect until it's heartbeating again */ - spin_lock(&nn->nn_lock); - atomic_set(&nn->nn_timeout, 0); - r2net_set_nn_state(nn, NULL, 0, -ENOTCONN); - spin_unlock(&nn->nn_lock); - - if (r2net_wq) { - cancel_delayed_work(&nn->nn_connect_expired); - cancel_delayed_work(&nn->nn_connect_work); - cancel_delayed_work(&nn->nn_still_up); - flush_workqueue(r2net_wq); - } -} - -static void r2net_hb_node_down_cb(struct r2nm_node *node, int node_num, - void *data) -{ - if (!node) - return; - - if (node_num != r2nm_this_node()) - r2net_disconnect_node(node); - - BUG_ON(atomic_read(&r2net_connected_peers) < 0); -} - -static void r2net_hb_node_up_cb(struct r2nm_node *node, int node_num, - void *data) -{ - struct r2net_node *nn = r2net_nn_from_num(node_num); - - BUG_ON(!node); - - /* ensure an immediate connect attempt */ - nn->nn_last_connect_attempt = jiffies - - (msecs_to_jiffies(r2net_reconnect_delay()) + 1); - - if (node_num != r2nm_this_node()) { - /* believe it or not, accept and node hearbeating testing - * can succeed for this node before we got here.. so - * only use set_nn_state to clear the persistent error - * if that hasn't already happened */ - spin_lock(&nn->nn_lock); - atomic_set(&nn->nn_timeout, 0); - if (nn->nn_persistent_error) - r2net_set_nn_state(nn, NULL, 0, 0); - spin_unlock(&nn->nn_lock); - } -} - -void r2net_unregister_hb_callbacks(void) -{ - r2hb_unregister_callback(NULL, &r2net_hb_up); - r2hb_unregister_callback(NULL, &r2net_hb_down); -} - -int r2net_register_hb_callbacks(void) -{ - int ret; - - r2hb_setup_callback(&r2net_hb_down, R2HB_NODE_DOWN_CB, - r2net_hb_node_down_cb, NULL, R2NET_HB_PRI); - r2hb_setup_callback(&r2net_hb_up, R2HB_NODE_UP_CB, - r2net_hb_node_up_cb, NULL, R2NET_HB_PRI); - - ret = r2hb_register_callback(NULL, &r2net_hb_up); - if (ret == 0) - ret = r2hb_register_callback(NULL, &r2net_hb_down); - - if (ret) - r2net_unregister_hb_callbacks(); - - return ret; -} - -/* ------------------------------------------------------------ */ - -static int r2net_accept_one(struct socket *sock) -{ - int ret, slen; - struct sockaddr_in sin; - struct socket *new_sock = NULL; - struct r2nm_node *node = NULL; - struct r2nm_node *local_node = NULL; - struct r2net_sock_container *sc = NULL; - struct r2net_node *nn; - - BUG_ON(sock == NULL); - ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, - sock->sk->sk_protocol, &new_sock); - if (ret) - goto out; - - new_sock->type = sock->type; - new_sock->ops = sock->ops; - ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); - if (ret < 0) - goto out; - - new_sock->sk->sk_allocation = GFP_ATOMIC; - - ret = r2net_set_nodelay(new_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } - - slen = sizeof(sin); - ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, - &slen, 1); - if (ret < 0) - goto out; - - node = r2nm_get_node_by_ip(sin.sin_addr.s_addr); - if (node == NULL) { - printk(KERN_NOTICE "ramster: Attempt to connect from unknown " - "node at %pI4:%d\n", &sin.sin_addr.s_addr, - ntohs(sin.sin_port)); - ret = -EINVAL; - goto out; - } - - if (r2nm_this_node() >= node->nd_num) { - local_node = r2nm_get_node_by_num(r2nm_this_node()); - printk(KERN_NOTICE "ramster: Unexpected connect attempt seen " - "at node '%s' (%u, %pI4:%d) from node '%s' (%u, " - "%pI4:%d)\n", local_node->nd_name, local_node->nd_num, - &(local_node->nd_ipv4_address), - ntohs(local_node->nd_ipv4_port), node->nd_name, - node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port)); - ret = -EINVAL; - goto out; - } - - /* this happens all the time when the other node sees our heartbeat - * and tries to connect before we see their heartbeat */ - if (!r2hb_check_node_heartbeating_from_callback(node->nd_num)) { - mlog(ML_CONN, "attempt to connect from node '%s' at " - "%pI4:%d but it isn't heartbeating\n", - node->nd_name, &sin.sin_addr.s_addr, - ntohs(sin.sin_port)); - ret = -EINVAL; - goto out; - } - - nn = r2net_nn_from_num(node->nd_num); - - spin_lock(&nn->nn_lock); - if (nn->nn_sc) - ret = -EBUSY; - else - ret = 0; - spin_unlock(&nn->nn_lock); - if (ret) { - printk(KERN_NOTICE "ramster: Attempt to connect from node '%s' " - "at %pI4:%d but it already has an open connection\n", - node->nd_name, &sin.sin_addr.s_addr, - ntohs(sin.sin_port)); - goto out; - } - - sc = sc_alloc(node); - if (sc == NULL) { - ret = -ENOMEM; - goto out; - } - - sc->sc_sock = new_sock; - new_sock = NULL; - - spin_lock(&nn->nn_lock); - atomic_set(&nn->nn_timeout, 0); - r2net_set_nn_state(nn, sc, 0, 0); - spin_unlock(&nn->nn_lock); - - r2net_register_callbacks(sc->sc_sock->sk, sc); - r2net_sc_queue_work(sc, &sc->sc_rx_work); - - r2net_initialize_handshake(); - r2net_sendpage(sc, r2net_hand, sizeof(*r2net_hand)); - -out: - if (new_sock) - sock_release(new_sock); - if (node) - r2nm_node_put(node); - if (local_node) - r2nm_node_put(local_node); - if (sc) - sc_put(sc); - return ret; -} - -static void r2net_accept_many(struct work_struct *work) -{ - struct socket *sock = r2net_listen_sock; - while (r2net_accept_one(sock) == 0) - cond_resched(); -} - -static void r2net_listen_data_ready(struct sock *sk, int bytes) -{ - void (*ready)(struct sock *sk, int bytes); - - read_lock(&sk->sk_callback_lock); - ready = sk->sk_user_data; - if (ready == NULL) { /* check for teardown race */ - ready = sk->sk_data_ready; - goto out; - } - - /* ->sk_data_ready is also called for a newly established child socket - * before it has been accepted and the acceptor has set up their - * data_ready.. we only want to queue listen work for our listening - * socket */ - if (sk->sk_state == TCP_LISTEN) { - mlog(ML_TCP, "bytes: %d\n", bytes); - queue_work(r2net_wq, &r2net_listen_work); - } - -out: - read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); -} - -static int r2net_open_listening_sock(__be32 addr, __be16 port) -{ - struct socket *sock = NULL; - int ret; - struct sockaddr_in sin = { - .sin_family = PF_INET, - .sin_addr = { .s_addr = addr }, - .sin_port = port, - }; - - ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (ret < 0) { - printk(KERN_ERR "ramster: Error %d while creating socket\n", - ret); - goto out; - } - - sock->sk->sk_allocation = GFP_ATOMIC; - - write_lock_bh(&sock->sk->sk_callback_lock); - sock->sk->sk_user_data = sock->sk->sk_data_ready; - sock->sk->sk_data_ready = r2net_listen_data_ready; - write_unlock_bh(&sock->sk->sk_callback_lock); - - r2net_listen_sock = sock; - INIT_WORK(&r2net_listen_work, r2net_accept_many); - - sock->sk->sk_reuse = 1; - ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); - if (ret < 0) { - printk(KERN_ERR "ramster: Error %d while binding socket at " - "%pI4:%u\n", ret, &addr, ntohs(port)); - goto out; - } - - ret = sock->ops->listen(sock, 64); - if (ret < 0) - printk(KERN_ERR "ramster: Error %d while listening on %pI4:%u\n", - ret, &addr, ntohs(port)); - -out: - if (ret) { - r2net_listen_sock = NULL; - if (sock) - sock_release(sock); - } - return ret; -} - -/* - * called from node manager when we should bring up our network listening - * socket. node manager handles all the serialization to only call this - * once and to match it with r2net_stop_listening(). note, - * r2nm_this_node() doesn't work yet as we're being called while it - * is being set up. - */ -int r2net_start_listening(struct r2nm_node *node) -{ - int ret = 0; - - BUG_ON(r2net_wq != NULL); - BUG_ON(r2net_listen_sock != NULL); - - mlog(ML_KTHREAD, "starting r2net thread...\n"); - r2net_wq = create_singlethread_workqueue("r2net"); - if (r2net_wq == NULL) { - mlog(ML_ERROR, "unable to launch r2net thread\n"); - return -ENOMEM; /* ? */ - } - - ret = r2net_open_listening_sock(node->nd_ipv4_address, - node->nd_ipv4_port); - if (ret) { - destroy_workqueue(r2net_wq); - r2net_wq = NULL; - } - - return ret; -} - -/* again, r2nm_this_node() doesn't work here as we're involved in - * tearing it down */ -void r2net_stop_listening(struct r2nm_node *node) -{ - struct socket *sock = r2net_listen_sock; - size_t i; - - BUG_ON(r2net_wq == NULL); - BUG_ON(r2net_listen_sock == NULL); - - /* stop the listening socket from generating work */ - write_lock_bh(&sock->sk->sk_callback_lock); - sock->sk->sk_data_ready = sock->sk->sk_user_data; - sock->sk->sk_user_data = NULL; - write_unlock_bh(&sock->sk->sk_callback_lock); - - for (i = 0; i < ARRAY_SIZE(r2net_nodes); i++) { - struct r2nm_node *node = r2nm_get_node_by_num(i); - if (node) { - r2net_disconnect_node(node); - r2nm_node_put(node); - } - } - - /* finish all work and tear down the work queue */ - mlog(ML_KTHREAD, "waiting for r2net thread to exit....\n"); - destroy_workqueue(r2net_wq); - r2net_wq = NULL; - - sock_release(r2net_listen_sock); - r2net_listen_sock = NULL; -} - -void r2net_hb_node_up_manual(int node_num) -{ - struct r2nm_node dummy; - if (r2nm_single_cluster == NULL) - pr_err("ramster: cluster not alive, node_up_manual ignored\n"); - else { - r2hb_manual_set_node_heartbeating(node_num); - r2net_hb_node_up_cb(&dummy, node_num, NULL); - } -} - -/* ------------------------------------------------------------ */ - -int r2net_init(void) -{ - unsigned long i; - - if (r2net_debugfs_init()) - return -ENOMEM; - - r2net_hand = kzalloc(sizeof(struct r2net_handshake), GFP_KERNEL); - r2net_keep_req = kzalloc(sizeof(struct r2net_msg), GFP_KERNEL); - r2net_keep_resp = kzalloc(sizeof(struct r2net_msg), GFP_KERNEL); - if (!r2net_hand || !r2net_keep_req || !r2net_keep_resp) { - kfree(r2net_hand); - kfree(r2net_keep_req); - kfree(r2net_keep_resp); - return -ENOMEM; - } - - r2net_hand->protocol_version = cpu_to_be64(R2NET_PROTOCOL_VERSION); - r2net_hand->connector_id = cpu_to_be64(1); - - r2net_keep_req->magic = cpu_to_be16(R2NET_MSG_KEEP_REQ_MAGIC); - r2net_keep_resp->magic = cpu_to_be16(R2NET_MSG_KEEP_RESP_MAGIC); - - for (i = 0; i < ARRAY_SIZE(r2net_nodes); i++) { - struct r2net_node *nn = r2net_nn_from_num(i); - - atomic_set(&nn->nn_timeout, 0); - spin_lock_init(&nn->nn_lock); - INIT_DELAYED_WORK(&nn->nn_connect_work, r2net_start_connect); - INIT_DELAYED_WORK(&nn->nn_connect_expired, - r2net_connect_expired); - INIT_DELAYED_WORK(&nn->nn_still_up, r2net_still_up); - /* until we see hb from a node we'll return einval */ - nn->nn_persistent_error = -ENOTCONN; - init_waitqueue_head(&nn->nn_sc_wq); - idr_init(&nn->nn_status_idr); - INIT_LIST_HEAD(&nn->nn_status_list); - } - - return 0; -} - -void r2net_exit(void) -{ - kfree(r2net_hand); - kfree(r2net_keep_req); - kfree(r2net_keep_resp); - r2net_debugfs_exit(); -} diff --git a/drivers/staging/ramster/cluster/tcp.h b/drivers/staging/ramster/cluster/tcp.h deleted file mode 100755 index 9d0583345..000000000 --- a/drivers/staging/ramster/cluster/tcp.h +++ /dev/null @@ -1,159 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * tcp.h - * - * Function prototypes - * - * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - */ - -#ifndef R2CLUSTER_TCP_H -#define R2CLUSTER_TCP_H - -#include -#ifdef __KERNEL__ -#include -#include -#else -#include -#endif -#include -#include - -struct r2net_msg { - __be16 magic; - __be16 data_len; - __be16 msg_type; - __be16 pad1; - __be32 sys_status; - __be32 status; - __be32 key; - __be32 msg_num; - __u8 buf[0]; -}; - -typedef int (r2net_msg_handler_func)(struct r2net_msg *msg, u32 len, void *data, - void **ret_data); -typedef void (r2net_post_msg_handler_func)(int status, void *data, - void *ret_data); - -#define R2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct r2net_msg)) - -/* same as hb delay, we're waiting for another node to recognize our hb */ -#define R2NET_RECONNECT_DELAY_MS_DEFAULT 2000 - -#define R2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 -#define R2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 - - -/* TODO: figure this out.... */ -static inline int r2net_link_down(int err, struct socket *sock) -{ - if (sock) { - if (sock->sk->sk_state != TCP_ESTABLISHED && - sock->sk->sk_state != TCP_CLOSE_WAIT) - return 1; - } - - if (err >= 0) - return 0; - switch (err) { - - /* ????????????????????????? */ - case -ERESTARTSYS: - case -EBADF: - /* When the server has died, an ICMP port unreachable - * message prompts ECONNREFUSED. */ - case -ECONNREFUSED: - case -ENOTCONN: - case -ECONNRESET: - case -EPIPE: - return 1; - - } - return 0; -} - -enum { - R2NET_DRIVER_UNINITED, - R2NET_DRIVER_READY, -}; - -int r2net_send_message(u32 msg_type, u32 key, void *data, u32 len, - u8 target_node, int *status); -int r2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec, - size_t veclen, u8 target_node, int *status); - -int r2net_register_handler(u32 msg_type, u32 key, u32 max_len, - r2net_msg_handler_func *func, void *data, - r2net_post_msg_handler_func *post_func, - struct list_head *unreg_list); -void r2net_unregister_handler_list(struct list_head *list); - -void r2net_fill_node_map(unsigned long *map, unsigned bytes); - -void r2net_force_data_magic(struct r2net_msg *, u16, u32); -void r2net_hb_node_up_manual(int); -struct r2net_node *r2net_nn_from_num(u8); - -struct r2nm_node; -int r2net_register_hb_callbacks(void); -void r2net_unregister_hb_callbacks(void); -int r2net_start_listening(struct r2nm_node *node); -void r2net_stop_listening(struct r2nm_node *node); -void r2net_disconnect_node(struct r2nm_node *node); -int r2net_num_connected_peers(void); - -int r2net_init(void); -void r2net_exit(void); - -struct r2net_send_tracking; -struct r2net_sock_container; - -#if 0 -int r2net_debugfs_init(void); -void r2net_debugfs_exit(void); -void r2net_debug_add_nst(struct r2net_send_tracking *nst); -void r2net_debug_del_nst(struct r2net_send_tracking *nst); -void r2net_debug_add_sc(struct r2net_sock_container *sc); -void r2net_debug_del_sc(struct r2net_sock_container *sc); -#else -static inline int r2net_debugfs_init(void) -{ - return 0; -} -static inline void r2net_debugfs_exit(void) -{ -} -static inline void r2net_debug_add_nst(struct r2net_send_tracking *nst) -{ -} -static inline void r2net_debug_del_nst(struct r2net_send_tracking *nst) -{ -} -static inline void r2net_debug_add_sc(struct r2net_sock_container *sc) -{ -} -static inline void r2net_debug_del_sc(struct r2net_sock_container *sc) -{ -} -#endif /* CONFIG_DEBUG_FS */ - -#endif /* R2CLUSTER_TCP_H */ diff --git a/drivers/staging/ramster/cluster/tcp_internal.h b/drivers/staging/ramster/cluster/tcp_internal.h deleted file mode 100755 index 4d8cc9f96..000000000 --- a/drivers/staging/ramster/cluster/tcp_internal.h +++ /dev/null @@ -1,248 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef R2CLUSTER_TCP_INTERNAL_H -#define R2CLUSTER_TCP_INTERNAL_H - -#define R2NET_MSG_MAGIC ((u16)0xfa55) -#define R2NET_MSG_STATUS_MAGIC ((u16)0xfa56) -#define R2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57) -#define R2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58) -/* - * "data magic" is a long version of "status magic" where the message - * payload actually contains data to be passed in reply to certain messages - */ -#define R2NET_MSG_DATA_MAGIC ((u16)0xfa59) - -/* we're delaying our quorum decision so that heartbeat will have timed - * out truly dead nodes by the time we come around to making decisions - * on their number */ -#define R2NET_QUORUM_DELAY_MS \ - ((r2hb_dead_threshold + 2) * R2HB_REGION_TIMEOUT_MS) - -/* - * This version number represents quite a lot, unfortunately. It not - * only represents the raw network message protocol on the wire but also - * locking semantics of the file system using the protocol. It should - * be somewhere else, I'm sure, but right now it isn't. - * - * With version 11, we separate out the filesystem locking portion. The - * filesystem now has a major.minor version it negotiates. Version 11 - * introduces this negotiation to the r2dlm protocol, and as such the - * version here in tcp_internal.h should not need to be bumped for - * filesystem locking changes. - * - * New in version 11 - * - Negotiation of filesystem locking in the dlm join. - * - * New in version 10: - * - Meta/data locks combined - * - * New in version 9: - * - All votes removed - * - * New in version 8: - * - Replace delete inode votes with a cluster lock - * - * New in version 7: - * - DLM join domain includes the live nodemap - * - * New in version 6: - * - DLM lockres remote refcount fixes. - * - * New in version 5: - * - Network timeout checking protocol - * - * New in version 4: - * - Remove i_generation from lock names for better stat performance. - * - * New in version 3: - * - Replace dentry votes with a cluster lock - * - * New in version 2: - * - full 64 bit i_size in the metadata lock lvbs - * - introduction of "rw" lock and pushing meta/data locking down - */ -#define R2NET_PROTOCOL_VERSION 11ULL -struct r2net_handshake { - __be64 protocol_version; - __be64 connector_id; - __be32 r2hb_heartbeat_timeout_ms; - __be32 r2net_idle_timeout_ms; - __be32 r2net_keepalive_delay_ms; - __be32 r2net_reconnect_delay_ms; -}; - -struct r2net_node { - /* this is never called from int/bh */ - spinlock_t nn_lock; - - /* set the moment an sc is allocated and a connect is started */ - struct r2net_sock_container *nn_sc; - /* _valid is only set after the handshake passes and tx can happen */ - unsigned nn_sc_valid:1; - /* if this is set tx just returns it */ - int nn_persistent_error; - /* It is only set to 1 after the idle time out. */ - atomic_t nn_timeout; - - /* threads waiting for an sc to arrive wait on the wq for generation - * to increase. it is increased when a connecting socket succeeds - * or fails or when an accepted socket is attached. */ - wait_queue_head_t nn_sc_wq; - - struct idr nn_status_idr; - struct list_head nn_status_list; - - /* connects are attempted from when heartbeat comes up until either hb - * goes down, the node is unconfigured, no connect attempts succeed - * before R2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work - * is queued from set_nn_state both from hb up and from itself if a - * connect attempt fails and so can be self-arming. shutdown is - * careful to first mark the nn such that no connects will be attempted - * before canceling delayed connect work and flushing the queue. */ - struct delayed_work nn_connect_work; - unsigned long nn_last_connect_attempt; - - /* this is queued as nodes come up and is canceled when a connection is - * established. this expiring gives up on the node and errors out - * transmits */ - struct delayed_work nn_connect_expired; - - /* after we give up on a socket we wait a while before deciding - * that it is still heartbeating and that we should do some - * quorum work */ - struct delayed_work nn_still_up; -}; - -struct r2net_sock_container { - struct kref sc_kref; - /* the next two are valid for the life time of the sc */ - struct socket *sc_sock; - struct r2nm_node *sc_node; - - /* all of these sc work structs hold refs on the sc while they are - * queued. they should not be able to ref a freed sc. the teardown - * race is with r2net_wq destruction in r2net_stop_listening() */ - - /* rx and connect work are generated from socket callbacks. sc - * shutdown removes the callbacks and then flushes the work queue */ - struct work_struct sc_rx_work; - struct work_struct sc_connect_work; - /* shutdown work is triggered in two ways. the simple way is - * for a code path calls ensure_shutdown which gets a lock, removes - * the sc from the nn, and queues the work. in this case the - * work is single-shot. the work is also queued from a sock - * callback, though, and in this case the work will find the sc - * still on the nn and will call ensure_shutdown itself.. this - * ends up triggering the shutdown work again, though nothing - * will be done in that second iteration. so work queue teardown - * has to be careful to remove the sc from the nn before waiting - * on the work queue so that the shutdown work doesn't remove the - * sc and rearm itself. - */ - struct work_struct sc_shutdown_work; - - struct timer_list sc_idle_timeout; - struct delayed_work sc_keepalive_work; - - unsigned sc_handshake_ok:1; - - struct page *sc_page; - size_t sc_page_off; - - /* original handlers for the sockets */ - void (*sc_state_change)(struct sock *sk); - void (*sc_data_ready)(struct sock *sk, int bytes); - - u32 sc_msg_key; - u16 sc_msg_type; - -#ifdef CONFIG_DEBUG_FS - struct list_head sc_net_debug_item; - ktime_t sc_tv_timer; - ktime_t sc_tv_data_ready; - ktime_t sc_tv_advance_start; - ktime_t sc_tv_advance_stop; - ktime_t sc_tv_func_start; - ktime_t sc_tv_func_stop; -#endif -#ifdef CONFIG_RAMSTER_FS_STATS - ktime_t sc_tv_acquiry_total; - ktime_t sc_tv_send_total; - ktime_t sc_tv_status_total; - u32 sc_send_count; - u32 sc_recv_count; - ktime_t sc_tv_process_total; -#endif - struct mutex sc_send_lock; -}; - -struct r2net_msg_handler { - struct rb_node nh_node; - u32 nh_max_len; - u32 nh_msg_type; - u32 nh_key; - r2net_msg_handler_func *nh_func; - r2net_msg_handler_func *nh_func_data; - r2net_post_msg_handler_func - *nh_post_func; - struct kref nh_kref; - struct list_head nh_unregister_item; -}; - -enum r2net_system_error { - R2NET_ERR_NONE = 0, - R2NET_ERR_NO_HNDLR, - R2NET_ERR_OVERFLOW, - R2NET_ERR_DIED, - R2NET_ERR_MAX -}; - -struct r2net_status_wait { - enum r2net_system_error ns_sys_status; - s32 ns_status; - int ns_id; - wait_queue_head_t ns_wq; - struct list_head ns_node_item; -}; - -#ifdef CONFIG_DEBUG_FS -/* just for state dumps */ -struct r2net_send_tracking { - struct list_head st_net_debug_item; - struct task_struct *st_task; - struct r2net_sock_container *st_sc; - u32 st_id; - u32 st_msg_type; - u32 st_msg_key; - u8 st_node; - ktime_t st_sock_time; - ktime_t st_send_time; - ktime_t st_status_time; -}; -#else -struct r2net_send_tracking { - u32 dummy; -}; -#endif /* CONFIG_DEBUG_FS */ - -#endif /* R2CLUSTER_TCP_INTERNAL_H */ diff --git a/drivers/staging/ramster/r2net.c b/drivers/staging/ramster/r2net.c deleted file mode 100644 index 2ee02204c..000000000 --- a/drivers/staging/ramster/r2net.c +++ /dev/null @@ -1,401 +0,0 @@ -/* - * r2net.c - * - * Copyright (c) 2011, Dan Magenheimer, Oracle Corp. - * - * Ramster_r2net provides an interface between zcache and r2net. - * - * FIXME: support more than two nodes - */ - -#include -#include "cluster/tcp.h" -#include "cluster/nodemanager.h" -#include "tmem.h" -#include "zcache.h" -#include "ramster.h" - -#define RAMSTER_TESTING - -#define RMSTR_KEY 0x77347734 - -enum { - RMSTR_TMEM_PUT_EPH = 100, - RMSTR_TMEM_PUT_PERS, - RMSTR_TMEM_ASYNC_GET_REQUEST, - RMSTR_TMEM_ASYNC_GET_AND_FREE_REQUEST, - RMSTR_TMEM_ASYNC_GET_REPLY, - RMSTR_TMEM_FLUSH, - RMSTR_TMEM_FLOBJ, - RMSTR_TMEM_DESTROY_POOL, -}; - -#define RMSTR_R2NET_MAX_LEN \ - (R2NET_MAX_PAYLOAD_BYTES - sizeof(struct tmem_xhandle)) - -#include "cluster/tcp_internal.h" - -static struct r2nm_node *r2net_target_node; -static int r2net_target_nodenum; - -int r2net_remote_target_node_set(int node_num) -{ - int ret = -1; - - r2net_target_node = r2nm_get_node_by_num(node_num); - if (r2net_target_node != NULL) { - r2net_target_nodenum = node_num; - r2nm_node_put(r2net_target_node); - ret = 0; - } - return ret; -} - -/* FIXME following buffer should be per-cpu, protected by preempt_disable */ -static char ramster_async_get_buf[R2NET_MAX_PAYLOAD_BYTES]; - -static int ramster_remote_async_get_request_handler(struct r2net_msg *msg, - u32 len, void *data, void **ret_data) -{ - char *pdata; - struct tmem_xhandle xh; - int found; - size_t size = RMSTR_R2NET_MAX_LEN; - u16 msgtype = be16_to_cpu(msg->msg_type); - bool get_and_free = (msgtype == RMSTR_TMEM_ASYNC_GET_AND_FREE_REQUEST); - unsigned long flags; - - xh = *(struct tmem_xhandle *)msg->buf; - if (xh.xh_data_size > RMSTR_R2NET_MAX_LEN) - BUG(); - pdata = ramster_async_get_buf; - *(struct tmem_xhandle *)pdata = xh; - pdata += sizeof(struct tmem_xhandle); - local_irq_save(flags); - found = zcache_get(xh.client_id, xh.pool_id, &xh.oid, xh.index, - pdata, &size, 1, get_and_free ? 1 : -1); - local_irq_restore(flags); - if (found < 0) { - /* a zero size indicates the get failed */ - size = 0; - } - if (size > RMSTR_R2NET_MAX_LEN) - BUG(); - *ret_data = pdata - sizeof(struct tmem_xhandle); - /* now make caller (r2net_process_message) handle specially */ - r2net_force_data_magic(msg, RMSTR_TMEM_ASYNC_GET_REPLY, RMSTR_KEY); - return size + sizeof(struct tmem_xhandle); -} - -static int ramster_remote_async_get_reply_handler(struct r2net_msg *msg, - u32 len, void *data, void **ret_data) -{ - char *in = (char *)msg->buf; - int datalen = len - sizeof(struct r2net_msg); - int ret = -1; - struct tmem_xhandle *xh = (struct tmem_xhandle *)in; - - in += sizeof(struct tmem_xhandle); - datalen -= sizeof(struct tmem_xhandle); - BUG_ON(datalen < 0 || datalen > PAGE_SIZE); - ret = zcache_localify(xh->pool_id, &xh->oid, xh->index, - in, datalen, xh->extra); -#ifdef RAMSTER_TESTING - if (ret == -EEXIST) - pr_err("TESTING ArrgREP, aborted overwrite on racy put\n"); -#endif - return ret; -} - -int ramster_remote_put_handler(struct r2net_msg *msg, - u32 len, void *data, void **ret_data) -{ - struct tmem_xhandle *xh; - char *p = (char *)msg->buf; - int datalen = len - sizeof(struct r2net_msg) - - sizeof(struct tmem_xhandle); - u16 msgtype = be16_to_cpu(msg->msg_type); - bool ephemeral = (msgtype == RMSTR_TMEM_PUT_EPH); - unsigned long flags; - int ret; - - xh = (struct tmem_xhandle *)p; - p += sizeof(struct tmem_xhandle); - zcache_autocreate_pool(xh->client_id, xh->pool_id, ephemeral); - local_irq_save(flags); - ret = zcache_put(xh->client_id, xh->pool_id, &xh->oid, xh->index, - p, datalen, 1, ephemeral ? 1 : -1); - local_irq_restore(flags); - return ret; -} - -int ramster_remote_flush_handler(struct r2net_msg *msg, - u32 len, void *data, void **ret_data) -{ - struct tmem_xhandle *xh; - char *p = (char *)msg->buf; - - xh = (struct tmem_xhandle *)p; - p += sizeof(struct tmem_xhandle); - (void)zcache_flush(xh->client_id, xh->pool_id, &xh->oid, xh->index); - return 0; -} - -int ramster_remote_flobj_handler(struct r2net_msg *msg, - u32 len, void *data, void **ret_data) -{ - struct tmem_xhandle *xh; - char *p = (char *)msg->buf; - - xh = (struct tmem_xhandle *)p; - p += sizeof(struct tmem_xhandle); - (void)zcache_flush_object(xh->client_id, xh->pool_id, &xh->oid); - return 0; -} - -int ramster_remote_async_get(struct tmem_xhandle *xh, bool free, int remotenode, - size_t expect_size, uint8_t expect_cksum, - void *extra) -{ - int ret = -1, status; - struct r2nm_node *node = NULL; - struct kvec vec[1]; - size_t veclen = 1; - u32 msg_type; - - node = r2nm_get_node_by_num(remotenode); - if (node == NULL) - goto out; - xh->client_id = r2nm_this_node(); /* which node is getting */ - xh->xh_data_cksum = expect_cksum; - xh->xh_data_size = expect_size; - xh->extra = extra; - vec[0].iov_len = sizeof(*xh); - vec[0].iov_base = xh; - if (free) - msg_type = RMSTR_TMEM_ASYNC_GET_AND_FREE_REQUEST; - else - msg_type = RMSTR_TMEM_ASYNC_GET_REQUEST; - ret = r2net_send_message_vec(msg_type, RMSTR_KEY, - vec, veclen, remotenode, &status); - r2nm_node_put(node); - if (ret < 0) { - /* FIXME handle bad message possibilities here? */ - pr_err("UNTESTED ret<0 in ramster_remote_async_get\n"); - } - ret = status; -out: - return ret; -} - -#ifdef RAMSTER_TESTING -/* leave me here to see if it catches a weird crash */ -static void ramster_check_irq_counts(void) -{ - static int last_hardirq_cnt, last_softirq_cnt, last_preempt_cnt; - int cur_hardirq_cnt, cur_softirq_cnt, cur_preempt_cnt; - - cur_hardirq_cnt = hardirq_count() >> HARDIRQ_SHIFT; - if (cur_hardirq_cnt > last_hardirq_cnt) { - last_hardirq_cnt = cur_hardirq_cnt; - if (!(last_hardirq_cnt&(last_hardirq_cnt-1))) - pr_err("RAMSTER TESTING RRP hardirq_count=%d\n", - last_hardirq_cnt); - } - cur_softirq_cnt = softirq_count() >> SOFTIRQ_SHIFT; - if (cur_softirq_cnt > last_softirq_cnt) { - last_softirq_cnt = cur_softirq_cnt; - if (!(last_softirq_cnt&(last_softirq_cnt-1))) - pr_err("RAMSTER TESTING RRP softirq_count=%d\n", - last_softirq_cnt); - } - cur_preempt_cnt = preempt_count() & PREEMPT_MASK; - if (cur_preempt_cnt > last_preempt_cnt) { - last_preempt_cnt = cur_preempt_cnt; - if (!(last_preempt_cnt&(last_preempt_cnt-1))) - pr_err("RAMSTER TESTING RRP preempt_count=%d\n", - last_preempt_cnt); - } -} -#endif - -int ramster_remote_put(struct tmem_xhandle *xh, char *data, size_t size, - bool ephemeral, int *remotenode) -{ - int nodenum, ret = -1, status; - struct r2nm_node *node = NULL; - struct kvec vec[2]; - size_t veclen = 2; - u32 msg_type; -#ifdef RAMSTER_TESTING - struct r2net_node *nn; -#endif - - BUG_ON(size > RMSTR_R2NET_MAX_LEN); - xh->client_id = r2nm_this_node(); /* which node is putting */ - vec[0].iov_len = sizeof(*xh); - vec[0].iov_base = xh; - vec[1].iov_len = size; - vec[1].iov_base = data; - node = r2net_target_node; - if (!node) - goto out; - - nodenum = r2net_target_nodenum; - - r2nm_node_get(node); - -#ifdef RAMSTER_TESTING - nn = r2net_nn_from_num(nodenum); - WARN_ON_ONCE(nn->nn_persistent_error || !nn->nn_sc_valid); -#endif - - if (ephemeral) - msg_type = RMSTR_TMEM_PUT_EPH; - else - msg_type = RMSTR_TMEM_PUT_PERS; -#ifdef RAMSTER_TESTING - /* leave me here to see if it catches a weird crash */ - ramster_check_irq_counts(); -#endif - - ret = r2net_send_message_vec(msg_type, RMSTR_KEY, vec, veclen, - nodenum, &status); -#ifdef RAMSTER_TESTING - if (ret != 0) { - static unsigned long cnt; - cnt++; - if (!(cnt&(cnt-1))) - pr_err("ramster_remote_put: message failed, " - "ret=%d, cnt=%lu\n", ret, cnt); - ret = -1; - } -#endif - if (ret < 0) - ret = -1; - else { - ret = status; - *remotenode = nodenum; - } - - r2nm_node_put(node); -out: - return ret; -} - -int ramster_remote_flush(struct tmem_xhandle *xh, int remotenode) -{ - int ret = -1, status; - struct r2nm_node *node = NULL; - struct kvec vec[1]; - size_t veclen = 1; - - node = r2nm_get_node_by_num(remotenode); - BUG_ON(node == NULL); - xh->client_id = r2nm_this_node(); /* which node is flushing */ - vec[0].iov_len = sizeof(*xh); - vec[0].iov_base = xh; - BUG_ON(irqs_disabled()); - BUG_ON(in_softirq()); - ret = r2net_send_message_vec(RMSTR_TMEM_FLUSH, RMSTR_KEY, - vec, veclen, remotenode, &status); - r2nm_node_put(node); - return ret; -} - -int ramster_remote_flush_object(struct tmem_xhandle *xh, int remotenode) -{ - int ret = -1, status; - struct r2nm_node *node = NULL; - struct kvec vec[1]; - size_t veclen = 1; - - node = r2nm_get_node_by_num(remotenode); - BUG_ON(node == NULL); - xh->client_id = r2nm_this_node(); /* which node is flobjing */ - vec[0].iov_len = sizeof(*xh); - vec[0].iov_base = xh; - ret = r2net_send_message_vec(RMSTR_TMEM_FLOBJ, RMSTR_KEY, - vec, veclen, remotenode, &status); - r2nm_node_put(node); - return ret; -} - -/* - * Handler registration - */ - -static LIST_HEAD(r2net_unreg_list); - -static void r2net_unregister_handlers(void) -{ - r2net_unregister_handler_list(&r2net_unreg_list); -} - -int r2net_register_handlers(void) -{ - int status; - - status = r2net_register_handler(RMSTR_TMEM_PUT_EPH, RMSTR_KEY, - RMSTR_R2NET_MAX_LEN, - ramster_remote_put_handler, - NULL, NULL, &r2net_unreg_list); - if (status) - goto bail; - - status = r2net_register_handler(RMSTR_TMEM_PUT_PERS, RMSTR_KEY, - RMSTR_R2NET_MAX_LEN, - ramster_remote_put_handler, - NULL, NULL, &r2net_unreg_list); - if (status) - goto bail; - - status = r2net_register_handler(RMSTR_TMEM_ASYNC_GET_REQUEST, RMSTR_KEY, - RMSTR_R2NET_MAX_LEN, - ramster_remote_async_get_request_handler, - NULL, NULL, - &r2net_unreg_list); - if (status) - goto bail; - - status = r2net_register_handler(RMSTR_TMEM_ASYNC_GET_AND_FREE_REQUEST, - RMSTR_KEY, RMSTR_R2NET_MAX_LEN, - ramster_remote_async_get_request_handler, - NULL, NULL, - &r2net_unreg_list); - if (status) - goto bail; - - status = r2net_register_handler(RMSTR_TMEM_ASYNC_GET_REPLY, RMSTR_KEY, - RMSTR_R2NET_MAX_LEN, - ramster_remote_async_get_reply_handler, - NULL, NULL, - &r2net_unreg_list); - if (status) - goto bail; - - status = r2net_register_handler(RMSTR_TMEM_FLUSH, RMSTR_KEY, - RMSTR_R2NET_MAX_LEN, - ramster_remote_flush_handler, - NULL, NULL, - &r2net_unreg_list); - if (status) - goto bail; - - status = r2net_register_handler(RMSTR_TMEM_FLOBJ, RMSTR_KEY, - RMSTR_R2NET_MAX_LEN, - ramster_remote_flobj_handler, - NULL, NULL, - &r2net_unreg_list); - if (status) - goto bail; - - pr_info("ramster: r2net handlers registered\n"); - -bail: - if (status) { - r2net_unregister_handlers(); - pr_err("ramster: couldn't register r2net handlers\n"); - } - return status; -} diff --git a/drivers/staging/ramster/ramster.h b/drivers/staging/ramster/ramster.h deleted file mode 100644 index 0c9455e8d..000000000 --- a/drivers/staging/ramster/ramster.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * ramster.h - * - * Peer-to-peer transcendent memory - * - * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp. - */ - -#ifndef _RAMSTER_H_ -#define _RAMSTER_H_ - -/* - * format of remote pampd: - * bit 0 == intransit - * bit 1 == is_remote... if this bit is set, then - * bit 2-9 == remotenode - * bit 10-22 == size - * bit 23-30 == cksum - */ -#define FAKE_PAMPD_INTRANSIT_BITS 1 -#define FAKE_PAMPD_ISREMOTE_BITS 1 -#define FAKE_PAMPD_REMOTENODE_BITS 8 -#define FAKE_PAMPD_REMOTESIZE_BITS 13 -#define FAKE_PAMPD_CHECKSUM_BITS 8 - -#define FAKE_PAMPD_INTRANSIT_SHIFT 0 -#define FAKE_PAMPD_ISREMOTE_SHIFT (FAKE_PAMPD_INTRANSIT_SHIFT + \ - FAKE_PAMPD_INTRANSIT_BITS) -#define FAKE_PAMPD_REMOTENODE_SHIFT (FAKE_PAMPD_ISREMOTE_SHIFT + \ - FAKE_PAMPD_ISREMOTE_BITS) -#define FAKE_PAMPD_REMOTESIZE_SHIFT (FAKE_PAMPD_REMOTENODE_SHIFT + \ - FAKE_PAMPD_REMOTENODE_BITS) -#define FAKE_PAMPD_CHECKSUM_SHIFT (FAKE_PAMPD_REMOTESIZE_SHIFT + \ - FAKE_PAMPD_REMOTESIZE_BITS) - -#define FAKE_PAMPD_MASK(x) ((1UL << (x)) - 1) - -static inline void *pampd_make_remote(int remotenode, size_t size, - unsigned char cksum) -{ - unsigned long fake_pampd = 0; - fake_pampd |= 1UL << FAKE_PAMPD_ISREMOTE_SHIFT; - fake_pampd |= ((unsigned long)remotenode & - FAKE_PAMPD_MASK(FAKE_PAMPD_REMOTENODE_BITS)) << - FAKE_PAMPD_REMOTENODE_SHIFT; - fake_pampd |= ((unsigned long)size & - FAKE_PAMPD_MASK(FAKE_PAMPD_REMOTESIZE_BITS)) << - FAKE_PAMPD_REMOTESIZE_SHIFT; - fake_pampd |= ((unsigned long)cksum & - FAKE_PAMPD_MASK(FAKE_PAMPD_CHECKSUM_BITS)) << - FAKE_PAMPD_CHECKSUM_SHIFT; - return (void *)fake_pampd; -} - -static inline unsigned int pampd_remote_node(void *pampd) -{ - unsigned long fake_pampd = (unsigned long)pampd; - return (fake_pampd >> FAKE_PAMPD_REMOTENODE_SHIFT) & - FAKE_PAMPD_MASK(FAKE_PAMPD_REMOTENODE_BITS); -} - -static inline unsigned int pampd_remote_size(void *pampd) -{ - unsigned long fake_pampd = (unsigned long)pampd; - return (fake_pampd >> FAKE_PAMPD_REMOTESIZE_SHIFT) & - FAKE_PAMPD_MASK(FAKE_PAMPD_REMOTESIZE_BITS); -} - -static inline unsigned char pampd_remote_cksum(void *pampd) -{ - unsigned long fake_pampd = (unsigned long)pampd; - return (fake_pampd >> FAKE_PAMPD_CHECKSUM_SHIFT) & - FAKE_PAMPD_MASK(FAKE_PAMPD_CHECKSUM_BITS); -} - -static inline bool pampd_is_remote(void *pampd) -{ - unsigned long fake_pampd = (unsigned long)pampd; - return (fake_pampd >> FAKE_PAMPD_ISREMOTE_SHIFT) & - FAKE_PAMPD_MASK(FAKE_PAMPD_ISREMOTE_BITS); -} - -static inline bool pampd_is_intransit(void *pampd) -{ - unsigned long fake_pampd = (unsigned long)pampd; - return (fake_pampd >> FAKE_PAMPD_INTRANSIT_SHIFT) & - FAKE_PAMPD_MASK(FAKE_PAMPD_INTRANSIT_BITS); -} - -/* note that it is a BUG for intransit to be set without isremote also set */ -static inline void *pampd_mark_intransit(void *pampd) -{ - unsigned long fake_pampd = (unsigned long)pampd; - - fake_pampd |= 1UL << FAKE_PAMPD_ISREMOTE_SHIFT; - fake_pampd |= 1UL << FAKE_PAMPD_INTRANSIT_SHIFT; - return (void *)fake_pampd; -} - -static inline void *pampd_mask_intransit_and_remote(void *marked_pampd) -{ - unsigned long pampd = (unsigned long)marked_pampd; - - pampd &= ~(1UL << FAKE_PAMPD_INTRANSIT_SHIFT); - pampd &= ~(1UL << FAKE_PAMPD_ISREMOTE_SHIFT); - return (void *)pampd; -} - -extern int ramster_remote_async_get(struct tmem_xhandle *, - bool, int, size_t, uint8_t, void *extra); -extern int ramster_remote_put(struct tmem_xhandle *, char *, size_t, - bool, int *); -extern int ramster_remote_flush(struct tmem_xhandle *, int); -extern int ramster_remote_flush_object(struct tmem_xhandle *, int); -extern int r2net_register_handlers(void); -extern int r2net_remote_target_node_set(int); - -#endif /* _TMEM_H */ diff --git a/drivers/staging/ramster/tmem.c b/drivers/staging/ramster/tmem.c deleted file mode 100644 index 8f2f6892d..000000000 --- a/drivers/staging/ramster/tmem.c +++ /dev/null @@ -1,851 +0,0 @@ -/* - * In-kernel transcendent memory (generic implementation) - * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. - * - * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented - * "handles" (triples containing a pool id, and object id, and an index), to - * pages in a page-accessible memory (PAM). Tmem references the PAM pages via - * an abstract "pampd" (PAM page-descriptor), which can be operated on by a - * set of functions (pamops). Each pampd contains some representation of - * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of - * pages and must be able to insert, find, and delete these pages at a - * potential frequency of thousands per second concurrently across many CPUs, - * (and, if used with KVM, across many vcpus across many guests). - * Tmem is tracked with a hierarchy of data structures, organized by - * the elements in a handle-tuple: pool_id, object_id, and page index. - * One or more "clients" (e.g. guests) each provide one or more tmem_pools. - * Each pool, contains a hash table of rb_trees of tmem_objs. Each - * tmem_obj contains a radix-tree-like tree of pointers, with intermediate - * nodes called tmem_objnodes. Each leaf pointer in this tree points to - * a pampd, which is accessible only through a small set of callbacks - * registered by the PAM implementation (see tmem_register_pamops). Tmem - * does all memory allocation via a set of callbacks registered by the tmem - * host implementation (e.g. see tmem_register_hostops). - */ - -#include -#include -#include -#include - -#include "tmem.h" - -/* data structure sentinels used for debugging... see tmem.h */ -#define POOL_SENTINEL 0x87658765 -#define OBJ_SENTINEL 0x12345678 -#define OBJNODE_SENTINEL 0xfedcba09 - -/* - * A tmem host implementation must use this function to register callbacks - * for memory allocation. - */ -static struct tmem_hostops tmem_hostops; - -static void tmem_objnode_tree_init(void); - -void tmem_register_hostops(struct tmem_hostops *m) -{ - tmem_objnode_tree_init(); - tmem_hostops = *m; -} - -/* - * A tmem host implementation must use this function to register - * callbacks for a page-accessible memory (PAM) implementation - */ -static struct tmem_pamops tmem_pamops; - -void tmem_register_pamops(struct tmem_pamops *m) -{ - tmem_pamops = *m; -} - -/* - * Oid's are potentially very sparse and tmem_objs may have an indeterminately - * short life, being added and deleted at a relatively high frequency. - * So an rb_tree is an ideal data structure to manage tmem_objs. But because - * of the potentially huge number of tmem_objs, each pool manages a hashtable - * of rb_trees to reduce search, insert, delete, and rebalancing time. - * Each hashbucket also has a lock to manage concurrent access. - * - * The following routines manage tmem_objs. When any tmem_obj is accessed, - * the hashbucket lock must be held. - */ - -/* searches for object==oid in pool, returns locked object if found */ -static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, - struct tmem_oid *oidp) -{ - struct rb_node *rbnode; - struct tmem_obj *obj; - - rbnode = hb->obj_rb_root.rb_node; - while (rbnode) { - BUG_ON(RB_EMPTY_NODE(rbnode)); - obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); - switch (tmem_oid_compare(oidp, &obj->oid)) { - case 0: /* equal */ - goto out; - case -1: - rbnode = rbnode->rb_left; - break; - case 1: - rbnode = rbnode->rb_right; - break; - } - } - obj = NULL; -out: - return obj; -} - -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *); - -/* free an object that has no more pampds in it */ -static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) -{ - struct tmem_pool *pool; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pampd_count > 0); - pool = obj->pool; - BUG_ON(pool == NULL); - if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */ - tmem_pampd_destroy_all_in_obj(obj); - BUG_ON(obj->objnode_tree_root != NULL); - BUG_ON((long)obj->objnode_count != 0); - atomic_dec(&pool->obj_count); - BUG_ON(atomic_read(&pool->obj_count) < 0); - INVERT_SENTINEL(obj, OBJ); - obj->pool = NULL; - tmem_oid_set_invalid(&obj->oid); - rb_erase(&obj->rb_tree_node, &hb->obj_rb_root); -} - -/* - * initialize, and insert an tmem_object_root (called only if find failed) - */ -static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, - struct tmem_pool *pool, - struct tmem_oid *oidp) -{ - struct rb_root *root = &hb->obj_rb_root; - struct rb_node **new = &(root->rb_node), *parent = NULL; - struct tmem_obj *this; - - BUG_ON(pool == NULL); - atomic_inc(&pool->obj_count); - obj->objnode_tree_height = 0; - obj->objnode_tree_root = NULL; - obj->pool = pool; - obj->oid = *oidp; - obj->objnode_count = 0; - obj->pampd_count = 0; - (*tmem_pamops.new_obj)(obj); - SET_SENTINEL(obj, OBJ); - while (*new) { - BUG_ON(RB_EMPTY_NODE(*new)); - this = rb_entry(*new, struct tmem_obj, rb_tree_node); - parent = *new; - switch (tmem_oid_compare(oidp, &this->oid)) { - case 0: - BUG(); /* already present; should never happen! */ - break; - case -1: - new = &(*new)->rb_left; - break; - case 1: - new = &(*new)->rb_right; - break; - } - } - rb_link_node(&obj->rb_tree_node, parent, new); - rb_insert_color(&obj->rb_tree_node, root); -} - -/* - * Tmem is managed as a set of tmem_pools with certain attributes, such as - * "ephemeral" vs "persistent". These attributes apply to all tmem_objs - * and all pampds that belong to a tmem_pool. A tmem_pool is created - * or deleted relatively rarely (for example, when a filesystem is - * mounted or unmounted. - */ - -/* flush all data from a pool and, optionally, free it */ -static void tmem_pool_flush(struct tmem_pool *pool, bool destroy) -{ - struct rb_node *rbnode; - struct tmem_obj *obj; - struct tmem_hashbucket *hb = &pool->hashbucket[0]; - int i; - - BUG_ON(pool == NULL); - for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { - spin_lock(&hb->lock); - rbnode = rb_first(&hb->obj_rb_root); - while (rbnode != NULL) { - obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); - rbnode = rb_next(rbnode); - tmem_pampd_destroy_all_in_obj(obj); - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - } - spin_unlock(&hb->lock); - } - if (destroy) - list_del(&pool->pool_list); -} - -/* - * A tmem_obj contains a radix-tree-like tree in which the intermediate - * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation - * is very specialized and tuned for specific uses and is not particularly - * suited for use from this code, though some code from the core algorithms has - * been reused, thus the copyright notices below). Each tmem_objnode contains - * a set of pointers which point to either a set of intermediate tmem_objnodes - * or a set of of pampds. - * - * Portions Copyright (C) 2001 Momchil Velikov - * Portions Copyright (C) 2001 Christoph Hellwig - * Portions Copyright (C) 2005 SGI, Christoph Lameter - */ - -struct tmem_objnode_tree_path { - struct tmem_objnode *objnode; - int offset; -}; - -/* objnode height_to_maxindex translation */ -static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1]; - -static void tmem_objnode_tree_init(void) -{ - unsigned int ht, tmp; - - for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) { - tmp = ht * OBJNODE_TREE_MAP_SHIFT; - if (tmp >= OBJNODE_TREE_INDEX_BITS) - tmem_objnode_tree_h2max[ht] = ~0UL; - else - tmem_objnode_tree_h2max[ht] = - (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1; - } -} - -static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj) -{ - struct tmem_objnode *objnode; - - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - objnode = (*tmem_hostops.objnode_alloc)(obj->pool); - if (unlikely(objnode == NULL)) - goto out; - objnode->obj = obj; - SET_SENTINEL(objnode, OBJNODE); - memset(&objnode->slots, 0, sizeof(objnode->slots)); - objnode->slots_in_use = 0; - obj->objnode_count++; -out: - return objnode; -} - -static void tmem_objnode_free(struct tmem_objnode *objnode) -{ - struct tmem_pool *pool; - int i; - - BUG_ON(objnode == NULL); - for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) - BUG_ON(objnode->slots[i] != NULL); - ASSERT_SENTINEL(objnode, OBJNODE); - INVERT_SENTINEL(objnode, OBJNODE); - BUG_ON(objnode->obj == NULL); - ASSERT_SENTINEL(objnode->obj, OBJ); - pool = objnode->obj->pool; - BUG_ON(pool == NULL); - ASSERT_SENTINEL(pool, POOL); - objnode->obj->objnode_count--; - objnode->obj = NULL; - (*tmem_hostops.objnode_free)(objnode, pool); -} - -/* - * lookup index in object and return associated pampd (or NULL if not found) - */ -static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) -{ - unsigned int height, shift; - struct tmem_objnode **slot = NULL; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - - height = obj->objnode_tree_height; - if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) - goto out; - if (height == 0 && obj->objnode_tree_root) { - slot = &obj->objnode_tree_root; - goto out; - } - shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; - slot = &obj->objnode_tree_root; - while (height > 0) { - if (*slot == NULL) - goto out; - slot = (struct tmem_objnode **) - ((*slot)->slots + - ((index >> shift) & OBJNODE_TREE_MAP_MASK)); - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } -out: - return slot != NULL ? (void **)slot : NULL; -} - -static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) -{ - struct tmem_objnode **slot; - - slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); - return slot != NULL ? *slot : NULL; -} - -static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, - void *new_pampd, bool no_free) -{ - struct tmem_objnode **slot; - void *ret = NULL; - - slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); - if ((slot != NULL) && (*slot != NULL)) { - void *old_pampd = *(void **)slot; - *(void **)slot = new_pampd; - if (!no_free) - (*tmem_pamops.free)(old_pampd, obj->pool, - NULL, 0, false); - ret = new_pampd; - } - return ret; -} - -static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index, - void *pampd) -{ - int ret = 0; - struct tmem_objnode *objnode = NULL, *newnode, *slot; - unsigned int height, shift; - int offset = 0; - - /* if necessary, extend the tree to be higher */ - if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) { - height = obj->objnode_tree_height + 1; - if (index > tmem_objnode_tree_h2max[height]) - while (index > tmem_objnode_tree_h2max[height]) - height++; - if (obj->objnode_tree_root == NULL) { - obj->objnode_tree_height = height; - goto insert; - } - do { - newnode = tmem_objnode_alloc(obj); - if (!newnode) { - ret = -ENOMEM; - goto out; - } - newnode->slots[0] = obj->objnode_tree_root; - newnode->slots_in_use = 1; - obj->objnode_tree_root = newnode; - obj->objnode_tree_height++; - } while (height > obj->objnode_tree_height); - } -insert: - slot = obj->objnode_tree_root; - height = obj->objnode_tree_height; - shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; - while (height > 0) { - if (slot == NULL) { - /* add a child objnode. */ - slot = tmem_objnode_alloc(obj); - if (!slot) { - ret = -ENOMEM; - goto out; - } - if (objnode) { - - objnode->slots[offset] = slot; - objnode->slots_in_use++; - } else - obj->objnode_tree_root = slot; - } - /* go down a level */ - offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; - objnode = slot; - slot = objnode->slots[offset]; - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } - BUG_ON(slot != NULL); - if (objnode) { - objnode->slots_in_use++; - objnode->slots[offset] = pampd; - } else - obj->objnode_tree_root = pampd; - obj->pampd_count++; -out: - return ret; -} - -static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index) -{ - struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1]; - struct tmem_objnode_tree_path *pathp = path; - struct tmem_objnode *slot = NULL; - unsigned int height, shift; - int offset; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - height = obj->objnode_tree_height; - if (index > tmem_objnode_tree_h2max[height]) - goto out; - slot = obj->objnode_tree_root; - if (height == 0 && obj->objnode_tree_root) { - obj->objnode_tree_root = NULL; - goto out; - } - shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT; - pathp->objnode = NULL; - do { - if (slot == NULL) - goto out; - pathp++; - offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; - pathp->offset = offset; - pathp->objnode = slot; - slot = slot->slots[offset]; - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } while (height > 0); - if (slot == NULL) - goto out; - while (pathp->objnode) { - pathp->objnode->slots[pathp->offset] = NULL; - pathp->objnode->slots_in_use--; - if (pathp->objnode->slots_in_use) { - if (pathp->objnode == obj->objnode_tree_root) { - while (obj->objnode_tree_height > 0 && - obj->objnode_tree_root->slots_in_use == 1 && - obj->objnode_tree_root->slots[0]) { - struct tmem_objnode *to_free = - obj->objnode_tree_root; - - obj->objnode_tree_root = - to_free->slots[0]; - obj->objnode_tree_height--; - to_free->slots[0] = NULL; - to_free->slots_in_use = 0; - tmem_objnode_free(to_free); - } - } - goto out; - } - tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */ - pathp--; - } - obj->objnode_tree_height = 0; - obj->objnode_tree_root = NULL; - -out: - if (slot != NULL) - obj->pampd_count--; - BUG_ON(obj->pampd_count < 0); - return slot; -} - -/* recursively walk the objnode_tree destroying pampds and objnodes */ -static void tmem_objnode_node_destroy(struct tmem_obj *obj, - struct tmem_objnode *objnode, - unsigned int ht) -{ - int i; - - if (ht == 0) - return; - for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) { - if (objnode->slots[i]) { - if (ht == 1) { - obj->pampd_count--; - (*tmem_pamops.free)(objnode->slots[i], - obj->pool, NULL, 0, true); - objnode->slots[i] = NULL; - continue; - } - tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1); - tmem_objnode_free(objnode->slots[i]); - objnode->slots[i] = NULL; - } - } -} - -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) -{ - if (obj->objnode_tree_root == NULL) - return; - if (obj->objnode_tree_height == 0) { - obj->pampd_count--; - (*tmem_pamops.free)(obj->objnode_tree_root, - obj->pool, NULL, 0, true); - } else { - tmem_objnode_node_destroy(obj, obj->objnode_tree_root, - obj->objnode_tree_height); - tmem_objnode_free(obj->objnode_tree_root); - obj->objnode_tree_height = 0; - } - obj->objnode_tree_root = NULL; - (*tmem_pamops.free_obj)(obj->pool, obj); -} - -/* - * Tmem is operated on by a set of well-defined actions: - * "put", "get", "flush", "flush_object", "new pool" and "destroy pool". - * (The tmem ABI allows for subpages and exchanges but these operations - * are not included in this implementation.) - * - * These "tmem core" operations are implemented in the following functions. - */ - -/* - * "Put" a page, e.g. copy a page from the kernel into newly allocated - * PAM space (if such space is available). Tmem_put is complicated by - * a corner case: What if a page with matching handle already exists in - * tmem? To guarantee coherency, one of two actions is necessary: Either - * the data for the page must be overwritten, or the page must be - * "flushed" so that the data is not accessible to a subsequent "get". - * Since these "duplicate puts" are relatively rare, this implementation - * always flushes for simplicity. - */ -int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, - char *data, size_t size, bool raw, int ephemeral) -{ - struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; - void *pampd = NULL, *pampd_del = NULL; - int ret = -ENOMEM; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = objfound = tmem_obj_find(hb, oidp); - if (obj != NULL) { - pampd = tmem_pampd_lookup_in_obj(objfound, index); - if (pampd != NULL) { - /* if found, is a dup put, flush the old one */ - pampd_del = tmem_pampd_delete_from_obj(obj, index); - BUG_ON(pampd_del != pampd); - (*tmem_pamops.free)(pampd, pool, oidp, index, true); - if (obj->pampd_count == 0) { - objnew = obj; - objfound = NULL; - } - pampd = NULL; - } - } else { - obj = objnew = (*tmem_hostops.obj_alloc)(pool); - if (unlikely(obj == NULL)) { - ret = -ENOMEM; - goto out; - } - tmem_obj_init(obj, hb, pool, oidp); - } - BUG_ON(obj == NULL); - BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound)); - pampd = (*tmem_pamops.create)(data, size, raw, ephemeral, - obj->pool, &obj->oid, index); - if (unlikely(pampd == NULL)) - goto free; - ret = tmem_pampd_add_to_obj(obj, index, pampd); - if (unlikely(ret == -ENOMEM)) - /* may have partially built objnode tree ("stump") */ - goto delete_and_free; - goto out; - -delete_and_free: - (void)tmem_pampd_delete_from_obj(obj, index); -free: - if (pampd) - (*tmem_pamops.free)(pampd, pool, NULL, 0, true); - if (objnew) { - tmem_obj_free(objnew, hb); - (*tmem_hostops.obj_free)(objnew, pool); - } -out: - spin_unlock(&hb->lock); - return ret; -} - -void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp, - uint32_t index, struct tmem_obj **ret_obj, - void **saved_hb) -{ - struct tmem_hashbucket *hb; - struct tmem_obj *obj = NULL; - void *pampd = NULL; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (likely(obj != NULL)) - pampd = tmem_pampd_lookup_in_obj(obj, index); - *ret_obj = obj; - *saved_hb = (void *)hb; - /* note, hashbucket remains locked */ - return pampd; -} - -void tmem_localify_finish(struct tmem_obj *obj, uint32_t index, - void *pampd, void *saved_hb, bool delete) -{ - struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb; - - BUG_ON(!spin_is_locked(&hb->lock)); - if (pampd != NULL) { - BUG_ON(obj == NULL); - (void)tmem_pampd_replace_in_obj(obj, index, pampd, 1); - } else if (delete) { - BUG_ON(obj == NULL); - (void)tmem_pampd_delete_from_obj(obj, index); - } - spin_unlock(&hb->lock); -} - -static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb, - struct tmem_pool *pool, struct tmem_oid *oidp, - uint32_t index, bool free, char *data) -{ - void *old_pampd = *ppampd, *new_pampd = NULL; - bool intransit = false; - int ret = 0; - - - if (!is_ephemeral(pool)) - new_pampd = (*tmem_pamops.repatriate_preload)( - old_pampd, pool, oidp, index, &intransit); - if (intransit) - ret = -EAGAIN; - else if (new_pampd != NULL) - *ppampd = new_pampd; - /* must release the hb->lock else repatriate can't sleep */ - spin_unlock(&hb->lock); - if (!intransit) - ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool, - oidp, index, free, data); - return ret; -} - -/* - * "Get" a page, e.g. if one can be found, copy the tmem page with the - * matching handle from PAM space to the kernel. By tmem definition, - * when a "get" is successful on an ephemeral page, the page is "flushed", - * and when a "get" is successful on a persistent page, the page is retained - * in tmem. Note that to preserve - * coherency, "get" can never be skipped if tmem contains the data. - * That is, if a get is done with a certain handle and fails, any - * subsequent "get" must also fail (unless of course there is a - * "put" done with the same handle). - - */ -int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, - char *data, size_t *size, bool raw, int get_and_free) -{ - struct tmem_obj *obj; - void *pampd; - bool ephemeral = is_ephemeral(pool); - int ret = -1; - struct tmem_hashbucket *hb; - bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral); - bool lock_held = 0; - void **ppampd; - -again: - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - lock_held = 1; - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - ppampd = __tmem_pampd_lookup_in_obj(obj, index); - if (ppampd == NULL) - goto out; - if (tmem_pamops.is_remote(*ppampd)) { - ret = tmem_repatriate(ppampd, hb, pool, oidp, - index, free, data); - lock_held = 0; /* note hb->lock has been unlocked */ - if (ret == -EAGAIN) { - /* rare I think, but should cond_resched()??? */ - usleep_range(10, 1000); - goto again; - } else if (ret != 0) { - if (ret != -ENOENT) - pr_err("UNTESTED case in tmem_get, ret=%d\n", - ret); - ret = -1; - goto out; - } - goto out; - } - if (free) - pampd = tmem_pampd_delete_from_obj(obj, index); - else - pampd = tmem_pampd_lookup_in_obj(obj, index); - if (pampd == NULL) - goto out; - if (free) { - if (obj->pampd_count == 0) { - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - obj = NULL; - } - } - if (free) - ret = (*tmem_pamops.get_data_and_free)( - data, size, raw, pampd, pool, oidp, index); - else - ret = (*tmem_pamops.get_data)( - data, size, raw, pampd, pool, oidp, index); - if (ret < 0) - goto out; - ret = 0; -out: - if (lock_held) - spin_unlock(&hb->lock); - return ret; -} - -/* - * If a page in tmem matches the handle, "flush" this page from tmem such - * that any subsequent "get" does not succeed (unless, of course, there - * was another "put" with the same handle). - */ -int tmem_flush_page(struct tmem_pool *pool, - struct tmem_oid *oidp, uint32_t index) -{ - struct tmem_obj *obj; - void *pampd; - int ret = -1; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - pampd = tmem_pampd_delete_from_obj(obj, index); - if (pampd == NULL) - goto out; - (*tmem_pamops.free)(pampd, pool, oidp, index, true); - if (obj->pampd_count == 0) { - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - } - ret = 0; - -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * If a page in tmem matches the handle, replace the page so that any - * subsequent "get" gets the new page. Returns the new page if - * there was a page to replace, else returns NULL. - */ -int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, - uint32_t index, void *new_pampd) -{ - struct tmem_obj *obj; - int ret = -1; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0); - ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj); -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Flush" all pages in tmem matching this oid. - */ -int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) -{ - struct tmem_obj *obj; - struct tmem_hashbucket *hb; - int ret = -1; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - tmem_pampd_destroy_all_in_obj(obj); - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - ret = 0; - -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Flush" all pages (and tmem_objs) from this tmem_pool and disable - * all subsequent access to this tmem_pool. - */ -int tmem_destroy_pool(struct tmem_pool *pool) -{ - int ret = -1; - - if (pool == NULL) - goto out; - tmem_pool_flush(pool, 1); - ret = 0; -out: - return ret; -} - -static LIST_HEAD(tmem_global_pool_list); - -/* - * Create a new tmem_pool with the provided flag and return - * a pool id provided by the tmem host implementation. - */ -void tmem_new_pool(struct tmem_pool *pool, uint32_t flags) -{ - int persistent = flags & TMEM_POOL_PERSIST; - int shared = flags & TMEM_POOL_SHARED; - struct tmem_hashbucket *hb = &pool->hashbucket[0]; - int i; - - for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { - hb->obj_rb_root = RB_ROOT; - spin_lock_init(&hb->lock); - } - INIT_LIST_HEAD(&pool->pool_list); - atomic_set(&pool->obj_count, 0); - SET_SENTINEL(pool, POOL); - list_add_tail(&pool->pool_list, &tmem_global_pool_list); - pool->persistent = persistent; - pool->shared = shared; -} diff --git a/drivers/staging/ramster/tmem.h b/drivers/staging/ramster/tmem.h deleted file mode 100644 index 47f1918c8..000000000 --- a/drivers/staging/ramster/tmem.h +++ /dev/null @@ -1,244 +0,0 @@ -/* - * tmem.h - * - * Transcendent memory - * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. - */ - -#ifndef _TMEM_H_ -#define _TMEM_H_ - -#include -#include -#include - -/* - * These are pre-defined by the Xen<->Linux ABI - */ -#define TMEM_PUT_PAGE 4 -#define TMEM_GET_PAGE 5 -#define TMEM_FLUSH_PAGE 6 -#define TMEM_FLUSH_OBJECT 7 -#define TMEM_POOL_PERSIST 1 -#define TMEM_POOL_SHARED 2 -#define TMEM_POOL_PRECOMPRESSED 4 -#define TMEM_POOL_PAGESIZE_SHIFT 4 -#define TMEM_POOL_PAGESIZE_MASK 0xf -#define TMEM_POOL_RESERVED_BITS 0x00ffff00 - -/* - * sentinels have proven very useful for debugging but can be removed - * or disabled before final merge. - */ -#define SENTINELS -#ifdef SENTINELS -#define DECL_SENTINEL uint32_t sentinel; -#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL) -#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL) -#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL) -#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL) -#else -#define DECL_SENTINEL -#define SET_SENTINEL(_x, _y) do { } while (0) -#define INVERT_SENTINEL(_x, _y) do { } while (0) -#define ASSERT_SENTINEL(_x, _y) do { } while (0) -#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0) -#endif - -#define ASSERT_SPINLOCK(_l) WARN_ON(!spin_is_locked(_l)) - -/* - * A pool is the highest-level data structure managed by tmem and - * usually corresponds to a large independent set of pages such as - * a filesystem. Each pool has an id, and certain attributes and counters. - * It also contains a set of hash buckets, each of which contains an rbtree - * of objects and a lock to manage concurrency within the pool. - */ - -#define TMEM_HASH_BUCKET_BITS 8 -#define TMEM_HASH_BUCKETS (1<persistent) -#define is_ephemeral(_p) (!(_p->persistent)) - -/* - * An object id ("oid") is large: 192-bits (to ensure, for example, files - * in a modern filesystem can be uniquely identified). - */ - -struct tmem_oid { - uint64_t oid[3]; -}; - -struct tmem_xhandle { - uint8_t client_id; - uint8_t xh_data_cksum; - uint16_t xh_data_size; - uint16_t pool_id; - struct tmem_oid oid; - uint32_t index; - void *extra; -}; - -static inline struct tmem_xhandle tmem_xhandle_fill(uint16_t client_id, - struct tmem_pool *pool, - struct tmem_oid *oidp, - uint32_t index) -{ - struct tmem_xhandle xh; - xh.client_id = client_id; - xh.xh_data_cksum = (uint8_t)-1; - xh.xh_data_size = (uint16_t)-1; - xh.pool_id = pool->pool_id; - xh.oid = *oidp; - xh.index = index; - return xh; -} - -static inline void tmem_oid_set_invalid(struct tmem_oid *oidp) -{ - oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL; -} - -static inline bool tmem_oid_valid(struct tmem_oid *oidp) -{ - return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL || - oidp->oid[2] != -1UL; -} - -static inline int tmem_oid_compare(struct tmem_oid *left, - struct tmem_oid *right) -{ - int ret; - - if (left->oid[2] == right->oid[2]) { - if (left->oid[1] == right->oid[1]) { - if (left->oid[0] == right->oid[0]) - ret = 0; - else if (left->oid[0] < right->oid[0]) - ret = -1; - else - return 1; - } else if (left->oid[1] < right->oid[1]) - ret = -1; - else - ret = 1; - } else if (left->oid[2] < right->oid[2]) - ret = -1; - else - ret = 1; - return ret; -} - -static inline unsigned tmem_oid_hash(struct tmem_oid *oidp) -{ - return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2], - TMEM_HASH_BUCKET_BITS); -} - -/* - * A tmem_obj contains an identifier (oid), pointers to the parent - * pool and the rb_tree to which it belongs, counters, and an ordered - * set of pampds, structured in a radix-tree-like tree. The intermediate - * nodes of the tree are called tmem_objnodes. - */ - -struct tmem_objnode; - -struct tmem_obj { - struct tmem_oid oid; - struct tmem_pool *pool; - struct rb_node rb_tree_node; - struct tmem_objnode *objnode_tree_root; - unsigned int objnode_tree_height; - unsigned long objnode_count; - long pampd_count; - /* for current design of ramster, all pages belonging to - * an object reside on the same remotenode and extra is - * used to record the number of the remotenode so a - * flush-object operation can specify it */ - void *extra; /* for use by pampd implementation */ - DECL_SENTINEL -}; - -#define OBJNODE_TREE_MAP_SHIFT 6 -#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT) -#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1) -#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -#define OBJNODE_TREE_MAX_PATH \ - (OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2) - -struct tmem_objnode { - struct tmem_obj *obj; - DECL_SENTINEL - void *slots[OBJNODE_TREE_MAP_SIZE]; - unsigned int slots_in_use; -}; - -/* pampd abstract datatype methods provided by the PAM implementation */ -struct tmem_pamops { - void *(*create)(char *, size_t, bool, int, - struct tmem_pool *, struct tmem_oid *, uint32_t); - int (*get_data)(char *, size_t *, bool, void *, struct tmem_pool *, - struct tmem_oid *, uint32_t); - int (*get_data_and_free)(char *, size_t *, bool, void *, - struct tmem_pool *, struct tmem_oid *, - uint32_t); - void (*free)(void *, struct tmem_pool *, - struct tmem_oid *, uint32_t, bool); - void (*free_obj)(struct tmem_pool *, struct tmem_obj *); - bool (*is_remote)(void *); - void *(*repatriate_preload)(void *, struct tmem_pool *, - struct tmem_oid *, uint32_t, bool *); - int (*repatriate)(void *, void *, struct tmem_pool *, - struct tmem_oid *, uint32_t, bool, void *); - void (*new_obj)(struct tmem_obj *); - int (*replace_in_obj)(void *, struct tmem_obj *); -}; -extern void tmem_register_pamops(struct tmem_pamops *m); - -/* memory allocation methods provided by the host implementation */ -struct tmem_hostops { - struct tmem_obj *(*obj_alloc)(struct tmem_pool *); - void (*obj_free)(struct tmem_obj *, struct tmem_pool *); - struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *); - void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *); -}; -extern void tmem_register_hostops(struct tmem_hostops *m); - -/* core tmem accessor functions */ -extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index, - char *, size_t, bool, int); -extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index, - char *, size_t *, bool, int); -extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index, - void *); -extern void *tmem_localify_get_pampd(struct tmem_pool *, struct tmem_oid *, - uint32_t index, struct tmem_obj **, - void **); -extern void tmem_localify_finish(struct tmem_obj *, uint32_t index, - void *, void *, bool); -extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *, - uint32_t index); -extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *); -extern int tmem_destroy_pool(struct tmem_pool *); -extern void tmem_new_pool(struct tmem_pool *, uint32_t); -#endif /* _TMEM_H */ diff --git a/drivers/staging/ramster/xvmalloc.c b/drivers/staging/ramster/xvmalloc.c deleted file mode 100644 index 93ba8e940..000000000 --- a/drivers/staging/ramster/xvmalloc.c +++ /dev/null @@ -1,509 +0,0 @@ -/* - * xvmalloc memory allocator - * - * Copyright (C) 2008, 2009, 2010 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the licence that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - */ - -#ifdef CONFIG_ZRAM_DEBUG -#define DEBUG -#endif - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "xvmalloc.h" -#include "xvmalloc_int.h" - -static void stat_inc(u64 *value) -{ - *value = *value + 1; -} - -static void stat_dec(u64 *value) -{ - *value = *value - 1; -} - -static int test_flag(struct block_header *block, enum blockflags flag) -{ - return block->prev & BIT(flag); -} - -static void set_flag(struct block_header *block, enum blockflags flag) -{ - block->prev |= BIT(flag); -} - -static void clear_flag(struct block_header *block, enum blockflags flag) -{ - block->prev &= ~BIT(flag); -} - -/* - * Given pair, provide a dereferencable pointer. - * This is called from xv_malloc/xv_free path, so it - * needs to be fast. - */ -static void *get_ptr_atomic(struct page *page, u16 offset) -{ - unsigned char *base; - - base = kmap_atomic(page); - return base + offset; -} - -static void put_ptr_atomic(void *ptr) -{ - kunmap_atomic(ptr); -} - -static u32 get_blockprev(struct block_header *block) -{ - return block->prev & PREV_MASK; -} - -static void set_blockprev(struct block_header *block, u16 new_offset) -{ - block->prev = new_offset | (block->prev & FLAGS_MASK); -} - -static struct block_header *BLOCK_NEXT(struct block_header *block) -{ - return (struct block_header *) - ((char *)block + block->size + XV_ALIGN); -} - -/* - * Get index of free list containing blocks of maximum size - * which is less than or equal to given size. - */ -static u32 get_index_for_insert(u32 size) -{ - if (unlikely(size > XV_MAX_ALLOC_SIZE)) - size = XV_MAX_ALLOC_SIZE; - size &= ~FL_DELTA_MASK; - return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT; -} - -/* - * Get index of free list having blocks of size greater than - * or equal to requested size. - */ -static u32 get_index(u32 size) -{ - if (unlikely(size < XV_MIN_ALLOC_SIZE)) - size = XV_MIN_ALLOC_SIZE; - size = ALIGN(size, FL_DELTA); - return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT; -} - -/** - * find_block - find block of at least given size - * @pool: memory pool to search from - * @size: size of block required - * @page: page containing required block - * @offset: offset within the page where block is located. - * - * Searches two level bitmap to locate block of at least - * the given size. If such a block is found, it provides - * to identify this block and returns index - * in freelist where we found this block. - * Otherwise, returns 0 and params are not touched. - */ -static u32 find_block(struct xv_pool *pool, u32 size, - struct page **page, u32 *offset) -{ - ulong flbitmap, slbitmap; - u32 flindex, slindex, slbitstart; - - /* There are no free blocks in this pool */ - if (!pool->flbitmap) - return 0; - - /* Get freelist index correspoding to this size */ - slindex = get_index(size); - slbitmap = pool->slbitmap[slindex / BITS_PER_LONG]; - slbitstart = slindex % BITS_PER_LONG; - - /* - * If freelist is not empty at this index, we found the - * block - head of this list. This is approximate best-fit match. - */ - if (test_bit(slbitstart, &slbitmap)) { - *page = pool->freelist[slindex].page; - *offset = pool->freelist[slindex].offset; - return slindex; - } - - /* - * No best-fit found. Search a bit further in bitmap for a free block. - * Second level bitmap consists of series of 32-bit chunks. Search - * further in the chunk where we expected a best-fit, starting from - * index location found above. - */ - slbitstart++; - slbitmap >>= slbitstart; - - /* Skip this search if we were already at end of this bitmap chunk */ - if ((slbitstart != BITS_PER_LONG) && slbitmap) { - slindex += __ffs(slbitmap) + 1; - *page = pool->freelist[slindex].page; - *offset = pool->freelist[slindex].offset; - return slindex; - } - - /* Now do a full two-level bitmap search to find next nearest fit */ - flindex = slindex / BITS_PER_LONG; - - flbitmap = (pool->flbitmap) >> (flindex + 1); - if (!flbitmap) - return 0; - - flindex += __ffs(flbitmap) + 1; - slbitmap = pool->slbitmap[flindex]; - slindex = (flindex * BITS_PER_LONG) + __ffs(slbitmap); - *page = pool->freelist[slindex].page; - *offset = pool->freelist[slindex].offset; - - return slindex; -} - -/* - * Insert block at in freelist of given pool. - * freelist used depends on block size. - */ -static void insert_block(struct xv_pool *pool, struct page *page, u32 offset, - struct block_header *block) -{ - u32 flindex, slindex; - struct block_header *nextblock; - - slindex = get_index_for_insert(block->size); - flindex = slindex / BITS_PER_LONG; - - block->link.prev_page = NULL; - block->link.prev_offset = 0; - block->link.next_page = pool->freelist[slindex].page; - block->link.next_offset = pool->freelist[slindex].offset; - pool->freelist[slindex].page = page; - pool->freelist[slindex].offset = offset; - - if (block->link.next_page) { - nextblock = get_ptr_atomic(block->link.next_page, - block->link.next_offset); - nextblock->link.prev_page = page; - nextblock->link.prev_offset = offset; - put_ptr_atomic(nextblock); - /* If there was a next page then the free bits are set. */ - return; - } - - __set_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]); - __set_bit(flindex, &pool->flbitmap); -} - -/* - * Remove block from freelist. Index 'slindex' identifies the freelist. - */ -static void remove_block(struct xv_pool *pool, struct page *page, u32 offset, - struct block_header *block, u32 slindex) -{ - u32 flindex = slindex / BITS_PER_LONG; - struct block_header *tmpblock; - - if (block->link.prev_page) { - tmpblock = get_ptr_atomic(block->link.prev_page, - block->link.prev_offset); - tmpblock->link.next_page = block->link.next_page; - tmpblock->link.next_offset = block->link.next_offset; - put_ptr_atomic(tmpblock); - } - - if (block->link.next_page) { - tmpblock = get_ptr_atomic(block->link.next_page, - block->link.next_offset); - tmpblock->link.prev_page = block->link.prev_page; - tmpblock->link.prev_offset = block->link.prev_offset; - put_ptr_atomic(tmpblock); - } - - /* Is this block is at the head of the freelist? */ - if (pool->freelist[slindex].page == page - && pool->freelist[slindex].offset == offset) { - - pool->freelist[slindex].page = block->link.next_page; - pool->freelist[slindex].offset = block->link.next_offset; - - if (pool->freelist[slindex].page) { - struct block_header *tmpblock; - tmpblock = get_ptr_atomic(pool->freelist[slindex].page, - pool->freelist[slindex].offset); - tmpblock->link.prev_page = NULL; - tmpblock->link.prev_offset = 0; - put_ptr_atomic(tmpblock); - } else { - /* This freelist bucket is empty */ - __clear_bit(slindex % BITS_PER_LONG, - &pool->slbitmap[flindex]); - if (!pool->slbitmap[flindex]) - __clear_bit(flindex, &pool->flbitmap); - } - } - - block->link.prev_page = NULL; - block->link.prev_offset = 0; - block->link.next_page = NULL; - block->link.next_offset = 0; -} - -/* - * Allocate a page and add it to freelist of given pool. - */ -static int grow_pool(struct xv_pool *pool, gfp_t flags) -{ - struct page *page; - struct block_header *block; - - page = alloc_page(flags); - if (unlikely(!page)) - return -ENOMEM; - - stat_inc(&pool->total_pages); - - spin_lock(&pool->lock); - block = get_ptr_atomic(page, 0); - - block->size = PAGE_SIZE - XV_ALIGN; - set_flag(block, BLOCK_FREE); - clear_flag(block, PREV_FREE); - set_blockprev(block, 0); - - insert_block(pool, page, 0, block); - - put_ptr_atomic(block); - spin_unlock(&pool->lock); - - return 0; -} - -/* - * Create a memory pool. Allocates freelist, bitmaps and other - * per-pool metadata. - */ -struct xv_pool *xv_create_pool(void) -{ - u32 ovhd_size; - struct xv_pool *pool; - - ovhd_size = roundup(sizeof(*pool), PAGE_SIZE); - pool = kzalloc(ovhd_size, GFP_KERNEL); - if (!pool) - return NULL; - - spin_lock_init(&pool->lock); - - return pool; -} -EXPORT_SYMBOL_GPL(xv_create_pool); - -void xv_destroy_pool(struct xv_pool *pool) -{ - kfree(pool); -} -EXPORT_SYMBOL_GPL(xv_destroy_pool); - -/** - * xv_malloc - Allocate block of given size from pool. - * @pool: pool to allocate from - * @size: size of block to allocate - * @page: page no. that holds the object - * @offset: location of object within page - * - * On success, identifies block allocated - * and 0 is returned. On failure, is set to - * 0 and -ENOMEM is returned. - * - * Allocation requests with size > XV_MAX_ALLOC_SIZE will fail. - */ -int xv_malloc(struct xv_pool *pool, u32 size, struct page **page, - u32 *offset, gfp_t flags) -{ - int error; - u32 index, tmpsize, origsize, tmpoffset; - struct block_header *block, *tmpblock; - - *page = NULL; - *offset = 0; - origsize = size; - - if (unlikely(!size || size > XV_MAX_ALLOC_SIZE)) - return -ENOMEM; - - size = ALIGN(size, XV_ALIGN); - - spin_lock(&pool->lock); - - index = find_block(pool, size, page, offset); - - if (!*page) { - spin_unlock(&pool->lock); - if (flags & GFP_NOWAIT) - return -ENOMEM; - error = grow_pool(pool, flags); - if (unlikely(error)) - return error; - - spin_lock(&pool->lock); - index = find_block(pool, size, page, offset); - } - - if (!*page) { - spin_unlock(&pool->lock); - return -ENOMEM; - } - - block = get_ptr_atomic(*page, *offset); - - remove_block(pool, *page, *offset, block, index); - - /* Split the block if required */ - tmpoffset = *offset + size + XV_ALIGN; - tmpsize = block->size - size; - tmpblock = (struct block_header *)((char *)block + size + XV_ALIGN); - if (tmpsize) { - tmpblock->size = tmpsize - XV_ALIGN; - set_flag(tmpblock, BLOCK_FREE); - clear_flag(tmpblock, PREV_FREE); - - set_blockprev(tmpblock, *offset); - if (tmpblock->size >= XV_MIN_ALLOC_SIZE) - insert_block(pool, *page, tmpoffset, tmpblock); - - if (tmpoffset + XV_ALIGN + tmpblock->size != PAGE_SIZE) { - tmpblock = BLOCK_NEXT(tmpblock); - set_blockprev(tmpblock, tmpoffset); - } - } else { - /* This block is exact fit */ - if (tmpoffset != PAGE_SIZE) - clear_flag(tmpblock, PREV_FREE); - } - - block->size = origsize; - clear_flag(block, BLOCK_FREE); - - put_ptr_atomic(block); - spin_unlock(&pool->lock); - - *offset += XV_ALIGN; - - return 0; -} -EXPORT_SYMBOL_GPL(xv_malloc); - -/* - * Free block identified with - */ -void xv_free(struct xv_pool *pool, struct page *page, u32 offset) -{ - void *page_start; - struct block_header *block, *tmpblock; - - offset -= XV_ALIGN; - - spin_lock(&pool->lock); - - page_start = get_ptr_atomic(page, 0); - block = (struct block_header *)((char *)page_start + offset); - - /* Catch double free bugs */ - BUG_ON(test_flag(block, BLOCK_FREE)); - - block->size = ALIGN(block->size, XV_ALIGN); - - tmpblock = BLOCK_NEXT(block); - if (offset + block->size + XV_ALIGN == PAGE_SIZE) - tmpblock = NULL; - - /* Merge next block if its free */ - if (tmpblock && test_flag(tmpblock, BLOCK_FREE)) { - /* - * Blocks smaller than XV_MIN_ALLOC_SIZE - * are not inserted in any free list. - */ - if (tmpblock->size >= XV_MIN_ALLOC_SIZE) { - remove_block(pool, page, - offset + block->size + XV_ALIGN, tmpblock, - get_index_for_insert(tmpblock->size)); - } - block->size += tmpblock->size + XV_ALIGN; - } - - /* Merge previous block if its free */ - if (test_flag(block, PREV_FREE)) { - tmpblock = (struct block_header *)((char *)(page_start) + - get_blockprev(block)); - offset = offset - tmpblock->size - XV_ALIGN; - - if (tmpblock->size >= XV_MIN_ALLOC_SIZE) - remove_block(pool, page, offset, tmpblock, - get_index_for_insert(tmpblock->size)); - - tmpblock->size += block->size + XV_ALIGN; - block = tmpblock; - } - - /* No used objects in this page. Free it. */ - if (block->size == PAGE_SIZE - XV_ALIGN) { - put_ptr_atomic(page_start); - spin_unlock(&pool->lock); - - __free_page(page); - stat_dec(&pool->total_pages); - return; - } - - set_flag(block, BLOCK_FREE); - if (block->size >= XV_MIN_ALLOC_SIZE) - insert_block(pool, page, offset, block); - - if (offset + block->size + XV_ALIGN != PAGE_SIZE) { - tmpblock = BLOCK_NEXT(block); - set_flag(tmpblock, PREV_FREE); - set_blockprev(tmpblock, offset); - } - - put_ptr_atomic(page_start); - spin_unlock(&pool->lock); -} -EXPORT_SYMBOL_GPL(xv_free); - -u32 xv_get_object_size(void *obj) -{ - struct block_header *blk; - - blk = (struct block_header *)((char *)(obj) - XV_ALIGN); - return blk->size; -} -EXPORT_SYMBOL_GPL(xv_get_object_size); - -/* - * Returns total memory used by allocator (userdata + metadata) - */ -u64 xv_get_total_size_bytes(struct xv_pool *pool) -{ - return pool->total_pages << PAGE_SHIFT; -} -EXPORT_SYMBOL_GPL(xv_get_total_size_bytes); diff --git a/drivers/staging/ramster/xvmalloc.h b/drivers/staging/ramster/xvmalloc.h deleted file mode 100644 index 5b1a81aa5..000000000 --- a/drivers/staging/ramster/xvmalloc.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * xvmalloc memory allocator - * - * Copyright (C) 2008, 2009, 2010 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the licence that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - */ - -#ifndef _XV_MALLOC_H_ -#define _XV_MALLOC_H_ - -#include - -struct xv_pool; - -struct xv_pool *xv_create_pool(void); -void xv_destroy_pool(struct xv_pool *pool); - -int xv_malloc(struct xv_pool *pool, u32 size, struct page **page, - u32 *offset, gfp_t flags); -void xv_free(struct xv_pool *pool, struct page *page, u32 offset); - -u32 xv_get_object_size(void *obj); -u64 xv_get_total_size_bytes(struct xv_pool *pool); - -#endif diff --git a/drivers/staging/ramster/xvmalloc_int.h b/drivers/staging/ramster/xvmalloc_int.h deleted file mode 100644 index b5f1f7feb..000000000 --- a/drivers/staging/ramster/xvmalloc_int.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * xvmalloc memory allocator - * - * Copyright (C) 2008, 2009, 2010 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the licence that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - */ - -#ifndef _XV_MALLOC_INT_H_ -#define _XV_MALLOC_INT_H_ - -#include -#include - -/* User configurable params */ - -/* Must be power of two */ -#ifdef CONFIG_64BIT -#define XV_ALIGN_SHIFT 3 -#else -#define XV_ALIGN_SHIFT 2 -#endif -#define XV_ALIGN (1 << XV_ALIGN_SHIFT) -#define XV_ALIGN_MASK (XV_ALIGN - 1) - -/* This must be greater than sizeof(link_free) */ -#define XV_MIN_ALLOC_SIZE 32 -#define XV_MAX_ALLOC_SIZE (PAGE_SIZE - XV_ALIGN) - -/* - * Free lists are separated by FL_DELTA bytes - * This value is 3 for 4k pages and 4 for 64k pages, for any - * other page size, a conservative (PAGE_SHIFT - 9) is used. - */ -#if PAGE_SHIFT == 16 -#define FL_DELTA_SHIFT 4 -#else -#define FL_DELTA_SHIFT (PAGE_SHIFT - 9) -#endif -#define FL_DELTA (1 << FL_DELTA_SHIFT) -#define FL_DELTA_MASK (FL_DELTA - 1) -#define NUM_FREE_LISTS ((XV_MAX_ALLOC_SIZE - XV_MIN_ALLOC_SIZE) \ - / FL_DELTA + 1) - -#define MAX_FLI DIV_ROUND_UP(NUM_FREE_LISTS, BITS_PER_LONG) - -/* End of user params */ - -enum blockflags { - BLOCK_FREE, - PREV_FREE, - __NR_BLOCKFLAGS, -}; - -#define FLAGS_MASK XV_ALIGN_MASK -#define PREV_MASK (~FLAGS_MASK) - -struct freelist_entry { - struct page *page; - u16 offset; - u16 pad; -}; - -struct link_free { - struct page *prev_page; - struct page *next_page; - u16 prev_offset; - u16 next_offset; -}; - -struct block_header { - union { - /* This common header must be XV_ALIGN bytes */ - u8 common[XV_ALIGN]; - struct { - u16 size; - u16 prev; - }; - }; - struct link_free link; -}; - -struct xv_pool { - ulong flbitmap; - ulong slbitmap[MAX_FLI]; - u64 total_pages; /* stats */ - struct freelist_entry freelist[NUM_FREE_LISTS]; - spinlock_t lock; -}; - -#endif diff --git a/drivers/staging/ramster/zcache-main.c b/drivers/staging/ramster/zcache-main.c deleted file mode 100644 index 68b2e053a..000000000 --- a/drivers/staging/ramster/zcache-main.c +++ /dev/null @@ -1,3320 +0,0 @@ -/* - * zcache.c - * - * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp. - * Copyright (c) 2010,2011, Nitin Gupta - * - * Zcache provides an in-kernel "host implementation" for transcendent memory - * and, thus indirectly, for cleancache and frontswap. Zcache includes two - * page-accessible memory [1] interfaces, both utilizing lzo1x compression: - * 1) "compression buddies" ("zbud") is used for ephemeral pages - * 2) xvmalloc is used for persistent pages. - * Xvmalloc (based on the TLSF allocator) has very low fragmentation - * so maximizes space efficiency, while zbud allows pairs (and potentially, - * in the future, more than a pair of) compressed pages to be closely linked - * so that reclaiming can be done via the kernel's physical-page-oriented - * "shrinker" interface. - * - * [1] For a definition of page-accessible memory (aka PAM), see: - * http://marc.info/?l=linux-mm&m=127811271605009 - * RAMSTER TODO: - * - handle remotifying of buddied pages (see zbud_remotify_zbpg) - * - kernel boot params: nocleancache/nofrontswap don't always work?!? - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "tmem.h" -#include "zcache.h" -#include "ramster.h" -#include "cluster/tcp.h" - -#include "xvmalloc.h" /* temporary until change to zsmalloc */ - -#define RAMSTER_TESTING - -#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP)) -#error "ramster is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP" -#endif -#ifdef CONFIG_CLEANCACHE -#include -#endif -#ifdef CONFIG_FRONTSWAP -#include -#endif - -enum ramster_remotify_op { - RAMSTER_REMOTIFY_EPH_PUT, - RAMSTER_REMOTIFY_PERS_PUT, - RAMSTER_REMOTIFY_FLUSH_PAGE, - RAMSTER_REMOTIFY_FLUSH_OBJ, - RAMSTER_INTRANSIT_PERS -}; - -struct ramster_remotify_hdr { - enum ramster_remotify_op op; - struct list_head list; -}; - -#define ZBH_SENTINEL 0x43214321 -#define ZBPG_SENTINEL 0xdeadbeef - -#define ZBUD_MAX_BUDS 2 - -struct zbud_hdr { - struct ramster_remotify_hdr rem_op; - uint16_t client_id; - uint16_t pool_id; - struct tmem_oid oid; - uint32_t index; - uint16_t size; /* compressed size in bytes, zero means unused */ - DECL_SENTINEL -}; - -#define ZVH_SENTINEL 0x43214321 -static const int zv_max_page_size = (PAGE_SIZE / 8) * 7; - -struct zv_hdr { - struct ramster_remotify_hdr rem_op; - uint16_t client_id; - uint16_t pool_id; - struct tmem_oid oid; - uint32_t index; - DECL_SENTINEL -}; - -struct flushlist_node { - struct ramster_remotify_hdr rem_op; - struct tmem_xhandle xh; -}; - -union { - struct ramster_remotify_hdr rem_op; - struct zv_hdr zv; - struct zbud_hdr zbud; - struct flushlist_node flist; -} remotify_list_node; - -static LIST_HEAD(zcache_rem_op_list); -static DEFINE_SPINLOCK(zcache_rem_op_list_lock); - -#if 0 -/* this is more aggressive but may cause other problems? */ -#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN) -#else -#define ZCACHE_GFP_MASK \ - (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC) -#endif - -#define MAX_POOLS_PER_CLIENT 16 - -#define MAX_CLIENTS 16 -#define LOCAL_CLIENT ((uint16_t)-1) - -MODULE_LICENSE("GPL"); - -struct zcache_client { - struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT]; - struct xv_pool *xvpool; - bool allocated; - atomic_t refcount; -}; - -static struct zcache_client zcache_host; -static struct zcache_client zcache_clients[MAX_CLIENTS]; - -static inline uint16_t get_client_id_from_client(struct zcache_client *cli) -{ - BUG_ON(cli == NULL); - if (cli == &zcache_host) - return LOCAL_CLIENT; - return cli - &zcache_clients[0]; -} - -static inline bool is_local_client(struct zcache_client *cli) -{ - return cli == &zcache_host; -} - -/********** - * Compression buddies ("zbud") provides for packing two (or, possibly - * in the future, more) compressed ephemeral pages into a single "raw" - * (physical) page and tracking them with data structures so that - * the raw pages can be easily reclaimed. - * - * A zbud page ("zbpg") is an aligned page containing a list_head, - * a lock, and two "zbud headers". The remainder of the physical - * page is divided up into aligned 64-byte "chunks" which contain - * the compressed data for zero, one, or two zbuds. Each zbpg - * resides on: (1) an "unused list" if it has no zbuds; (2) a - * "buddied" list if it is fully populated with two zbuds; or - * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks - * the one unbuddied zbud uses. The data inside a zbpg cannot be - * read or written unless the zbpg's lock is held. - */ - -struct zbud_page { - struct list_head bud_list; - spinlock_t lock; - struct zbud_hdr buddy[ZBUD_MAX_BUDS]; - DECL_SENTINEL - /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */ -}; - -#define CHUNK_SHIFT 6 -#define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define CHUNK_MASK (~(CHUNK_SIZE-1)) -#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \ - CHUNK_MASK) >> CHUNK_SHIFT) -#define MAX_CHUNK (NCHUNKS-1) - -static struct { - struct list_head list; - unsigned count; -} zbud_unbuddied[NCHUNKS]; -/* list N contains pages with N chunks USED and NCHUNKS-N unused */ -/* element 0 is never used but optimizing that isn't worth it */ -static unsigned long zbud_cumul_chunk_counts[NCHUNKS]; - -struct list_head zbud_buddied_list; -static unsigned long zcache_zbud_buddied_count; - -/* protects the buddied list and all unbuddied lists */ -static DEFINE_SPINLOCK(zbud_budlists_spinlock); - -static atomic_t zcache_zbud_curr_raw_pages; -static atomic_t zcache_zbud_curr_zpages; -static unsigned long zcache_zbud_curr_zbytes; -static unsigned long zcache_zbud_cumul_zpages; -static unsigned long zcache_zbud_cumul_zbytes; -static unsigned long zcache_compress_poor; -static unsigned long zcache_policy_percent_exceeded; -static unsigned long zcache_mean_compress_poor; - -/* - * RAMster counters - * - Remote pages are pages with a local pampd but the data is remote - * - Foreign pages are pages stored locally but belonging to another node - */ -static atomic_t ramster_remote_pers_pages = ATOMIC_INIT(0); -static unsigned long ramster_pers_remotify_enable; -static unsigned long ramster_eph_remotify_enable; -static unsigned long ramster_eph_pages_remoted; -static unsigned long ramster_eph_pages_remote_failed; -static unsigned long ramster_pers_pages_remoted; -static unsigned long ramster_pers_pages_remote_failed; -static unsigned long ramster_pers_pages_remote_nomem; -static unsigned long ramster_remote_objects_flushed; -static unsigned long ramster_remote_object_flushes_failed; -static unsigned long ramster_remote_pages_flushed; -static unsigned long ramster_remote_page_flushes_failed; -static unsigned long ramster_remote_eph_pages_succ_get; -static unsigned long ramster_remote_pers_pages_succ_get; -static unsigned long ramster_remote_eph_pages_unsucc_get; -static unsigned long ramster_remote_pers_pages_unsucc_get; -static atomic_t ramster_curr_flnode_count = ATOMIC_INIT(0); -static unsigned long ramster_curr_flnode_count_max; -static atomic_t ramster_foreign_eph_pampd_count = ATOMIC_INIT(0); -static unsigned long ramster_foreign_eph_pampd_count_max; -static atomic_t ramster_foreign_pers_pampd_count = ATOMIC_INIT(0); -static unsigned long ramster_foreign_pers_pampd_count_max; - -/* forward references */ -static void *zcache_get_free_page(void); -static void zcache_free_page(void *p); - -/* - * zbud helper functions - */ - -static inline unsigned zbud_max_buddy_size(void) -{ - return MAX_CHUNK << CHUNK_SHIFT; -} - -static inline unsigned zbud_size_to_chunks(unsigned size) -{ - BUG_ON(size == 0 || size > zbud_max_buddy_size()); - return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; -} - -static inline int zbud_budnum(struct zbud_hdr *zh) -{ - unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1); - struct zbud_page *zbpg = NULL; - unsigned budnum = -1U; - int i; - - for (i = 0; i < ZBUD_MAX_BUDS; i++) - if (offset == offsetof(typeof(*zbpg), buddy[i])) { - budnum = i; - break; - } - BUG_ON(budnum == -1U); - return budnum; -} - -static char *zbud_data(struct zbud_hdr *zh, unsigned size) -{ - struct zbud_page *zbpg; - char *p; - unsigned budnum; - - ASSERT_SENTINEL(zh, ZBH); - budnum = zbud_budnum(zh); - BUG_ON(size == 0 || size > zbud_max_buddy_size()); - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - ASSERT_SPINLOCK(&zbpg->lock); - p = (char *)zbpg; - if (budnum == 0) - p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & - CHUNK_MASK); - else if (budnum == 1) - p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); - return p; -} - -static void zbud_copy_from_pampd(char *data, size_t *size, struct zbud_hdr *zh) -{ - struct zbud_page *zbpg; - char *p; - unsigned budnum; - - ASSERT_SENTINEL(zh, ZBH); - budnum = zbud_budnum(zh); - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - spin_lock(&zbpg->lock); - BUG_ON(zh->size > *size); - p = (char *)zbpg; - if (budnum == 0) - p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & - CHUNK_MASK); - else if (budnum == 1) - p += PAGE_SIZE - ((zh->size + CHUNK_SIZE - 1) & CHUNK_MASK); - /* client should be filled in by caller */ - memcpy(data, p, zh->size); - *size = zh->size; - spin_unlock(&zbpg->lock); -} - -/* - * zbud raw page management - */ - -static struct zbud_page *zbud_alloc_raw_page(void) -{ - struct zbud_page *zbpg = NULL; - struct zbud_hdr *zh0, *zh1; - zbpg = zcache_get_free_page(); - if (likely(zbpg != NULL)) { - INIT_LIST_HEAD(&zbpg->bud_list); - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; - spin_lock_init(&zbpg->lock); - atomic_inc(&zcache_zbud_curr_raw_pages); - INIT_LIST_HEAD(&zbpg->bud_list); - SET_SENTINEL(zbpg, ZBPG); - zh0->size = 0; zh1->size = 0; - tmem_oid_set_invalid(&zh0->oid); - tmem_oid_set_invalid(&zh1->oid); - } - return zbpg; -} - -static void zbud_free_raw_page(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1]; - - ASSERT_SENTINEL(zbpg, ZBPG); - BUG_ON(!list_empty(&zbpg->bud_list)); - ASSERT_SPINLOCK(&zbpg->lock); - BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); - BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); - INVERT_SENTINEL(zbpg, ZBPG); - spin_unlock(&zbpg->lock); - atomic_dec(&zcache_zbud_curr_raw_pages); - zcache_free_page(zbpg); -} - -/* - * core zbud handling routines - */ - -static unsigned zbud_free(struct zbud_hdr *zh) -{ - unsigned size; - - ASSERT_SENTINEL(zh, ZBH); - BUG_ON(!tmem_oid_valid(&zh->oid)); - size = zh->size; - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); - zh->size = 0; - tmem_oid_set_invalid(&zh->oid); - INVERT_SENTINEL(zh, ZBH); - zcache_zbud_curr_zbytes -= size; - atomic_dec(&zcache_zbud_curr_zpages); - return size; -} - -static void zbud_free_and_delist(struct zbud_hdr *zh) -{ - unsigned chunks; - struct zbud_hdr *zh_other; - unsigned budnum = zbud_budnum(zh), size; - struct zbud_page *zbpg = - container_of(zh, struct zbud_page, buddy[budnum]); - - /* FIXME, should be BUG_ON, pool destruction path doesn't disable - * interrupts tmem_destroy_pool()->tmem_pampd_destroy_all_in_obj()-> - * tmem_objnode_node_destroy()-> zcache_pampd_free() */ - WARN_ON(!irqs_disabled()); - spin_lock(&zbpg->lock); - if (list_empty(&zbpg->bud_list)) { - /* ignore zombie page... see zbud_evict_pages() */ - spin_unlock(&zbpg->lock); - return; - } - size = zbud_free(zh); - ASSERT_SPINLOCK(&zbpg->lock); - zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0]; - if (zh_other->size == 0) { /* was unbuddied: unlist and free */ - chunks = zbud_size_to_chunks(size) ; - spin_lock(&zbud_budlists_spinlock); - BUG_ON(list_empty(&zbud_unbuddied[chunks].list)); - list_del_init(&zbpg->bud_list); - zbud_unbuddied[chunks].count--; - spin_unlock(&zbud_budlists_spinlock); - zbud_free_raw_page(zbpg); - } else { /* was buddied: move remaining buddy to unbuddied list */ - chunks = zbud_size_to_chunks(zh_other->size) ; - spin_lock(&zbud_budlists_spinlock); - list_del_init(&zbpg->bud_list); - zcache_zbud_buddied_count--; - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list); - zbud_unbuddied[chunks].count++; - spin_unlock(&zbud_budlists_spinlock); - spin_unlock(&zbpg->lock); - } -} - -static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id, - struct tmem_oid *oid, - uint32_t index, struct page *page, - void *cdata, unsigned size) -{ - struct zbud_hdr *zh0, *zh1, *zh = NULL; - struct zbud_page *zbpg = NULL, *ztmp; - unsigned nchunks; - char *to; - int i, found_good_buddy = 0; - - nchunks = zbud_size_to_chunks(size) ; - for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { - spin_lock(&zbud_budlists_spinlock); - if (!list_empty(&zbud_unbuddied[i].list)) { - list_for_each_entry_safe(zbpg, ztmp, - &zbud_unbuddied[i].list, bud_list) { - if (spin_trylock(&zbpg->lock)) { - found_good_buddy = i; - goto found_unbuddied; - } - } - } - spin_unlock(&zbud_budlists_spinlock); - } - /* didn't find a good buddy, try allocating a new page */ - zbpg = zbud_alloc_raw_page(); - if (unlikely(zbpg == NULL)) - goto out; - /* ok, have a page, now compress the data before taking locks */ - spin_lock(&zbud_budlists_spinlock); - spin_lock(&zbpg->lock); - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list); - zbud_unbuddied[nchunks].count++; - zh = &zbpg->buddy[0]; - goto init_zh; - -found_unbuddied: - ASSERT_SPINLOCK(&zbpg->lock); - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; - BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0))); - if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */ - ASSERT_SENTINEL(zh0, ZBH); - zh = zh1; - } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */ - ASSERT_SENTINEL(zh1, ZBH); - zh = zh0; - } else - BUG(); - list_del_init(&zbpg->bud_list); - zbud_unbuddied[found_good_buddy].count--; - list_add_tail(&zbpg->bud_list, &zbud_buddied_list); - zcache_zbud_buddied_count++; - -init_zh: - SET_SENTINEL(zh, ZBH); - zh->size = size; - zh->index = index; - zh->oid = *oid; - zh->pool_id = pool_id; - zh->client_id = client_id; - to = zbud_data(zh, size); - memcpy(to, cdata, size); - spin_unlock(&zbpg->lock); - spin_unlock(&zbud_budlists_spinlock); - zbud_cumul_chunk_counts[nchunks]++; - atomic_inc(&zcache_zbud_curr_zpages); - zcache_zbud_cumul_zpages++; - zcache_zbud_curr_zbytes += size; - zcache_zbud_cumul_zbytes += size; -out: - return zh; -} - -static int zbud_decompress(struct page *page, struct zbud_hdr *zh) -{ - struct zbud_page *zbpg; - unsigned budnum = zbud_budnum(zh); - size_t out_len = PAGE_SIZE; - char *to_va, *from_va; - unsigned size; - int ret = 0; - - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - spin_lock(&zbpg->lock); - if (list_empty(&zbpg->bud_list)) { - /* ignore zombie page... see zbud_evict_pages() */ - ret = -EINVAL; - goto out; - } - ASSERT_SENTINEL(zh, ZBH); - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); - to_va = kmap_atomic(page); - size = zh->size; - from_va = zbud_data(zh, size); - ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len); - BUG_ON(ret != LZO_E_OK); - BUG_ON(out_len != PAGE_SIZE); - kunmap_atomic(to_va); -out: - spin_unlock(&zbpg->lock); - return ret; -} - -/* - * The following routines handle shrinking of ephemeral pages by evicting - * pages "least valuable" first. - */ - -static unsigned long zcache_evicted_raw_pages; -static unsigned long zcache_evicted_buddied_pages; -static unsigned long zcache_evicted_unbuddied_pages; - -static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, - uint16_t poolid); -static void zcache_put_pool(struct tmem_pool *pool); - -/* - * Flush and free all zbuds in a zbpg, then free the pageframe - */ -static void zbud_evict_zbpg(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh; - int i, j; - uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS]; - uint32_t index[ZBUD_MAX_BUDS]; - struct tmem_oid oid[ZBUD_MAX_BUDS]; - struct tmem_pool *pool; - unsigned long flags; - - ASSERT_SPINLOCK(&zbpg->lock); - for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) { - zh = &zbpg->buddy[i]; - if (zh->size) { - client_id[j] = zh->client_id; - pool_id[j] = zh->pool_id; - oid[j] = zh->oid; - index[j] = zh->index; - j++; - } - } - spin_unlock(&zbpg->lock); - for (i = 0; i < j; i++) { - pool = zcache_get_pool_by_id(client_id[i], pool_id[i]); - BUG_ON(pool == NULL); - local_irq_save(flags); - /* these flushes should dispose of any local storage */ - tmem_flush_page(pool, &oid[i], index[i]); - local_irq_restore(flags); - zcache_put_pool(pool); - } -} - -/* - * Free nr pages. This code is funky because we want to hold the locks - * protecting various lists for as short a time as possible, and in some - * circumstances the list may change asynchronously when the list lock is - * not held. In some cases we also trylock not only to avoid waiting on a - * page in use by another cpu, but also to avoid potential deadlock due to - * lock inversion. - */ -static void zbud_evict_pages(int nr) -{ - struct zbud_page *zbpg; - int i, newly_unused_pages = 0; - - - /* now try freeing unbuddied pages, starting with least space avail */ - for (i = 0; i < MAX_CHUNK; i++) { -retry_unbud_list_i: - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_unbuddied[i].list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - continue; - } - list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; - zbud_unbuddied[i].count--; - spin_unlock(&zbud_budlists_spinlock); - zcache_evicted_unbuddied_pages++; - /* want budlists unlocked when doing zbpg eviction */ - zbud_evict_zbpg(zbpg); - newly_unused_pages++; - local_bh_enable(); - if (--nr <= 0) - goto evict_unused; - goto retry_unbud_list_i; - } - spin_unlock_bh(&zbud_budlists_spinlock); - } - - /* as a last resort, free buddied pages */ -retry_bud_list: - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_buddied_list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - goto evict_unused; - } - list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; - zcache_zbud_buddied_count--; - spin_unlock(&zbud_budlists_spinlock); - zcache_evicted_buddied_pages++; - /* want budlists unlocked when doing zbpg eviction */ - zbud_evict_zbpg(zbpg); - newly_unused_pages++; - local_bh_enable(); - if (--nr <= 0) - goto evict_unused; - goto retry_bud_list; - } - spin_unlock_bh(&zbud_budlists_spinlock); - -evict_unused: - return; -} - -static DEFINE_PER_CPU(unsigned char *, zcache_remoteputmem); - -static int zbud_remotify_zbud(struct tmem_xhandle *xh, char *data, - size_t size) -{ - struct tmem_pool *pool; - int i, remotenode, ret = -1; - unsigned char cksum, *p; - unsigned long flags; - - for (p = data, cksum = 0, i = 0; i < size; i++) - cksum += *p; - ret = ramster_remote_put(xh, data, size, true, &remotenode); - if (ret == 0) { - /* data was successfully remoted so change the local version - * to point to the remote node where it landed */ - pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh->pool_id); - BUG_ON(pool == NULL); - local_irq_save(flags); - /* tmem_replace will also free up any local space */ - (void)tmem_replace(pool, &xh->oid, xh->index, - pampd_make_remote(remotenode, size, cksum)); - local_irq_restore(flags); - zcache_put_pool(pool); - ramster_eph_pages_remoted++; - ret = 0; - } else - ramster_eph_pages_remote_failed++; - return ret; -} - -static int zbud_remotify_zbpg(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh1, *zh2 = NULL; - struct tmem_xhandle xh1, xh2 = { 0 }; - char *data1 = NULL, *data2 = NULL; - size_t size1 = 0, size2 = 0; - int ret = 0; - unsigned char *tmpmem = __get_cpu_var(zcache_remoteputmem); - - ASSERT_SPINLOCK(&zbpg->lock); - if (zbpg->buddy[0].size == 0) - zh1 = &zbpg->buddy[1]; - else if (zbpg->buddy[1].size == 0) - zh1 = &zbpg->buddy[0]; - else { - zh1 = &zbpg->buddy[0]; - zh2 = &zbpg->buddy[1]; - } - /* don't remotify pages that are already remotified */ - if (zh1->client_id != LOCAL_CLIENT) - zh1 = NULL; - if ((zh2 != NULL) && (zh2->client_id != LOCAL_CLIENT)) - zh2 = NULL; - - /* copy the data and metadata so can release lock */ - if (zh1 != NULL) { - xh1.client_id = zh1->client_id; - xh1.pool_id = zh1->pool_id; - xh1.oid = zh1->oid; - xh1.index = zh1->index; - size1 = zh1->size; - data1 = zbud_data(zh1, size1); - memcpy(tmpmem, zbud_data(zh1, size1), size1); - data1 = tmpmem; - tmpmem += size1; - } - if (zh2 != NULL) { - xh2.client_id = zh2->client_id; - xh2.pool_id = zh2->pool_id; - xh2.oid = zh2->oid; - xh2.index = zh2->index; - size2 = zh2->size; - memcpy(tmpmem, zbud_data(zh2, size2), size2); - data2 = tmpmem; - } - spin_unlock(&zbpg->lock); - preempt_enable(); - - /* OK, no locks held anymore, remotify one or both zbuds */ - if (zh1 != NULL) - ret = zbud_remotify_zbud(&xh1, data1, size1); - if (zh2 != NULL) - ret |= zbud_remotify_zbud(&xh2, data2, size2); - return ret; -} - -void zbud_remotify_pages(int nr) -{ - struct zbud_page *zbpg; - int i, ret; - - /* - * for now just try remotifying unbuddied pages, starting with - * least space avail - */ - for (i = 0; i < MAX_CHUNK; i++) { -retry_unbud_list_i: - preempt_disable(); /* enable in zbud_remotify_zbpg */ - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_unbuddied[i].list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - preempt_enable(); - continue; /* next i in for loop */ - } - list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; /* next list_for_each_entry */ - zbud_unbuddied[i].count--; - /* want budlists unlocked when doing zbpg remotify */ - spin_unlock_bh(&zbud_budlists_spinlock); - ret = zbud_remotify_zbpg(zbpg); - /* preemption is re-enabled in zbud_remotify_zbpg */ - if (ret == 0) { - if (--nr <= 0) - goto out; - goto retry_unbud_list_i; - } - /* if fail to remotify any page, quit */ - pr_err("TESTING zbud_remotify_pages failed on page," - " trying to re-add\n"); - spin_lock_bh(&zbud_budlists_spinlock); - spin_lock(&zbpg->lock); - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[i].list); - zbud_unbuddied[i].count++; - spin_unlock(&zbpg->lock); - spin_unlock_bh(&zbud_budlists_spinlock); - pr_err("TESTING zbud_remotify_pages failed on page," - " finished re-add\n"); - goto out; - } - spin_unlock_bh(&zbud_budlists_spinlock); - preempt_enable(); - } - -next_buddied_zbpg: - preempt_disable(); /* enable in zbud_remotify_zbpg */ - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_buddied_list)) - goto unlock_out; - list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; /* next list_for_each_entry */ - zcache_zbud_buddied_count--; - /* want budlists unlocked when doing zbpg remotify */ - spin_unlock_bh(&zbud_budlists_spinlock); - ret = zbud_remotify_zbpg(zbpg); - /* preemption is re-enabled in zbud_remotify_zbpg */ - if (ret == 0) { - if (--nr <= 0) - goto out; - goto next_buddied_zbpg; - } - /* if fail to remotify any page, quit */ - pr_err("TESTING zbud_remotify_pages failed on BUDDIED page," - " trying to re-add\n"); - spin_lock_bh(&zbud_budlists_spinlock); - spin_lock(&zbpg->lock); - list_add_tail(&zbpg->bud_list, &zbud_buddied_list); - zcache_zbud_buddied_count++; - spin_unlock(&zbpg->lock); - spin_unlock_bh(&zbud_budlists_spinlock); - pr_err("TESTING zbud_remotify_pages failed on BUDDIED page," - " finished re-add\n"); - goto out; - } -unlock_out: - spin_unlock_bh(&zbud_budlists_spinlock); - preempt_enable(); -out: - return; -} - -/* the "flush list" asynchronously collects pages to remotely flush */ -#define FLUSH_ENTIRE_OBJECT ((uint32_t)-1) -static void ramster_flnode_free(struct flushlist_node *, - struct tmem_pool *); - -static void zcache_remote_flush_page(struct flushlist_node *flnode) -{ - struct tmem_xhandle *xh; - int remotenode, ret; - - preempt_disable(); - xh = &flnode->xh; - remotenode = flnode->xh.client_id; - ret = ramster_remote_flush(xh, remotenode); - if (ret >= 0) - ramster_remote_pages_flushed++; - else - ramster_remote_page_flushes_failed++; - preempt_enable_no_resched(); - ramster_flnode_free(flnode, NULL); -} - -static void zcache_remote_flush_object(struct flushlist_node *flnode) -{ - struct tmem_xhandle *xh; - int remotenode, ret; - - preempt_disable(); - xh = &flnode->xh; - remotenode = flnode->xh.client_id; - ret = ramster_remote_flush_object(xh, remotenode); - if (ret >= 0) - ramster_remote_objects_flushed++; - else - ramster_remote_object_flushes_failed++; - preempt_enable_no_resched(); - ramster_flnode_free(flnode, NULL); -} - -static void zcache_remote_eph_put(struct zbud_hdr *zbud) -{ - /* FIXME */ -} - -static void zcache_remote_pers_put(struct zv_hdr *zv) -{ - struct tmem_xhandle xh; - uint16_t size; - bool ephemeral; - int remotenode, ret = -1; - char *data; - struct tmem_pool *pool; - unsigned long flags; - unsigned char cksum; - char *p; - int i; - unsigned char *tmpmem = __get_cpu_var(zcache_remoteputmem); - - ASSERT_SENTINEL(zv, ZVH); - BUG_ON(zv->client_id != LOCAL_CLIENT); - local_bh_disable(); - xh.client_id = zv->client_id; - xh.pool_id = zv->pool_id; - xh.oid = zv->oid; - xh.index = zv->index; - size = xv_get_object_size(zv) - sizeof(*zv); - BUG_ON(size == 0 || size > zv_max_page_size); - data = (char *)zv + sizeof(*zv); - for (p = data, cksum = 0, i = 0; i < size; i++) - cksum += *p; - memcpy(tmpmem, data, size); - data = tmpmem; - pool = zcache_get_pool_by_id(zv->client_id, zv->pool_id); - ephemeral = is_ephemeral(pool); - zcache_put_pool(pool); - /* now OK to release lock set in caller */ - spin_unlock(&zcache_rem_op_list_lock); - local_bh_enable(); - preempt_disable(); - ret = ramster_remote_put(&xh, data, size, ephemeral, &remotenode); - preempt_enable_no_resched(); - if (ret != 0) { - /* - * This is some form of a memory leak... if the remote put - * fails, there will never be another attempt to remotify - * this page. But since we've dropped the zv pointer, - * the page may have been freed or the data replaced - * so we can't just "put it back" in the remote op list. - * Even if we could, not sure where to put it in the list - * because there may be flushes that must be strictly - * ordered vs the put. So leave this as a FIXME for now. - * But count them so we know if it becomes a problem. - */ - ramster_pers_pages_remote_failed++; - goto out; - } else - atomic_inc(&ramster_remote_pers_pages); - ramster_pers_pages_remoted++; - /* - * data was successfully remoted so change the local version to - * point to the remote node where it landed - */ - local_bh_disable(); - pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh.pool_id); - local_irq_save(flags); - (void)tmem_replace(pool, &xh.oid, xh.index, - pampd_make_remote(remotenode, size, cksum)); - local_irq_restore(flags); - zcache_put_pool(pool); - local_bh_enable(); -out: - return; -} - -static void zcache_do_remotify_ops(int nr) -{ - struct ramster_remotify_hdr *rem_op; - union remotify_list_node *u; - - while (1) { - if (!nr) - goto out; - spin_lock(&zcache_rem_op_list_lock); - if (list_empty(&zcache_rem_op_list)) { - spin_unlock(&zcache_rem_op_list_lock); - goto out; - } - rem_op = list_first_entry(&zcache_rem_op_list, - struct ramster_remotify_hdr, list); - list_del_init(&rem_op->list); - if (rem_op->op != RAMSTER_REMOTIFY_PERS_PUT) - spin_unlock(&zcache_rem_op_list_lock); - u = (union remotify_list_node *)rem_op; - switch (rem_op->op) { - case RAMSTER_REMOTIFY_EPH_PUT: -BUG(); - zcache_remote_eph_put((struct zbud_hdr *)rem_op); - break; - case RAMSTER_REMOTIFY_PERS_PUT: - zcache_remote_pers_put((struct zv_hdr *)rem_op); - break; - case RAMSTER_REMOTIFY_FLUSH_PAGE: - zcache_remote_flush_page((struct flushlist_node *)u); - break; - case RAMSTER_REMOTIFY_FLUSH_OBJ: - zcache_remote_flush_object((struct flushlist_node *)u); - break; - default: - BUG(); - } - } -out: - return; -} - -/* - * Communicate interface revision with userspace - */ -#include "cluster/ramster_nodemanager.h" -static unsigned long ramster_interface_revision = R2NM_API_VERSION; - -/* - * For now, just push over a few pages every few seconds to - * ensure that it basically works - */ -static struct workqueue_struct *ramster_remotify_workqueue; -static void ramster_remotify_process(struct work_struct *work); -static DECLARE_DELAYED_WORK(ramster_remotify_worker, - ramster_remotify_process); - -static void ramster_remotify_queue_delayed_work(unsigned long delay) -{ - if (!queue_delayed_work(ramster_remotify_workqueue, - &ramster_remotify_worker, delay)) - pr_err("ramster_remotify: bad workqueue\n"); -} - - -static int use_frontswap; -static int use_cleancache; -static int ramster_remote_target_nodenum = -1; -static void ramster_remotify_process(struct work_struct *work) -{ - static bool remotify_in_progress; - - BUG_ON(irqs_disabled()); - if (remotify_in_progress) - ramster_remotify_queue_delayed_work(HZ); - else if (ramster_remote_target_nodenum != -1) { - remotify_in_progress = true; -#ifdef CONFIG_CLEANCACHE - if (use_cleancache && ramster_eph_remotify_enable) - zbud_remotify_pages(5000); /* FIXME is this a good number? */ -#endif -#ifdef CONFIG_FRONTSWAP - if (use_frontswap && ramster_pers_remotify_enable) - zcache_do_remotify_ops(500); /* FIXME is this a good number? */ -#endif - remotify_in_progress = false; - ramster_remotify_queue_delayed_work(HZ); - } -} - -static void ramster_remotify_init(void) -{ - unsigned long n = 60UL; - ramster_remotify_workqueue = - create_singlethread_workqueue("ramster_remotify"); - ramster_remotify_queue_delayed_work(n * HZ); -} - - -static void zbud_init(void) -{ - int i; - - INIT_LIST_HEAD(&zbud_buddied_list); - zcache_zbud_buddied_count = 0; - for (i = 0; i < NCHUNKS; i++) { - INIT_LIST_HEAD(&zbud_unbuddied[i].list); - zbud_unbuddied[i].count = 0; - } -} - -#ifdef CONFIG_SYSFS -/* - * These sysfs routines show a nice distribution of how many zbpg's are - * currently (and have ever been placed) in each unbuddied list. It's fun - * to watch but can probably go away before final merge. - */ -static int zbud_show_unbuddied_list_counts(char *buf) -{ - int i; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) - p += sprintf(p, "%u ", zbud_unbuddied[i].count); - return p - buf; -} - -static int zbud_show_cumul_chunk_counts(char *buf) -{ - unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0; - unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0; - unsigned long total_chunks_lte_42 = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]); - chunks += zbud_cumul_chunk_counts[i]; - total_chunks += zbud_cumul_chunk_counts[i]; - sum_total_chunks += i * zbud_cumul_chunk_counts[i]; - if (i == 21) - total_chunks_lte_21 = total_chunks; - if (i == 32) - total_chunks_lte_32 = total_chunks; - if (i == 42) - total_chunks_lte_42 = total_chunks; - } - p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n", - total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42, - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} -#endif - -/********** - * This "zv" PAM implementation combines the TLSF-based xvMalloc - * with lzo1x compression to maximize the amount of data that can - * be packed into a physical page. - * - * Zv represents a PAM page with the index and object (plus a "size" value - * necessary for decompression) immediately preceding the compressed data. - */ - -/* rudimentary policy limits */ -/* total number of persistent pages may not exceed this percentage */ -static unsigned int zv_page_count_policy_percent = 75; -/* - * byte count defining poor compression; pages with greater zsize will be - * rejected - */ -static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7; -/* - * byte count defining poor *mean* compression; pages with greater zsize - * will be rejected until sufficient better-compressed pages are accepted - * driving the mean below this threshold - */ -static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5; - -static atomic_t zv_curr_dist_counts[NCHUNKS]; -static atomic_t zv_cumul_dist_counts[NCHUNKS]; - - -static struct zv_hdr *zv_create(struct zcache_client *cli, uint32_t pool_id, - struct tmem_oid *oid, uint32_t index, - void *cdata, unsigned clen) -{ - struct page *page; - struct zv_hdr *zv = NULL; - uint32_t offset; - int alloc_size = clen + sizeof(struct zv_hdr); - int chunks = (alloc_size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT; - int ret; - - BUG_ON(!irqs_disabled()); - BUG_ON(chunks >= NCHUNKS); - ret = xv_malloc(cli->xvpool, clen + sizeof(struct zv_hdr), - &page, &offset, ZCACHE_GFP_MASK); - if (unlikely(ret)) - goto out; - atomic_inc(&zv_curr_dist_counts[chunks]); - atomic_inc(&zv_cumul_dist_counts[chunks]); - zv = kmap_atomic(page) + offset; - zv->index = index; - zv->oid = *oid; - zv->pool_id = pool_id; - SET_SENTINEL(zv, ZVH); - INIT_LIST_HEAD(&zv->rem_op.list); - zv->client_id = get_client_id_from_client(cli); - zv->rem_op.op = RAMSTER_REMOTIFY_PERS_PUT; - if (zv->client_id == LOCAL_CLIENT) { - spin_lock(&zcache_rem_op_list_lock); - list_add_tail(&zv->rem_op.list, &zcache_rem_op_list); - spin_unlock(&zcache_rem_op_list_lock); - } - memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen); - kunmap_atomic(zv); -out: - return zv; -} - -/* similar to zv_create, but just reserve space, no data yet */ -static struct zv_hdr *zv_alloc(struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index, - unsigned clen) -{ - struct zcache_client *cli = pool->client; - struct page *page; - struct zv_hdr *zv = NULL; - uint32_t offset; - int ret; - - BUG_ON(!irqs_disabled()); - BUG_ON(!is_local_client(pool->client)); - ret = xv_malloc(cli->xvpool, clen + sizeof(struct zv_hdr), - &page, &offset, ZCACHE_GFP_MASK); - if (unlikely(ret)) - goto out; - zv = kmap_atomic(page) + offset; - SET_SENTINEL(zv, ZVH); - INIT_LIST_HEAD(&zv->rem_op.list); - zv->client_id = LOCAL_CLIENT; - zv->rem_op.op = RAMSTER_INTRANSIT_PERS; - zv->index = index; - zv->oid = *oid; - zv->pool_id = pool->pool_id; - kunmap_atomic(zv); -out: - return zv; -} - -static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv) -{ - unsigned long flags; - struct page *page; - uint32_t offset; - uint16_t size = xv_get_object_size(zv); - int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT; - - ASSERT_SENTINEL(zv, ZVH); - BUG_ON(chunks >= NCHUNKS); - atomic_dec(&zv_curr_dist_counts[chunks]); - size -= sizeof(*zv); - spin_lock(&zcache_rem_op_list_lock); - size = xv_get_object_size(zv) - sizeof(*zv); - BUG_ON(size == 0); - INVERT_SENTINEL(zv, ZVH); - if (!list_empty(&zv->rem_op.list)) - list_del_init(&zv->rem_op.list); - spin_unlock(&zcache_rem_op_list_lock); - page = virt_to_page(zv); - offset = (unsigned long)zv & ~PAGE_MASK; - local_irq_save(flags); - xv_free(xvpool, page, offset); - local_irq_restore(flags); -} - -static void zv_decompress(struct page *page, struct zv_hdr *zv) -{ - size_t clen = PAGE_SIZE; - char *to_va; - unsigned size; - int ret; - - ASSERT_SENTINEL(zv, ZVH); - size = xv_get_object_size(zv) - sizeof(*zv); - BUG_ON(size == 0); - to_va = kmap_atomic(page); - ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv), - size, to_va, &clen); - kunmap_atomic(to_va); - BUG_ON(ret != LZO_E_OK); - BUG_ON(clen != PAGE_SIZE); -} - -static void zv_copy_from_pampd(char *data, size_t *bufsize, struct zv_hdr *zv) -{ - unsigned size; - - ASSERT_SENTINEL(zv, ZVH); - size = xv_get_object_size(zv) - sizeof(*zv); - BUG_ON(size == 0 || size > zv_max_page_size); - BUG_ON(size > *bufsize); - memcpy(data, (char *)zv + sizeof(*zv), size); - *bufsize = size; -} - -static void zv_copy_to_pampd(struct zv_hdr *zv, char *data, size_t size) -{ - unsigned zv_size; - - ASSERT_SENTINEL(zv, ZVH); - zv_size = xv_get_object_size(zv) - sizeof(*zv); - BUG_ON(zv_size != size); - BUG_ON(zv_size == 0 || zv_size > zv_max_page_size); - memcpy((char *)zv + sizeof(*zv), data, size); -} - -#ifdef CONFIG_SYSFS -/* - * show a distribution of compression stats for zv pages. - */ - -static int zv_curr_dist_counts_show(char *buf) -{ - unsigned long i, n, chunks = 0, sum_total_chunks = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - n = atomic_read(&zv_curr_dist_counts[i]); - p += sprintf(p, "%lu ", n); - chunks += n; - sum_total_chunks += i * n; - } - p += sprintf(p, "mean:%lu\n", - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} - -static int zv_cumul_dist_counts_show(char *buf) -{ - unsigned long i, n, chunks = 0, sum_total_chunks = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - n = atomic_read(&zv_cumul_dist_counts[i]); - p += sprintf(p, "%lu ", n); - chunks += n; - sum_total_chunks += i * n; - } - p += sprintf(p, "mean:%lu\n", - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} - -/* - * setting zv_max_zsize via sysfs causes all persistent (e.g. swap) - * pages that don't compress to less than this value (including metadata - * overhead) to be rejected. We don't allow the value to get too close - * to PAGE_SIZE. - */ -static ssize_t zv_max_zsize_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", zv_max_zsize); -} - -static ssize_t zv_max_zsize_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7)) - return -EINVAL; - zv_max_zsize = val; - return count; -} - -/* - * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap) - * pages that don't compress to less than this value (including metadata - * overhead) to be rejected UNLESS the mean compression is also smaller - * than this value. In other words, we are load-balancing-by-zsize the - * accepted pages. Again, we don't allow the value to get too close - * to PAGE_SIZE. - */ -static ssize_t zv_max_mean_zsize_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", zv_max_mean_zsize); -} - -static ssize_t zv_max_mean_zsize_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7)) - return -EINVAL; - zv_max_mean_zsize = val; - return count; -} - -/* - * setting zv_page_count_policy_percent via sysfs sets an upper bound of - * persistent (e.g. swap) pages that will be retained according to: - * (zv_page_count_policy_percent * totalram_pages) / 100) - * when that limit is reached, further puts will be rejected (until - * some pages have been flushed). Note that, due to compression, - * this number may exceed 100; it defaults to 75 and we set an - * arbitary limit of 150. A poor choice will almost certainly result - * in OOM's, so this value should only be changed prudently. - */ -static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", zv_page_count_policy_percent); -} - -static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > 150)) - return -EINVAL; - zv_page_count_policy_percent = val; - return count; -} - -static struct kobj_attribute zcache_zv_max_zsize_attr = { - .attr = { .name = "zv_max_zsize", .mode = 0644 }, - .show = zv_max_zsize_show, - .store = zv_max_zsize_store, -}; - -static struct kobj_attribute zcache_zv_max_mean_zsize_attr = { - .attr = { .name = "zv_max_mean_zsize", .mode = 0644 }, - .show = zv_max_mean_zsize_show, - .store = zv_max_mean_zsize_store, -}; - -static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = { - .attr = { .name = "zv_page_count_policy_percent", - .mode = 0644 }, - .show = zv_page_count_policy_percent_show, - .store = zv_page_count_policy_percent_store, -}; -#endif - -/* - * zcache core code starts here - */ - -/* useful stats not collected by cleancache or frontswap */ -static unsigned long zcache_flush_total; -static unsigned long zcache_flush_found; -static unsigned long zcache_flobj_total; -static unsigned long zcache_flobj_found; -static unsigned long zcache_failed_eph_puts; -static unsigned long zcache_nonactive_puts; -static unsigned long zcache_failed_pers_puts; - -/* - * Tmem operations assume the poolid implies the invoking client. - * Zcache only has one client (the kernel itself): LOCAL_CLIENT. - * RAMster has each client numbered by cluster node, and a KVM version - * of zcache would have one client per guest and each client might - * have a poolid==N. - */ -static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid) -{ - struct tmem_pool *pool = NULL; - struct zcache_client *cli = NULL; - - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else { - if (cli_id >= MAX_CLIENTS) - goto out; - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - atomic_inc(&cli->refcount); - } - if (poolid < MAX_POOLS_PER_CLIENT) { - pool = cli->tmem_pools[poolid]; - if (pool != NULL) - atomic_inc(&pool->refcount); - } -out: - return pool; -} - -static void zcache_put_pool(struct tmem_pool *pool) -{ - struct zcache_client *cli = NULL; - - if (pool == NULL) - BUG(); - cli = pool->client; - atomic_dec(&pool->refcount); - atomic_dec(&cli->refcount); -} - -int zcache_new_client(uint16_t cli_id) -{ - struct zcache_client *cli = NULL; - int ret = -1; - - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else if ((unsigned int)cli_id < MAX_CLIENTS) - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - if (cli->allocated) - goto out; - cli->allocated = 1; -#ifdef CONFIG_FRONTSWAP - cli->xvpool = xv_create_pool(); - if (cli->xvpool == NULL) - goto out; -#endif - ret = 0; -out: - return ret; -} - -/* counters for debugging */ -static unsigned long zcache_failed_get_free_pages; -static unsigned long zcache_failed_alloc; -static unsigned long zcache_put_to_flush; - -/* - * for now, used named slabs so can easily track usage; later can - * either just use kmalloc, or perhaps add a slab-like allocator - * to more carefully manage total memory utilization - */ -static struct kmem_cache *zcache_objnode_cache; -static struct kmem_cache *zcache_obj_cache; -static struct kmem_cache *ramster_flnode_cache; -static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_obj_count_max; -static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_objnode_count_max; - -/* - * to avoid memory allocation recursion (e.g. due to direct reclaim), we - * preload all necessary data structures so the hostops callbacks never - * actually do a malloc - */ -struct zcache_preload { - void *page; - struct tmem_obj *obj; - int nr; - struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH]; - struct flushlist_node *flnode; -}; -static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, }; - -static int zcache_do_preload(struct tmem_pool *pool) -{ - struct zcache_preload *kp; - struct tmem_objnode *objnode; - struct tmem_obj *obj; - struct flushlist_node *flnode; - void *page; - int ret = -ENOMEM; - - if (unlikely(zcache_objnode_cache == NULL)) - goto out; - if (unlikely(zcache_obj_cache == NULL)) - goto out; - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - while (kp->nr < ARRAY_SIZE(kp->objnodes)) { - preempt_enable_no_resched(); - objnode = kmem_cache_alloc(zcache_objnode_cache, - ZCACHE_GFP_MASK); - if (unlikely(objnode == NULL)) { - zcache_failed_alloc++; - goto out; - } - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - if (kp->nr < ARRAY_SIZE(kp->objnodes)) - kp->objnodes[kp->nr++] = objnode; - else - kmem_cache_free(zcache_objnode_cache, objnode); - } - preempt_enable_no_resched(); - obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK); - if (unlikely(obj == NULL)) { - zcache_failed_alloc++; - goto out; - } - flnode = kmem_cache_alloc(ramster_flnode_cache, ZCACHE_GFP_MASK); - if (unlikely(flnode == NULL)) { - zcache_failed_alloc++; - goto out; - } - if (is_ephemeral(pool)) { - page = (void *)__get_free_page(ZCACHE_GFP_MASK); - if (unlikely(page == NULL)) { - zcache_failed_get_free_pages++; - kmem_cache_free(zcache_obj_cache, obj); - kmem_cache_free(ramster_flnode_cache, flnode); - goto out; - } - } - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - if (kp->obj == NULL) - kp->obj = obj; - else - kmem_cache_free(zcache_obj_cache, obj); - if (kp->flnode == NULL) - kp->flnode = flnode; - else - kmem_cache_free(ramster_flnode_cache, flnode); - if (is_ephemeral(pool)) { - if (kp->page == NULL) - kp->page = page; - else - free_page((unsigned long)page); - } - ret = 0; -out: - return ret; -} - -static int ramster_do_preload_flnode_only(struct tmem_pool *pool) -{ - struct zcache_preload *kp; - struct flushlist_node *flnode; - int ret = -ENOMEM; - - BUG_ON(!irqs_disabled()); - if (unlikely(ramster_flnode_cache == NULL)) - BUG(); - kp = &__get_cpu_var(zcache_preloads); - flnode = kmem_cache_alloc(ramster_flnode_cache, GFP_ATOMIC); - if (unlikely(flnode == NULL) && kp->flnode == NULL) - BUG(); /* FIXME handle more gracefully, but how??? */ - else if (kp->flnode == NULL) - kp->flnode = flnode; - else - kmem_cache_free(ramster_flnode_cache, flnode); - return ret; -} - -static void *zcache_get_free_page(void) -{ - struct zcache_preload *kp; - void *page; - - kp = &__get_cpu_var(zcache_preloads); - page = kp->page; - BUG_ON(page == NULL); - kp->page = NULL; - return page; -} - -static void zcache_free_page(void *p) -{ - free_page((unsigned long)p); -} - -/* - * zcache implementation for tmem host ops - */ - -static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool) -{ - struct tmem_objnode *objnode = NULL; - unsigned long count; - struct zcache_preload *kp; - - kp = &__get_cpu_var(zcache_preloads); - if (kp->nr <= 0) - goto out; - objnode = kp->objnodes[kp->nr - 1]; - BUG_ON(objnode == NULL); - kp->objnodes[kp->nr - 1] = NULL; - kp->nr--; - count = atomic_inc_return(&zcache_curr_objnode_count); - if (count > zcache_curr_objnode_count_max) - zcache_curr_objnode_count_max = count; -out: - return objnode; -} - -static void zcache_objnode_free(struct tmem_objnode *objnode, - struct tmem_pool *pool) -{ - atomic_dec(&zcache_curr_objnode_count); - BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0); - kmem_cache_free(zcache_objnode_cache, objnode); -} - -static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool) -{ - struct tmem_obj *obj = NULL; - unsigned long count; - struct zcache_preload *kp; - - kp = &__get_cpu_var(zcache_preloads); - obj = kp->obj; - BUG_ON(obj == NULL); - kp->obj = NULL; - count = atomic_inc_return(&zcache_curr_obj_count); - if (count > zcache_curr_obj_count_max) - zcache_curr_obj_count_max = count; - return obj; -} - -static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool) -{ - atomic_dec(&zcache_curr_obj_count); - BUG_ON(atomic_read(&zcache_curr_obj_count) < 0); - kmem_cache_free(zcache_obj_cache, obj); -} - -static struct flushlist_node *ramster_flnode_alloc(struct tmem_pool *pool) -{ - struct flushlist_node *flnode = NULL; - struct zcache_preload *kp; - int count; - - kp = &__get_cpu_var(zcache_preloads); - flnode = kp->flnode; - BUG_ON(flnode == NULL); - kp->flnode = NULL; - count = atomic_inc_return(&ramster_curr_flnode_count); - if (count > ramster_curr_flnode_count_max) - ramster_curr_flnode_count_max = count; - return flnode; -} - -static void ramster_flnode_free(struct flushlist_node *flnode, - struct tmem_pool *pool) -{ - atomic_dec(&ramster_curr_flnode_count); - BUG_ON(atomic_read(&ramster_curr_flnode_count) < 0); - kmem_cache_free(ramster_flnode_cache, flnode); -} - -static struct tmem_hostops zcache_hostops = { - .obj_alloc = zcache_obj_alloc, - .obj_free = zcache_obj_free, - .objnode_alloc = zcache_objnode_alloc, - .objnode_free = zcache_objnode_free, -}; - -/* - * zcache implementations for PAM page descriptor ops - */ - - -static inline void dec_and_check(atomic_t *pvar) -{ - atomic_dec(pvar); - /* later when all accounting is fixed, make this a BUG */ - WARN_ON_ONCE(atomic_read(pvar) < 0); -} - -static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_eph_pampd_count_max; -static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_pers_pampd_count_max; - -/* forward reference */ -static int zcache_compress(struct page *from, void **out_va, size_t *out_len); - -static int zcache_pampd_eph_create(char *data, size_t size, bool raw, - struct tmem_pool *pool, struct tmem_oid *oid, - uint32_t index, void **pampd) -{ - int ret = -1; - void *cdata = data; - size_t clen = size; - struct zcache_client *cli = pool->client; - uint16_t client_id = get_client_id_from_client(cli); - struct page *page = NULL; - unsigned long count; - - if (!raw) { - page = virt_to_page(data); - ret = zcache_compress(page, &cdata, &clen); - if (ret == 0) - goto out; - if (clen == 0 || clen > zbud_max_buddy_size()) { - zcache_compress_poor++; - goto out; - } - } - *pampd = (void *)zbud_create(client_id, pool->pool_id, oid, - index, page, cdata, clen); - if (*pampd == NULL) { - ret = -ENOMEM; - goto out; - } - ret = 0; - count = atomic_inc_return(&zcache_curr_eph_pampd_count); - if (count > zcache_curr_eph_pampd_count_max) - zcache_curr_eph_pampd_count_max = count; - if (client_id != LOCAL_CLIENT) { - count = atomic_inc_return(&ramster_foreign_eph_pampd_count); - if (count > ramster_foreign_eph_pampd_count_max) - ramster_foreign_eph_pampd_count_max = count; - } -out: - return ret; -} - -static int zcache_pampd_pers_create(char *data, size_t size, bool raw, - struct tmem_pool *pool, struct tmem_oid *oid, - uint32_t index, void **pampd) -{ - int ret = -1; - void *cdata = data; - size_t clen = size; - struct zcache_client *cli = pool->client; - struct page *page; - unsigned long count; - unsigned long zv_mean_zsize; - struct zv_hdr *zv; - long curr_pers_pampd_count; - u64 total_zsize; -#ifdef RAMSTER_TESTING - static bool pampd_neg_warned; -#endif - - curr_pers_pampd_count = atomic_read(&zcache_curr_pers_pampd_count) - - atomic_read(&ramster_remote_pers_pages); -#ifdef RAMSTER_TESTING - /* should always be positive, but warn if accounting is off */ - if (!pampd_neg_warned) { - pr_warn("ramster: bad accounting for curr_pers_pampd_count\n"); - pampd_neg_warned = true; - } -#endif - if (curr_pers_pampd_count > - (zv_page_count_policy_percent * totalram_pages) / 100) { - zcache_policy_percent_exceeded++; - goto out; - } - if (raw) - goto ok_to_create; - page = virt_to_page(data); - if (zcache_compress(page, &cdata, &clen) == 0) - goto out; - /* reject if compression is too poor */ - if (clen > zv_max_zsize) { - zcache_compress_poor++; - goto out; - } - /* reject if mean compression is too poor */ - if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) { - total_zsize = xv_get_total_size_bytes(cli->xvpool); - zv_mean_zsize = div_u64(total_zsize, curr_pers_pampd_count); - if (zv_mean_zsize > zv_max_mean_zsize) { - zcache_mean_compress_poor++; - goto out; - } - } -ok_to_create: - *pampd = (void *)zv_create(cli, pool->pool_id, oid, index, cdata, clen); - if (*pampd == NULL) { - ret = -ENOMEM; - goto out; - } - ret = 0; - count = atomic_inc_return(&zcache_curr_pers_pampd_count); - if (count > zcache_curr_pers_pampd_count_max) - zcache_curr_pers_pampd_count_max = count; - if (is_local_client(cli)) - goto out; - zv = *(struct zv_hdr **)pampd; - count = atomic_inc_return(&ramster_foreign_pers_pampd_count); - if (count > ramster_foreign_pers_pampd_count_max) - ramster_foreign_pers_pampd_count_max = count; -out: - return ret; -} - -static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph, - struct tmem_pool *pool, struct tmem_oid *oid, - uint32_t index) -{ - void *pampd = NULL; - int ret; - bool ephemeral; - - BUG_ON(preemptible()); - ephemeral = (eph == 1) || ((eph == 0) && is_ephemeral(pool)); - if (ephemeral) - ret = zcache_pampd_eph_create(data, size, raw, pool, - oid, index, &pampd); - else - ret = zcache_pampd_pers_create(data, size, raw, pool, - oid, index, &pampd); - /* FIXME add some counters here for failed creates? */ - return pampd; -} - -/* - * fill the pageframe corresponding to the struct page with the data - * from the passed pampd - */ -static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw, - void *pampd, struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index) -{ - int ret = 0; - - BUG_ON(preemptible()); - BUG_ON(is_ephemeral(pool)); /* Fix later for shared pools? */ - BUG_ON(pampd_is_remote(pampd)); - if (raw) - zv_copy_from_pampd(data, bufsize, pampd); - else - zv_decompress(virt_to_page(data), pampd); - return ret; -} - -static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw, - void *pampd, struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index) -{ - int ret = 0; - unsigned long flags; - struct zcache_client *cli = pool->client; - - BUG_ON(preemptible()); - BUG_ON(pampd_is_remote(pampd)); - if (is_ephemeral(pool)) { - local_irq_save(flags); - if (raw) - zbud_copy_from_pampd(data, bufsize, pampd); - else - ret = zbud_decompress(virt_to_page(data), pampd); - zbud_free_and_delist((struct zbud_hdr *)pampd); - local_irq_restore(flags); - if (!is_local_client(cli)) - dec_and_check(&ramster_foreign_eph_pampd_count); - dec_and_check(&zcache_curr_eph_pampd_count); - } else { - if (is_local_client(cli)) - BUG(); - if (raw) - zv_copy_from_pampd(data, bufsize, pampd); - else - zv_decompress(virt_to_page(data), pampd); - zv_free(cli->xvpool, pampd); - if (!is_local_client(cli)) - dec_and_check(&ramster_foreign_pers_pampd_count); - dec_and_check(&zcache_curr_pers_pampd_count); - ret = 0; - } - return ret; -} - -static bool zcache_pampd_is_remote(void *pampd) -{ - return pampd_is_remote(pampd); -} - -/* - * free the pampd and remove it from any zcache lists - * pampd must no longer be pointed to from any tmem data structures! - */ -static void zcache_pampd_free(void *pampd, struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index, bool acct) -{ - struct zcache_client *cli = pool->client; - bool eph = is_ephemeral(pool); - struct zv_hdr *zv; - - BUG_ON(preemptible()); - if (pampd_is_remote(pampd)) { - WARN_ON(acct == false); - if (oid == NULL) { - /* - * a NULL oid means to ignore this pampd free - * as the remote freeing will be handled elsewhere - */ - } else if (eph) { - /* FIXME remote flush optional but probably good idea */ - /* FIXME get these working properly again */ - dec_and_check(&zcache_curr_eph_pampd_count); - } else if (pampd_is_intransit(pampd)) { - /* did a pers remote get_and_free, so just free local */ - pampd = pampd_mask_intransit_and_remote(pampd); - goto local_pers; - } else { - struct flushlist_node *flnode = - ramster_flnode_alloc(pool); - - flnode->xh.client_id = pampd_remote_node(pampd); - flnode->xh.pool_id = pool->pool_id; - flnode->xh.oid = *oid; - flnode->xh.index = index; - flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_PAGE; - spin_lock(&zcache_rem_op_list_lock); - list_add(&flnode->rem_op.list, &zcache_rem_op_list); - spin_unlock(&zcache_rem_op_list_lock); - dec_and_check(&zcache_curr_pers_pampd_count); - dec_and_check(&ramster_remote_pers_pages); - } - } else if (eph) { - zbud_free_and_delist((struct zbud_hdr *)pampd); - if (!is_local_client(pool->client)) - dec_and_check(&ramster_foreign_eph_pampd_count); - if (acct) - /* FIXME get these working properly again */ - dec_and_check(&zcache_curr_eph_pampd_count); - } else { -local_pers: - zv = (struct zv_hdr *)pampd; - if (!is_local_client(pool->client)) - dec_and_check(&ramster_foreign_pers_pampd_count); - zv_free(cli->xvpool, zv); - if (acct) - /* FIXME get these working properly again */ - dec_and_check(&zcache_curr_pers_pampd_count); - } -} - -static void zcache_pampd_free_obj(struct tmem_pool *pool, - struct tmem_obj *obj) -{ - struct flushlist_node *flnode; - - BUG_ON(preemptible()); - if (obj->extra == NULL) - return; - BUG_ON(!pampd_is_remote(obj->extra)); - flnode = ramster_flnode_alloc(pool); - flnode->xh.client_id = pampd_remote_node(obj->extra); - flnode->xh.pool_id = pool->pool_id; - flnode->xh.oid = obj->oid; - flnode->xh.index = FLUSH_ENTIRE_OBJECT; - flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_OBJ; - spin_lock(&zcache_rem_op_list_lock); - list_add(&flnode->rem_op.list, &zcache_rem_op_list); - spin_unlock(&zcache_rem_op_list_lock); -} - -void zcache_pampd_new_obj(struct tmem_obj *obj) -{ - obj->extra = NULL; -} - -int zcache_pampd_replace_in_obj(void *new_pampd, struct tmem_obj *obj) -{ - int ret = -1; - - if (new_pampd != NULL) { - if (obj->extra == NULL) - obj->extra = new_pampd; - /* enforce that all remote pages in an object reside - * in the same node! */ - else if (pampd_remote_node(new_pampd) != - pampd_remote_node((void *)(obj->extra))) - BUG(); - ret = 0; - } - return ret; -} - -/* - * Called by the message handler after a (still compressed) page has been - * fetched from the remote machine in response to an "is_remote" tmem_get - * or persistent tmem_localify. For a tmem_get, "extra" is the address of - * the page that is to be filled to succesfully resolve the tmem_get; for - * a (persistent) tmem_localify, "extra" is NULL (as the data is placed only - * in the local zcache). "data" points to "size" bytes of (compressed) data - * passed in the message. In the case of a persistent remote get, if - * pre-allocation was successful (see zcache_repatriate_preload), the page - * is placed into both local zcache and at "extra". - */ -int zcache_localify(int pool_id, struct tmem_oid *oidp, - uint32_t index, char *data, size_t size, - void *extra) -{ - int ret = -ENOENT; - unsigned long flags; - struct tmem_pool *pool; - bool ephemeral, delete = false; - size_t clen = PAGE_SIZE; - void *pampd, *saved_hb; - struct tmem_obj *obj; - - pool = zcache_get_pool_by_id(LOCAL_CLIENT, pool_id); - if (unlikely(pool == NULL)) - /* pool doesn't exist anymore */ - goto out; - ephemeral = is_ephemeral(pool); - local_irq_save(flags); /* FIXME: maybe only disable softirqs? */ - pampd = tmem_localify_get_pampd(pool, oidp, index, &obj, &saved_hb); - if (pampd == NULL) { - /* hmmm... must have been a flush while waiting */ -#ifdef RAMSTER_TESTING - pr_err("UNTESTED pampd==NULL in zcache_localify\n"); -#endif - if (ephemeral) - ramster_remote_eph_pages_unsucc_get++; - else - ramster_remote_pers_pages_unsucc_get++; - obj = NULL; - goto finish; - } else if (unlikely(!pampd_is_remote(pampd))) { - /* hmmm... must have been a dup put while waiting */ -#ifdef RAMSTER_TESTING - pr_err("UNTESTED dup while waiting in zcache_localify\n"); -#endif - if (ephemeral) - ramster_remote_eph_pages_unsucc_get++; - else - ramster_remote_pers_pages_unsucc_get++; - obj = NULL; - pampd = NULL; - ret = -EEXIST; - goto finish; - } else if (size == 0) { - /* no remote data, delete the local is_remote pampd */ - pampd = NULL; - if (ephemeral) - ramster_remote_eph_pages_unsucc_get++; - else - BUG(); - delete = true; - goto finish; - } - if (!ephemeral && pampd_is_intransit(pampd)) { - /* localify to zcache */ - pampd = pampd_mask_intransit_and_remote(pampd); - zv_copy_to_pampd(pampd, data, size); - } else { - pampd = NULL; - obj = NULL; - } - if (extra != NULL) { - /* decompress direct-to-memory to complete remotify */ - ret = lzo1x_decompress_safe((char *)data, size, - (char *)extra, &clen); - BUG_ON(ret != LZO_E_OK); - BUG_ON(clen != PAGE_SIZE); - } - if (ephemeral) - ramster_remote_eph_pages_succ_get++; - else - ramster_remote_pers_pages_succ_get++; - ret = 0; -finish: - tmem_localify_finish(obj, index, pampd, saved_hb, delete); - zcache_put_pool(pool); - local_irq_restore(flags); -out: - return ret; -} - -/* - * Called on a remote persistent tmem_get to attempt to preallocate - * local storage for the data contained in the remote persistent page. - * If succesfully preallocated, returns the pampd, marked as remote and - * in_transit. Else returns NULL. Note that the appropriate tmem data - * structure must be locked. - */ -static void *zcache_pampd_repatriate_preload(void *pampd, - struct tmem_pool *pool, - struct tmem_oid *oid, - uint32_t index, - bool *intransit) -{ - int clen = pampd_remote_size(pampd); - void *ret_pampd = NULL; - unsigned long flags; - - if (!pampd_is_remote(pampd)) - BUG(); - if (is_ephemeral(pool)) - BUG(); - if (pampd_is_intransit(pampd)) { - /* - * to avoid multiple allocations (and maybe a memory leak) - * don't preallocate if already in the process of being - * repatriated - */ - *intransit = true; - goto out; - } - *intransit = false; - local_irq_save(flags); - ret_pampd = (void *)zv_alloc(pool, oid, index, clen); - if (ret_pampd != NULL) { - /* - * a pampd is marked intransit if it is remote and space has - * been allocated for it locally (note, only happens for - * persistent pages, in which case the remote copy is freed) - */ - ret_pampd = pampd_mark_intransit(ret_pampd); - dec_and_check(&ramster_remote_pers_pages); - } else - ramster_pers_pages_remote_nomem++; - local_irq_restore(flags); -out: - return ret_pampd; -} - -/* - * Called on a remote tmem_get to invoke a message to fetch the page. - * Might sleep so no tmem locks can be held. "extra" is passed - * all the way through the round-trip messaging to zcache_localify. - */ -static int zcache_pampd_repatriate(void *fake_pampd, void *real_pampd, - struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index, - bool free, void *extra) -{ - struct tmem_xhandle xh; - int ret; - - if (pampd_is_intransit(real_pampd)) - /* have local space pre-reserved, so free remote copy */ - free = true; - xh = tmem_xhandle_fill(LOCAL_CLIENT, pool, oid, index); - /* unreliable request/response for now */ - ret = ramster_remote_async_get(&xh, free, - pampd_remote_node(fake_pampd), - pampd_remote_size(fake_pampd), - pampd_remote_cksum(fake_pampd), - extra); -#ifdef RAMSTER_TESTING - if (ret != 0 && ret != -ENOENT) - pr_err("TESTING zcache_pampd_repatriate returns, ret=%d\n", - ret); -#endif - return ret; -} - -static struct tmem_pamops zcache_pamops = { - .create = zcache_pampd_create, - .get_data = zcache_pampd_get_data, - .free = zcache_pampd_free, - .get_data_and_free = zcache_pampd_get_data_and_free, - .free_obj = zcache_pampd_free_obj, - .is_remote = zcache_pampd_is_remote, - .repatriate_preload = zcache_pampd_repatriate_preload, - .repatriate = zcache_pampd_repatriate, - .new_obj = zcache_pampd_new_obj, - .replace_in_obj = zcache_pampd_replace_in_obj, -}; - -/* - * zcache compression/decompression and related per-cpu stuff - */ - -#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS -#define LZO_DSTMEM_PAGE_ORDER 1 -static DEFINE_PER_CPU(unsigned char *, zcache_workmem); -static DEFINE_PER_CPU(unsigned char *, zcache_dstmem); - -static int zcache_compress(struct page *from, void **out_va, size_t *out_len) -{ - int ret = 0; - unsigned char *dmem = __get_cpu_var(zcache_dstmem); - unsigned char *wmem = __get_cpu_var(zcache_workmem); - char *from_va; - - BUG_ON(!irqs_disabled()); - if (unlikely(dmem == NULL || wmem == NULL)) - goto out; /* no buffer, so can't compress */ - from_va = kmap_atomic(from); - mb(); - ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem); - BUG_ON(ret != LZO_E_OK); - *out_va = dmem; - kunmap_atomic(from_va); - ret = 1; -out: - return ret; -} - - -static int zcache_cpu_notifier(struct notifier_block *nb, - unsigned long action, void *pcpu) -{ - int cpu = (long)pcpu; - struct zcache_preload *kp; - - switch (action) { - case CPU_UP_PREPARE: - per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages( - GFP_KERNEL | __GFP_REPEAT, - LZO_DSTMEM_PAGE_ORDER), - per_cpu(zcache_workmem, cpu) = - kzalloc(LZO1X_MEM_COMPRESS, - GFP_KERNEL | __GFP_REPEAT); - per_cpu(zcache_remoteputmem, cpu) = - kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT); - break; - case CPU_DEAD: - case CPU_UP_CANCELED: - kfree(per_cpu(zcache_remoteputmem, cpu)); - per_cpu(zcache_remoteputmem, cpu) = NULL; - free_pages((unsigned long)per_cpu(zcache_dstmem, cpu), - LZO_DSTMEM_PAGE_ORDER); - per_cpu(zcache_dstmem, cpu) = NULL; - kfree(per_cpu(zcache_workmem, cpu)); - per_cpu(zcache_workmem, cpu) = NULL; - kp = &per_cpu(zcache_preloads, cpu); - while (kp->nr) { - kmem_cache_free(zcache_objnode_cache, - kp->objnodes[kp->nr - 1]); - kp->objnodes[kp->nr - 1] = NULL; - kp->nr--; - } - if (kp->obj) { - kmem_cache_free(zcache_obj_cache, kp->obj); - kp->obj = NULL; - } - if (kp->flnode) { - kmem_cache_free(ramster_flnode_cache, kp->flnode); - kp->flnode = NULL; - } - if (kp->page) { - free_page((unsigned long)kp->page); - kp->page = NULL; - } - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block zcache_cpu_notifier_block = { - .notifier_call = zcache_cpu_notifier -}; - -#ifdef CONFIG_SYSFS -#define ZCACHE_SYSFS_RO(_name) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", zcache_##_name); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -#define ZCACHE_SYSFS_RO_ATOMIC(_name) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return _func(buf); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -ZCACHE_SYSFS_RO(curr_obj_count_max); -ZCACHE_SYSFS_RO(curr_objnode_count_max); -ZCACHE_SYSFS_RO(flush_total); -ZCACHE_SYSFS_RO(flush_found); -ZCACHE_SYSFS_RO(flobj_total); -ZCACHE_SYSFS_RO(flobj_found); -ZCACHE_SYSFS_RO(failed_eph_puts); -ZCACHE_SYSFS_RO(nonactive_puts); -ZCACHE_SYSFS_RO(failed_pers_puts); -ZCACHE_SYSFS_RO(zbud_curr_zbytes); -ZCACHE_SYSFS_RO(zbud_cumul_zpages); -ZCACHE_SYSFS_RO(zbud_cumul_zbytes); -ZCACHE_SYSFS_RO(zbud_buddied_count); -ZCACHE_SYSFS_RO(evicted_raw_pages); -ZCACHE_SYSFS_RO(evicted_unbuddied_pages); -ZCACHE_SYSFS_RO(evicted_buddied_pages); -ZCACHE_SYSFS_RO(failed_get_free_pages); -ZCACHE_SYSFS_RO(failed_alloc); -ZCACHE_SYSFS_RO(put_to_flush); -ZCACHE_SYSFS_RO(compress_poor); -ZCACHE_SYSFS_RO(mean_compress_poor); -ZCACHE_SYSFS_RO(policy_percent_exceeded); -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages); -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages); -ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count); -ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count); -ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts, - zbud_show_unbuddied_list_counts); -ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts, - zbud_show_cumul_chunk_counts); -ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts, - zv_curr_dist_counts_show); -ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts, - zv_cumul_dist_counts_show); - -static struct attribute *zcache_attrs[] = { - &zcache_curr_obj_count_attr.attr, - &zcache_curr_obj_count_max_attr.attr, - &zcache_curr_objnode_count_attr.attr, - &zcache_curr_objnode_count_max_attr.attr, - &zcache_flush_total_attr.attr, - &zcache_flobj_total_attr.attr, - &zcache_flush_found_attr.attr, - &zcache_flobj_found_attr.attr, - &zcache_failed_eph_puts_attr.attr, - &zcache_nonactive_puts_attr.attr, - &zcache_failed_pers_puts_attr.attr, - &zcache_policy_percent_exceeded_attr.attr, - &zcache_compress_poor_attr.attr, - &zcache_mean_compress_poor_attr.attr, - &zcache_zbud_curr_raw_pages_attr.attr, - &zcache_zbud_curr_zpages_attr.attr, - &zcache_zbud_curr_zbytes_attr.attr, - &zcache_zbud_cumul_zpages_attr.attr, - &zcache_zbud_cumul_zbytes_attr.attr, - &zcache_zbud_buddied_count_attr.attr, - &zcache_evicted_raw_pages_attr.attr, - &zcache_evicted_unbuddied_pages_attr.attr, - &zcache_evicted_buddied_pages_attr.attr, - &zcache_failed_get_free_pages_attr.attr, - &zcache_failed_alloc_attr.attr, - &zcache_put_to_flush_attr.attr, - &zcache_zbud_unbuddied_list_counts_attr.attr, - &zcache_zbud_cumul_chunk_counts_attr.attr, - &zcache_zv_curr_dist_counts_attr.attr, - &zcache_zv_cumul_dist_counts_attr.attr, - &zcache_zv_max_zsize_attr.attr, - &zcache_zv_max_mean_zsize_attr.attr, - &zcache_zv_page_count_policy_percent_attr.attr, - NULL, -}; - -static struct attribute_group zcache_attr_group = { - .attrs = zcache_attrs, - .name = "zcache", -}; - -#define RAMSTER_SYSFS_RO(_name) \ - static ssize_t ramster_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", ramster_##_name); \ - } \ - static struct kobj_attribute ramster_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = ramster_##_name##_show, \ - } - -#define RAMSTER_SYSFS_RW(_name) \ - static ssize_t ramster_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", ramster_##_name); \ - } \ - static ssize_t ramster_##_name##_store(struct kobject *kobj, \ - struct kobj_attribute *attr, const char *buf, size_t count) \ - { \ - int err; \ - unsigned long enable; \ - err = kstrtoul(buf, 10, &enable); \ - if (err) \ - return -EINVAL; \ - ramster_##_name = enable; \ - return count; \ - } \ - static struct kobj_attribute ramster_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0644 }, \ - .show = ramster_##_name##_show, \ - .store = ramster_##_name##_store, \ - } - -#define RAMSTER_SYSFS_RO_ATOMIC(_name) \ - static ssize_t ramster_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%d\n", atomic_read(&ramster_##_name)); \ - } \ - static struct kobj_attribute ramster_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = ramster_##_name##_show, \ - } - -RAMSTER_SYSFS_RO(interface_revision); -RAMSTER_SYSFS_RO_ATOMIC(remote_pers_pages); -RAMSTER_SYSFS_RW(pers_remotify_enable); -RAMSTER_SYSFS_RW(eph_remotify_enable); -RAMSTER_SYSFS_RO(eph_pages_remoted); -RAMSTER_SYSFS_RO(eph_pages_remote_failed); -RAMSTER_SYSFS_RO(pers_pages_remoted); -RAMSTER_SYSFS_RO(pers_pages_remote_failed); -RAMSTER_SYSFS_RO(pers_pages_remote_nomem); -RAMSTER_SYSFS_RO(remote_pages_flushed); -RAMSTER_SYSFS_RO(remote_page_flushes_failed); -RAMSTER_SYSFS_RO(remote_objects_flushed); -RAMSTER_SYSFS_RO(remote_object_flushes_failed); -RAMSTER_SYSFS_RO(remote_eph_pages_succ_get); -RAMSTER_SYSFS_RO(remote_eph_pages_unsucc_get); -RAMSTER_SYSFS_RO(remote_pers_pages_succ_get); -RAMSTER_SYSFS_RO(remote_pers_pages_unsucc_get); -RAMSTER_SYSFS_RO_ATOMIC(foreign_eph_pampd_count); -RAMSTER_SYSFS_RO(foreign_eph_pampd_count_max); -RAMSTER_SYSFS_RO_ATOMIC(foreign_pers_pampd_count); -RAMSTER_SYSFS_RO(foreign_pers_pampd_count_max); -RAMSTER_SYSFS_RO_ATOMIC(curr_flnode_count); -RAMSTER_SYSFS_RO(curr_flnode_count_max); - -#define MANUAL_NODES 8 -static bool ramster_nodes_manual_up[MANUAL_NODES]; -static ssize_t ramster_manual_node_up_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - int i; - char *p = buf; - for (i = 0; i < MANUAL_NODES; i++) - if (ramster_nodes_manual_up[i]) - p += sprintf(p, "%d ", i); - p += sprintf(p, "\n"); - return p - buf; -} - -static ssize_t ramster_manual_node_up_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - int err; - unsigned long node_num; - - err = kstrtoul(buf, 10, &node_num); - if (err) { - pr_err("ramster: bad strtoul?\n"); - return -EINVAL; - } - if (node_num >= MANUAL_NODES) { - pr_err("ramster: bad node_num=%lu?\n", node_num); - return -EINVAL; - } - if (ramster_nodes_manual_up[node_num]) { - pr_err("ramster: node %d already up, ignoring\n", - (int)node_num); - } else { - ramster_nodes_manual_up[node_num] = true; - r2net_hb_node_up_manual((int)node_num); - } - return count; -} - -static struct kobj_attribute ramster_manual_node_up_attr = { - .attr = { .name = "manual_node_up", .mode = 0644 }, - .show = ramster_manual_node_up_show, - .store = ramster_manual_node_up_store, -}; - -static ssize_t ramster_remote_target_nodenum_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - if (ramster_remote_target_nodenum == -1UL) - return sprintf(buf, "unset\n"); - else - return sprintf(buf, "%d\n", ramster_remote_target_nodenum); -} - -static ssize_t ramster_remote_target_nodenum_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - int err; - unsigned long node_num; - - err = kstrtoul(buf, 10, &node_num); - if (err) { - pr_err("ramster: bad strtoul?\n"); - return -EINVAL; - } else if (node_num == -1UL) { - pr_err("ramster: disabling all remotification, " - "data may still reside on remote nodes however\n"); - return -EINVAL; - } else if (node_num >= MANUAL_NODES) { - pr_err("ramster: bad node_num=%lu?\n", node_num); - return -EINVAL; - } else if (!ramster_nodes_manual_up[node_num]) { - pr_err("ramster: node %d not up, ignoring setting " - "of remotification target\n", (int)node_num); - } else if (r2net_remote_target_node_set((int)node_num) >= 0) { - pr_info("ramster: node %d set as remotification target\n", - (int)node_num); - ramster_remote_target_nodenum = (int)node_num; - } else { - pr_err("ramster: bad num to node node_num=%d?\n", - (int)node_num); - return -EINVAL; - } - return count; -} - -static struct kobj_attribute ramster_remote_target_nodenum_attr = { - .attr = { .name = "remote_target_nodenum", .mode = 0644 }, - .show = ramster_remote_target_nodenum_show, - .store = ramster_remote_target_nodenum_store, -}; - - -static struct attribute *ramster_attrs[] = { - &ramster_interface_revision_attr.attr, - &ramster_pers_remotify_enable_attr.attr, - &ramster_eph_remotify_enable_attr.attr, - &ramster_remote_pers_pages_attr.attr, - &ramster_eph_pages_remoted_attr.attr, - &ramster_eph_pages_remote_failed_attr.attr, - &ramster_pers_pages_remoted_attr.attr, - &ramster_pers_pages_remote_failed_attr.attr, - &ramster_pers_pages_remote_nomem_attr.attr, - &ramster_remote_pages_flushed_attr.attr, - &ramster_remote_page_flushes_failed_attr.attr, - &ramster_remote_objects_flushed_attr.attr, - &ramster_remote_object_flushes_failed_attr.attr, - &ramster_remote_eph_pages_succ_get_attr.attr, - &ramster_remote_eph_pages_unsucc_get_attr.attr, - &ramster_remote_pers_pages_succ_get_attr.attr, - &ramster_remote_pers_pages_unsucc_get_attr.attr, - &ramster_foreign_eph_pampd_count_attr.attr, - &ramster_foreign_eph_pampd_count_max_attr.attr, - &ramster_foreign_pers_pampd_count_attr.attr, - &ramster_foreign_pers_pampd_count_max_attr.attr, - &ramster_curr_flnode_count_attr.attr, - &ramster_curr_flnode_count_max_attr.attr, - &ramster_manual_node_up_attr.attr, - &ramster_remote_target_nodenum_attr.attr, - NULL, -}; - -static struct attribute_group ramster_attr_group = { - .attrs = ramster_attrs, - .name = "ramster", -}; - -#endif /* CONFIG_SYSFS */ -/* - * When zcache is disabled ("frozen"), pools can be created and destroyed, - * but all puts (and thus all other operations that require memory allocation) - * must fail. If zcache is unfrozen, accepts puts, then frozen again, - * data consistency requires all puts while frozen to be converted into - * flushes. - */ -static bool zcache_freeze; - -/* - * zcache shrinker interface (only useful for ephemeral pages, so zbud only) - */ -static int shrink_zcache_memory(struct shrinker *shrink, - struct shrink_control *sc) -{ - int ret = -1; - int nr = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; - - if (nr >= 0) { - if (!(gfp_mask & __GFP_FS)) - /* does this case really need to be skipped? */ - goto out; - zbud_evict_pages(nr); - } - ret = (int)atomic_read(&zcache_zbud_curr_raw_pages); -out: - return ret; -} - -static struct shrinker zcache_shrinker = { - .shrink = shrink_zcache_memory, - .seeks = DEFAULT_SEEKS, -}; - -/* - * zcache shims between cleancache/frontswap ops and tmem - */ - -int zcache_put(int cli_id, int pool_id, struct tmem_oid *oidp, - uint32_t index, char *data, size_t size, - bool raw, int ephemeral) -{ - struct tmem_pool *pool; - int ret = -1; - - BUG_ON(!irqs_disabled()); - pool = zcache_get_pool_by_id(cli_id, pool_id); - if (unlikely(pool == NULL)) - goto out; - if (!zcache_freeze && zcache_do_preload(pool) == 0) { - /* preload does preempt_disable on success */ - ret = tmem_put(pool, oidp, index, data, size, raw, ephemeral); - if (ret < 0) { - if (is_ephemeral(pool)) - zcache_failed_eph_puts++; - else - zcache_failed_pers_puts++; - } - zcache_put_pool(pool); - preempt_enable_no_resched(); - } else { - zcache_put_to_flush++; - if (atomic_read(&pool->obj_count) > 0) - /* the put fails whether the flush succeeds or not */ - (void)tmem_flush_page(pool, oidp, index); - zcache_put_pool(pool); - } -out: - return ret; -} - -int zcache_get(int cli_id, int pool_id, struct tmem_oid *oidp, - uint32_t index, char *data, size_t *sizep, - bool raw, int get_and_free) -{ - struct tmem_pool *pool; - int ret = -1; - bool eph; - - if (!raw) { - BUG_ON(irqs_disabled()); - BUG_ON(in_softirq()); - } - pool = zcache_get_pool_by_id(cli_id, pool_id); - eph = is_ephemeral(pool); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_get(pool, oidp, index, data, sizep, - raw, get_and_free); - zcache_put_pool(pool); - } - WARN_ONCE((!eph && (ret != 0)), "zcache_get fails on persistent pool, " - "bad things are very likely to happen soon\n"); -#ifdef RAMSTER_TESTING - if (ret != 0 && ret != -1 && !(ret == -EINVAL && is_ephemeral(pool))) - pr_err("TESTING zcache_get tmem_get returns ret=%d\n", ret); -#endif - if (ret == -EAGAIN) - BUG(); /* FIXME... don't need this anymore??? let's ensure */ - return ret; -} - -int zcache_flush(int cli_id, int pool_id, - struct tmem_oid *oidp, uint32_t index) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - - local_irq_save(flags); - zcache_flush_total++; - pool = zcache_get_pool_by_id(cli_id, pool_id); - ramster_do_preload_flnode_only(pool); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_flush_page(pool, oidp, index); - zcache_put_pool(pool); - } - if (ret >= 0) - zcache_flush_found++; - local_irq_restore(flags); - return ret; -} - -int zcache_flush_object(int cli_id, int pool_id, struct tmem_oid *oidp) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - - local_irq_save(flags); - zcache_flobj_total++; - pool = zcache_get_pool_by_id(cli_id, pool_id); - ramster_do_preload_flnode_only(pool); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_flush_object(pool, oidp); - zcache_put_pool(pool); - } - if (ret >= 0) - zcache_flobj_found++; - local_irq_restore(flags); - return ret; -} - -int zcache_client_destroy_pool(int cli_id, int pool_id) -{ - struct tmem_pool *pool = NULL; - struct zcache_client *cli = NULL; - int ret = -1; - - if (pool_id < 0) - goto out; - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else if ((unsigned int)cli_id < MAX_CLIENTS) - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - atomic_inc(&cli->refcount); - pool = cli->tmem_pools[pool_id]; - if (pool == NULL) - goto out; - cli->tmem_pools[pool_id] = NULL; - /* wait for pool activity on other cpus to quiesce */ - while (atomic_read(&pool->refcount) != 0) - ; - atomic_dec(&cli->refcount); - local_bh_disable(); - ret = tmem_destroy_pool(pool); - local_bh_enable(); - kfree(pool); - pr_info("ramster: destroyed pool id=%d cli_id=%d\n", pool_id, cli_id); -out: - return ret; -} - -static int zcache_destroy_pool(int pool_id) -{ - return zcache_client_destroy_pool(LOCAL_CLIENT, pool_id); -} - -int zcache_new_pool(uint16_t cli_id, uint32_t flags) -{ - int poolid = -1; - struct tmem_pool *pool; - struct zcache_client *cli = NULL; - - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else if ((unsigned int)cli_id < MAX_CLIENTS) - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - atomic_inc(&cli->refcount); - pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC); - if (pool == NULL) { - pr_info("ramster: pool creation failed: out of memory\n"); - goto out; - } - - for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++) - if (cli->tmem_pools[poolid] == NULL) - break; - if (poolid >= MAX_POOLS_PER_CLIENT) { - pr_info("ramster: pool creation failed: max exceeded\n"); - kfree(pool); - poolid = -1; - goto out; - } - atomic_set(&pool->refcount, 0); - pool->client = cli; - pool->pool_id = poolid; - tmem_new_pool(pool, flags); - cli->tmem_pools[poolid] = pool; - if (cli_id == LOCAL_CLIENT) - pr_info("ramster: created %s tmem pool, id=%d, local client\n", - flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", - poolid); - else - pr_info("ramster: created %s tmem pool, id=%d, client=%d\n", - flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", - poolid, cli_id); -out: - if (cli != NULL) - atomic_dec(&cli->refcount); - return poolid; -} - -static int zcache_local_new_pool(uint32_t flags) -{ - return zcache_new_pool(LOCAL_CLIENT, flags); -} - -int zcache_autocreate_pool(int cli_id, int pool_id, bool ephemeral) -{ - struct tmem_pool *pool; - struct zcache_client *cli = NULL; - uint32_t flags = ephemeral ? 0 : TMEM_POOL_PERSIST; - int ret = -1; - - if (cli_id == LOCAL_CLIENT) - goto out; - if (pool_id >= MAX_POOLS_PER_CLIENT) - goto out; - else if ((unsigned int)cli_id < MAX_CLIENTS) - cli = &zcache_clients[cli_id]; - if ((ephemeral && !use_cleancache) || (!ephemeral && !use_frontswap)) - BUG(); /* FIXME, handle more gracefully later */ - if (!cli->allocated) { - if (zcache_new_client(cli_id)) - BUG(); /* FIXME, handle more gracefully later */ - cli = &zcache_clients[cli_id]; - } - atomic_inc(&cli->refcount); - pool = cli->tmem_pools[pool_id]; - if (pool != NULL) { - if (pool->persistent && ephemeral) { - pr_err("zcache_autocreate_pool: type mismatch\n"); - goto out; - } - ret = 0; - goto out; - } - pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL); - if (pool == NULL) { - pr_info("ramster: pool creation failed: out of memory\n"); - goto out; - } - atomic_set(&pool->refcount, 0); - pool->client = cli; - pool->pool_id = pool_id; - tmem_new_pool(pool, flags); - cli->tmem_pools[pool_id] = pool; - pr_info("ramster: AUTOcreated %s tmem poolid=%d, for remote client=%d\n", - flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", - pool_id, cli_id); - ret = 0; -out: - if (cli == NULL) - BUG(); /* FIXME, handle more gracefully later */ - /* pr_err("zcache_autocreate_pool: failed\n"); */ - if (cli != NULL) - atomic_dec(&cli->refcount); - return ret; -} - -/********** - * Two kernel functionalities currently can be layered on top of tmem. - * These are "cleancache" which is used as a second-chance cache for clean - * page cache pages; and "frontswap" which is used for swap pages - * to avoid writes to disk. A generic "shim" is provided here for each - * to translate in-kernel semantics to zcache semantics. - */ - -#ifdef CONFIG_CLEANCACHE -static void zcache_cleancache_put_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - -#ifdef __PG_WAS_ACTIVE - if (!PageWasActive(page)) { - zcache_nonactive_puts++; - return; - } -#endif - if (likely(ind == index)) { - char *kva = page_address(page); - - (void)zcache_put(LOCAL_CLIENT, pool_id, &oid, index, - kva, PAGE_SIZE, 0, 1); - } -} - -static int zcache_cleancache_get_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - int ret = -1; - - preempt_disable(); - if (likely(ind == index)) { - char *kva = page_address(page); - size_t size = PAGE_SIZE; - - ret = zcache_get(LOCAL_CLIENT, pool_id, &oid, index, - kva, &size, 0, 0); -#ifdef __PG_WAS_ACTIVE - if (ret == 0) - SetPageWasActive(page); -#endif - } - preempt_enable(); - return ret; -} - -static void zcache_cleancache_flush_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (likely(ind == index)) - (void)zcache_flush(LOCAL_CLIENT, pool_id, &oid, ind); -} - -static void zcache_cleancache_flush_inode(int pool_id, - struct cleancache_filekey key) -{ - struct tmem_oid oid = *(struct tmem_oid *)&key; - - (void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid); -} - -static void zcache_cleancache_flush_fs(int pool_id) -{ - if (pool_id >= 0) - (void)zcache_destroy_pool(pool_id); -} - -static int zcache_cleancache_init_fs(size_t pagesize) -{ - BUG_ON(sizeof(struct cleancache_filekey) != - sizeof(struct tmem_oid)); - BUG_ON(pagesize != PAGE_SIZE); - return zcache_local_new_pool(0); -} - -static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) -{ - /* shared pools are unsupported and map to private */ - BUG_ON(sizeof(struct cleancache_filekey) != - sizeof(struct tmem_oid)); - BUG_ON(pagesize != PAGE_SIZE); - return zcache_local_new_pool(0); -} - -static struct cleancache_ops zcache_cleancache_ops = { - .put_page = zcache_cleancache_put_page, - .get_page = zcache_cleancache_get_page, - .invalidate_page = zcache_cleancache_flush_page, - .invalidate_inode = zcache_cleancache_flush_inode, - .invalidate_fs = zcache_cleancache_flush_fs, - .init_shared_fs = zcache_cleancache_init_shared_fs, - .init_fs = zcache_cleancache_init_fs -}; - -struct cleancache_ops zcache_cleancache_register_ops(void) -{ - struct cleancache_ops old_ops = - cleancache_register_ops(&zcache_cleancache_ops); - - return old_ops; -} -#endif - -#ifdef CONFIG_FRONTSWAP -/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ -static int zcache_frontswap_poolid = -1; - -/* - * Swizzling increases objects per swaptype, increasing tmem concurrency - * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS - */ -#define SWIZ_BITS 8 -#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) -#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) -#define iswiz(_ind) (_ind >> SWIZ_BITS) - -static inline struct tmem_oid oswiz(unsigned type, u32 ind) -{ - struct tmem_oid oid = { .oid = { 0 } }; - oid.oid[0] = _oswiz(type, ind); - return oid; -} - -static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - int ret = -1; - unsigned long flags; - char *kva; - - BUG_ON(!PageLocked(page)); - if (likely(ind64 == ind)) { - local_irq_save(flags); - kva = page_address(page); - ret = zcache_put(LOCAL_CLIENT, zcache_frontswap_poolid, - &oid, iswiz(ind), kva, PAGE_SIZE, 0, 0); - local_irq_restore(flags); - } - return ret; -} - -/* returns 0 if the page was successfully gotten from frontswap, -1 if - * was not present (should never happen!) */ -static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - int ret = -1; - - preempt_disable(); /* FIXME, remove this? */ - BUG_ON(!PageLocked(page)); - if (likely(ind64 == ind)) { - char *kva = page_address(page); - size_t size = PAGE_SIZE; - - ret = zcache_get(LOCAL_CLIENT, zcache_frontswap_poolid, - &oid, iswiz(ind), kva, &size, 0, -1); - } - preempt_enable(); /* FIXME, remove this? */ - return ret; -} - -/* flush a single page from frontswap */ -static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - - if (likely(ind64 == ind)) - (void)zcache_flush(LOCAL_CLIENT, zcache_frontswap_poolid, - &oid, iswiz(ind)); -} - -/* flush all pages from the passed swaptype */ -static void zcache_frontswap_flush_area(unsigned type) -{ - struct tmem_oid oid; - int ind; - - for (ind = SWIZ_MASK; ind >= 0; ind--) { - oid = oswiz(type, ind); - (void)zcache_flush_object(LOCAL_CLIENT, - zcache_frontswap_poolid, &oid); - } -} - -static void zcache_frontswap_init(unsigned ignored) -{ - /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ - if (zcache_frontswap_poolid < 0) - zcache_frontswap_poolid = - zcache_local_new_pool(TMEM_POOL_PERSIST); -} - -static struct frontswap_ops zcache_frontswap_ops = { - .put_page = zcache_frontswap_put_page, - .get_page = zcache_frontswap_get_page, - .invalidate_page = zcache_frontswap_flush_page, - .invalidate_area = zcache_frontswap_flush_area, - .init = zcache_frontswap_init -}; - -struct frontswap_ops zcache_frontswap_register_ops(void) -{ - struct frontswap_ops old_ops = - frontswap_register_ops(&zcache_frontswap_ops); - - return old_ops; -} -#endif - -/* - * frontswap selfshrinking - */ - -#ifdef CONFIG_FRONTSWAP -/* In HZ, controls frequency of worker invocation. */ -static unsigned int selfshrink_interval __read_mostly = 5; - -static void selfshrink_process(struct work_struct *work); -static DECLARE_DELAYED_WORK(selfshrink_worker, selfshrink_process); - -/* Enable/disable with sysfs. */ -static bool frontswap_selfshrinking __read_mostly; - -/* Enable/disable with kernel boot option. */ -static bool use_frontswap_selfshrink __initdata = true; - -/* - * The default values for the following parameters were deemed reasonable - * by experimentation, may be workload-dependent, and can all be - * adjusted via sysfs. - */ - -/* Control rate for frontswap shrinking. Higher hysteresis is slower. */ -static unsigned int frontswap_hysteresis __read_mostly = 20; - -/* - * Number of selfshrink worker invocations to wait before observing that - * frontswap selfshrinking should commence. Note that selfshrinking does - * not use a separate worker thread. - */ -static unsigned int frontswap_inertia __read_mostly = 3; - -/* Countdown to next invocation of frontswap_shrink() */ -static unsigned long frontswap_inertia_counter; - -/* - * Invoked by the selfshrink worker thread, uses current number of pages - * in frontswap (frontswap_curr_pages()), previous status, and control - * values (hysteresis and inertia) to determine if frontswap should be - * shrunk and what the new frontswap size should be. Note that - * frontswap_shrink is essentially a partial swapoff that immediately - * transfers pages from the "swap device" (frontswap) back into kernel - * RAM; despite the name, frontswap "shrinking" is very different from - * the "shrinker" interface used by the kernel MM subsystem to reclaim - * memory. - */ -static void frontswap_selfshrink(void) -{ - static unsigned long cur_frontswap_pages; - static unsigned long last_frontswap_pages; - static unsigned long tgt_frontswap_pages; - - last_frontswap_pages = cur_frontswap_pages; - cur_frontswap_pages = frontswap_curr_pages(); - if (!cur_frontswap_pages || - (cur_frontswap_pages > last_frontswap_pages)) { - frontswap_inertia_counter = frontswap_inertia; - return; - } - if (frontswap_inertia_counter && --frontswap_inertia_counter) - return; - if (cur_frontswap_pages <= frontswap_hysteresis) - tgt_frontswap_pages = 0; - else - tgt_frontswap_pages = cur_frontswap_pages - - (cur_frontswap_pages / frontswap_hysteresis); - frontswap_shrink(tgt_frontswap_pages); -} - -static int __init ramster_nofrontswap_selfshrink_setup(char *s) -{ - use_frontswap_selfshrink = false; - return 1; -} - -__setup("noselfshrink", ramster_nofrontswap_selfshrink_setup); - -static void selfshrink_process(struct work_struct *work) -{ - if (frontswap_selfshrinking && frontswap_enabled) { - frontswap_selfshrink(); - schedule_delayed_work(&selfshrink_worker, - selfshrink_interval * HZ); - } -} - -static int ramster_enabled; - -static int __init ramster_selfshrink_init(void) -{ - frontswap_selfshrinking = ramster_enabled && use_frontswap_selfshrink; - if (frontswap_selfshrinking) - pr_info("ramster: Initializing frontswap " - "selfshrinking driver.\n"); - else - return -ENODEV; - - schedule_delayed_work(&selfshrink_worker, selfshrink_interval * HZ); - - return 0; -} - -subsys_initcall(ramster_selfshrink_init); -#endif - -/* - * zcache initialization - * NOTE FOR NOW ramster MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR - * NOTHING HAPPENS! - */ - -static int ramster_enabled; - -static int __init enable_ramster(char *s) -{ - ramster_enabled = 1; - return 1; -} -__setup("ramster", enable_ramster); - -/* allow independent dynamic disabling of cleancache and frontswap */ - -static int use_cleancache = 1; - -static int __init no_cleancache(char *s) -{ - pr_info("INIT no_cleancache called\n"); - use_cleancache = 0; - return 1; -} - -/* - * FIXME: need to guarantee this gets checked before zcache_init is called - * What is the correct way to achieve this? - */ -early_param("nocleancache", no_cleancache); - -static int use_frontswap = 1; - -static int __init no_frontswap(char *s) -{ - pr_info("INIT no_frontswap called\n"); - use_frontswap = 0; - return 1; -} - -__setup("nofrontswap", no_frontswap); - -static int __init zcache_init(void) -{ - int ret = 0; - -#ifdef CONFIG_SYSFS - ret = sysfs_create_group(mm_kobj, &zcache_attr_group); - ret = sysfs_create_group(mm_kobj, &ramster_attr_group); - if (ret) { - pr_err("ramster: can't create sysfs\n"); - goto out; - } -#endif /* CONFIG_SYSFS */ -#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP) - if (ramster_enabled) { - unsigned int cpu; - - (void)r2net_register_handlers(); - tmem_register_hostops(&zcache_hostops); - tmem_register_pamops(&zcache_pamops); - ret = register_cpu_notifier(&zcache_cpu_notifier_block); - if (ret) { - pr_err("ramster: can't register cpu notifier\n"); - goto out; - } - for_each_online_cpu(cpu) { - void *pcpu = (void *)(long)cpu; - zcache_cpu_notifier(&zcache_cpu_notifier_block, - CPU_UP_PREPARE, pcpu); - } - } - zcache_objnode_cache = kmem_cache_create("zcache_objnode", - sizeof(struct tmem_objnode), 0, 0, NULL); - zcache_obj_cache = kmem_cache_create("zcache_obj", - sizeof(struct tmem_obj), 0, 0, NULL); - ramster_flnode_cache = kmem_cache_create("ramster_flnode", - sizeof(struct flushlist_node), 0, 0, NULL); -#endif -#ifdef CONFIG_CLEANCACHE - pr_info("INIT ramster_enabled=%d use_cleancache=%d\n", - ramster_enabled, use_cleancache); - if (ramster_enabled && use_cleancache) { - struct cleancache_ops old_ops; - - zbud_init(); - register_shrinker(&zcache_shrinker); - old_ops = zcache_cleancache_register_ops(); - pr_info("ramster: cleancache enabled using kernel " - "transcendent memory and compression buddies\n"); - if (old_ops.init_fs != NULL) - pr_warning("ramster: cleancache_ops overridden"); - } -#endif -#ifdef CONFIG_FRONTSWAP - pr_info("INIT ramster_enabled=%d use_frontswap=%d\n", - ramster_enabled, use_frontswap); - if (ramster_enabled && use_frontswap) { - struct frontswap_ops old_ops; - - zcache_new_client(LOCAL_CLIENT); - old_ops = zcache_frontswap_register_ops(); - pr_info("ramster: frontswap enabled using kernel " - "transcendent memory and xvmalloc\n"); - if (old_ops.init != NULL) - pr_warning("ramster: frontswap_ops overridden"); - } - if (ramster_enabled && (use_frontswap || use_cleancache)) - ramster_remotify_init(); -#endif -out: - return ret; -} - -module_init(zcache_init) diff --git a/drivers/staging/ramster/zcache.h b/drivers/staging/ramster/zcache.h deleted file mode 100644 index 250b121c2..000000000 --- a/drivers/staging/ramster/zcache.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * zcache.h - * - * External zcache functions - * - * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp. - */ - -#ifndef _ZCACHE_H_ -#define _ZCACHE_H_ - -extern int zcache_put(int, int, struct tmem_oid *, uint32_t, - char *, size_t, bool, int); -extern int zcache_autocreate_pool(int, int, bool); -extern int zcache_get(int, int, struct tmem_oid *, uint32_t, - char *, size_t *, bool, int); -extern int zcache_flush(int, int, struct tmem_oid *, uint32_t); -extern int zcache_flush_object(int, int, struct tmem_oid *); -extern int zcache_localify(int, struct tmem_oid *, uint32_t, - char *, size_t, void *); - -#endif /* _ZCACHE_H */ diff --git a/drivers/staging/zcache/Kconfig b/drivers/staging/zcache/Kconfig deleted file mode 100644 index 7048e01f0..000000000 --- a/drivers/staging/zcache/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -config ZCACHE - bool "Dynamic compression of swap pages and clean pagecache pages" - # X86 dependency is because zsmalloc uses non-portable pte/tlb - # functions - depends on (CLEANCACHE || FRONTSWAP) && CRYPTO=y && X86 - select ZSMALLOC - select CRYPTO_LZO - default n - help - Zcache doubles RAM efficiency while providing a significant - performance boosts on many workloads. Zcache uses - compression and an in-kernel implementation of transcendent - memory to store clean page cache pages and swap in RAM, - providing a noticeable reduction in disk I/O. diff --git a/drivers/staging/zcache/Makefile b/drivers/staging/zcache/Makefile deleted file mode 100644 index 60daa272c..000000000 --- a/drivers/staging/zcache/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -zcache-y := zcache-main.o tmem.o - -obj-$(CONFIG_ZCACHE) += zcache.o diff --git a/drivers/staging/zcache/tmem.c b/drivers/staging/zcache/tmem.c deleted file mode 100644 index 1ca66ea9b..000000000 --- a/drivers/staging/zcache/tmem.c +++ /dev/null @@ -1,770 +0,0 @@ -/* - * In-kernel transcendent memory (generic implementation) - * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. - * - * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented - * "handles" (triples containing a pool id, and object id, and an index), to - * pages in a page-accessible memory (PAM). Tmem references the PAM pages via - * an abstract "pampd" (PAM page-descriptor), which can be operated on by a - * set of functions (pamops). Each pampd contains some representation of - * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of - * pages and must be able to insert, find, and delete these pages at a - * potential frequency of thousands per second concurrently across many CPUs, - * (and, if used with KVM, across many vcpus across many guests). - * Tmem is tracked with a hierarchy of data structures, organized by - * the elements in a handle-tuple: pool_id, object_id, and page index. - * One or more "clients" (e.g. guests) each provide one or more tmem_pools. - * Each pool, contains a hash table of rb_trees of tmem_objs. Each - * tmem_obj contains a radix-tree-like tree of pointers, with intermediate - * nodes called tmem_objnodes. Each leaf pointer in this tree points to - * a pampd, which is accessible only through a small set of callbacks - * registered by the PAM implementation (see tmem_register_pamops). Tmem - * does all memory allocation via a set of callbacks registered by the tmem - * host implementation (e.g. see tmem_register_hostops). - */ - -#include -#include -#include - -#include "tmem.h" - -/* data structure sentinels used for debugging... see tmem.h */ -#define POOL_SENTINEL 0x87658765 -#define OBJ_SENTINEL 0x12345678 -#define OBJNODE_SENTINEL 0xfedcba09 - -/* - * A tmem host implementation must use this function to register callbacks - * for memory allocation. - */ -static struct tmem_hostops tmem_hostops; - -static void tmem_objnode_tree_init(void); - -void tmem_register_hostops(struct tmem_hostops *m) -{ - tmem_objnode_tree_init(); - tmem_hostops = *m; -} - -/* - * A tmem host implementation must use this function to register - * callbacks for a page-accessible memory (PAM) implementation - */ -static struct tmem_pamops tmem_pamops; - -void tmem_register_pamops(struct tmem_pamops *m) -{ - tmem_pamops = *m; -} - -/* - * Oid's are potentially very sparse and tmem_objs may have an indeterminately - * short life, being added and deleted at a relatively high frequency. - * So an rb_tree is an ideal data structure to manage tmem_objs. But because - * of the potentially huge number of tmem_objs, each pool manages a hashtable - * of rb_trees to reduce search, insert, delete, and rebalancing time. - * Each hashbucket also has a lock to manage concurrent access. - * - * The following routines manage tmem_objs. When any tmem_obj is accessed, - * the hashbucket lock must be held. - */ - -/* searches for object==oid in pool, returns locked object if found */ -static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, - struct tmem_oid *oidp) -{ - struct rb_node *rbnode; - struct tmem_obj *obj; - - rbnode = hb->obj_rb_root.rb_node; - while (rbnode) { - BUG_ON(RB_EMPTY_NODE(rbnode)); - obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); - switch (tmem_oid_compare(oidp, &obj->oid)) { - case 0: /* equal */ - goto out; - case -1: - rbnode = rbnode->rb_left; - break; - case 1: - rbnode = rbnode->rb_right; - break; - } - } - obj = NULL; -out: - return obj; -} - -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *); - -/* free an object that has no more pampds in it */ -static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) -{ - struct tmem_pool *pool; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pampd_count > 0); - pool = obj->pool; - BUG_ON(pool == NULL); - if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */ - tmem_pampd_destroy_all_in_obj(obj); - BUG_ON(obj->objnode_tree_root != NULL); - BUG_ON((long)obj->objnode_count != 0); - atomic_dec(&pool->obj_count); - BUG_ON(atomic_read(&pool->obj_count) < 0); - INVERT_SENTINEL(obj, OBJ); - obj->pool = NULL; - tmem_oid_set_invalid(&obj->oid); - rb_erase(&obj->rb_tree_node, &hb->obj_rb_root); -} - -/* - * initialize, and insert an tmem_object_root (called only if find failed) - */ -static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, - struct tmem_pool *pool, - struct tmem_oid *oidp) -{ - struct rb_root *root = &hb->obj_rb_root; - struct rb_node **new = &(root->rb_node), *parent = NULL; - struct tmem_obj *this; - - BUG_ON(pool == NULL); - atomic_inc(&pool->obj_count); - obj->objnode_tree_height = 0; - obj->objnode_tree_root = NULL; - obj->pool = pool; - obj->oid = *oidp; - obj->objnode_count = 0; - obj->pampd_count = 0; - (*tmem_pamops.new_obj)(obj); - SET_SENTINEL(obj, OBJ); - while (*new) { - BUG_ON(RB_EMPTY_NODE(*new)); - this = rb_entry(*new, struct tmem_obj, rb_tree_node); - parent = *new; - switch (tmem_oid_compare(oidp, &this->oid)) { - case 0: - BUG(); /* already present; should never happen! */ - break; - case -1: - new = &(*new)->rb_left; - break; - case 1: - new = &(*new)->rb_right; - break; - } - } - rb_link_node(&obj->rb_tree_node, parent, new); - rb_insert_color(&obj->rb_tree_node, root); -} - -/* - * Tmem is managed as a set of tmem_pools with certain attributes, such as - * "ephemeral" vs "persistent". These attributes apply to all tmem_objs - * and all pampds that belong to a tmem_pool. A tmem_pool is created - * or deleted relatively rarely (for example, when a filesystem is - * mounted or unmounted. - */ - -/* flush all data from a pool and, optionally, free it */ -static void tmem_pool_flush(struct tmem_pool *pool, bool destroy) -{ - struct rb_node *rbnode; - struct tmem_obj *obj; - struct tmem_hashbucket *hb = &pool->hashbucket[0]; - int i; - - BUG_ON(pool == NULL); - for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { - spin_lock(&hb->lock); - rbnode = rb_first(&hb->obj_rb_root); - while (rbnode != NULL) { - obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); - rbnode = rb_next(rbnode); - tmem_pampd_destroy_all_in_obj(obj); - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - } - spin_unlock(&hb->lock); - } - if (destroy) - list_del(&pool->pool_list); -} - -/* - * A tmem_obj contains a radix-tree-like tree in which the intermediate - * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation - * is very specialized and tuned for specific uses and is not particularly - * suited for use from this code, though some code from the core algorithms has - * been reused, thus the copyright notices below). Each tmem_objnode contains - * a set of pointers which point to either a set of intermediate tmem_objnodes - * or a set of of pampds. - * - * Portions Copyright (C) 2001 Momchil Velikov - * Portions Copyright (C) 2001 Christoph Hellwig - * Portions Copyright (C) 2005 SGI, Christoph Lameter - */ - -struct tmem_objnode_tree_path { - struct tmem_objnode *objnode; - int offset; -}; - -/* objnode height_to_maxindex translation */ -static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1]; - -static void tmem_objnode_tree_init(void) -{ - unsigned int ht, tmp; - - for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) { - tmp = ht * OBJNODE_TREE_MAP_SHIFT; - if (tmp >= OBJNODE_TREE_INDEX_BITS) - tmem_objnode_tree_h2max[ht] = ~0UL; - else - tmem_objnode_tree_h2max[ht] = - (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1; - } -} - -static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj) -{ - struct tmem_objnode *objnode; - - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - objnode = (*tmem_hostops.objnode_alloc)(obj->pool); - if (unlikely(objnode == NULL)) - goto out; - objnode->obj = obj; - SET_SENTINEL(objnode, OBJNODE); - memset(&objnode->slots, 0, sizeof(objnode->slots)); - objnode->slots_in_use = 0; - obj->objnode_count++; -out: - return objnode; -} - -static void tmem_objnode_free(struct tmem_objnode *objnode) -{ - struct tmem_pool *pool; - int i; - - BUG_ON(objnode == NULL); - for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) - BUG_ON(objnode->slots[i] != NULL); - ASSERT_SENTINEL(objnode, OBJNODE); - INVERT_SENTINEL(objnode, OBJNODE); - BUG_ON(objnode->obj == NULL); - ASSERT_SENTINEL(objnode->obj, OBJ); - pool = objnode->obj->pool; - BUG_ON(pool == NULL); - ASSERT_SENTINEL(pool, POOL); - objnode->obj->objnode_count--; - objnode->obj = NULL; - (*tmem_hostops.objnode_free)(objnode, pool); -} - -/* - * lookup index in object and return associated pampd (or NULL if not found) - */ -static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) -{ - unsigned int height, shift; - struct tmem_objnode **slot = NULL; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - - height = obj->objnode_tree_height; - if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) - goto out; - if (height == 0 && obj->objnode_tree_root) { - slot = &obj->objnode_tree_root; - goto out; - } - shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; - slot = &obj->objnode_tree_root; - while (height > 0) { - if (*slot == NULL) - goto out; - slot = (struct tmem_objnode **) - ((*slot)->slots + - ((index >> shift) & OBJNODE_TREE_MAP_MASK)); - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } -out: - return slot != NULL ? (void **)slot : NULL; -} - -static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) -{ - struct tmem_objnode **slot; - - slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); - return slot != NULL ? *slot : NULL; -} - -static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, - void *new_pampd) -{ - struct tmem_objnode **slot; - void *ret = NULL; - - slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); - if ((slot != NULL) && (*slot != NULL)) { - void *old_pampd = *(void **)slot; - *(void **)slot = new_pampd; - (*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0); - ret = new_pampd; - } - return ret; -} - -static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index, - void *pampd) -{ - int ret = 0; - struct tmem_objnode *objnode = NULL, *newnode, *slot; - unsigned int height, shift; - int offset = 0; - - /* if necessary, extend the tree to be higher */ - if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) { - height = obj->objnode_tree_height + 1; - if (index > tmem_objnode_tree_h2max[height]) - while (index > tmem_objnode_tree_h2max[height]) - height++; - if (obj->objnode_tree_root == NULL) { - obj->objnode_tree_height = height; - goto insert; - } - do { - newnode = tmem_objnode_alloc(obj); - if (!newnode) { - ret = -ENOMEM; - goto out; - } - newnode->slots[0] = obj->objnode_tree_root; - newnode->slots_in_use = 1; - obj->objnode_tree_root = newnode; - obj->objnode_tree_height++; - } while (height > obj->objnode_tree_height); - } -insert: - slot = obj->objnode_tree_root; - height = obj->objnode_tree_height; - shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; - while (height > 0) { - if (slot == NULL) { - /* add a child objnode. */ - slot = tmem_objnode_alloc(obj); - if (!slot) { - ret = -ENOMEM; - goto out; - } - if (objnode) { - - objnode->slots[offset] = slot; - objnode->slots_in_use++; - } else - obj->objnode_tree_root = slot; - } - /* go down a level */ - offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; - objnode = slot; - slot = objnode->slots[offset]; - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } - BUG_ON(slot != NULL); - if (objnode) { - objnode->slots_in_use++; - objnode->slots[offset] = pampd; - } else - obj->objnode_tree_root = pampd; - obj->pampd_count++; -out: - return ret; -} - -static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index) -{ - struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1]; - struct tmem_objnode_tree_path *pathp = path; - struct tmem_objnode *slot = NULL; - unsigned int height, shift; - int offset; - - BUG_ON(obj == NULL); - ASSERT_SENTINEL(obj, OBJ); - BUG_ON(obj->pool == NULL); - ASSERT_SENTINEL(obj->pool, POOL); - height = obj->objnode_tree_height; - if (index > tmem_objnode_tree_h2max[height]) - goto out; - slot = obj->objnode_tree_root; - if (height == 0 && obj->objnode_tree_root) { - obj->objnode_tree_root = NULL; - goto out; - } - shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT; - pathp->objnode = NULL; - do { - if (slot == NULL) - goto out; - pathp++; - offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; - pathp->offset = offset; - pathp->objnode = slot; - slot = slot->slots[offset]; - shift -= OBJNODE_TREE_MAP_SHIFT; - height--; - } while (height > 0); - if (slot == NULL) - goto out; - while (pathp->objnode) { - pathp->objnode->slots[pathp->offset] = NULL; - pathp->objnode->slots_in_use--; - if (pathp->objnode->slots_in_use) { - if (pathp->objnode == obj->objnode_tree_root) { - while (obj->objnode_tree_height > 0 && - obj->objnode_tree_root->slots_in_use == 1 && - obj->objnode_tree_root->slots[0]) { - struct tmem_objnode *to_free = - obj->objnode_tree_root; - - obj->objnode_tree_root = - to_free->slots[0]; - obj->objnode_tree_height--; - to_free->slots[0] = NULL; - to_free->slots_in_use = 0; - tmem_objnode_free(to_free); - } - } - goto out; - } - tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */ - pathp--; - } - obj->objnode_tree_height = 0; - obj->objnode_tree_root = NULL; - -out: - if (slot != NULL) - obj->pampd_count--; - BUG_ON(obj->pampd_count < 0); - return slot; -} - -/* recursively walk the objnode_tree destroying pampds and objnodes */ -static void tmem_objnode_node_destroy(struct tmem_obj *obj, - struct tmem_objnode *objnode, - unsigned int ht) -{ - int i; - - if (ht == 0) - return; - for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) { - if (objnode->slots[i]) { - if (ht == 1) { - obj->pampd_count--; - (*tmem_pamops.free)(objnode->slots[i], - obj->pool, NULL, 0); - objnode->slots[i] = NULL; - continue; - } - tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1); - tmem_objnode_free(objnode->slots[i]); - objnode->slots[i] = NULL; - } - } -} - -static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) -{ - if (obj->objnode_tree_root == NULL) - return; - if (obj->objnode_tree_height == 0) { - obj->pampd_count--; - (*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0); - } else { - tmem_objnode_node_destroy(obj, obj->objnode_tree_root, - obj->objnode_tree_height); - tmem_objnode_free(obj->objnode_tree_root); - obj->objnode_tree_height = 0; - } - obj->objnode_tree_root = NULL; - (*tmem_pamops.free_obj)(obj->pool, obj); -} - -/* - * Tmem is operated on by a set of well-defined actions: - * "put", "get", "flush", "flush_object", "new pool" and "destroy pool". - * (The tmem ABI allows for subpages and exchanges but these operations - * are not included in this implementation.) - * - * These "tmem core" operations are implemented in the following functions. - */ - -/* - * "Put" a page, e.g. copy a page from the kernel into newly allocated - * PAM space (if such space is available). Tmem_put is complicated by - * a corner case: What if a page with matching handle already exists in - * tmem? To guarantee coherency, one of two actions is necessary: Either - * the data for the page must be overwritten, or the page must be - * "flushed" so that the data is not accessible to a subsequent "get". - * Since these "duplicate puts" are relatively rare, this implementation - * always flushes for simplicity. - */ -int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, - char *data, size_t size, bool raw, bool ephemeral) -{ - struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; - void *pampd = NULL, *pampd_del = NULL; - int ret = -ENOMEM; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = objfound = tmem_obj_find(hb, oidp); - if (obj != NULL) { - pampd = tmem_pampd_lookup_in_obj(objfound, index); - if (pampd != NULL) { - /* if found, is a dup put, flush the old one */ - pampd_del = tmem_pampd_delete_from_obj(obj, index); - BUG_ON(pampd_del != pampd); - (*tmem_pamops.free)(pampd, pool, oidp, index); - if (obj->pampd_count == 0) { - objnew = obj; - objfound = NULL; - } - pampd = NULL; - } - } else { - obj = objnew = (*tmem_hostops.obj_alloc)(pool); - if (unlikely(obj == NULL)) { - ret = -ENOMEM; - goto out; - } - tmem_obj_init(obj, hb, pool, oidp); - } - BUG_ON(obj == NULL); - BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound)); - pampd = (*tmem_pamops.create)(data, size, raw, ephemeral, - obj->pool, &obj->oid, index); - if (unlikely(pampd == NULL)) - goto free; - ret = tmem_pampd_add_to_obj(obj, index, pampd); - if (unlikely(ret == -ENOMEM)) - /* may have partially built objnode tree ("stump") */ - goto delete_and_free; - goto out; - -delete_and_free: - (void)tmem_pampd_delete_from_obj(obj, index); -free: - if (pampd) - (*tmem_pamops.free)(pampd, pool, NULL, 0); - if (objnew) { - tmem_obj_free(objnew, hb); - (*tmem_hostops.obj_free)(objnew, pool); - } -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Get" a page, e.g. if one can be found, copy the tmem page with the - * matching handle from PAM space to the kernel. By tmem definition, - * when a "get" is successful on an ephemeral page, the page is "flushed", - * and when a "get" is successful on a persistent page, the page is retained - * in tmem. Note that to preserve - * coherency, "get" can never be skipped if tmem contains the data. - * That is, if a get is done with a certain handle and fails, any - * subsequent "get" must also fail (unless of course there is a - * "put" done with the same handle). - - */ -int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, - char *data, size_t *size, bool raw, int get_and_free) -{ - struct tmem_obj *obj; - void *pampd; - bool ephemeral = is_ephemeral(pool); - int ret = -1; - struct tmem_hashbucket *hb; - bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral); - bool lock_held = false; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - lock_held = true; - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - if (free) - pampd = tmem_pampd_delete_from_obj(obj, index); - else - pampd = tmem_pampd_lookup_in_obj(obj, index); - if (pampd == NULL) - goto out; - if (free) { - if (obj->pampd_count == 0) { - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - obj = NULL; - } - } - if (tmem_pamops.is_remote(pampd)) { - lock_held = false; - spin_unlock(&hb->lock); - } - if (free) - ret = (*tmem_pamops.get_data_and_free)( - data, size, raw, pampd, pool, oidp, index); - else - ret = (*tmem_pamops.get_data)( - data, size, raw, pampd, pool, oidp, index); - if (ret < 0) - goto out; - ret = 0; -out: - if (lock_held) - spin_unlock(&hb->lock); - return ret; -} - -/* - * If a page in tmem matches the handle, "flush" this page from tmem such - * that any subsequent "get" does not succeed (unless, of course, there - * was another "put" with the same handle). - */ -int tmem_flush_page(struct tmem_pool *pool, - struct tmem_oid *oidp, uint32_t index) -{ - struct tmem_obj *obj; - void *pampd; - int ret = -1; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - pampd = tmem_pampd_delete_from_obj(obj, index); - if (pampd == NULL) - goto out; - (*tmem_pamops.free)(pampd, pool, oidp, index); - if (obj->pampd_count == 0) { - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - } - ret = 0; - -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * If a page in tmem matches the handle, replace the page so that any - * subsequent "get" gets the new page. Returns 0 if - * there was a page to replace, else returns -1. - */ -int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, - uint32_t index, void *new_pampd) -{ - struct tmem_obj *obj; - int ret = -1; - struct tmem_hashbucket *hb; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd); - ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj); -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Flush" all pages in tmem matching this oid. - */ -int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) -{ - struct tmem_obj *obj; - struct tmem_hashbucket *hb; - int ret = -1; - - hb = &pool->hashbucket[tmem_oid_hash(oidp)]; - spin_lock(&hb->lock); - obj = tmem_obj_find(hb, oidp); - if (obj == NULL) - goto out; - tmem_pampd_destroy_all_in_obj(obj); - tmem_obj_free(obj, hb); - (*tmem_hostops.obj_free)(obj, pool); - ret = 0; - -out: - spin_unlock(&hb->lock); - return ret; -} - -/* - * "Flush" all pages (and tmem_objs) from this tmem_pool and disable - * all subsequent access to this tmem_pool. - */ -int tmem_destroy_pool(struct tmem_pool *pool) -{ - int ret = -1; - - if (pool == NULL) - goto out; - tmem_pool_flush(pool, 1); - ret = 0; -out: - return ret; -} - -static LIST_HEAD(tmem_global_pool_list); - -/* - * Create a new tmem_pool with the provided flag and return - * a pool id provided by the tmem host implementation. - */ -void tmem_new_pool(struct tmem_pool *pool, uint32_t flags) -{ - int persistent = flags & TMEM_POOL_PERSIST; - int shared = flags & TMEM_POOL_SHARED; - struct tmem_hashbucket *hb = &pool->hashbucket[0]; - int i; - - for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { - hb->obj_rb_root = RB_ROOT; - spin_lock_init(&hb->lock); - } - INIT_LIST_HEAD(&pool->pool_list); - atomic_set(&pool->obj_count, 0); - SET_SENTINEL(pool, POOL); - list_add_tail(&pool->pool_list, &tmem_global_pool_list); - pool->persistent = persistent; - pool->shared = shared; -} diff --git a/drivers/staging/zcache/tmem.h b/drivers/staging/zcache/tmem.h deleted file mode 100644 index 0d4aa8270..000000000 --- a/drivers/staging/zcache/tmem.h +++ /dev/null @@ -1,206 +0,0 @@ -/* - * tmem.h - * - * Transcendent memory - * - * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. - */ - -#ifndef _TMEM_H_ -#define _TMEM_H_ - -#include -#include -#include -#include - -/* - * These are pre-defined by the Xen<->Linux ABI - */ -#define TMEM_PUT_PAGE 4 -#define TMEM_GET_PAGE 5 -#define TMEM_FLUSH_PAGE 6 -#define TMEM_FLUSH_OBJECT 7 -#define TMEM_POOL_PERSIST 1 -#define TMEM_POOL_SHARED 2 -#define TMEM_POOL_PRECOMPRESSED 4 -#define TMEM_POOL_PAGESIZE_SHIFT 4 -#define TMEM_POOL_PAGESIZE_MASK 0xf -#define TMEM_POOL_RESERVED_BITS 0x00ffff00 - -/* - * sentinels have proven very useful for debugging but can be removed - * or disabled before final merge. - */ -#define SENTINELS -#ifdef SENTINELS -#define DECL_SENTINEL uint32_t sentinel; -#define SET_SENTINEL(_x, _y) (_x->sentinel = _y##_SENTINEL) -#define INVERT_SENTINEL(_x, _y) (_x->sentinel = ~_y##_SENTINEL) -#define ASSERT_SENTINEL(_x, _y) WARN_ON(_x->sentinel != _y##_SENTINEL) -#define ASSERT_INVERTED_SENTINEL(_x, _y) WARN_ON(_x->sentinel != ~_y##_SENTINEL) -#else -#define DECL_SENTINEL -#define SET_SENTINEL(_x, _y) do { } while (0) -#define INVERT_SENTINEL(_x, _y) do { } while (0) -#define ASSERT_SENTINEL(_x, _y) do { } while (0) -#define ASSERT_INVERTED_SENTINEL(_x, _y) do { } while (0) -#endif - -#define ASSERT_SPINLOCK(_l) lockdep_assert_held(_l) - -/* - * A pool is the highest-level data structure managed by tmem and - * usually corresponds to a large independent set of pages such as - * a filesystem. Each pool has an id, and certain attributes and counters. - * It also contains a set of hash buckets, each of which contains an rbtree - * of objects and a lock to manage concurrency within the pool. - */ - -#define TMEM_HASH_BUCKET_BITS 8 -#define TMEM_HASH_BUCKETS (1<persistent) -#define is_ephemeral(_p) (!(_p->persistent)) - -/* - * An object id ("oid") is large: 192-bits (to ensure, for example, files - * in a modern filesystem can be uniquely identified). - */ - -struct tmem_oid { - uint64_t oid[3]; -}; - -static inline void tmem_oid_set_invalid(struct tmem_oid *oidp) -{ - oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL; -} - -static inline bool tmem_oid_valid(struct tmem_oid *oidp) -{ - return oidp->oid[0] != -1UL || oidp->oid[1] != -1UL || - oidp->oid[2] != -1UL; -} - -static inline int tmem_oid_compare(struct tmem_oid *left, - struct tmem_oid *right) -{ - int ret; - - if (left->oid[2] == right->oid[2]) { - if (left->oid[1] == right->oid[1]) { - if (left->oid[0] == right->oid[0]) - ret = 0; - else if (left->oid[0] < right->oid[0]) - ret = -1; - else - return 1; - } else if (left->oid[1] < right->oid[1]) - ret = -1; - else - ret = 1; - } else if (left->oid[2] < right->oid[2]) - ret = -1; - else - ret = 1; - return ret; -} - -static inline unsigned tmem_oid_hash(struct tmem_oid *oidp) -{ - return hash_long(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2], - TMEM_HASH_BUCKET_BITS); -} - -/* - * A tmem_obj contains an identifier (oid), pointers to the parent - * pool and the rb_tree to which it belongs, counters, and an ordered - * set of pampds, structured in a radix-tree-like tree. The intermediate - * nodes of the tree are called tmem_objnodes. - */ - -struct tmem_objnode; - -struct tmem_obj { - struct tmem_oid oid; - struct tmem_pool *pool; - struct rb_node rb_tree_node; - struct tmem_objnode *objnode_tree_root; - unsigned int objnode_tree_height; - unsigned long objnode_count; - long pampd_count; - void *extra; /* for private use by pampd implementation */ - DECL_SENTINEL -}; - -#define OBJNODE_TREE_MAP_SHIFT 6 -#define OBJNODE_TREE_MAP_SIZE (1UL << OBJNODE_TREE_MAP_SHIFT) -#define OBJNODE_TREE_MAP_MASK (OBJNODE_TREE_MAP_SIZE-1) -#define OBJNODE_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -#define OBJNODE_TREE_MAX_PATH \ - (OBJNODE_TREE_INDEX_BITS/OBJNODE_TREE_MAP_SHIFT + 2) - -struct tmem_objnode { - struct tmem_obj *obj; - DECL_SENTINEL - void *slots[OBJNODE_TREE_MAP_SIZE]; - unsigned int slots_in_use; -}; - -/* pampd abstract datatype methods provided by the PAM implementation */ -struct tmem_pamops { - void *(*create)(char *, size_t, bool, int, - struct tmem_pool *, struct tmem_oid *, uint32_t); - int (*get_data)(char *, size_t *, bool, void *, struct tmem_pool *, - struct tmem_oid *, uint32_t); - int (*get_data_and_free)(char *, size_t *, bool, void *, - struct tmem_pool *, struct tmem_oid *, - uint32_t); - void (*free)(void *, struct tmem_pool *, struct tmem_oid *, uint32_t); - void (*free_obj)(struct tmem_pool *, struct tmem_obj *); - bool (*is_remote)(void *); - void (*new_obj)(struct tmem_obj *); - int (*replace_in_obj)(void *, struct tmem_obj *); -}; -extern void tmem_register_pamops(struct tmem_pamops *m); - -/* memory allocation methods provided by the host implementation */ -struct tmem_hostops { - struct tmem_obj *(*obj_alloc)(struct tmem_pool *); - void (*obj_free)(struct tmem_obj *, struct tmem_pool *); - struct tmem_objnode *(*objnode_alloc)(struct tmem_pool *); - void (*objnode_free)(struct tmem_objnode *, struct tmem_pool *); -}; -extern void tmem_register_hostops(struct tmem_hostops *m); - -/* core tmem accessor functions */ -extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index, - char *, size_t, bool, bool); -extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index, - char *, size_t *, bool, int); -extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index, - void *); -extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *, - uint32_t index); -extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *); -extern int tmem_destroy_pool(struct tmem_pool *); -extern void tmem_new_pool(struct tmem_pool *, uint32_t); -#endif /* _TMEM_H */ diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c deleted file mode 100644 index 1812bedab..000000000 --- a/drivers/staging/zcache/zcache-main.c +++ /dev/null @@ -1,2083 +0,0 @@ -/* - * zcache.c - * - * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp. - * Copyright (c) 2010,2011, Nitin Gupta - * - * Zcache provides an in-kernel "host implementation" for transcendent memory - * and, thus indirectly, for cleancache and frontswap. Zcache includes two - * page-accessible memory [1] interfaces, both utilizing the crypto compression - * API: - * 1) "compression buddies" ("zbud") is used for ephemeral pages - * 2) zsmalloc is used for persistent pages. - * Xvmalloc (based on the TLSF allocator) has very low fragmentation - * so maximizes space efficiency, while zbud allows pairs (and potentially, - * in the future, more than a pair of) compressed pages to be closely linked - * so that reclaiming can be done via the kernel's physical-page-oriented - * "shrinker" interface. - * - * [1] For a definition of page-accessible memory (aka PAM), see: - * http://marc.info/?l=linux-mm&m=127811271605009 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "tmem.h" - -#include "../zsmalloc/zsmalloc.h" - -#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP)) -#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP" -#endif -#ifdef CONFIG_CLEANCACHE -#include -#endif -#ifdef CONFIG_FRONTSWAP -#include -#endif - -#if 0 -/* this is more aggressive but may cause other problems? */ -#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN) -#else -#define ZCACHE_GFP_MASK \ - (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC) -#endif - -#define MAX_POOLS_PER_CLIENT 16 - -#define MAX_CLIENTS 16 -#define LOCAL_CLIENT ((uint16_t)-1) - -MODULE_LICENSE("GPL"); - -struct zcache_client { - struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT]; - struct zs_pool *zspool; - bool allocated; - atomic_t refcount; -}; - -static struct zcache_client zcache_host; -static struct zcache_client zcache_clients[MAX_CLIENTS]; - -static inline uint16_t get_client_id_from_client(struct zcache_client *cli) -{ - BUG_ON(cli == NULL); - if (cli == &zcache_host) - return LOCAL_CLIENT; - return cli - &zcache_clients[0]; -} - -static inline bool is_local_client(struct zcache_client *cli) -{ - return cli == &zcache_host; -} - -/* crypto API for zcache */ -#define ZCACHE_COMP_NAME_SZ CRYPTO_MAX_ALG_NAME -static char zcache_comp_name[ZCACHE_COMP_NAME_SZ]; -static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms; - -enum comp_op { - ZCACHE_COMPOP_COMPRESS, - ZCACHE_COMPOP_DECOMPRESS -}; - -static inline int zcache_comp_op(enum comp_op op, - const u8 *src, unsigned int slen, - u8 *dst, unsigned int *dlen) -{ - struct crypto_comp *tfm; - int ret; - - BUG_ON(!zcache_comp_pcpu_tfms); - tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu()); - BUG_ON(!tfm); - switch (op) { - case ZCACHE_COMPOP_COMPRESS: - ret = crypto_comp_compress(tfm, src, slen, dst, dlen); - break; - case ZCACHE_COMPOP_DECOMPRESS: - ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); - break; - } - put_cpu(); - return ret; -} - -/********** - * Compression buddies ("zbud") provides for packing two (or, possibly - * in the future, more) compressed ephemeral pages into a single "raw" - * (physical) page and tracking them with data structures so that - * the raw pages can be easily reclaimed. - * - * A zbud page ("zbpg") is an aligned page containing a list_head, - * a lock, and two "zbud headers". The remainder of the physical - * page is divided up into aligned 64-byte "chunks" which contain - * the compressed data for zero, one, or two zbuds. Each zbpg - * resides on: (1) an "unused list" if it has no zbuds; (2) a - * "buddied" list if it is fully populated with two zbuds; or - * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks - * the one unbuddied zbud uses. The data inside a zbpg cannot be - * read or written unless the zbpg's lock is held. - */ - -#define ZBH_SENTINEL 0x43214321 -#define ZBPG_SENTINEL 0xdeadbeef - -#define ZBUD_MAX_BUDS 2 - -struct zbud_hdr { - uint16_t client_id; - uint16_t pool_id; - struct tmem_oid oid; - uint32_t index; - uint16_t size; /* compressed size in bytes, zero means unused */ - DECL_SENTINEL -}; - -struct zbud_page { - struct list_head bud_list; - spinlock_t lock; - struct zbud_hdr buddy[ZBUD_MAX_BUDS]; - DECL_SENTINEL - /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */ -}; - -#define CHUNK_SHIFT 6 -#define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define CHUNK_MASK (~(CHUNK_SIZE-1)) -#define NCHUNKS (((PAGE_SIZE - sizeof(struct zbud_page)) & \ - CHUNK_MASK) >> CHUNK_SHIFT) -#define MAX_CHUNK (NCHUNKS-1) - -static struct { - struct list_head list; - unsigned count; -} zbud_unbuddied[NCHUNKS]; -/* list N contains pages with N chunks USED and NCHUNKS-N unused */ -/* element 0 is never used but optimizing that isn't worth it */ -static unsigned long zbud_cumul_chunk_counts[NCHUNKS]; - -struct list_head zbud_buddied_list; -static unsigned long zcache_zbud_buddied_count; - -/* protects the buddied list and all unbuddied lists */ -static DEFINE_SPINLOCK(zbud_budlists_spinlock); - -static LIST_HEAD(zbpg_unused_list); -static unsigned long zcache_zbpg_unused_list_count; - -/* protects the unused page list */ -static DEFINE_SPINLOCK(zbpg_unused_list_spinlock); - -static atomic_t zcache_zbud_curr_raw_pages; -static atomic_t zcache_zbud_curr_zpages; -static unsigned long zcache_zbud_curr_zbytes; -static unsigned long zcache_zbud_cumul_zpages; -static unsigned long zcache_zbud_cumul_zbytes; -static unsigned long zcache_compress_poor; -static unsigned long zcache_mean_compress_poor; - -/* forward references */ -static void *zcache_get_free_page(void); -static void zcache_free_page(void *p); - -/* - * zbud helper functions - */ - -static inline unsigned zbud_max_buddy_size(void) -{ - return MAX_CHUNK << CHUNK_SHIFT; -} - -static inline unsigned zbud_size_to_chunks(unsigned size) -{ - BUG_ON(size == 0 || size > zbud_max_buddy_size()); - return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; -} - -static inline int zbud_budnum(struct zbud_hdr *zh) -{ - unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1); - struct zbud_page *zbpg = NULL; - unsigned budnum = -1U; - int i; - - for (i = 0; i < ZBUD_MAX_BUDS; i++) - if (offset == offsetof(typeof(*zbpg), buddy[i])) { - budnum = i; - break; - } - BUG_ON(budnum == -1U); - return budnum; -} - -static char *zbud_data(struct zbud_hdr *zh, unsigned size) -{ - struct zbud_page *zbpg; - char *p; - unsigned budnum; - - ASSERT_SENTINEL(zh, ZBH); - budnum = zbud_budnum(zh); - BUG_ON(size == 0 || size > zbud_max_buddy_size()); - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - ASSERT_SPINLOCK(&zbpg->lock); - p = (char *)zbpg; - if (budnum == 0) - p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) & - CHUNK_MASK); - else if (budnum == 1) - p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK); - return p; -} - -/* - * zbud raw page management - */ - -static struct zbud_page *zbud_alloc_raw_page(void) -{ - struct zbud_page *zbpg = NULL; - struct zbud_hdr *zh0, *zh1; - bool recycled = 0; - - /* if any pages on the zbpg list, use one */ - spin_lock(&zbpg_unused_list_spinlock); - if (!list_empty(&zbpg_unused_list)) { - zbpg = list_first_entry(&zbpg_unused_list, - struct zbud_page, bud_list); - list_del_init(&zbpg->bud_list); - zcache_zbpg_unused_list_count--; - recycled = 1; - } - spin_unlock(&zbpg_unused_list_spinlock); - if (zbpg == NULL) - /* none on zbpg list, try to get a kernel page */ - zbpg = zcache_get_free_page(); - if (likely(zbpg != NULL)) { - INIT_LIST_HEAD(&zbpg->bud_list); - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; - spin_lock_init(&zbpg->lock); - if (recycled) { - ASSERT_INVERTED_SENTINEL(zbpg, ZBPG); - SET_SENTINEL(zbpg, ZBPG); - BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); - BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); - } else { - atomic_inc(&zcache_zbud_curr_raw_pages); - INIT_LIST_HEAD(&zbpg->bud_list); - SET_SENTINEL(zbpg, ZBPG); - zh0->size = 0; zh1->size = 0; - tmem_oid_set_invalid(&zh0->oid); - tmem_oid_set_invalid(&zh1->oid); - } - } - return zbpg; -} - -static void zbud_free_raw_page(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1]; - - ASSERT_SENTINEL(zbpg, ZBPG); - BUG_ON(!list_empty(&zbpg->bud_list)); - ASSERT_SPINLOCK(&zbpg->lock); - BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid)); - BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid)); - INVERT_SENTINEL(zbpg, ZBPG); - spin_unlock(&zbpg->lock); - spin_lock(&zbpg_unused_list_spinlock); - list_add(&zbpg->bud_list, &zbpg_unused_list); - zcache_zbpg_unused_list_count++; - spin_unlock(&zbpg_unused_list_spinlock); -} - -/* - * core zbud handling routines - */ - -static unsigned zbud_free(struct zbud_hdr *zh) -{ - unsigned size; - - ASSERT_SENTINEL(zh, ZBH); - BUG_ON(!tmem_oid_valid(&zh->oid)); - size = zh->size; - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); - zh->size = 0; - tmem_oid_set_invalid(&zh->oid); - INVERT_SENTINEL(zh, ZBH); - zcache_zbud_curr_zbytes -= size; - atomic_dec(&zcache_zbud_curr_zpages); - return size; -} - -static void zbud_free_and_delist(struct zbud_hdr *zh) -{ - unsigned chunks; - struct zbud_hdr *zh_other; - unsigned budnum = zbud_budnum(zh), size; - struct zbud_page *zbpg = - container_of(zh, struct zbud_page, buddy[budnum]); - - spin_lock(&zbud_budlists_spinlock); - spin_lock(&zbpg->lock); - if (list_empty(&zbpg->bud_list)) { - /* ignore zombie page... see zbud_evict_pages() */ - spin_unlock(&zbpg->lock); - spin_unlock(&zbud_budlists_spinlock); - return; - } - size = zbud_free(zh); - ASSERT_SPINLOCK(&zbpg->lock); - zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0]; - if (zh_other->size == 0) { /* was unbuddied: unlist and free */ - chunks = zbud_size_to_chunks(size) ; - BUG_ON(list_empty(&zbud_unbuddied[chunks].list)); - list_del_init(&zbpg->bud_list); - zbud_unbuddied[chunks].count--; - spin_unlock(&zbud_budlists_spinlock); - zbud_free_raw_page(zbpg); - } else { /* was buddied: move remaining buddy to unbuddied list */ - chunks = zbud_size_to_chunks(zh_other->size) ; - list_del_init(&zbpg->bud_list); - zcache_zbud_buddied_count--; - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list); - zbud_unbuddied[chunks].count++; - spin_unlock(&zbud_budlists_spinlock); - spin_unlock(&zbpg->lock); - } -} - -static struct zbud_hdr *zbud_create(uint16_t client_id, uint16_t pool_id, - struct tmem_oid *oid, - uint32_t index, struct page *page, - void *cdata, unsigned size) -{ - struct zbud_hdr *zh0, *zh1, *zh = NULL; - struct zbud_page *zbpg = NULL, *ztmp; - unsigned nchunks; - char *to; - int i, found_good_buddy = 0; - - nchunks = zbud_size_to_chunks(size) ; - for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) { - spin_lock(&zbud_budlists_spinlock); - if (!list_empty(&zbud_unbuddied[i].list)) { - list_for_each_entry_safe(zbpg, ztmp, - &zbud_unbuddied[i].list, bud_list) { - if (spin_trylock(&zbpg->lock)) { - found_good_buddy = i; - goto found_unbuddied; - } - } - } - spin_unlock(&zbud_budlists_spinlock); - } - /* didn't find a good buddy, try allocating a new page */ - zbpg = zbud_alloc_raw_page(); - if (unlikely(zbpg == NULL)) - goto out; - /* ok, have a page, now compress the data before taking locks */ - spin_lock(&zbud_budlists_spinlock); - spin_lock(&zbpg->lock); - list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list); - zbud_unbuddied[nchunks].count++; - zh = &zbpg->buddy[0]; - goto init_zh; - -found_unbuddied: - ASSERT_SPINLOCK(&zbpg->lock); - zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1]; - BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0))); - if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */ - ASSERT_SENTINEL(zh0, ZBH); - zh = zh1; - } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */ - ASSERT_SENTINEL(zh1, ZBH); - zh = zh0; - } else - BUG(); - list_del_init(&zbpg->bud_list); - zbud_unbuddied[found_good_buddy].count--; - list_add_tail(&zbpg->bud_list, &zbud_buddied_list); - zcache_zbud_buddied_count++; - -init_zh: - SET_SENTINEL(zh, ZBH); - zh->size = size; - zh->index = index; - zh->oid = *oid; - zh->pool_id = pool_id; - zh->client_id = client_id; - to = zbud_data(zh, size); - memcpy(to, cdata, size); - spin_unlock(&zbpg->lock); - spin_unlock(&zbud_budlists_spinlock); - - zbud_cumul_chunk_counts[nchunks]++; - atomic_inc(&zcache_zbud_curr_zpages); - zcache_zbud_cumul_zpages++; - zcache_zbud_curr_zbytes += size; - zcache_zbud_cumul_zbytes += size; -out: - return zh; -} - -static int zbud_decompress(struct page *page, struct zbud_hdr *zh) -{ - struct zbud_page *zbpg; - unsigned budnum = zbud_budnum(zh); - unsigned int out_len = PAGE_SIZE; - char *to_va, *from_va; - unsigned size; - int ret = 0; - - zbpg = container_of(zh, struct zbud_page, buddy[budnum]); - spin_lock(&zbpg->lock); - if (list_empty(&zbpg->bud_list)) { - /* ignore zombie page... see zbud_evict_pages() */ - ret = -EINVAL; - goto out; - } - ASSERT_SENTINEL(zh, ZBH); - BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size()); - to_va = kmap_atomic(page); - size = zh->size; - from_va = zbud_data(zh, size); - ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, from_va, size, - to_va, &out_len); - BUG_ON(ret); - BUG_ON(out_len != PAGE_SIZE); - kunmap_atomic(to_va); -out: - spin_unlock(&zbpg->lock); - return ret; -} - -/* - * The following routines handle shrinking of ephemeral pages by evicting - * pages "least valuable" first. - */ - -static unsigned long zcache_evicted_raw_pages; -static unsigned long zcache_evicted_buddied_pages; -static unsigned long zcache_evicted_unbuddied_pages; - -static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, - uint16_t poolid); -static void zcache_put_pool(struct tmem_pool *pool); - -/* - * Flush and free all zbuds in a zbpg, then free the pageframe - */ -static void zbud_evict_zbpg(struct zbud_page *zbpg) -{ - struct zbud_hdr *zh; - int i, j; - uint32_t pool_id[ZBUD_MAX_BUDS], client_id[ZBUD_MAX_BUDS]; - uint32_t index[ZBUD_MAX_BUDS]; - struct tmem_oid oid[ZBUD_MAX_BUDS]; - struct tmem_pool *pool; - - ASSERT_SPINLOCK(&zbpg->lock); - BUG_ON(!list_empty(&zbpg->bud_list)); - for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) { - zh = &zbpg->buddy[i]; - if (zh->size) { - client_id[j] = zh->client_id; - pool_id[j] = zh->pool_id; - oid[j] = zh->oid; - index[j] = zh->index; - j++; - zbud_free(zh); - } - } - spin_unlock(&zbpg->lock); - for (i = 0; i < j; i++) { - pool = zcache_get_pool_by_id(client_id[i], pool_id[i]); - if (pool != NULL) { - tmem_flush_page(pool, &oid[i], index[i]); - zcache_put_pool(pool); - } - } - ASSERT_SENTINEL(zbpg, ZBPG); - spin_lock(&zbpg->lock); - zbud_free_raw_page(zbpg); -} - -/* - * Free nr pages. This code is funky because we want to hold the locks - * protecting various lists for as short a time as possible, and in some - * circumstances the list may change asynchronously when the list lock is - * not held. In some cases we also trylock not only to avoid waiting on a - * page in use by another cpu, but also to avoid potential deadlock due to - * lock inversion. - */ -static void zbud_evict_pages(int nr) -{ - struct zbud_page *zbpg; - int i; - - /* first try freeing any pages on unused list */ -retry_unused_list: - spin_lock_bh(&zbpg_unused_list_spinlock); - if (!list_empty(&zbpg_unused_list)) { - /* can't walk list here, since it may change when unlocked */ - zbpg = list_first_entry(&zbpg_unused_list, - struct zbud_page, bud_list); - list_del_init(&zbpg->bud_list); - zcache_zbpg_unused_list_count--; - atomic_dec(&zcache_zbud_curr_raw_pages); - spin_unlock_bh(&zbpg_unused_list_spinlock); - zcache_free_page(zbpg); - zcache_evicted_raw_pages++; - if (--nr <= 0) - goto out; - goto retry_unused_list; - } - spin_unlock_bh(&zbpg_unused_list_spinlock); - - /* now try freeing unbuddied pages, starting with least space avail */ - for (i = 0; i < MAX_CHUNK; i++) { -retry_unbud_list_i: - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_unbuddied[i].list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - continue; - } - list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; - list_del_init(&zbpg->bud_list); - zbud_unbuddied[i].count--; - spin_unlock(&zbud_budlists_spinlock); - zcache_evicted_unbuddied_pages++; - /* want budlists unlocked when doing zbpg eviction */ - zbud_evict_zbpg(zbpg); - local_bh_enable(); - if (--nr <= 0) - goto out; - goto retry_unbud_list_i; - } - spin_unlock_bh(&zbud_budlists_spinlock); - } - - /* as a last resort, free buddied pages */ -retry_bud_list: - spin_lock_bh(&zbud_budlists_spinlock); - if (list_empty(&zbud_buddied_list)) { - spin_unlock_bh(&zbud_budlists_spinlock); - goto out; - } - list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) { - if (unlikely(!spin_trylock(&zbpg->lock))) - continue; - list_del_init(&zbpg->bud_list); - zcache_zbud_buddied_count--; - spin_unlock(&zbud_budlists_spinlock); - zcache_evicted_buddied_pages++; - /* want budlists unlocked when doing zbpg eviction */ - zbud_evict_zbpg(zbpg); - local_bh_enable(); - if (--nr <= 0) - goto out; - goto retry_bud_list; - } - spin_unlock_bh(&zbud_budlists_spinlock); -out: - return; -} - -static void zbud_init(void) -{ - int i; - - INIT_LIST_HEAD(&zbud_buddied_list); - zcache_zbud_buddied_count = 0; - for (i = 0; i < NCHUNKS; i++) { - INIT_LIST_HEAD(&zbud_unbuddied[i].list); - zbud_unbuddied[i].count = 0; - } -} - -#ifdef CONFIG_SYSFS -/* - * These sysfs routines show a nice distribution of how many zbpg's are - * currently (and have ever been placed) in each unbuddied list. It's fun - * to watch but can probably go away before final merge. - */ -static int zbud_show_unbuddied_list_counts(char *buf) -{ - int i; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) - p += sprintf(p, "%u ", zbud_unbuddied[i].count); - return p - buf; -} - -static int zbud_show_cumul_chunk_counts(char *buf) -{ - unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0; - unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0; - unsigned long total_chunks_lte_42 = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]); - chunks += zbud_cumul_chunk_counts[i]; - total_chunks += zbud_cumul_chunk_counts[i]; - sum_total_chunks += i * zbud_cumul_chunk_counts[i]; - if (i == 21) - total_chunks_lte_21 = total_chunks; - if (i == 32) - total_chunks_lte_32 = total_chunks; - if (i == 42) - total_chunks_lte_42 = total_chunks; - } - p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n", - total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42, - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} -#endif - -/********** - * This "zv" PAM implementation combines the slab-based zsmalloc - * with the crypto compression API to maximize the amount of data that can - * be packed into a physical page. - * - * Zv represents a PAM page with the index and object (plus a "size" value - * necessary for decompression) immediately preceding the compressed data. - */ - -#define ZVH_SENTINEL 0x43214321 - -struct zv_hdr { - uint32_t pool_id; - struct tmem_oid oid; - uint32_t index; - size_t size; - DECL_SENTINEL -}; - -/* rudimentary policy limits */ -/* total number of persistent pages may not exceed this percentage */ -static unsigned int zv_page_count_policy_percent = 75; -/* - * byte count defining poor compression; pages with greater zsize will be - * rejected - */ -static unsigned int zv_max_zsize = (PAGE_SIZE / 8) * 7; -/* - * byte count defining poor *mean* compression; pages with greater zsize - * will be rejected until sufficient better-compressed pages are accepted - * driving the mean below this threshold - */ -static unsigned int zv_max_mean_zsize = (PAGE_SIZE / 8) * 5; - -static atomic_t zv_curr_dist_counts[NCHUNKS]; -static atomic_t zv_cumul_dist_counts[NCHUNKS]; - -static struct zv_hdr *zv_create(struct zs_pool *pool, uint32_t pool_id, - struct tmem_oid *oid, uint32_t index, - void *cdata, unsigned clen) -{ - struct zv_hdr *zv; - u32 size = clen + sizeof(struct zv_hdr); - int chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT; - void *handle = NULL; - - BUG_ON(!irqs_disabled()); - BUG_ON(chunks >= NCHUNKS); - handle = zs_malloc(pool, size); - if (!handle) - goto out; - atomic_inc(&zv_curr_dist_counts[chunks]); - atomic_inc(&zv_cumul_dist_counts[chunks]); - zv = zs_map_object(pool, handle); - zv->index = index; - zv->oid = *oid; - zv->pool_id = pool_id; - zv->size = clen; - SET_SENTINEL(zv, ZVH); - memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen); - zs_unmap_object(pool, handle); -out: - return handle; -} - -static void zv_free(struct zs_pool *pool, void *handle) -{ - unsigned long flags; - struct zv_hdr *zv; - uint16_t size; - int chunks; - - zv = zs_map_object(pool, handle); - ASSERT_SENTINEL(zv, ZVH); - size = zv->size + sizeof(struct zv_hdr); - INVERT_SENTINEL(zv, ZVH); - zs_unmap_object(pool, handle); - - chunks = (size + (CHUNK_SIZE - 1)) >> CHUNK_SHIFT; - BUG_ON(chunks >= NCHUNKS); - atomic_dec(&zv_curr_dist_counts[chunks]); - - local_irq_save(flags); - zs_free(pool, handle); - local_irq_restore(flags); -} - -static void zv_decompress(struct page *page, void *handle) -{ - unsigned int clen = PAGE_SIZE; - char *to_va; - int ret; - struct zv_hdr *zv; - - zv = zs_map_object(zcache_host.zspool, handle); - BUG_ON(zv->size == 0); - ASSERT_SENTINEL(zv, ZVH); - to_va = kmap_atomic(page); - ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, (char *)zv + sizeof(*zv), - zv->size, to_va, &clen); - kunmap_atomic(to_va); - zs_unmap_object(zcache_host.zspool, handle); - BUG_ON(ret); - BUG_ON(clen != PAGE_SIZE); -} - -#ifdef CONFIG_SYSFS -/* - * show a distribution of compression stats for zv pages. - */ - -static int zv_curr_dist_counts_show(char *buf) -{ - unsigned long i, n, chunks = 0, sum_total_chunks = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - n = atomic_read(&zv_curr_dist_counts[i]); - p += sprintf(p, "%lu ", n); - chunks += n; - sum_total_chunks += i * n; - } - p += sprintf(p, "mean:%lu\n", - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} - -static int zv_cumul_dist_counts_show(char *buf) -{ - unsigned long i, n, chunks = 0, sum_total_chunks = 0; - char *p = buf; - - for (i = 0; i < NCHUNKS; i++) { - n = atomic_read(&zv_cumul_dist_counts[i]); - p += sprintf(p, "%lu ", n); - chunks += n; - sum_total_chunks += i * n; - } - p += sprintf(p, "mean:%lu\n", - chunks == 0 ? 0 : sum_total_chunks / chunks); - return p - buf; -} - -/* - * setting zv_max_zsize via sysfs causes all persistent (e.g. swap) - * pages that don't compress to less than this value (including metadata - * overhead) to be rejected. We don't allow the value to get too close - * to PAGE_SIZE. - */ -static ssize_t zv_max_zsize_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", zv_max_zsize); -} - -static ssize_t zv_max_zsize_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7)) - return -EINVAL; - zv_max_zsize = val; - return count; -} - -/* - * setting zv_max_mean_zsize via sysfs causes all persistent (e.g. swap) - * pages that don't compress to less than this value (including metadata - * overhead) to be rejected UNLESS the mean compression is also smaller - * than this value. In other words, we are load-balancing-by-zsize the - * accepted pages. Again, we don't allow the value to get too close - * to PAGE_SIZE. - */ -static ssize_t zv_max_mean_zsize_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", zv_max_mean_zsize); -} - -static ssize_t zv_max_mean_zsize_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > (PAGE_SIZE / 8) * 7)) - return -EINVAL; - zv_max_mean_zsize = val; - return count; -} - -/* - * setting zv_page_count_policy_percent via sysfs sets an upper bound of - * persistent (e.g. swap) pages that will be retained according to: - * (zv_page_count_policy_percent * totalram_pages) / 100) - * when that limit is reached, further puts will be rejected (until - * some pages have been flushed). Note that, due to compression, - * this number may exceed 100; it defaults to 75 and we set an - * arbitary limit of 150. A poor choice will almost certainly result - * in OOM's, so this value should only be changed prudently. - */ -static ssize_t zv_page_count_policy_percent_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", zv_page_count_policy_percent); -} - -static ssize_t zv_page_count_policy_percent_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = kstrtoul(buf, 10, &val); - if (err || (val == 0) || (val > 150)) - return -EINVAL; - zv_page_count_policy_percent = val; - return count; -} - -static struct kobj_attribute zcache_zv_max_zsize_attr = { - .attr = { .name = "zv_max_zsize", .mode = 0644 }, - .show = zv_max_zsize_show, - .store = zv_max_zsize_store, -}; - -static struct kobj_attribute zcache_zv_max_mean_zsize_attr = { - .attr = { .name = "zv_max_mean_zsize", .mode = 0644 }, - .show = zv_max_mean_zsize_show, - .store = zv_max_mean_zsize_store, -}; - -static struct kobj_attribute zcache_zv_page_count_policy_percent_attr = { - .attr = { .name = "zv_page_count_policy_percent", - .mode = 0644 }, - .show = zv_page_count_policy_percent_show, - .store = zv_page_count_policy_percent_store, -}; -#endif - -/* - * zcache core code starts here - */ - -/* useful stats not collected by cleancache or frontswap */ -static unsigned long zcache_flush_total; -static unsigned long zcache_flush_found; -static unsigned long zcache_flobj_total; -static unsigned long zcache_flobj_found; -static unsigned long zcache_failed_eph_puts; -static unsigned long zcache_failed_pers_puts; - -/* - * Tmem operations assume the poolid implies the invoking client. - * Zcache only has one client (the kernel itself): LOCAL_CLIENT. - * RAMster has each client numbered by cluster node, and a KVM version - * of zcache would have one client per guest and each client might - * have a poolid==N. - */ -static struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid) -{ - struct tmem_pool *pool = NULL; - struct zcache_client *cli = NULL; - - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else { - if (cli_id >= MAX_CLIENTS) - goto out; - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - atomic_inc(&cli->refcount); - } - if (poolid < MAX_POOLS_PER_CLIENT) { - pool = cli->tmem_pools[poolid]; - if (pool != NULL) - atomic_inc(&pool->refcount); - } -out: - return pool; -} - -static void zcache_put_pool(struct tmem_pool *pool) -{ - struct zcache_client *cli = NULL; - - if (pool == NULL) - BUG(); - cli = pool->client; - atomic_dec(&pool->refcount); - atomic_dec(&cli->refcount); -} - -int zcache_new_client(uint16_t cli_id) -{ - struct zcache_client *cli = NULL; - int ret = -1; - - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else if ((unsigned int)cli_id < MAX_CLIENTS) - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - if (cli->allocated) - goto out; - cli->allocated = 1; -#ifdef CONFIG_FRONTSWAP - cli->zspool = zs_create_pool("zcache", ZCACHE_GFP_MASK); - if (cli->zspool == NULL) - goto out; -#endif - ret = 0; -out: - return ret; -} - -/* counters for debugging */ -static unsigned long zcache_failed_get_free_pages; -static unsigned long zcache_failed_alloc; -static unsigned long zcache_put_to_flush; - -/* - * for now, used named slabs so can easily track usage; later can - * either just use kmalloc, or perhaps add a slab-like allocator - * to more carefully manage total memory utilization - */ -static struct kmem_cache *zcache_objnode_cache; -static struct kmem_cache *zcache_obj_cache; -static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_obj_count_max; -static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_objnode_count_max; - -/* - * to avoid memory allocation recursion (e.g. due to direct reclaim), we - * preload all necessary data structures so the hostops callbacks never - * actually do a malloc - */ -struct zcache_preload { - void *page; - struct tmem_obj *obj; - int nr; - struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH]; -}; -static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, }; - -static int zcache_do_preload(struct tmem_pool *pool) -{ - struct zcache_preload *kp; - struct tmem_objnode *objnode; - struct tmem_obj *obj; - void *page; - int ret = -ENOMEM; - - if (unlikely(zcache_objnode_cache == NULL)) - goto out; - if (unlikely(zcache_obj_cache == NULL)) - goto out; - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - while (kp->nr < ARRAY_SIZE(kp->objnodes)) { - preempt_enable_no_resched(); - objnode = kmem_cache_alloc(zcache_objnode_cache, - ZCACHE_GFP_MASK); - if (unlikely(objnode == NULL)) { - zcache_failed_alloc++; - goto out; - } - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - if (kp->nr < ARRAY_SIZE(kp->objnodes)) - kp->objnodes[kp->nr++] = objnode; - else - kmem_cache_free(zcache_objnode_cache, objnode); - } - preempt_enable_no_resched(); - obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK); - if (unlikely(obj == NULL)) { - zcache_failed_alloc++; - goto out; - } - page = (void *)__get_free_page(ZCACHE_GFP_MASK); - if (unlikely(page == NULL)) { - zcache_failed_get_free_pages++; - kmem_cache_free(zcache_obj_cache, obj); - goto out; - } - preempt_disable(); - kp = &__get_cpu_var(zcache_preloads); - if (kp->obj == NULL) - kp->obj = obj; - else - kmem_cache_free(zcache_obj_cache, obj); - if (kp->page == NULL) - kp->page = page; - else - free_page((unsigned long)page); - ret = 0; -out: - return ret; -} - -static void *zcache_get_free_page(void) -{ - struct zcache_preload *kp; - void *page; - - kp = &__get_cpu_var(zcache_preloads); - page = kp->page; - BUG_ON(page == NULL); - kp->page = NULL; - return page; -} - -static void zcache_free_page(void *p) -{ - free_page((unsigned long)p); -} - -/* - * zcache implementation for tmem host ops - */ - -static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool) -{ - struct tmem_objnode *objnode = NULL; - unsigned long count; - struct zcache_preload *kp; - - kp = &__get_cpu_var(zcache_preloads); - if (kp->nr <= 0) - goto out; - objnode = kp->objnodes[kp->nr - 1]; - BUG_ON(objnode == NULL); - kp->objnodes[kp->nr - 1] = NULL; - kp->nr--; - count = atomic_inc_return(&zcache_curr_objnode_count); - if (count > zcache_curr_objnode_count_max) - zcache_curr_objnode_count_max = count; -out: - return objnode; -} - -static void zcache_objnode_free(struct tmem_objnode *objnode, - struct tmem_pool *pool) -{ - atomic_dec(&zcache_curr_objnode_count); - BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0); - kmem_cache_free(zcache_objnode_cache, objnode); -} - -static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool) -{ - struct tmem_obj *obj = NULL; - unsigned long count; - struct zcache_preload *kp; - - kp = &__get_cpu_var(zcache_preloads); - obj = kp->obj; - BUG_ON(obj == NULL); - kp->obj = NULL; - count = atomic_inc_return(&zcache_curr_obj_count); - if (count > zcache_curr_obj_count_max) - zcache_curr_obj_count_max = count; - return obj; -} - -static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool) -{ - atomic_dec(&zcache_curr_obj_count); - BUG_ON(atomic_read(&zcache_curr_obj_count) < 0); - kmem_cache_free(zcache_obj_cache, obj); -} - -static struct tmem_hostops zcache_hostops = { - .obj_alloc = zcache_obj_alloc, - .obj_free = zcache_obj_free, - .objnode_alloc = zcache_objnode_alloc, - .objnode_free = zcache_objnode_free, -}; - -/* - * zcache implementations for PAM page descriptor ops - */ - -static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_eph_pampd_count_max; -static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0); -static unsigned long zcache_curr_pers_pampd_count_max; - -/* forward reference */ -static int zcache_compress(struct page *from, void **out_va, unsigned *out_len); - -static void *zcache_pampd_create(char *data, size_t size, bool raw, int eph, - struct tmem_pool *pool, struct tmem_oid *oid, - uint32_t index) -{ - void *pampd = NULL, *cdata; - unsigned clen; - int ret; - unsigned long count; - struct page *page = (struct page *)(data); - struct zcache_client *cli = pool->client; - uint16_t client_id = get_client_id_from_client(cli); - unsigned long zv_mean_zsize; - unsigned long curr_pers_pampd_count; - u64 total_zsize; - - if (eph) { - ret = zcache_compress(page, &cdata, &clen); - if (ret == 0) - goto out; - if (clen == 0 || clen > zbud_max_buddy_size()) { - zcache_compress_poor++; - goto out; - } - pampd = (void *)zbud_create(client_id, pool->pool_id, oid, - index, page, cdata, clen); - if (pampd != NULL) { - count = atomic_inc_return(&zcache_curr_eph_pampd_count); - if (count > zcache_curr_eph_pampd_count_max) - zcache_curr_eph_pampd_count_max = count; - } - } else { - curr_pers_pampd_count = - atomic_read(&zcache_curr_pers_pampd_count); - if (curr_pers_pampd_count > - (zv_page_count_policy_percent * totalram_pages) / 100) - goto out; - ret = zcache_compress(page, &cdata, &clen); - if (ret == 0) - goto out; - /* reject if compression is too poor */ - if (clen > zv_max_zsize) { - zcache_compress_poor++; - goto out; - } - /* reject if mean compression is too poor */ - if ((clen > zv_max_mean_zsize) && (curr_pers_pampd_count > 0)) { - total_zsize = zs_get_total_size_bytes(cli->zspool); - zv_mean_zsize = div_u64(total_zsize, - curr_pers_pampd_count); - if (zv_mean_zsize > zv_max_mean_zsize) { - zcache_mean_compress_poor++; - goto out; - } - } - pampd = (void *)zv_create(cli->zspool, pool->pool_id, - oid, index, cdata, clen); - if (pampd == NULL) - goto out; - count = atomic_inc_return(&zcache_curr_pers_pampd_count); - if (count > zcache_curr_pers_pampd_count_max) - zcache_curr_pers_pampd_count_max = count; - } -out: - return pampd; -} - -/* - * fill the pageframe corresponding to the struct page with the data - * from the passed pampd - */ -static int zcache_pampd_get_data(char *data, size_t *bufsize, bool raw, - void *pampd, struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index) -{ - int ret = 0; - - BUG_ON(is_ephemeral(pool)); - zv_decompress((struct page *)(data), pampd); - return ret; -} - -/* - * fill the pageframe corresponding to the struct page with the data - * from the passed pampd - */ -static int zcache_pampd_get_data_and_free(char *data, size_t *bufsize, bool raw, - void *pampd, struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index) -{ - BUG_ON(!is_ephemeral(pool)); - if (zbud_decompress((struct page *)(data), pampd) < 0) - return -EINVAL; - zbud_free_and_delist((struct zbud_hdr *)pampd); - atomic_dec(&zcache_curr_eph_pampd_count); - return 0; -} - -/* - * free the pampd and remove it from any zcache lists - * pampd must no longer be pointed to from any tmem data structures! - */ -static void zcache_pampd_free(void *pampd, struct tmem_pool *pool, - struct tmem_oid *oid, uint32_t index) -{ - struct zcache_client *cli = pool->client; - - if (is_ephemeral(pool)) { - zbud_free_and_delist((struct zbud_hdr *)pampd); - atomic_dec(&zcache_curr_eph_pampd_count); - BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0); - } else { - zv_free(cli->zspool, pampd); - atomic_dec(&zcache_curr_pers_pampd_count); - BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0); - } -} - -static void zcache_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj) -{ -} - -static void zcache_pampd_new_obj(struct tmem_obj *obj) -{ -} - -static int zcache_pampd_replace_in_obj(void *pampd, struct tmem_obj *obj) -{ - return -1; -} - -static bool zcache_pampd_is_remote(void *pampd) -{ - return 0; -} - -static struct tmem_pamops zcache_pamops = { - .create = zcache_pampd_create, - .get_data = zcache_pampd_get_data, - .get_data_and_free = zcache_pampd_get_data_and_free, - .free = zcache_pampd_free, - .free_obj = zcache_pampd_free_obj, - .new_obj = zcache_pampd_new_obj, - .replace_in_obj = zcache_pampd_replace_in_obj, - .is_remote = zcache_pampd_is_remote, -}; - -/* - * zcache compression/decompression and related per-cpu stuff - */ - -static DEFINE_PER_CPU(unsigned char *, zcache_dstmem); -#define ZCACHE_DSTMEM_ORDER 1 - -static int zcache_compress(struct page *from, void **out_va, unsigned *out_len) -{ - int ret = 0; - unsigned char *dmem = __get_cpu_var(zcache_dstmem); - char *from_va; - - BUG_ON(!irqs_disabled()); - if (unlikely(dmem == NULL)) - goto out; /* no buffer or no compressor so can't compress */ - *out_len = PAGE_SIZE << ZCACHE_DSTMEM_ORDER; - from_va = kmap_atomic(from); - mb(); - ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, from_va, PAGE_SIZE, dmem, - out_len); - BUG_ON(ret); - *out_va = dmem; - kunmap_atomic(from_va); - ret = 1; -out: - return ret; -} - -static int zcache_comp_cpu_up(int cpu) -{ - struct crypto_comp *tfm; - - tfm = crypto_alloc_comp(zcache_comp_name, 0, 0); - if (IS_ERR(tfm)) - return NOTIFY_BAD; - *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm; - return NOTIFY_OK; -} - -static void zcache_comp_cpu_down(int cpu) -{ - struct crypto_comp *tfm; - - tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu); - crypto_free_comp(tfm); - *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL; -} - -static int zcache_cpu_notifier(struct notifier_block *nb, - unsigned long action, void *pcpu) -{ - int ret, cpu = (long)pcpu; - struct zcache_preload *kp; - - switch (action) { - case CPU_UP_PREPARE: - ret = zcache_comp_cpu_up(cpu); - if (ret != NOTIFY_OK) { - pr_err("zcache: can't allocate compressor transform\n"); - return ret; - } - per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages( - GFP_KERNEL | __GFP_REPEAT, ZCACHE_DSTMEM_ORDER); - break; - case CPU_DEAD: - case CPU_UP_CANCELED: - zcache_comp_cpu_down(cpu); - free_pages((unsigned long)per_cpu(zcache_dstmem, cpu), - ZCACHE_DSTMEM_ORDER); - per_cpu(zcache_dstmem, cpu) = NULL; - kp = &per_cpu(zcache_preloads, cpu); - while (kp->nr) { - kmem_cache_free(zcache_objnode_cache, - kp->objnodes[kp->nr - 1]); - kp->objnodes[kp->nr - 1] = NULL; - kp->nr--; - } - if (kp->obj) { - kmem_cache_free(zcache_obj_cache, kp->obj); - kp->obj = NULL; - } - if (kp->page) { - free_page((unsigned long)kp->page); - kp->page = NULL; - } - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block zcache_cpu_notifier_block = { - .notifier_call = zcache_cpu_notifier -}; - -#ifdef CONFIG_SYSFS -#define ZCACHE_SYSFS_RO(_name) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%lu\n", zcache_##_name); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -#define ZCACHE_SYSFS_RO_ATOMIC(_name) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \ - static ssize_t zcache_##_name##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, char *buf) \ - { \ - return _func(buf); \ - } \ - static struct kobj_attribute zcache_##_name##_attr = { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = zcache_##_name##_show, \ - } - -ZCACHE_SYSFS_RO(curr_obj_count_max); -ZCACHE_SYSFS_RO(curr_objnode_count_max); -ZCACHE_SYSFS_RO(flush_total); -ZCACHE_SYSFS_RO(flush_found); -ZCACHE_SYSFS_RO(flobj_total); -ZCACHE_SYSFS_RO(flobj_found); -ZCACHE_SYSFS_RO(failed_eph_puts); -ZCACHE_SYSFS_RO(failed_pers_puts); -ZCACHE_SYSFS_RO(zbud_curr_zbytes); -ZCACHE_SYSFS_RO(zbud_cumul_zpages); -ZCACHE_SYSFS_RO(zbud_cumul_zbytes); -ZCACHE_SYSFS_RO(zbud_buddied_count); -ZCACHE_SYSFS_RO(zbpg_unused_list_count); -ZCACHE_SYSFS_RO(evicted_raw_pages); -ZCACHE_SYSFS_RO(evicted_unbuddied_pages); -ZCACHE_SYSFS_RO(evicted_buddied_pages); -ZCACHE_SYSFS_RO(failed_get_free_pages); -ZCACHE_SYSFS_RO(failed_alloc); -ZCACHE_SYSFS_RO(put_to_flush); -ZCACHE_SYSFS_RO(compress_poor); -ZCACHE_SYSFS_RO(mean_compress_poor); -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages); -ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages); -ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count); -ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count); -ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts, - zbud_show_unbuddied_list_counts); -ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts, - zbud_show_cumul_chunk_counts); -ZCACHE_SYSFS_RO_CUSTOM(zv_curr_dist_counts, - zv_curr_dist_counts_show); -ZCACHE_SYSFS_RO_CUSTOM(zv_cumul_dist_counts, - zv_cumul_dist_counts_show); - -static struct attribute *zcache_attrs[] = { - &zcache_curr_obj_count_attr.attr, - &zcache_curr_obj_count_max_attr.attr, - &zcache_curr_objnode_count_attr.attr, - &zcache_curr_objnode_count_max_attr.attr, - &zcache_flush_total_attr.attr, - &zcache_flobj_total_attr.attr, - &zcache_flush_found_attr.attr, - &zcache_flobj_found_attr.attr, - &zcache_failed_eph_puts_attr.attr, - &zcache_failed_pers_puts_attr.attr, - &zcache_compress_poor_attr.attr, - &zcache_mean_compress_poor_attr.attr, - &zcache_zbud_curr_raw_pages_attr.attr, - &zcache_zbud_curr_zpages_attr.attr, - &zcache_zbud_curr_zbytes_attr.attr, - &zcache_zbud_cumul_zpages_attr.attr, - &zcache_zbud_cumul_zbytes_attr.attr, - &zcache_zbud_buddied_count_attr.attr, - &zcache_zbpg_unused_list_count_attr.attr, - &zcache_evicted_raw_pages_attr.attr, - &zcache_evicted_unbuddied_pages_attr.attr, - &zcache_evicted_buddied_pages_attr.attr, - &zcache_failed_get_free_pages_attr.attr, - &zcache_failed_alloc_attr.attr, - &zcache_put_to_flush_attr.attr, - &zcache_zbud_unbuddied_list_counts_attr.attr, - &zcache_zbud_cumul_chunk_counts_attr.attr, - &zcache_zv_curr_dist_counts_attr.attr, - &zcache_zv_cumul_dist_counts_attr.attr, - &zcache_zv_max_zsize_attr.attr, - &zcache_zv_max_mean_zsize_attr.attr, - &zcache_zv_page_count_policy_percent_attr.attr, - NULL, -}; - -static struct attribute_group zcache_attr_group = { - .attrs = zcache_attrs, - .name = "zcache", -}; - -#endif /* CONFIG_SYSFS */ -/* - * When zcache is disabled ("frozen"), pools can be created and destroyed, - * but all puts (and thus all other operations that require memory allocation) - * must fail. If zcache is unfrozen, accepts puts, then frozen again, - * data consistency requires all puts while frozen to be converted into - * flushes. - */ -static bool zcache_freeze; - -/* - * zcache shrinker interface (only useful for ephemeral pages, so zbud only) - */ -static int shrink_zcache_memory(struct shrinker *shrink, - struct shrink_control *sc) -{ - int ret = -1; - int nr = sc->nr_to_scan; - gfp_t gfp_mask = sc->gfp_mask; - - if (nr >= 0) { - if (!(gfp_mask & __GFP_FS)) - /* does this case really need to be skipped? */ - goto out; - zbud_evict_pages(nr); - } - ret = (int)atomic_read(&zcache_zbud_curr_raw_pages); -out: - return ret; -} - -static struct shrinker zcache_shrinker = { - .shrink = shrink_zcache_memory, - .seeks = DEFAULT_SEEKS, -}; - -/* - * zcache shims between cleancache/frontswap ops and tmem - */ - -static int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp, - uint32_t index, struct page *page) -{ - struct tmem_pool *pool; - int ret = -1; - - BUG_ON(!irqs_disabled()); - pool = zcache_get_pool_by_id(cli_id, pool_id); - if (unlikely(pool == NULL)) - goto out; - if (!zcache_freeze && zcache_do_preload(pool) == 0) { - /* preload does preempt_disable on success */ - ret = tmem_put(pool, oidp, index, (char *)(page), - PAGE_SIZE, 0, is_ephemeral(pool)); - if (ret < 0) { - if (is_ephemeral(pool)) - zcache_failed_eph_puts++; - else - zcache_failed_pers_puts++; - } - zcache_put_pool(pool); - preempt_enable_no_resched(); - } else { - zcache_put_to_flush++; - if (atomic_read(&pool->obj_count) > 0) - /* the put fails whether the flush succeeds or not */ - (void)tmem_flush_page(pool, oidp, index); - zcache_put_pool(pool); - } -out: - return ret; -} - -static int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp, - uint32_t index, struct page *page) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - size_t size = PAGE_SIZE; - - local_irq_save(flags); - pool = zcache_get_pool_by_id(cli_id, pool_id); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_get(pool, oidp, index, (char *)(page), - &size, 0, is_ephemeral(pool)); - zcache_put_pool(pool); - } - local_irq_restore(flags); - return ret; -} - -static int zcache_flush_page(int cli_id, int pool_id, - struct tmem_oid *oidp, uint32_t index) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - - local_irq_save(flags); - zcache_flush_total++; - pool = zcache_get_pool_by_id(cli_id, pool_id); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_flush_page(pool, oidp, index); - zcache_put_pool(pool); - } - if (ret >= 0) - zcache_flush_found++; - local_irq_restore(flags); - return ret; -} - -static int zcache_flush_object(int cli_id, int pool_id, - struct tmem_oid *oidp) -{ - struct tmem_pool *pool; - int ret = -1; - unsigned long flags; - - local_irq_save(flags); - zcache_flobj_total++; - pool = zcache_get_pool_by_id(cli_id, pool_id); - if (likely(pool != NULL)) { - if (atomic_read(&pool->obj_count) > 0) - ret = tmem_flush_object(pool, oidp); - zcache_put_pool(pool); - } - if (ret >= 0) - zcache_flobj_found++; - local_irq_restore(flags); - return ret; -} - -static int zcache_destroy_pool(int cli_id, int pool_id) -{ - struct tmem_pool *pool = NULL; - struct zcache_client *cli = NULL; - int ret = -1; - - if (pool_id < 0) - goto out; - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else if ((unsigned int)cli_id < MAX_CLIENTS) - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - atomic_inc(&cli->refcount); - pool = cli->tmem_pools[pool_id]; - if (pool == NULL) - goto out; - cli->tmem_pools[pool_id] = NULL; - /* wait for pool activity on other cpus to quiesce */ - while (atomic_read(&pool->refcount) != 0) - ; - atomic_dec(&cli->refcount); - local_bh_disable(); - ret = tmem_destroy_pool(pool); - local_bh_enable(); - kfree(pool); - pr_info("zcache: destroyed pool id=%d, cli_id=%d\n", - pool_id, cli_id); -out: - return ret; -} - -static int zcache_new_pool(uint16_t cli_id, uint32_t flags) -{ - int poolid = -1; - struct tmem_pool *pool; - struct zcache_client *cli = NULL; - - if (cli_id == LOCAL_CLIENT) - cli = &zcache_host; - else if ((unsigned int)cli_id < MAX_CLIENTS) - cli = &zcache_clients[cli_id]; - if (cli == NULL) - goto out; - atomic_inc(&cli->refcount); - pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC); - if (pool == NULL) { - pr_info("zcache: pool creation failed: out of memory\n"); - goto out; - } - - for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++) - if (cli->tmem_pools[poolid] == NULL) - break; - if (poolid >= MAX_POOLS_PER_CLIENT) { - pr_info("zcache: pool creation failed: max exceeded\n"); - kfree(pool); - poolid = -1; - goto out; - } - atomic_set(&pool->refcount, 0); - pool->client = cli; - pool->pool_id = poolid; - tmem_new_pool(pool, flags); - cli->tmem_pools[poolid] = pool; - pr_info("zcache: created %s tmem pool, id=%d, client=%d\n", - flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral", - poolid, cli_id); -out: - if (cli != NULL) - atomic_dec(&cli->refcount); - return poolid; -} - -/********** - * Two kernel functionalities currently can be layered on top of tmem. - * These are "cleancache" which is used as a second-chance cache for clean - * page cache pages; and "frontswap" which is used for swap pages - * to avoid writes to disk. A generic "shim" is provided here for each - * to translate in-kernel semantics to zcache semantics. - */ - -#ifdef CONFIG_CLEANCACHE -static void zcache_cleancache_put_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (likely(ind == index)) - (void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index, page); -} - -static int zcache_cleancache_get_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index, struct page *page) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - int ret = -1; - - if (likely(ind == index)) - ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index, page); - return ret; -} - -static void zcache_cleancache_flush_page(int pool_id, - struct cleancache_filekey key, - pgoff_t index) -{ - u32 ind = (u32) index; - struct tmem_oid oid = *(struct tmem_oid *)&key; - - if (likely(ind == index)) - (void)zcache_flush_page(LOCAL_CLIENT, pool_id, &oid, ind); -} - -static void zcache_cleancache_flush_inode(int pool_id, - struct cleancache_filekey key) -{ - struct tmem_oid oid = *(struct tmem_oid *)&key; - - (void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid); -} - -static void zcache_cleancache_flush_fs(int pool_id) -{ - if (pool_id >= 0) - (void)zcache_destroy_pool(LOCAL_CLIENT, pool_id); -} - -static int zcache_cleancache_init_fs(size_t pagesize) -{ - BUG_ON(sizeof(struct cleancache_filekey) != - sizeof(struct tmem_oid)); - BUG_ON(pagesize != PAGE_SIZE); - return zcache_new_pool(LOCAL_CLIENT, 0); -} - -static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize) -{ - /* shared pools are unsupported and map to private */ - BUG_ON(sizeof(struct cleancache_filekey) != - sizeof(struct tmem_oid)); - BUG_ON(pagesize != PAGE_SIZE); - return zcache_new_pool(LOCAL_CLIENT, 0); -} - -static struct cleancache_ops zcache_cleancache_ops = { - .put_page = zcache_cleancache_put_page, - .get_page = zcache_cleancache_get_page, - .invalidate_page = zcache_cleancache_flush_page, - .invalidate_inode = zcache_cleancache_flush_inode, - .invalidate_fs = zcache_cleancache_flush_fs, - .init_shared_fs = zcache_cleancache_init_shared_fs, - .init_fs = zcache_cleancache_init_fs -}; - -struct cleancache_ops zcache_cleancache_register_ops(void) -{ - struct cleancache_ops old_ops = - cleancache_register_ops(&zcache_cleancache_ops); - - return old_ops; -} -#endif - -#ifdef CONFIG_FRONTSWAP -/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ -static int zcache_frontswap_poolid = -1; - -/* - * Swizzling increases objects per swaptype, increasing tmem concurrency - * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS - * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from - * frontswap_get_page(), but has side-effects. Hence using 8. - */ -#define SWIZ_BITS 8 -#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) -#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) -#define iswiz(_ind) (_ind >> SWIZ_BITS) - -static inline struct tmem_oid oswiz(unsigned type, u32 ind) -{ - struct tmem_oid oid = { .oid = { 0 } }; - oid.oid[0] = _oswiz(type, ind); - return oid; -} - -static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - int ret = -1; - unsigned long flags; - - BUG_ON(!PageLocked(page)); - if (likely(ind64 == ind)) { - local_irq_save(flags); - ret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid, - &oid, iswiz(ind), page); - local_irq_restore(flags); - } - return ret; -} - -/* returns 0 if the page was successfully gotten from frontswap, -1 if - * was not present (should never happen!) */ -static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, - struct page *page) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - int ret = -1; - - BUG_ON(!PageLocked(page)); - if (likely(ind64 == ind)) - ret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid, - &oid, iswiz(ind), page); - return ret; -} - -/* flush a single page from frontswap */ -static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset) -{ - u64 ind64 = (u64)offset; - u32 ind = (u32)offset; - struct tmem_oid oid = oswiz(type, ind); - - if (likely(ind64 == ind)) - (void)zcache_flush_page(LOCAL_CLIENT, zcache_frontswap_poolid, - &oid, iswiz(ind)); -} - -/* flush all pages from the passed swaptype */ -static void zcache_frontswap_flush_area(unsigned type) -{ - struct tmem_oid oid; - int ind; - - for (ind = SWIZ_MASK; ind >= 0; ind--) { - oid = oswiz(type, ind); - (void)zcache_flush_object(LOCAL_CLIENT, - zcache_frontswap_poolid, &oid); - } -} - -static void zcache_frontswap_init(unsigned ignored) -{ - /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ - if (zcache_frontswap_poolid < 0) - zcache_frontswap_poolid = - zcache_new_pool(LOCAL_CLIENT, TMEM_POOL_PERSIST); -} - -static struct frontswap_ops zcache_frontswap_ops = { - .put_page = zcache_frontswap_put_page, - .get_page = zcache_frontswap_get_page, - .invalidate_page = zcache_frontswap_flush_page, - .invalidate_area = zcache_frontswap_flush_area, - .init = zcache_frontswap_init -}; - -struct frontswap_ops zcache_frontswap_register_ops(void) -{ - struct frontswap_ops old_ops = - frontswap_register_ops(&zcache_frontswap_ops); - - return old_ops; -} -#endif - -/* - * zcache initialization - * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR - * NOTHING HAPPENS! - */ - -static int zcache_enabled; - -static int __init enable_zcache(char *s) -{ - zcache_enabled = 1; - return 1; -} -__setup("zcache", enable_zcache); - -/* allow independent dynamic disabling of cleancache and frontswap */ - -static int use_cleancache = 1; - -static int __init no_cleancache(char *s) -{ - use_cleancache = 0; - return 1; -} - -__setup("nocleancache", no_cleancache); - -static int use_frontswap = 1; - -static int __init no_frontswap(char *s) -{ - use_frontswap = 0; - return 1; -} - -__setup("nofrontswap", no_frontswap); - -static int __init enable_zcache_compressor(char *s) -{ - strncpy(zcache_comp_name, s, ZCACHE_COMP_NAME_SZ); - zcache_enabled = 1; - return 1; -} -__setup("zcache=", enable_zcache_compressor); - - -static int zcache_comp_init(void) -{ - int ret = 0; - - /* check crypto algorithm */ - if (*zcache_comp_name != '\0') { - ret = crypto_has_comp(zcache_comp_name, 0, 0); - if (!ret) - pr_info("zcache: %s not supported\n", - zcache_comp_name); - } - if (!ret) - strcpy(zcache_comp_name, "lzo"); - ret = crypto_has_comp(zcache_comp_name, 0, 0); - if (!ret) { - ret = 1; - goto out; - } - pr_info("zcache: using %s compressor\n", zcache_comp_name); - - /* alloc percpu transforms */ - ret = 0; - zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); - if (!zcache_comp_pcpu_tfms) - ret = 1; -out: - return ret; -} - -static int __init zcache_init(void) -{ - int ret = 0; - -#ifdef CONFIG_SYSFS - ret = sysfs_create_group(mm_kobj, &zcache_attr_group); - if (ret) { - pr_err("zcache: can't create sysfs\n"); - goto out; - } -#endif /* CONFIG_SYSFS */ -#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP) - if (zcache_enabled) { - unsigned int cpu; - - tmem_register_hostops(&zcache_hostops); - tmem_register_pamops(&zcache_pamops); - ret = register_cpu_notifier(&zcache_cpu_notifier_block); - if (ret) { - pr_err("zcache: can't register cpu notifier\n"); - goto out; - } - ret = zcache_comp_init(); - if (ret) { - pr_err("zcache: compressor initialization failed\n"); - goto out; - } - for_each_online_cpu(cpu) { - void *pcpu = (void *)(long)cpu; - zcache_cpu_notifier(&zcache_cpu_notifier_block, - CPU_UP_PREPARE, pcpu); - } - } - zcache_objnode_cache = kmem_cache_create("zcache_objnode", - sizeof(struct tmem_objnode), 0, 0, NULL); - zcache_obj_cache = kmem_cache_create("zcache_obj", - sizeof(struct tmem_obj), 0, 0, NULL); - ret = zcache_new_client(LOCAL_CLIENT); - if (ret) { - pr_err("zcache: can't create client\n"); - goto out; - } -#endif -#ifdef CONFIG_CLEANCACHE - if (zcache_enabled && use_cleancache) { - struct cleancache_ops old_ops; - - zbud_init(); - register_shrinker(&zcache_shrinker); - old_ops = zcache_cleancache_register_ops(); - pr_info("zcache: cleancache enabled using kernel " - "transcendent memory and compression buddies\n"); - if (old_ops.init_fs != NULL) - pr_warning("zcache: cleancache_ops overridden"); - } -#endif -#ifdef CONFIG_FRONTSWAP - if (zcache_enabled && use_frontswap) { - struct frontswap_ops old_ops; - - old_ops = zcache_frontswap_register_ops(); - pr_info("zcache: frontswap enabled using kernel " - "transcendent memory and zsmalloc\n"); - if (old_ops.init != NULL) - pr_warning("zcache: frontswap_ops overridden"); - } -#endif -out: - return ret; -} - -module_init(zcache_init) From 4d72c37dde25b010b17415f3dfaae72dd79bd89e Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 20 Jun 2016 06:59:44 -0400 Subject: [PATCH 40/52] ZCACHE: add zcache --- arch/arm/configs/ik_defconfig | 5 +- drivers/staging/Kconfig | 4 - drivers/staging/Makefile | 2 - drivers/staging/android/lowmemorykiller.c | 13 +- drivers/xen/tmem.c | 30 +- fs/ocfs2/super.c | 3 +- fs/super.c | 2 +- include/linux/cleancache.h | 15 +- include/linux/page-flags.h | 3 + include/linux/zcache.h | 22 + mm/Kconfig | 16 + mm/Makefile | 1 + mm/cleancache.c | 181 +++- mm/page_alloc.c | 3 + mm/vmscan.c | 8 + mm/zcache.c | 1158 +++++++++++++++++++++ 16 files changed, 1393 insertions(+), 73 deletions(-) create mode 100644 include/linux/zcache.h create mode 100644 mm/zcache.c diff --git a/arch/arm/configs/ik_defconfig b/arch/arm/configs/ik_defconfig index 5ff50e067..7ff4bd3d8 100644 --- a/arch/arm/configs/ik_defconfig +++ b/arch/arm/configs/ik_defconfig @@ -607,7 +607,7 @@ CONFIG_KSM=y CONFIG_UKSM=y # CONFIG_KSM_LEGACY is not set CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 -# CONFIG_CLEANCACHE is not set +CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y CONFIG_ZSMALLOC_NEW=y CONFIG_PGTABLE_MAPPING=y @@ -615,7 +615,8 @@ CONFIG_ZSWAP=y # CONFIG_SWAP_ENABLE_READAHEAD is not set # CONFIG_ZSWAP_ENABLE_WRITEBACK is not set CONFIG_ZPOOL=y -# CONFIG_ZBUD is not set +CONFIG_ZBUD=y +CONFIG_ZCACHE=y CONFIG_DIRECT_RECLAIM_FILE_PAGES_ONLY=y CONFIG_INCREASE_MAXIMUM_SWAPPINESS=y CONFIG_FIX_INACTIVE_RATIO=y diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index f5edd3dc8..c15649439 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -80,8 +80,6 @@ source "drivers/staging/sep/Kconfig" source "drivers/staging/iio/Kconfig" -source "drivers/staging/zcache/Kconfig" - source "drivers/staging/wlags49_h2/Kconfig" source "drivers/staging/wlags49_h25/Kconfig" @@ -122,8 +120,6 @@ source "drivers/staging/android/Kconfig" source "drivers/staging/telephony/Kconfig" -source "drivers/staging/ramster/Kconfig" - source "drivers/staging/ozwpan/Kconfig" source "drivers/staging/vnswap/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 4da2ea418..605826f30 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -32,7 +32,6 @@ obj-$(CONFIG_VT6656) += vt6656/ obj-$(CONFIG_VME_BUS) += vme/ obj-$(CONFIG_DX_SEP) += sep/ obj-$(CONFIG_IIO) += iio/ -obj-$(CONFIG_ZCACHE) += zcache/ obj-$(CONFIG_WLAGS49_H2) += wlags49_h2/ obj-$(CONFIG_WLAGS49_H25) += wlags49_h25/ obj-$(CONFIG_FB_SM7XX) += sm7xx/ @@ -52,6 +51,5 @@ obj-$(CONFIG_MFD_NVEC) += nvec/ obj-$(CONFIG_DRM_OMAP) += omapdrm/ obj-$(CONFIG_ANDROID) += android/ obj-$(CONFIG_PHONE) += telephony/ -obj-$(CONFIG_RAMSTER) += ramster/ obj-$(CONFIG_USB_WPAN_HCD) += ozwpan/ obj-$(CONFIG_VNSWAP) += vnswap/ diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index 9b01986b8..bf2158b6c 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -41,6 +41,7 @@ #include #include #include +#include #include @@ -197,7 +198,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) is_active_high = (global_page_state(NR_ACTIVE_FILE) > global_page_state(NR_INACTIVE_FILE)) ? 1 : 0; #endif - other_file = global_page_state(NR_FILE_PAGES); + other_file = global_page_state(NR_FILE_PAGES) + zcache_pages(); #if defined(CONFIG_CMA_PAGE_COUNTING) && defined(CONFIG_EXCLUDE_LRU_LIVING_IN_CMA) if (get_nr_swap_pages() < SSWAP_LMK_THRESHOLD && cma_page_ratio >= CMA_PAGE_RATIO @@ -303,22 +304,22 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) #if defined(CONFIG_CMA_PAGE_COUNTING) lowmem_print(1, "send sigkill to %d (%s), adj %d, size %d, " "ofree %d, ofile %d(%c), is_kswapd %d - " - "cma_free %lu priority %d cma_i_file %lu cma_a_file %lu\n", + "cma_free %lu priority %d cma_i_file %lu cma_a_file %lu\n,Total zcache is %ldkB\n", selected->pid, selected->comm, selected_oom_score_adj, selected_tasksize, other_free, other_file, flag ? '-' : '+', !!current_is_kswapd(), nr_cma_free, sc->priority, - nr_cma_inactive_file, nr_cma_active_file); + nr_cma_inactive_file, nr_cma_active_file, (long)zcache_pages() * (long)(PAGE_SIZE / 1024)); #else lowmem_print(1, "send sigkill to %d (%s), adj %d, size %d, " "free memory = %d, reclaimable memory = %d " - "is_kswapd %d cma_free %lu priority %d\n", + "is_kswapd %d cma_free %lu priority %d\nTotal zcache is %ldkB\n", selected->pid, selected->comm, selected_oom_score_adj, selected_tasksize, other_free, other_file, !!current_is_kswapd(), - nr_cma_free, sc->priority); + nr_cma_free, sc->priority,(long)zcache_pages() * (long)(PAGE_SIZE / 1024)); #endif lowmem_deathpending_timeout = jiffies + HZ; send_sig(SIGKILL, selected, 0); @@ -401,7 +402,7 @@ static int android_oom_handler(struct notifier_block *nb, nr_cma_inactive_file = global_page_state(NR_CMA_INACTIVE_FILE); nr_cma_active_file = global_page_state(NR_CMA_ACTIVE_FILE); - other_file = global_page_state(NR_FILE_PAGES) - + other_file = global_page_state(NR_FILE_PAGES) + zcache_pages() - global_page_state(NR_SHMEM) - total_swapcache_pages() - nr_cma_inactive_file - diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c index 89f264c67..de9273ea7 100644 --- a/drivers/xen/tmem.c +++ b/drivers/xen/tmem.c @@ -235,7 +235,7 @@ static int __init no_cleancache(char *s) } __setup("nocleancache", no_cleancache); -static struct cleancache_ops __initdata tmem_cleancache_ops = { +static const struct cleancache_ops tmem_cleancache_ops = { .put_page = tmem_cleancache_put_page, .get_page = tmem_cleancache_get_page, .invalidate_page = tmem_cleancache_flush_page, @@ -389,14 +389,26 @@ static int __init xen_tmem_init(void) #endif #ifdef CONFIG_CLEANCACHE BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); - if (tmem_enabled && use_cleancache) { - char *s = ""; - struct cleancache_ops old_ops = - cleancache_register_ops(&tmem_cleancache_ops); - if (old_ops.init_fs != NULL) - s = " (WARNING: cleancache_ops overridden)"; - printk(KERN_INFO "cleancache enabled, RAM provided by " - "Xen Transcendent Memory%s\n", s); + if (tmem_enabled && cleancache) { + int err; + + err = cleancache_register_ops(&tmem_cleancache_ops); + if (err) + pr_warn("xen-tmem: failed to enable cleancache: %d\n", + err); + else + pr_info("cleancache enabled, RAM provided by " + "Xen Transcendent Memory\n"); + } +#endif +#ifdef CONFIG_XEN_SELFBALLOONING + /* + * There is no point of driving pages to the swap system if they + * aren't going anywhere in tmem universe. + */ + if (!frontswap) { + selfshrinking = false; + selfballooning = false; } #endif return 0; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 91a0020a0..7e478160b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2357,7 +2357,8 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_errno(status); goto bail; } - cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); + + cleancache_init_shared_fs(sb); bail: return status; diff --git a/fs/super.c b/fs/super.c index 672ccae0f..88b352193 100644 --- a/fs/super.c +++ b/fs/super.c @@ -167,7 +167,7 @@ static struct super_block *alloc_super(struct file_system_type *type) s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; - s->cleancache_poolid = -1; + s->cleancache_poolid = CLEANCACHE_NO_POOL; s->s_shrink.seeks = DEFAULT_SEEKS; s->s_shrink.shrink = prune_super; diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h index 42e55deee..cb3e14234 100644 --- a/include/linux/cleancache.h +++ b/include/linux/cleancache.h @@ -5,6 +5,10 @@ #include #include +#define CLEANCACHE_NO_POOL -1 +#define CLEANCACHE_NO_BACKEND -2 +#define CLEANCACHE_NO_BACKEND_SHARED -3 + #define CLEANCACHE_KEY_MAX 6 /* @@ -33,18 +37,17 @@ struct cleancache_ops { void (*invalidate_fs)(int); }; -extern struct cleancache_ops - cleancache_register_ops(struct cleancache_ops *ops); +extern int cleancache_register_ops(const struct cleancache_ops *ops); extern void __cleancache_init_fs(struct super_block *); -extern void __cleancache_init_shared_fs(char *, struct super_block *); +extern void __cleancache_init_shared_fs(struct super_block *); extern int __cleancache_get_page(struct page *); extern void __cleancache_put_page(struct page *); extern void __cleancache_invalidate_page(struct address_space *, struct page *); extern void __cleancache_invalidate_inode(struct address_space *); extern void __cleancache_invalidate_fs(struct super_block *); -extern int cleancache_enabled; #ifdef CONFIG_CLEANCACHE +#define cleancache_enabled (1) static inline bool cleancache_fs_enabled(struct page *page) { return page->mapping->host->i_sb->cleancache_poolid >= 0; @@ -78,10 +81,10 @@ static inline void cleancache_init_fs(struct super_block *sb) __cleancache_init_fs(sb); } -static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb) +static inline void cleancache_init_shared_fs(struct super_block *sb) { if (cleancache_enabled) - __cleancache_init_shared_fs(uuid, sb); + __cleancache_init_shared_fs(sb); } static inline int cleancache_get_page(struct page *page) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 2d0243350..837539e4d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -115,6 +115,9 @@ enum pageflags { #endif #ifdef CONFIG_SDP PG_sensitive, +#endif +#ifdef CONFIG_ZCACHE + PG_was_active, #endif __NR_PAGEFLAGS, #if defined(CONFIG_CMA_PAGE_COUNTING) diff --git a/include/linux/zcache.h b/include/linux/zcache.h new file mode 100644 index 000000000..2db7e4bbb --- /dev/null +++ b/include/linux/zcache.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2015, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#ifndef _LINUX_ZCACHE_H +#define _LINUX_ZCACHE_H + +#ifdef CONFIG_ZCACHE +extern u64 zcache_pages(void); +#else +u64 zcache_pages(void) { return 0; } +#endif + +#endif /* _LINUX_ZCACHE_H */ diff --git a/mm/Kconfig b/mm/Kconfig index a34fbc381..81c5191d2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -466,6 +466,22 @@ config ZSWAP from this compressed store much faster than most tradition swap devices resulting in reduced I/O and faster performance for many workloads. + +config ZCACHE + bool "Compressed cache for file pages (EXPERIMENTAL)" + depends on CRYPTO && CLEANCACHE + select CRYPTO_LZO + select ZBUD + default n + help + A compressed cache for file pages. + It takes active file pages that are in the process of being reclaimed + and attempts to compress them into a dynamically allocated RAM-based + memory pool. + + If this process is successful, when those file pages needed again, the + I/O reading operation was avoided. This results in a significant performance + gains under memory pressure for systems full with file pages. config SWAP_ENABLE_READAHEAD bool "Enable readahead on page swap in" diff --git a/mm/Makefile b/mm/Makefile index e5904b71d..f279b0dc1 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_FRONTSWAP) += frontswap.o +obj-$(CONFIG_ZCACHE) += zcache.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o diff --git a/mm/cleancache.c b/mm/cleancache.c index 5646c740f..770794807 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -19,23 +19,13 @@ #include /* - * This global enablement flag may be read thousands of times per second - * by cleancache_get/put/invalidate even on systems where cleancache_ops - * is not claimed (e.g. cleancache is config'ed on but remains - * disabled), so is preferred to the slower alternative: a function - * call that checks a non-global. - */ -int cleancache_enabled __read_mostly; -EXPORT_SYMBOL(cleancache_enabled); - -/* - * cleancache_ops is set by cleancache_ops_register to contain the pointers + * cleancache_ops is set by cleancache_register_ops to contain the pointers * to the cleancache "backend" implementation functions. */ -static struct cleancache_ops cleancache_ops __read_mostly; +static const struct cleancache_ops *cleancache_ops __read_mostly; /* - * Counters available via /sys/kernel/debug/frontswap (if debugfs is + * Counters available via /sys/kernel/debug/cleancache (if debugfs is * properly configured. These are for information only so are not protected * against increment races. */ @@ -44,32 +34,107 @@ static u64 cleancache_failed_gets; static u64 cleancache_puts; static u64 cleancache_invalidates; +static void cleancache_register_ops_sb(struct super_block *sb, void *unused) +{ + switch (sb->cleancache_poolid) { + case CLEANCACHE_NO_BACKEND: + __cleancache_init_fs(sb); + break; + case CLEANCACHE_NO_BACKEND_SHARED: + __cleancache_init_shared_fs(sb); + break; + } +} + /* - * register operations for cleancache, returning previous thus allowing - * detection of multiple backends and possible nesting + * Register operations for cleancache. Returns 0 on success. */ -struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops) +int cleancache_register_ops(const struct cleancache_ops *ops) { - struct cleancache_ops old = cleancache_ops; + if (cmpxchg(&cleancache_ops, NULL, ops)) + return -EBUSY; - cleancache_ops = *ops; - cleancache_enabled = 1; - return old; + /* + * A cleancache backend can be built as a module and hence loaded after + * a cleancache enabled filesystem has called cleancache_init_fs. To + * handle such a scenario, here we call ->init_fs or ->init_shared_fs + * for each active super block. To differentiate between local and + * shared filesystems, we temporarily initialize sb->cleancache_poolid + * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED + * respectively in case there is no backend registered at the time + * cleancache_init_fs or cleancache_init_shared_fs is called. + * + * Since filesystems can be mounted concurrently with cleancache + * backend registration, we have to be careful to guarantee that all + * cleancache enabled filesystems that has been mounted by the time + * cleancache_register_ops is called has got and all mounted later will + * get cleancache_poolid. This is assured by the following statements + * tied together: + * + * a) iterate_supers skips only those super blocks that has started + * ->kill_sb + * + * b) if iterate_supers encounters a super block that has not finished + * ->mount yet, it waits until it is finished + * + * c) cleancache_init_fs is called from ->mount and + * cleancache_invalidate_fs is called from ->kill_sb + * + * d) we call iterate_supers after cleancache_ops has been set + * + * From a) it follows that if iterate_supers skips a super block, then + * either the super block is already dead, in which case we do not need + * to bother initializing cleancache for it, or it was mounted after we + * initiated iterate_supers. In the latter case, it must have seen + * cleancache_ops set according to d) and initialized cleancache from + * ->mount by itself according to c). This proves that we call + * ->init_fs at least once for each active super block. + * + * From b) and c) it follows that if iterate_supers encounters a super + * block that has already started ->init_fs, it will wait until ->mount + * and hence ->init_fs has finished, then check cleancache_poolid, see + * that it has already been set and therefore do nothing. This proves + * that we call ->init_fs no more than once for each super block. + * + * Combined together, the last two paragraphs prove the function + * correctness. + * + * Note that various cleancache callbacks may proceed before this + * function is called or even concurrently with it, but since + * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop + * until the corresponding ->init_fs has been actually called and + * cleancache_ops has been set. + */ + iterate_supers(cleancache_register_ops_sb, NULL); + return 0; } EXPORT_SYMBOL(cleancache_register_ops); /* Called by a cleancache-enabled filesystem at time of mount */ void __cleancache_init_fs(struct super_block *sb) { - sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE); + int pool_id = CLEANCACHE_NO_BACKEND; + + if (cleancache_ops) { + pool_id = cleancache_ops->init_fs(PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; + } + sb->cleancache_poolid = pool_id; } EXPORT_SYMBOL(__cleancache_init_fs); /* Called by a cleancache-enabled clustered filesystem at time of mount */ -void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) +void __cleancache_init_shared_fs(struct super_block *sb) { - sb->cleancache_poolid = - (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE); + int pool_id = CLEANCACHE_NO_BACKEND_SHARED; + + if (cleancache_ops) { + pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; + } + sb->cleancache_poolid = pool_id; } EXPORT_SYMBOL(__cleancache_init_shared_fs); @@ -80,7 +145,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs); static int cleancache_get_key(struct inode *inode, struct cleancache_filekey *key) { - int (*fhfn)(struct dentry *, __u32 *fh, int *, int); + int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *); int len = 0, maxlen = CLEANCACHE_KEY_MAX; struct super_block *sb = inode->i_sb; @@ -88,10 +153,8 @@ static int cleancache_get_key(struct inode *inode, if (sb->s_export_op != NULL) { fhfn = sb->s_export_op->encode_fh; if (fhfn) { - struct dentry d; - d.d_inode = inode; - len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); - if (len <= 0 || len == 255) + len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); + if (len <= FILEID_ROOT || len == FILEID_INVALID) return -1; if (maxlen > CLEANCACHE_KEY_MAX) return -1; @@ -106,6 +169,10 @@ static int cleancache_get_key(struct inode *inode, * successful, use it to fill the specified page with data and return 0. * The pageframe is unchanged and returns -1 if the get fails. * Page must be locked by caller. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ int __cleancache_get_page(struct page *page) { @@ -113,6 +180,11 @@ int __cleancache_get_page(struct page *page) int pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) { + cleancache_failed_gets++; + goto out; + } + VM_BUG_ON(!PageLocked(page)); pool_id = page->mapping->host->i_sb->cleancache_poolid; if (pool_id < 0) @@ -121,7 +193,7 @@ int __cleancache_get_page(struct page *page) if (cleancache_get_key(page->mapping->host, &key) < 0) goto out; - ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page); + ret = cleancache_ops->get_page(pool_id, key, page->index, page); if (ret == 0) cleancache_succ_gets++; else @@ -136,17 +208,26 @@ EXPORT_SYMBOL(__cleancache_get_page); * (previously-obtained per-filesystem) poolid and the page's, * inode and page index. Page must be locked. Note that a put_page * always "succeeds", though a subsequent get_page may succeed or fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_put_page(struct page *page) { int pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) { + cleancache_puts++; + return; + } + VM_BUG_ON(!PageLocked(page)); pool_id = page->mapping->host->i_sb->cleancache_poolid; if (pool_id >= 0 && - cleancache_get_key(page->mapping->host, &key) >= 0) { - (*cleancache_ops.put_page)(pool_id, key, page->index, page); + cleancache_get_key(page->mapping->host, &key) >= 0) { + cleancache_ops->put_page(pool_id, key, page->index, page); cleancache_puts++; } } @@ -155,6 +236,10 @@ EXPORT_SYMBOL(__cleancache_put_page); /* * Invalidate any data from cleancache associated with the poolid and the * page's inode and page index so that a subsequent "get" will fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_invalidate_page(struct address_space *mapping, struct page *page) @@ -163,11 +248,14 @@ void __cleancache_invalidate_page(struct address_space *mapping, int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) + return; + if (pool_id >= 0) { VM_BUG_ON(!PageLocked(page)); if (cleancache_get_key(mapping->host, &key) >= 0) { - (*cleancache_ops.invalidate_page)(pool_id, - key, page->index); + cleancache_ops->invalidate_page(pool_id, + key, page->index); cleancache_invalidates++; } } @@ -178,29 +266,38 @@ EXPORT_SYMBOL(__cleancache_invalidate_page); * Invalidate all data from cleancache associated with the poolid and the * mappings's inode so that all subsequent gets to this poolid/inode * will fail. + * + * The function has two checks before any action is taken - whether + * a backend is registered and whether the sb->cleancache_poolid + * is correct. */ void __cleancache_invalidate_inode(struct address_space *mapping) { int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; + if (!cleancache_ops) + return; + if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) - (*cleancache_ops.invalidate_inode)(pool_id, key); + cleancache_ops->invalidate_inode(pool_id, key); } EXPORT_SYMBOL(__cleancache_invalidate_inode); /* * Called by any cleancache-enabled filesystem at time of unmount; - * note that pool_id is surrendered and may be reutrned by a subsequent - * cleancache_init_fs or cleancache_init_shared_fs + * note that pool_id is surrendered and may be returned by a subsequent + * cleancache_init_fs or cleancache_init_shared_fs. */ void __cleancache_invalidate_fs(struct super_block *sb) { - if (sb->cleancache_poolid >= 0) { - int old_poolid = sb->cleancache_poolid; - sb->cleancache_poolid = -1; - (*cleancache_ops.invalidate_fs)(old_poolid); - } + int pool_id; + + pool_id = sb->cleancache_poolid; + sb->cleancache_poolid = CLEANCACHE_NO_POOL; + + if (cleancache_ops && pool_id >= 0) + cleancache_ops->invalidate_fs(pool_id); } EXPORT_SYMBOL(__cleancache_invalidate_fs); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd68682b4..1cfc67c3e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6312,6 +6312,9 @@ static struct trace_print_flags pageflag_names[] = { {1UL << PG_scfslower, "scfslower"}, {1UL << PG_nocache,"nocache"}, #endif +#ifdef CONFIG_ZCACHE + {1UL << PG_was_active, "PG_was_active" }, +#endif }; static void dump_page_flags(unsigned long flags) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3df4609d1..dd2681158 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1284,6 +1284,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, while (!list_empty(page_list)) { struct page *page = lru_to_page(page_list); int lru; + int file; VM_BUG_ON(PageLRU(page)); list_del(&page->lru); @@ -1296,6 +1297,13 @@ putback_inactive_pages(struct mem_cgroup_zone *mz, SetPageLRU(page); lru = page_lru(page); add_page_to_lru_list(zone, page, lru); + + file = is_file_lru(lru); +#ifdef CONFIG_ZCACHE + if (IS_ENABLED(CONFIG_ZCACHE)) + if (file) + SetPageWasActive(page); +#endif if (is_active_lru(lru)) { int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); diff --git a/mm/zcache.c b/mm/zcache.c new file mode 100644 index 000000000..ec1c7c919 --- /dev/null +++ b/mm/zcache.c @@ -0,0 +1,1158 @@ +/* + * linux/mm/zcache.c + * + * A cleancache backend for file pages compression. + * Concepts based on original zcache by Dan Magenheimer. + * Copyright (C) 2013 Bob Liu + * + * With zcache, active file pages can be compressed in memory during page + * reclaiming. When their data is needed again the I/O reading operation is + * avoided. This results in a significant performance gain under memory pressure + * for systems with many file pages. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Enable/disable zcache (enabled by default) + */ +static bool zcache_enabled = true; +module_param_named(enabled, zcache_enabled, bool, 0); + +/* + * Compressor to be used by zcache + */ +#define ZCACHE_COMPRESSOR_DEFAULT "lzo" +#ifndef CONFIG_CRYPTO_LZ4 +static char *zcache_compressor = ZCACHE_COMPRESSOR_DEFAULT; +#else +static char *zcache_compressor = "lz4"; +#endif +module_param_named(compressor, zcache_compressor, charp, 0); + +/* + * The maximum percentage of memory that the compressed pool can occupy. + */ +static unsigned int zcache_max_pool_percent = 10; +module_param_named(max_pool_percent, zcache_max_pool_percent, uint, 0644); + +static unsigned int zcache_clear_percent = 4; +module_param_named(clear_percent, zcache_clear_percent, uint, 0644); +/* + * zcache statistics + */ +static u64 zcache_pool_limit_hit; +static u64 zcache_dup_entry; +static u64 zcache_zbud_alloc_fail; +static u64 zcache_evict_zpages; +static u64 zcache_evict_filepages; +static u64 zcache_inactive_pages_refused; +static u64 zcache_reclaim_fail; +static u64 zcache_pool_shrink; +static u64 zcache_pool_shrink_fail; +static u64 zcache_pool_shrink_pages; +static u64 zcache_store_failed; +static atomic_t zcache_stored_pages = ATOMIC_INIT(0); +static atomic_t zcache_stored_zero_pages = ATOMIC_INIT(0); + +#define GFP_ZCACHE \ + (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NOMEMALLOC | __GFP_NO_KSWAPD | __GFP_ZERO) + +/* + * Make sure this is different from radix tree + * indirect ptr or exceptional entry. + */ +#define ZERO_HANDLE ((void *)~(~0UL >> 1)) + +/* + * Zcache receives pages for compression through the Cleancache API and is able + * to evict pages from its own compressed pool on an LRU basis in the case that + * the compressed pool is full. + * + * Zcache makes use of zbud for the managing the compressed memory pool. Each + * allocation in zbud is not directly accessible by address. Rather, a handle + * (zaddr) is return by the allocation routine and that handle(zaddr must be + * mapped before being accessed. The compressed memory pool grows on demand and + * shrinks as compressed pages are freed. + * + * When a file page is passed from cleancache to zcache, zcache maintains a + * mapping of the to the zbud + * address that references that compressed file page. This mapping is achieved + * with a red-black tree per filesystem type, plus a radix tree per red-black + * node. + * + * A zcache pool with pool_id as the index is created when a filesystem mounted + * Each zcache pool has a red-black tree, the inode number(rb_index) is the + * search key. Each red-black tree node has a radix tree which use + * page->index(ra_index) as the index. Each radix tree slot points to the zbud + * address combining with some extra information(zcache_ra_handle). + */ +#define MAX_ZCACHE_POOLS 32 +/* + * One zcache_pool per (cleancache aware) filesystem mount instance + */ +struct zcache_pool { + struct rb_root rbtree; + rwlock_t rb_lock; /* Protects rbtree */ + u64 size; + struct zbud_pool *pool; /* Zbud pool used */ +}; + +/* + * Manage all zcache pools + */ +struct _zcache { + struct zcache_pool *pools[MAX_ZCACHE_POOLS]; + u32 num_pools; /* Current no. of zcache pools */ + spinlock_t pool_lock; /* Protects pools[] and num_pools */ +}; +struct _zcache zcache; + +/* + * Redblack tree node, each node has a page index radix-tree. + * Indexed by inode nubmer. + */ +struct zcache_rbnode { + struct rb_node rb_node; + int rb_index; + struct radix_tree_root ratree; /* Page radix tree per inode rbtree */ + spinlock_t ra_lock; /* Protects radix tree */ + struct kref refcount; +}; + +/* + * Radix-tree leaf, indexed by page->index + */ +struct zcache_ra_handle { + int rb_index; /* Redblack tree index */ + int ra_index; /* Radix tree index */ + int zlen; /* Compressed page size */ + struct zcache_pool *zpool; /* Finding zcache_pool during evict */ +}; + +u64 zcache_pages(void) +{ + int i; + u64 count = 0; + + for (i = 0; (i < MAX_ZCACHE_POOLS) && zcache.pools[i]; i++) + count += zcache.pools[i]->size; + + return count; +} + +static struct kmem_cache *zcache_rbnode_cache; +static int zcache_rbnode_cache_create(void) +{ + zcache_rbnode_cache = KMEM_CACHE(zcache_rbnode, 0); + return zcache_rbnode_cache == NULL; +} +static void zcache_rbnode_cache_destroy(void) +{ + kmem_cache_destroy(zcache_rbnode_cache); +} + +static int zcache_shrink(struct shrinker *s, struct shrink_control *sc) +{ + unsigned long active_file; + unsigned long file; + long file_gap; + unsigned long freed = 0; + unsigned long pool; + static bool running; + int i = 0; + int retries; + + if (running) + goto end; + + running = true; + active_file = global_page_state(NR_ACTIVE_FILE); + file = global_page_state(NR_FILE_PAGES); + pool = zcache_pages(); + + file_gap = pool - file; + + if ((file_gap >= 0) && + (totalram_pages * zcache_clear_percent / 100 > file)) { + file_gap = pool; + zcache_pool_shrink++; + goto reclaim; + } + + /* + * file_gap == 0 means that the number of pages + * stored by zcache is around twice as many as the + * number of active file pages. + */ + file_gap = pool - active_file; + if (file_gap < 0) + file_gap = 0; + else + zcache_pool_shrink++; + +reclaim: + retries = file_gap; + while ((file_gap > 0) && retries) { + struct zcache_pool *zpool = + zcache.pools[i++ % MAX_ZCACHE_POOLS]; + if (!zpool || !zpool->size) + continue; + if (zbud_reclaim_page(zpool->pool, 8)) { + zcache_pool_shrink_fail++; + retries--; + continue; + } + freed++; + file_gap--; + } + + zcache_pool_shrink_pages += freed; + for (i = 0; (i < MAX_ZCACHE_POOLS) && zcache.pools[i]; i++) + zcache.pools[i]->size = + zbud_get_pool_size(zcache.pools[i]->pool); + + running = false; +end: + return freed; +} + +static struct shrinker zcache_shrinker = { + .shrink = zcache_shrink, + .seeks = DEFAULT_SEEKS * 16 +}; + +/* + * Compression functions + * (Below functions are copyed from zswap!) + */ +static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms; + +enum comp_op { + ZCACHE_COMPOP_COMPRESS, + ZCACHE_COMPOP_DECOMPRESS +}; + +static int zcache_comp_op(enum comp_op op, const u8 *src, unsigned int slen, + u8 *dst, unsigned int *dlen) +{ + struct crypto_comp *tfm; + int ret; + + tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu()); + switch (op) { + case ZCACHE_COMPOP_COMPRESS: + ret = crypto_comp_compress(tfm, src, slen, dst, dlen); + break; + case ZCACHE_COMPOP_DECOMPRESS: + ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); + break; + default: + ret = -EINVAL; + } + + put_cpu(); + return ret; +} + +static int __init zcache_comp_init(void) +{ + if (!crypto_has_comp(zcache_compressor, 0, 0)) { + pr_info("%s compressor not available\n", zcache_compressor); + /* fall back to default compressor */ + zcache_compressor = ZCACHE_COMPRESSOR_DEFAULT; + if (!crypto_has_comp(zcache_compressor, 0, 0)) + /* can't even load the default compressor */ + return -ENODEV; + } + pr_info("using %s compressor\n", zcache_compressor); + + /* alloc percpu transforms */ + zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); + if (!zcache_comp_pcpu_tfms) + return -ENOMEM; + return 0; +} + +static void zcache_comp_exit(void) +{ + /* free percpu transforms */ + if (zcache_comp_pcpu_tfms) + free_percpu(zcache_comp_pcpu_tfms); +} + +/* + * Per-cpu code + * (Below functions are also copyed from zswap!) + */ +static DEFINE_PER_CPU(u8 *, zcache_dstmem); + +static int __zcache_cpu_notifier(unsigned long action, unsigned long cpu) +{ + struct crypto_comp *tfm; + u8 *dst; + + switch (action) { + case CPU_UP_PREPARE: + tfm = crypto_alloc_comp(zcache_compressor, 0, 0); + if (IS_ERR(tfm)) { + pr_err("can't allocate compressor transform\n"); + return NOTIFY_BAD; + } + *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm; + dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); + if (!dst) { + pr_err("can't allocate compressor buffer\n"); + crypto_free_comp(tfm); + *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL; + return NOTIFY_BAD; + } + per_cpu(zcache_dstmem, cpu) = dst; + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu); + if (tfm) { + crypto_free_comp(tfm); + *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL; + } + dst = per_cpu(zcache_dstmem, cpu); + kfree(dst); + per_cpu(zcache_dstmem, cpu) = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} + +static int zcache_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + unsigned long cpu = (unsigned long)pcpu; + + return __zcache_cpu_notifier(action, cpu); +} + +static struct notifier_block zcache_cpu_notifier_block = { + .notifier_call = zcache_cpu_notifier +}; + +static int zcache_cpu_init(void) +{ + unsigned long cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + if (__zcache_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) + goto cleanup; + register_cpu_notifier(&zcache_cpu_notifier_block); + put_online_cpus(); + return 0; + +cleanup: + for_each_online_cpu(cpu) + __zcache_cpu_notifier(CPU_UP_CANCELED, cpu); + put_online_cpus(); + return -ENOMEM; +} + +/* + * Zcache helpers + */ +static bool zcache_is_full(void) +{ + long file = global_page_state(NR_FILE_PAGES); + + return ((totalram_pages * zcache_max_pool_percent / 100 < + zcache_pages()) || + (totalram_pages * zcache_clear_percent / 100 > + file)); +} + +/* + * The caller must hold zpool->rb_lock at least + */ +static struct zcache_rbnode *zcache_find_rbnode(struct rb_root *rbtree, + int index, struct rb_node **rb_parent, struct rb_node ***rb_link) +{ + struct zcache_rbnode *entry; + struct rb_node **__rb_link, *__rb_parent, *rb_prev; + + __rb_link = &rbtree->rb_node; + rb_prev = __rb_parent = NULL; + + while (*__rb_link) { + __rb_parent = *__rb_link; + entry = rb_entry(__rb_parent, struct zcache_rbnode, rb_node); + if (entry->rb_index > index) + __rb_link = &__rb_parent->rb_left; + else if (entry->rb_index < index) { + rb_prev = __rb_parent; + __rb_link = &__rb_parent->rb_right; + } else + return entry; + } + + if (rb_parent) + *rb_parent = __rb_parent; + if (rb_link) + *rb_link = __rb_link; + return NULL; +} + +static struct zcache_rbnode *zcache_find_get_rbnode(struct zcache_pool *zpool, + int rb_index) +{ + unsigned long flags; + struct zcache_rbnode *rbnode; + + read_lock_irqsave(&zpool->rb_lock, flags); + rbnode = zcache_find_rbnode(&zpool->rbtree, rb_index, 0, 0); + if (rbnode) + kref_get(&rbnode->refcount); + read_unlock_irqrestore(&zpool->rb_lock, flags); + return rbnode; +} + +/* + * kref_put callback for zcache_rbnode. + * + * The rbnode must have been isolated from rbtree already. + */ +static void zcache_rbnode_release(struct kref *kref) +{ + struct zcache_rbnode *rbnode; + + rbnode = container_of(kref, struct zcache_rbnode, refcount); + BUG_ON(rbnode->ratree.rnode); + kmem_cache_free(zcache_rbnode_cache, rbnode); +} + +/* + * Check whether the radix-tree of this rbnode is empty. + * If that's true, then we can delete this zcache_rbnode from + * zcache_pool->rbtree + * + * Caller must hold zcache_rbnode->ra_lock + */ +static int zcache_rbnode_empty(struct zcache_rbnode *rbnode) +{ + return rbnode->ratree.rnode == NULL; +} + +/* + * Remove zcache_rbnode from zpool->rbtree + * + * holded_rblock - whether the caller has holded zpool->rb_lock + */ +static void zcache_rbnode_isolate(struct zcache_pool *zpool, + struct zcache_rbnode *rbnode, bool holded_rblock) +{ + unsigned long flags; + + if (!holded_rblock) + write_lock_irqsave(&zpool->rb_lock, flags); + /* + * Someone can get reference on this rbnode before we could + * acquire write lock above. + * We want to remove it from zpool->rbtree when only the caller and + * corresponding ratree holds a reference to this rbnode. + * Below check ensures that a racing zcache put will not end up adding + * a page to an isolated node and thereby losing that memory. + */ + if (atomic_read(&rbnode->refcount.refcount) == 2) { + rb_erase(&rbnode->rb_node, &zpool->rbtree); + RB_CLEAR_NODE(&rbnode->rb_node); + kref_put(&rbnode->refcount, zcache_rbnode_release); + } + if (!holded_rblock) + write_unlock_irqrestore(&zpool->rb_lock, flags); +} + +/* + * Store zaddr which allocated by zbud_alloc() to the hierarchy rbtree-ratree. + */ +static int zcache_store_zaddr(struct zcache_pool *zpool, + int ra_index, int rb_index, unsigned long zaddr) +{ + unsigned long flags; + struct zcache_rbnode *rbnode, *tmp; + struct rb_node **link = NULL, *parent = NULL; + int ret; + void *dup_zaddr; + + rbnode = zcache_find_get_rbnode(zpool, rb_index); + if (!rbnode) { + /* alloc and init a new rbnode */ + rbnode = kmem_cache_alloc(zcache_rbnode_cache, + GFP_ZCACHE); + if (!rbnode) + return -ENOMEM; + + INIT_RADIX_TREE(&rbnode->ratree, GFP_ATOMIC|__GFP_NOWARN); + spin_lock_init(&rbnode->ra_lock); + rbnode->rb_index = rb_index; + kref_init(&rbnode->refcount); + RB_CLEAR_NODE(&rbnode->rb_node); + + /* add that rbnode to rbtree */ + write_lock_irqsave(&zpool->rb_lock, flags); + tmp = zcache_find_rbnode(&zpool->rbtree, rb_index, + &parent, &link); + if (tmp) { + /* somebody else allocated new rbnode */ + kmem_cache_free(zcache_rbnode_cache, rbnode); + rbnode = tmp; + } else { + rb_link_node(&rbnode->rb_node, parent, link); + rb_insert_color(&rbnode->rb_node, &zpool->rbtree); + } + + /* Inc the reference of this zcache_rbnode */ + kref_get(&rbnode->refcount); + write_unlock_irqrestore(&zpool->rb_lock, flags); + } + + /* Succfully got a zcache_rbnode when arriving here */ + spin_lock_irqsave(&rbnode->ra_lock, flags); + dup_zaddr = radix_tree_delete(&rbnode->ratree, ra_index); + if (unlikely(dup_zaddr)) { + if (dup_zaddr == ZERO_HANDLE) { + atomic_dec(&zcache_stored_zero_pages); + } else { + zbud_free(zpool->pool, (unsigned long)dup_zaddr); + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); + } + zcache_dup_entry++; + } + + /* Insert zcache_ra_handle to ratree */ + ret = radix_tree_insert(&rbnode->ratree, ra_index, + (void *)zaddr); + spin_unlock_irqrestore(&rbnode->ra_lock, flags); + if (unlikely(ret)) { + write_lock_irqsave(&zpool->rb_lock, flags); + spin_lock(&rbnode->ra_lock); + + if (zcache_rbnode_empty(rbnode)) + zcache_rbnode_isolate(zpool, rbnode, 1); + + spin_unlock(&rbnode->ra_lock); + write_unlock_irqrestore(&zpool->rb_lock, flags); + } + + kref_put(&rbnode->refcount, zcache_rbnode_release); + return ret; +} + +/* + * Load zaddr and delete it from radix tree. + * If the radix tree of the corresponding rbnode is empty, delete the rbnode + * from zpool->rbtree also. + */ +static void *zcache_load_delete_zaddr(struct zcache_pool *zpool, + int rb_index, int ra_index) +{ + struct zcache_rbnode *rbnode; + void *zaddr = NULL; + unsigned long flags; + + rbnode = zcache_find_get_rbnode(zpool, rb_index); + if (!rbnode) + goto out; + + BUG_ON(rbnode->rb_index != rb_index); + + spin_lock_irqsave(&rbnode->ra_lock, flags); + zaddr = radix_tree_delete(&rbnode->ratree, ra_index); + spin_unlock_irqrestore(&rbnode->ra_lock, flags); + + /* rb_lock and ra_lock must be taken again in the given sequence */ + write_lock_irqsave(&zpool->rb_lock, flags); + spin_lock(&rbnode->ra_lock); + if (zcache_rbnode_empty(rbnode)) + zcache_rbnode_isolate(zpool, rbnode, 1); + spin_unlock(&rbnode->ra_lock); + write_unlock_irqrestore(&zpool->rb_lock, flags); + + kref_put(&rbnode->refcount, zcache_rbnode_release); +out: + return zaddr; +} + +static bool zero_page(struct page *page) +{ + unsigned long *ptr = kmap_atomic(page); + int i; + bool ret = false; + + for (i = 0; i < PAGE_SIZE / sizeof(*ptr); i++) { + if (ptr[i]) + goto out; + } + ret = true; +out: + kunmap_atomic(ptr); + return ret; +} + +static void zcache_store_page(int pool_id, struct cleancache_filekey key, + pgoff_t index, struct page *page) +{ + struct zcache_ra_handle *zhandle; + u8 *zpage, *src, *dst; + /* Address of zhandle + compressed data(zpage) */ + unsigned long zaddr = 0; + unsigned int zlen = PAGE_SIZE; + bool zero = 0; + int ret; + + struct zcache_pool *zpool = zcache.pools[pool_id]; + + /* + * Zcache will be ineffective if the compressed memory pool is full with + * compressed inactive file pages and most of them will never be used + * again. + * So we refuse to compress pages that are not from active file list. + */ + if (!PageWasActive(page)) { + zcache_inactive_pages_refused++; + return; + } + + zero = zero_page(page); + if (zero) + goto zero; + + if (zcache_is_full()) { + zcache_pool_limit_hit++; + if (zbud_reclaim_page(zpool->pool, 8)) { + zcache_reclaim_fail++; + return; + } + /* + * Continue if reclaimed a page frame succ. + */ + zcache_evict_filepages++; + zpool->size = zbud_get_pool_size(zpool->pool); + } + + /* compress */ + dst = get_cpu_var(zcache_dstmem); + src = kmap_atomic(page); + ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, src, PAGE_SIZE, dst, + &zlen); + kunmap_atomic(src); + if (ret) { + pr_err("zcache compress error ret %d\n", ret); + put_cpu_var(zcache_dstmem); + return; + } + + /* store zcache handle together with compressed page data */ + ret = zbud_alloc(zpool->pool, zlen + sizeof(struct zcache_ra_handle), + GFP_ZCACHE, &zaddr); + if (ret) { + zcache_zbud_alloc_fail++; + put_cpu_var(zcache_dstmem); + return; + } + + zhandle = (struct zcache_ra_handle *)zbud_map(zpool->pool, zaddr); + + /* Compressed page data stored at the end of zcache_ra_handle */ + zpage = (u8 *)(zhandle + 1); + memcpy(zpage, dst, zlen); + zbud_unmap(zpool->pool, zaddr); + put_cpu_var(zcache_dstmem); + +zero: + if (zero) + zaddr = (unsigned long)ZERO_HANDLE; + + /* store zcache handle */ + ret = zcache_store_zaddr(zpool, index, key.u.ino, zaddr); + if (ret) { + zcache_store_failed++; + if (!zero) + zbud_free(zpool->pool, zaddr); + return; + } + + /* update stats */ + if (zero) { + atomic_inc(&zcache_stored_zero_pages); + } else { + zhandle->ra_index = index; + zhandle->rb_index = key.u.ino; + zhandle->zlen = zlen; + zhandle->zpool = zpool; + atomic_inc(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); + } + + return; +} + +static int zcache_load_page(int pool_id, struct cleancache_filekey key, + pgoff_t index, struct page *page) +{ + int ret = 0; + u8 *src, *dst; + void *zaddr; + unsigned int dlen = PAGE_SIZE; + struct zcache_ra_handle *zhandle; + struct zcache_pool *zpool = zcache.pools[pool_id]; + + zaddr = zcache_load_delete_zaddr(zpool, key.u.ino, index); + if (!zaddr) + return -ENOENT; + else if (zaddr == ZERO_HANDLE) + goto map; + + zhandle = (struct zcache_ra_handle *)zbud_map(zpool->pool, + (unsigned long)zaddr); + /* Compressed page data stored at the end of zcache_ra_handle */ + src = (u8 *)(zhandle + 1); + + /* decompress */ +map: + dst = kmap_atomic(page); + if (zaddr != ZERO_HANDLE) { + ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, src, + zhandle->zlen, dst, &dlen); + } else { + memset(dst, 0, PAGE_SIZE); + kunmap_atomic(dst); + flush_dcache_page(page); + atomic_dec(&zcache_stored_zero_pages); + goto out; + } + kunmap_atomic(dst); + zbud_unmap(zpool->pool, (unsigned long)zaddr); + zbud_free(zpool->pool, (unsigned long)zaddr); + + BUG_ON(ret); + BUG_ON(dlen != PAGE_SIZE); + + /* update stats */ + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); +out: + SetPageWasActive(page); + return ret; +} + +static void zcache_flush_page(int pool_id, struct cleancache_filekey key, + pgoff_t index) +{ + struct zcache_pool *zpool = zcache.pools[pool_id]; + void *zaddr = NULL; + + zaddr = zcache_load_delete_zaddr(zpool, key.u.ino, index); + if (zaddr && (zaddr != ZERO_HANDLE)) { + zbud_free(zpool->pool, (unsigned long)zaddr); + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); + } else if (zaddr == ZERO_HANDLE) { + atomic_dec(&zcache_stored_zero_pages); + } +} + +#define FREE_BATCH 16 +/* + * Callers must hold the lock + */ +static void zcache_flush_ratree(struct zcache_pool *zpool, + struct zcache_rbnode *rbnode) +{ + unsigned long index = 0; + int count, i; + struct zcache_ra_handle *zhandle; + void *zaddr = NULL; + + do { + void *zaddrs[FREE_BATCH]; + unsigned long indices[FREE_BATCH]; + + count = radix_tree_gang_lookup_index(&rbnode->ratree, + (void **)zaddrs, indices, + index, FREE_BATCH); + + for (i = 0; i < count; i++) { + if (zaddrs[i] == ZERO_HANDLE) { + zaddr = radix_tree_delete(&rbnode->ratree, + indices[i]); + if (zaddr) + atomic_dec(&zcache_stored_zero_pages); + continue; + } + zhandle = (struct zcache_ra_handle *)zbud_map( + zpool->pool, (unsigned long)zaddrs[i]); + index = zhandle->ra_index; + zaddr = radix_tree_delete(&rbnode->ratree, index); + if (!zaddr) + continue; + zbud_unmap(zpool->pool, (unsigned long)zaddrs[i]); + zbud_free(zpool->pool, (unsigned long)zaddrs[i]); + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(zpool->pool); + } + + index++; + } while (count == FREE_BATCH); +} + +static void zcache_flush_inode(int pool_id, struct cleancache_filekey key) +{ + struct zcache_rbnode *rbnode; + unsigned long flags1, flags2; + struct zcache_pool *zpool = zcache.pools[pool_id]; + + /* + * Refuse new pages added in to the same rbinode, so get rb_lock at + * first. + */ + write_lock_irqsave(&zpool->rb_lock, flags1); + rbnode = zcache_find_rbnode(&zpool->rbtree, key.u.ino, 0, 0); + if (!rbnode) { + write_unlock_irqrestore(&zpool->rb_lock, flags1); + return; + } + + kref_get(&rbnode->refcount); + spin_lock_irqsave(&rbnode->ra_lock, flags2); + + zcache_flush_ratree(zpool, rbnode); + if (zcache_rbnode_empty(rbnode)) + /* When arrvied here, we already hold rb_lock */ + zcache_rbnode_isolate(zpool, rbnode, 1); + + spin_unlock_irqrestore(&rbnode->ra_lock, flags2); + write_unlock_irqrestore(&zpool->rb_lock, flags1); + kref_put(&rbnode->refcount, zcache_rbnode_release); +} + +static void zcache_destroy_pool(struct zcache_pool *zpool); +static void zcache_flush_fs(int pool_id) +{ + struct zcache_rbnode *z_rbnode = NULL; + struct rb_node *rbnode; + unsigned long flags1, flags2; + struct zcache_pool *zpool; + + if (pool_id < 0) + return; + + zpool = zcache.pools[pool_id]; + if (!zpool) + return; + + /* + * Refuse new pages added in, so get rb_lock at first. + */ + write_lock_irqsave(&zpool->rb_lock, flags1); + + rbnode = rb_first(&zpool->rbtree); + while (rbnode) { + z_rbnode = rb_entry(rbnode, struct zcache_rbnode, rb_node); + rbnode = rb_next(rbnode); + if (z_rbnode) { + kref_get(&z_rbnode->refcount); + spin_lock_irqsave(&z_rbnode->ra_lock, flags2); + zcache_flush_ratree(zpool, z_rbnode); + if (zcache_rbnode_empty(z_rbnode)) + zcache_rbnode_isolate(zpool, z_rbnode, 1); + spin_unlock_irqrestore(&z_rbnode->ra_lock, flags2); + kref_put(&z_rbnode->refcount, zcache_rbnode_release); + } + } + + write_unlock_irqrestore(&zpool->rb_lock, flags1); + zcache_destroy_pool(zpool); +} + +/* + * Evict compressed pages from zcache pool on an LRU basis after the compressed + * pool is full. + */ +static int zcache_evict_zpage(struct zbud_pool *pool, unsigned long zaddr) +{ + struct zcache_pool *zpool; + struct zcache_ra_handle *zhandle; + void *zaddr_intree; + + BUG_ON(zaddr == (unsigned long)ZERO_HANDLE); + + zhandle = (struct zcache_ra_handle *)zbud_map(pool, zaddr); + + zpool = zhandle->zpool; + /* There can be a race with zcache store */ + if (!zpool) + return -EINVAL; + + BUG_ON(pool != zpool->pool); + + zaddr_intree = zcache_load_delete_zaddr(zpool, zhandle->rb_index, + zhandle->ra_index); + if (zaddr_intree) { + BUG_ON((unsigned long)zaddr_intree != zaddr); + zbud_unmap(pool, zaddr); + zbud_free(pool, zaddr); + atomic_dec(&zcache_stored_pages); + zpool->size = zbud_get_pool_size(pool); + zcache_evict_zpages++; + } + return 0; +} + +static struct zbud_ops zcache_zbud_ops = { + .evict = zcache_evict_zpage +}; + +/* Return pool id */ +static int zcache_create_pool(void) +{ + int ret; + struct zcache_pool *zpool; + + zpool = kzalloc(sizeof(*zpool), GFP_KERNEL); + if (!zpool) { + ret = -ENOMEM; + goto out; + } + + zpool->pool = zbud_create_pool(GFP_KERNEL, &zcache_zbud_ops); + if (!zpool->pool) { + kfree(zpool); + ret = -ENOMEM; + goto out; + } + + spin_lock(&zcache.pool_lock); + if (zcache.num_pools == MAX_ZCACHE_POOLS) { + pr_err("Cannot create new pool (limit:%u)\n", MAX_ZCACHE_POOLS); + zbud_destroy_pool(zpool->pool); + kfree(zpool); + ret = -EPERM; + goto out_unlock; + } + + rwlock_init(&zpool->rb_lock); + zpool->rbtree = RB_ROOT; + /* Add to pool list */ + for (ret = 0; ret < MAX_ZCACHE_POOLS; ret++) + if (!zcache.pools[ret]) + break; + zcache.pools[ret] = zpool; + zcache.num_pools++; + pr_info("New pool created id:%d\n", ret); + +out_unlock: + spin_unlock(&zcache.pool_lock); +out: + return ret; +} + +static void zcache_destroy_pool(struct zcache_pool *zpool) +{ + int i; + + if (!zpool) + return; + + spin_lock(&zcache.pool_lock); + zcache.num_pools--; + for (i = 0; i < MAX_ZCACHE_POOLS; i++) + if (zcache.pools[i] == zpool) + break; + zcache.pools[i] = NULL; + spin_unlock(&zcache.pool_lock); + + if (!RB_EMPTY_ROOT(&zpool->rbtree)) + WARN_ON("Memory leak detected. Freeing non-empty pool!\n"); + + zbud_destroy_pool(zpool->pool); + kfree(zpool); +} + +static int zcache_init_fs(size_t pagesize) +{ + int ret; + + if (pagesize != PAGE_SIZE) { + pr_info("Unsupported page size: %zu", pagesize); + ret = -EINVAL; + goto out; + } + + ret = zcache_create_pool(); + if (ret < 0) { + pr_info("Failed to create new pool\n"); + ret = -ENOMEM; + goto out; + } +out: + return ret; +} + +static int zcache_init_shared_fs(char *uuid, size_t pagesize) +{ + /* shared pools are unsupported and map to private */ + return zcache_init_fs(pagesize); +} + +static struct cleancache_ops zcache_ops = { + .put_page = zcache_store_page, + .get_page = zcache_load_page, + .invalidate_page = zcache_flush_page, + .invalidate_inode = zcache_flush_inode, + .invalidate_fs = zcache_flush_fs, + .init_shared_fs = zcache_init_shared_fs, + .init_fs = zcache_init_fs +}; + +/* + * Debugfs functions + */ +#ifdef CONFIG_DEBUG_FS +#include + +static int pool_pages_get(void *_data, u64 *val) +{ + *val = zcache_pages(); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(pool_page_fops, pool_pages_get, NULL, "%llu\n"); + +static struct dentry *zcache_debugfs_root; + +static int __init zcache_debugfs_init(void) +{ + if (!debugfs_initialized()) + return -ENODEV; + + zcache_debugfs_root = debugfs_create_dir("zcache", NULL); + if (!zcache_debugfs_root) + return -ENOMEM; + + debugfs_create_u64("pool_limit_hit", S_IRUGO, zcache_debugfs_root, + &zcache_pool_limit_hit); + debugfs_create_u64("reject_alloc_fail", S_IRUGO, zcache_debugfs_root, + &zcache_zbud_alloc_fail); + debugfs_create_u64("duplicate_entry", S_IRUGO, zcache_debugfs_root, + &zcache_dup_entry); + debugfs_create_file("pool_pages", S_IRUGO, zcache_debugfs_root, NULL, + &pool_page_fops); + debugfs_create_atomic_t("stored_pages", S_IRUGO, zcache_debugfs_root, + &zcache_stored_pages); + debugfs_create_atomic_t("stored_zero_pages", S_IRUGO, + zcache_debugfs_root, &zcache_stored_zero_pages); + debugfs_create_u64("evicted_zpages", S_IRUGO, zcache_debugfs_root, + &zcache_evict_zpages); + debugfs_create_u64("evicted_filepages", S_IRUGO, zcache_debugfs_root, + &zcache_evict_filepages); + debugfs_create_u64("reclaim_fail", S_IRUGO, zcache_debugfs_root, + &zcache_reclaim_fail); + debugfs_create_u64("inactive_pages_refused", S_IRUGO, + zcache_debugfs_root, &zcache_inactive_pages_refused); + debugfs_create_u64("pool_shrink_count", S_IRUGO, + zcache_debugfs_root, &zcache_pool_shrink); + debugfs_create_u64("pool_shrink_fail", S_IRUGO, + zcache_debugfs_root, &zcache_pool_shrink_fail); + debugfs_create_u64("pool_shrink_pages", S_IRUGO, + zcache_debugfs_root, &zcache_pool_shrink_pages); + debugfs_create_u64("store_fail", S_IRUGO, + zcache_debugfs_root, &zcache_store_failed); + return 0; +} + +static void __exit zcache_debugfs_exit(void) +{ + debugfs_remove_recursive(zcache_debugfs_root); +} +#else +static int __init zcache_debugfs_init(void) +{ + return 0; +} +static void __exit zcache_debugfs_exit(void) +{ +} +#endif + +/* + * zcache init and exit + */ +static int __init init_zcache(void) +{ + if (!zcache_enabled) + return 0; + + pr_info("loading zcache..\n"); + if (zcache_rbnode_cache_create()) { + pr_err("entry cache creation failed\n"); + goto error; + } + + if (zcache_comp_init()) { + pr_err("compressor initialization failed\n"); + goto compfail; + } + if (zcache_cpu_init()) { + pr_err("per-cpu initialization failed\n"); + goto pcpufail; + } + + spin_lock_init(&zcache.pool_lock); + cleancache_register_ops(&zcache_ops); + + if (zcache_debugfs_init()) + pr_warn("debugfs initialization failed\n"); + register_shrinker(&zcache_shrinker); + return 0; +pcpufail: + zcache_comp_exit(); +compfail: + zcache_rbnode_cache_destroy(); +error: + return -ENOMEM; +} + +/* must be late so crypto has time to come up */ +late_initcall(init_zcache); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bob Liu "); +MODULE_DESCRIPTION("Compressed cache for clean file pages"); + From 0330c0719ce95f645e0a97f25ece38418e5a830c Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 20 Jun 2016 07:09:21 -0400 Subject: [PATCH 41/52] ZCACHE: add zcache --- include/linux/page-flags.h | 6 ++++++ mm/vmscan.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 837539e4d..1bd8e64f3 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -224,6 +224,12 @@ PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) __PAGEFLAG(SlobFree, slob_free) +#ifdef CONFIG_ZCACHE +PAGEFLAG(WasActive, was_active) +#else +PAGEFLAG_FALSE(WasActive) +#endif + /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. diff --git a/mm/vmscan.c b/mm/vmscan.c index dd2681158..b27d8ba49 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1598,6 +1598,12 @@ static void shrink_active_list(unsigned long nr_to_scan, } ClearPageActive(page); /* we are de-activating */ + if (IS_ENABLED(CONFIG_ZCACHE)) + /* + * For zcache to know whether the page is from active + * file list + */ + SetPageWasActive(page); list_add(&page->lru, &l_inactive); } From 10d5a49b2cfbb738973891475edd62ebf13a9d67 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 20 Jun 2016 07:24:22 -0400 Subject: [PATCH 42/52] rbtree: update to 3.10 --- drivers/gpu/ion/ion.c | 2 +- include/linux/rbtree.h | 136 ++---- include/linux/rbtree_augmented.h | 249 +++++++++++ include/linux/timerqueue.h | 2 +- lib/rbtree.c | 698 +++++++++++++++++-------------- 5 files changed, 672 insertions(+), 415 deletions(-) create mode 100644 include/linux/rbtree_augmented.h diff --git a/drivers/gpu/ion/ion.c b/drivers/gpu/ion/ion.c index 9c61bad7e..34d20c949 100644 --- a/drivers/gpu/ion/ion.c +++ b/drivers/gpu/ion/ion.c @@ -325,7 +325,7 @@ static struct ion_handle *ion_handle_create(struct ion_client *client, if (!handle) return ERR_PTR(-ENOMEM); kref_init(&handle->ref); - rb_init_node(&handle->node); + RB_CLEAR_NODE(&handle->node); handle->client = client; ion_buffer_get(buffer); ion_buffer_add_to_handle(buffer); diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 99ce4aedd..a5aa7ae67 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -23,72 +23,7 @@ I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... - Some example of insert and search follows here. The search is a plain - normal search over an ordered tree. The insert instead must be implemented - in two steps: First, the code must insert the element in order as a red leaf - in the tree, and then the support library function rb_insert_color() must - be called. Such function will do the not trivial work to rebalance the - rbtree, if necessary. - ------------------------------------------------------------------------ -static inline struct page * rb_search_page_cache(struct inode * inode, - unsigned long offset) -{ - struct rb_node * n = inode->i_rb_page_cache.rb_node; - struct page * page; - - while (n) - { - page = rb_entry(n, struct page, rb_page_cache); - - if (offset < page->offset) - n = n->rb_left; - else if (offset > page->offset) - n = n->rb_right; - else - return page; - } - return NULL; -} - -static inline struct page * __rb_insert_page_cache(struct inode * inode, - unsigned long offset, - struct rb_node * node) -{ - struct rb_node ** p = &inode->i_rb_page_cache.rb_node; - struct rb_node * parent = NULL; - struct page * page; - - while (*p) - { - parent = *p; - page = rb_entry(parent, struct page, rb_page_cache); - - if (offset < page->offset) - p = &(*p)->rb_left; - else if (offset > page->offset) - p = &(*p)->rb_right; - else - return page; - } - - rb_link_node(node, parent, p); - - return NULL; -} - -static inline struct page * rb_insert_page_cache(struct inode * inode, - unsigned long offset, - struct rb_node * node) -{ - struct page * ret; - if ((ret = __rb_insert_page_cache(inode, offset, node))) - goto out; - rb_insert_color(node, &inode->i_rb_page_cache); - out: - return ret; -} ------------------------------------------------------------------------ + See Documentation/rbtree.txt for documentation and samples. */ #ifndef _LINUX_RBTREE_H @@ -96,64 +31,37 @@ static inline struct page * rb_insert_page_cache(struct inode * inode, #include #include +#include -struct rb_node -{ - unsigned long rb_parent_color; -#define RB_RED 0 -#define RB_BLACK 1 +struct rb_node { + unsigned long __rb_parent_color; struct rb_node *rb_right; struct rb_node *rb_left; } __attribute__((aligned(sizeof(long)))); /* The alignment might seem pointless, but allegedly CRIS needs it */ -struct rb_root -{ +struct rb_root { struct rb_node *rb_node; }; -#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3)) -#define rb_color(r) ((r)->rb_parent_color & 1) -#define rb_is_red(r) (!rb_color(r)) -#define rb_is_black(r) rb_color(r) -#define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0) -#define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0) - -static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) -{ - rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p; -} -static inline void rb_set_color(struct rb_node *rb, int color) -{ - rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; -} +#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define RB_ROOT (struct rb_root) { NULL, } #define rb_entry(ptr, type, member) container_of(ptr, type, member) -#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) -#define RB_EMPTY_NODE(node) (rb_parent(node) == node) -#define RB_CLEAR_NODE(node) (rb_set_parent(node, node)) +#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) + +/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ +#define RB_EMPTY_NODE(node) \ + ((node)->__rb_parent_color == (unsigned long)(node)) +#define RB_CLEAR_NODE(node) \ + ((node)->__rb_parent_color = (unsigned long)(node)) -static inline void rb_init_node(struct rb_node *rb) -{ - rb->rb_parent_color = 0; - rb->rb_right = NULL; - rb->rb_left = NULL; - RB_CLEAR_NODE(rb); -} extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); -typedef void (*rb_augment_f)(struct rb_node *node, void *data); - -extern void rb_augment_insert(struct rb_node *node, - rb_augment_f func, void *data); -extern struct rb_node *rb_augment_erase_begin(struct rb_node *node); -extern void rb_augment_erase_end(struct rb_node *node, - rb_augment_f func, void *data); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); @@ -166,18 +74,27 @@ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ -extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, +extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); -static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, - struct rb_node ** rb_link) +static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, + struct rb_node **rb_link) { - node->rb_parent_color = (unsigned long )parent; + node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } +static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, + struct rb_node **rb_link) +{ + node->__rb_parent_color = (unsigned long)parent; + node->rb_left = node->rb_right = NULL; + + rcu_assign_pointer(*rb_link, node); +} + #define rb_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? rb_entry(____ptr, type, member) : NULL; \ @@ -205,4 +122,5 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ typeof(*pos), field); 1; }); \ pos = n) + #endif /* _LINUX_RBTREE_H */ diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h new file mode 100644 index 000000000..14d7b831b --- /dev/null +++ b/include/linux/rbtree_augmented.h @@ -0,0 +1,249 @@ +/* + Red Black Trees + (C) 1999 Andrea Arcangeli + (C) 2002 David Woodhouse + (C) 2012 Michel Lespinasse + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + linux/include/linux/rbtree_augmented.h +*/ + +#ifndef _LINUX_RBTREE_AUGMENTED_H +#define _LINUX_RBTREE_AUGMENTED_H + +#include +#include + +/* + * Please note - only struct rb_augment_callbacks and the prototypes for + * rb_insert_augmented() and rb_erase_augmented() are intended to be public. + * The rest are implementation details you are not expected to depend on. + * + * See Documentation/rbtree.txt for documentation and samples. + */ + +struct rb_augment_callbacks { + void (*propagate)(struct rb_node *node, struct rb_node *stop); + void (*copy)(struct rb_node *old, struct rb_node *new); + void (*rotate)(struct rb_node *old, struct rb_node *new); +}; + +extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); +/* + * Fixup the rbtree and update the augmented information when rebalancing. + * + * On insertion, the user must update the augmented information on the path + * leading to the inserted node, then call rb_link_node() as usual and + * rb_augment_inserted() instead of the usual rb_insert_color() call. + * If rb_augment_inserted() rebalances the rbtree, it will callback into + * a user provided function to update the augmented information on the + * affected subtrees. + */ +static inline void +rb_insert_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) +{ + __rb_insert_augmented(node, root, augment->rotate); +} + +#define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ + rbtype, rbaugmented, rbcompute) \ +static inline void \ +rbname ## _propagate(struct rb_node *rb, struct rb_node *stop) \ +{ \ + while (rb != stop) { \ + rbstruct *node = rb_entry(rb, rbstruct, rbfield); \ + rbtype augmented = rbcompute(node); \ + if (node->rbaugmented == augmented) \ + break; \ + node->rbaugmented = augmented; \ + rb = rb_parent(&node->rbfield); \ + } \ +} \ +static inline void \ +rbname ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ +{ \ + rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ + rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ + new->rbaugmented = old->rbaugmented; \ +} \ +static void \ +rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ +{ \ + rbstruct *old = rb_entry(rb_old, rbstruct, rbfield); \ + rbstruct *new = rb_entry(rb_new, rbstruct, rbfield); \ + new->rbaugmented = old->rbaugmented; \ + old->rbaugmented = rbcompute(old); \ +} \ +rbstatic const struct rb_augment_callbacks rbname = { \ + rbname ## _propagate, rbname ## _copy, rbname ## _rotate \ +}; + + +#define RB_RED 0 +#define RB_BLACK 1 + +#define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) + +#define __rb_color(pc) ((pc) & 1) +#define __rb_is_black(pc) __rb_color(pc) +#define __rb_is_red(pc) (!__rb_color(pc)) +#define rb_color(rb) __rb_color((rb)->__rb_parent_color) +#define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) +#define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) + +static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) +{ + rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; +} + +static inline void rb_set_parent_color(struct rb_node *rb, + struct rb_node *p, int color) +{ + rb->__rb_parent_color = (unsigned long)p | color; +} + +static inline void +__rb_change_child(struct rb_node *old, struct rb_node *new, + struct rb_node *parent, struct rb_root *root) +{ + if (parent) { + if (parent->rb_left == old) + WRITE_ONCE(parent->rb_left, new); + else + WRITE_ONCE(parent->rb_right, new); + } else + WRITE_ONCE(root->rb_node, new); +} + +extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); + +static __always_inline struct rb_node * +__rb_erase_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) +{ + struct rb_node *child = node->rb_right; + struct rb_node *tmp = node->rb_left; + struct rb_node *parent, *rebalance; + unsigned long pc; + + if (!tmp) { + /* + * Case 1: node to erase has no more than 1 child (easy!) + * + * Note that if there is one child it must be red due to 5) + * and node must be black due to 4). We adjust colors locally + * so as to bypass __rb_erase_color() later on. + */ + pc = node->__rb_parent_color; + parent = __rb_parent(pc); + __rb_change_child(node, child, parent, root); + if (child) { + child->__rb_parent_color = pc; + rebalance = NULL; + } else + rebalance = __rb_is_black(pc) ? parent : NULL; + tmp = parent; + } else if (!child) { + /* Still case 1, but this time the child is node->rb_left */ + tmp->__rb_parent_color = pc = node->__rb_parent_color; + parent = __rb_parent(pc); + __rb_change_child(node, tmp, parent, root); + rebalance = NULL; + tmp = parent; + } else { + struct rb_node *successor = child, *child2; + + tmp = child->rb_left; + if (!tmp) { + /* + * Case 2: node's successor is its right child + * + * (n) (s) + * / \ / \ + * (x) (s) -> (x) (c) + * \ + * (c) + */ + parent = successor; + child2 = successor->rb_right; + + augment->copy(node, successor); + } else { + /* + * Case 3: node's successor is leftmost under + * node's right child subtree + * + * (n) (s) + * / \ / \ + * (x) (y) -> (x) (y) + * / / + * (p) (p) + * / / + * (s) (c) + * \ + * (c) + */ + do { + parent = successor; + successor = tmp; + tmp = tmp->rb_left; + } while (tmp); + child2 = successor->rb_right; + WRITE_ONCE(parent->rb_left, child2); + WRITE_ONCE(successor->rb_right, child); + rb_set_parent(child, successor); + + augment->copy(node, successor); + augment->propagate(parent, successor); + } + + tmp = node->rb_left; + WRITE_ONCE(successor->rb_left, tmp); + rb_set_parent(tmp, successor); + + pc = node->__rb_parent_color; + tmp = __rb_parent(pc); + __rb_change_child(node, successor, tmp, root); + + if (child2) { + successor->__rb_parent_color = pc; + rb_set_parent_color(child2, parent, RB_BLACK); + rebalance = NULL; + } else { + unsigned long pc2 = successor->__rb_parent_color; + successor->__rb_parent_color = pc; + rebalance = __rb_is_black(pc2) ? parent : NULL; + } + tmp = successor; + } + + augment->propagate(tmp, NULL); + return rebalance; +} + +static __always_inline void +rb_erase_augmented(struct rb_node *node, struct rb_root *root, + const struct rb_augment_callbacks *augment) +{ + struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); + if (rebalance) + __rb_erase_color(rebalance, root, augment->rotate); +} + +#endif /* _LINUX_RBTREE_AUGMENTED_H */ diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index 508872747..a520fd70a 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -39,7 +39,7 @@ struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) static inline void timerqueue_init(struct timerqueue_node *node) { - rb_init_node(&node->node); + RB_CLEAR_NODE(&node->node); } static inline void timerqueue_init_head(struct timerqueue_head *head) diff --git a/lib/rbtree.c b/lib/rbtree.c index 68d1faf12..1356454e3 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -2,7 +2,8 @@ Red Black Trees (C) 1999 Andrea Arcangeli (C) 2002 David Woodhouse - + (C) 2012 Michel Lespinasse + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or @@ -20,339 +21,428 @@ linux/lib/rbtree.c */ -#include +#include #include -static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) -{ - struct rb_node *right = node->rb_right; - struct rb_node *parent = rb_parent(node); - - if ((node->rb_right = right->rb_left)) - rb_set_parent(right->rb_left, node); - right->rb_left = node; +/* + * red-black trees properties: http://en.wikipedia.org/wiki/Rbtree + * + * 1) A node is either red or black + * 2) The root is black + * 3) All leaves (NULL) are black + * 4) Both children of every red node are black + * 5) Every simple path from root to leaves contains the same number + * of black nodes. + * + * 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two + * consecutive red nodes in a path and every red node is therefore followed by + * a black. So if B is the number of black nodes on every simple path (as per + * 5), then the longest possible path due to 4 is 2B. + * + * We shall indicate color with case, where black nodes are uppercase and red + * nodes will be lowercase. Unknown color nodes shall be drawn as red within + * parentheses and have some accompanying text comment. + */ - rb_set_parent(right, parent); +/* + * Notes on lockless lookups: + * + * All stores to the tree structure (rb_left and rb_right) must be done using + * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the + * tree structure as seen in program order. + * + * These two requirements will allow lockless iteration of the tree -- not + * correct iteration mind you, tree rotations are not atomic so a lookup might + * miss entire subtrees. + * + * But they do guarantee that any such traversal will only see valid elements + * and that it will indeed complete -- does not get stuck in a loop. + * + * It also guarantees that if the lookup returns an element it is the 'correct' + * one. But not returning an element does _NOT_ mean it's not present. + * + * NOTE: + * + * Stores to __rb_parent_color are not important for simple lookups so those + * are left undone as of now. Nor did I check for loops involving parent + * pointers. + */ - if (parent) - { - if (node == parent->rb_left) - parent->rb_left = right; - else - parent->rb_right = right; - } - else - root->rb_node = right; - rb_set_parent(node, right); +static inline void rb_set_black(struct rb_node *rb) +{ + rb->__rb_parent_color |= RB_BLACK; } -static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) +static inline struct rb_node *rb_red_parent(struct rb_node *red) { - struct rb_node *left = node->rb_left; - struct rb_node *parent = rb_parent(node); - - if ((node->rb_left = left->rb_right)) - rb_set_parent(left->rb_right, node); - left->rb_right = node; - - rb_set_parent(left, parent); + return (struct rb_node *)red->__rb_parent_color; +} - if (parent) - { - if (node == parent->rb_right) - parent->rb_right = left; - else - parent->rb_left = left; - } - else - root->rb_node = left; - rb_set_parent(node, left); +/* + * Helper function for rotations: + * - old's parent and color get assigned to new + * - old gets assigned new as a parent and 'color' as a color. + */ +static inline void +__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, + struct rb_root *root, int color) +{ + struct rb_node *parent = rb_parent(old); + new->__rb_parent_color = old->__rb_parent_color; + rb_set_parent_color(old, new, color); + __rb_change_child(old, new, parent, root); } -void rb_insert_color(struct rb_node *node, struct rb_root *root) +static __always_inline void +__rb_insert(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - struct rb_node *parent, *gparent; - - while ((parent = rb_parent(node)) && rb_is_red(parent)) - { - gparent = rb_parent(parent); - - if (parent == gparent->rb_left) - { - { - register struct rb_node *uncle = gparent->rb_right; - if (uncle && rb_is_red(uncle)) - { - rb_set_black(uncle); - rb_set_black(parent); - rb_set_red(gparent); - node = gparent; - continue; - } + struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; + + while (true) { + /* + * Loop invariant: node is red + * + * If there is a black parent, we are done. + * Otherwise, take some corrective action as we don't + * want a red root or two consecutive red nodes. + */ + if (!parent) { + rb_set_parent_color(node, NULL, RB_BLACK); + break; + } else if (rb_is_black(parent)) + break; + + gparent = rb_red_parent(parent); + + tmp = gparent->rb_right; + if (parent != tmp) { /* parent == gparent->rb_left */ + if (tmp && rb_is_red(tmp)) { + /* + * Case 1 - color flips + * + * G g + * / \ / \ + * p u --> P U + * / / + * n n + * + * However, since g's parent might be red, and + * 4) does not allow this, we need to recurse + * at g. + */ + rb_set_parent_color(tmp, gparent, RB_BLACK); + rb_set_parent_color(parent, gparent, RB_BLACK); + node = gparent; + parent = rb_parent(node); + rb_set_parent_color(node, parent, RB_RED); + continue; } - if (parent->rb_right == node) - { - register struct rb_node *tmp; - __rb_rotate_left(parent, root); - tmp = parent; + tmp = parent->rb_right; + if (node == tmp) { + /* + * Case 2 - left rotate at parent + * + * G G + * / \ / \ + * p U --> n U + * \ / + * n p + * + * This still leaves us in violation of 4), the + * continuation into Case 3 will fix that. + */ + tmp = node->rb_left; + WRITE_ONCE(parent->rb_right, tmp); + WRITE_ONCE(node->rb_left, parent); + if (tmp) + rb_set_parent_color(tmp, parent, + RB_BLACK); + rb_set_parent_color(parent, node, RB_RED); + augment_rotate(parent, node); parent = node; - node = tmp; + tmp = node->rb_right; } - rb_set_black(parent); - rb_set_red(gparent); - __rb_rotate_right(gparent, root); + /* + * Case 3 - right rotate at gparent + * + * G P + * / \ / \ + * p U --> n g + * / \ + * n U + */ + WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */ + WRITE_ONCE(parent->rb_right, gparent); + if (tmp) + rb_set_parent_color(tmp, gparent, RB_BLACK); + __rb_rotate_set_parents(gparent, parent, root, RB_RED); + augment_rotate(gparent, parent); + break; } else { - { - register struct rb_node *uncle = gparent->rb_left; - if (uncle && rb_is_red(uncle)) - { - rb_set_black(uncle); - rb_set_black(parent); - rb_set_red(gparent); - node = gparent; - continue; - } + tmp = gparent->rb_left; + if (tmp && rb_is_red(tmp)) { + /* Case 1 - color flips */ + rb_set_parent_color(tmp, gparent, RB_BLACK); + rb_set_parent_color(parent, gparent, RB_BLACK); + node = gparent; + parent = rb_parent(node); + rb_set_parent_color(node, parent, RB_RED); + continue; } - if (parent->rb_left == node) - { - register struct rb_node *tmp; - __rb_rotate_right(parent, root); - tmp = parent; + tmp = parent->rb_left; + if (node == tmp) { + /* Case 2 - right rotate at parent */ + tmp = node->rb_right; + WRITE_ONCE(parent->rb_left, tmp); + WRITE_ONCE(node->rb_right, parent); + if (tmp) + rb_set_parent_color(tmp, parent, + RB_BLACK); + rb_set_parent_color(parent, node, RB_RED); + augment_rotate(parent, node); parent = node; - node = tmp; + tmp = node->rb_left; } - rb_set_black(parent); - rb_set_red(gparent); - __rb_rotate_left(gparent, root); + /* Case 3 - left rotate at gparent */ + WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */ + WRITE_ONCE(parent->rb_left, gparent); + if (tmp) + rb_set_parent_color(tmp, gparent, RB_BLACK); + __rb_rotate_set_parents(gparent, parent, root, RB_RED); + augment_rotate(gparent, parent); + break; } } - - rb_set_black(root->rb_node); } -EXPORT_SYMBOL(rb_insert_color); -static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, - struct rb_root *root) +/* + * Inline version for rb_erase() use - we want to be able to inline + * and eliminate the dummy_rotate callback there + */ +static __always_inline void +____rb_erase_color(struct rb_node *parent, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - struct rb_node *other; - - while ((!node || rb_is_black(node)) && node != root->rb_node) - { - if (parent->rb_left == node) - { - other = parent->rb_right; - if (rb_is_red(other)) - { - rb_set_black(other); - rb_set_red(parent); - __rb_rotate_left(parent, root); - other = parent->rb_right; - } - if ((!other->rb_left || rb_is_black(other->rb_left)) && - (!other->rb_right || rb_is_black(other->rb_right))) - { - rb_set_red(other); - node = parent; - parent = rb_parent(node); + struct rb_node *node = NULL, *sibling, *tmp1, *tmp2; + + while (true) { + /* + * Loop invariants: + * - node is black (or NULL on first iteration) + * - node is not the root (parent is not NULL) + * - All leaf paths going through parent and node have a + * black node count that is 1 lower than other leaf paths. + */ + sibling = parent->rb_right; + if (node != sibling) { /* node == parent->rb_left */ + if (rb_is_red(sibling)) { + /* + * Case 1 - left rotate at parent + * + * P S + * / \ / \ + * N s --> p Sr + * / \ / \ + * Sl Sr N Sl + */ + tmp1 = sibling->rb_left; + WRITE_ONCE(parent->rb_right, tmp1); + WRITE_ONCE(sibling->rb_left, parent); + rb_set_parent_color(tmp1, parent, RB_BLACK); + __rb_rotate_set_parents(parent, sibling, root, + RB_RED); + augment_rotate(parent, sibling); + sibling = tmp1; } - else - { - if (!other->rb_right || rb_is_black(other->rb_right)) - { - rb_set_black(other->rb_left); - rb_set_red(other); - __rb_rotate_right(other, root); - other = parent->rb_right; + tmp1 = sibling->rb_right; + if (!tmp1 || rb_is_black(tmp1)) { + tmp2 = sibling->rb_left; + if (!tmp2 || rb_is_black(tmp2)) { + /* + * Case 2 - sibling color flip + * (p could be either color here) + * + * (p) (p) + * / \ / \ + * N S --> N s + * / \ / \ + * Sl Sr Sl Sr + * + * This leaves us violating 5) which + * can be fixed by flipping p to black + * if it was red, or by recursing at p. + * p is red when coming from Case 1. + */ + rb_set_parent_color(sibling, parent, + RB_RED); + if (rb_is_red(parent)) + rb_set_black(parent); + else { + node = parent; + parent = rb_parent(node); + if (parent) + continue; + } + break; } - rb_set_color(other, rb_color(parent)); - rb_set_black(parent); - rb_set_black(other->rb_right); - __rb_rotate_left(parent, root); - node = root->rb_node; - break; - } - } - else - { - other = parent->rb_left; - if (rb_is_red(other)) - { - rb_set_black(other); - rb_set_red(parent); - __rb_rotate_right(parent, root); - other = parent->rb_left; + /* + * Case 3 - right rotate at sibling + * (p could be either color here) + * + * (p) (p) + * / \ / \ + * N S --> N Sl + * / \ \ + * sl Sr s + * \ + * Sr + */ + tmp1 = tmp2->rb_right; + WRITE_ONCE(sibling->rb_left, tmp1); + WRITE_ONCE(tmp2->rb_right, sibling); + WRITE_ONCE(parent->rb_right, tmp2); + if (tmp1) + rb_set_parent_color(tmp1, sibling, + RB_BLACK); + augment_rotate(sibling, tmp2); + tmp1 = sibling; + sibling = tmp2; } - if ((!other->rb_left || rb_is_black(other->rb_left)) && - (!other->rb_right || rb_is_black(other->rb_right))) - { - rb_set_red(other); - node = parent; - parent = rb_parent(node); + /* + * Case 4 - left rotate at parent + color flips + * (p and sl could be either color here. + * After rotation, p becomes black, s acquires + * p's color, and sl keeps its color) + * + * (p) (s) + * / \ / \ + * N S --> P Sr + * / \ / \ + * (sl) sr N (sl) + */ + tmp2 = sibling->rb_left; + WRITE_ONCE(parent->rb_right, tmp2); + WRITE_ONCE(sibling->rb_left, parent); + rb_set_parent_color(tmp1, sibling, RB_BLACK); + if (tmp2) + rb_set_parent(tmp2, parent); + __rb_rotate_set_parents(parent, sibling, root, + RB_BLACK); + augment_rotate(parent, sibling); + break; + } else { + sibling = parent->rb_left; + if (rb_is_red(sibling)) { + /* Case 1 - right rotate at parent */ + tmp1 = sibling->rb_right; + WRITE_ONCE(parent->rb_left, tmp1); + WRITE_ONCE(sibling->rb_right, parent); + rb_set_parent_color(tmp1, parent, RB_BLACK); + __rb_rotate_set_parents(parent, sibling, root, + RB_RED); + augment_rotate(parent, sibling); + sibling = tmp1; } - else - { - if (!other->rb_left || rb_is_black(other->rb_left)) - { - rb_set_black(other->rb_right); - rb_set_red(other); - __rb_rotate_left(other, root); - other = parent->rb_left; + tmp1 = sibling->rb_left; + if (!tmp1 || rb_is_black(tmp1)) { + tmp2 = sibling->rb_right; + if (!tmp2 || rb_is_black(tmp2)) { + /* Case 2 - sibling color flip */ + rb_set_parent_color(sibling, parent, + RB_RED); + if (rb_is_red(parent)) + rb_set_black(parent); + else { + node = parent; + parent = rb_parent(node); + if (parent) + continue; + } + break; } - rb_set_color(other, rb_color(parent)); - rb_set_black(parent); - rb_set_black(other->rb_left); - __rb_rotate_right(parent, root); - node = root->rb_node; - break; + /* Case 3 - right rotate at sibling */ + tmp1 = tmp2->rb_left; + WRITE_ONCE(sibling->rb_right, tmp1); + WRITE_ONCE(tmp2->rb_left, sibling); + WRITE_ONCE(parent->rb_left, tmp2); + if (tmp1) + rb_set_parent_color(tmp1, sibling, + RB_BLACK); + augment_rotate(sibling, tmp2); + tmp1 = sibling; + sibling = tmp2; } + /* Case 4 - left rotate at parent + color flips */ + tmp2 = sibling->rb_right; + WRITE_ONCE(parent->rb_left, tmp2); + WRITE_ONCE(sibling->rb_right, parent); + rb_set_parent_color(tmp1, sibling, RB_BLACK); + if (tmp2) + rb_set_parent(tmp2, parent); + __rb_rotate_set_parents(parent, sibling, root, + RB_BLACK); + augment_rotate(parent, sibling); + break; } } - if (node) - rb_set_black(node); } -void rb_erase(struct rb_node *node, struct rb_root *root) +/* Non-inline version for rb_erase_augmented() use */ +void __rb_erase_color(struct rb_node *parent, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - struct rb_node *child, *parent; - int color; - - if (!node->rb_left) - child = node->rb_right; - else if (!node->rb_right) - child = node->rb_left; - else - { - struct rb_node *old = node, *left; - - node = node->rb_right; - while ((left = node->rb_left) != NULL) - node = left; - - if (rb_parent(old)) { - if (rb_parent(old)->rb_left == old) - rb_parent(old)->rb_left = node; - else - rb_parent(old)->rb_right = node; - } else - root->rb_node = node; - - child = node->rb_right; - parent = rb_parent(node); - color = rb_color(node); - - if (parent == old) { - parent = node; - } else { - if (child) - rb_set_parent(child, parent); - parent->rb_left = child; - - node->rb_right = old->rb_right; - rb_set_parent(old->rb_right, node); - } - - node->rb_parent_color = old->rb_parent_color; - node->rb_left = old->rb_left; - rb_set_parent(old->rb_left, node); - - goto color; - } - - parent = rb_parent(node); - color = rb_color(node); - - if (child) - rb_set_parent(child, parent); - if (parent) - { - if (parent->rb_left == node) - parent->rb_left = child; - else - parent->rb_right = child; - } - else - root->rb_node = child; - - color: - if (color == RB_BLACK) - __rb_erase_color(child, parent, root); + ____rb_erase_color(parent, root, augment_rotate); } -EXPORT_SYMBOL(rb_erase); +EXPORT_SYMBOL(__rb_erase_color); -static void rb_augment_path(struct rb_node *node, rb_augment_f func, void *data) -{ - struct rb_node *parent; - -up: - func(node, data); - parent = rb_parent(node); - if (!parent) - return; +/* + * Non-augmented rbtree manipulation functions. + * + * We use dummy augmented callbacks here, and have the compiler optimize them + * out of the rb_insert_color() and rb_erase() function definitions. + */ - if (node == parent->rb_left && parent->rb_right) - func(parent->rb_right, data); - else if (parent->rb_left) - func(parent->rb_left, data); +static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {} +static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {} +static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {} - node = parent; - goto up; -} +static const struct rb_augment_callbacks dummy_callbacks = { + dummy_propagate, dummy_copy, dummy_rotate +}; -/* - * after inserting @node into the tree, update the tree to account for - * both the new entry and any damage done by rebalance - */ -void rb_augment_insert(struct rb_node *node, rb_augment_f func, void *data) +void rb_insert_color(struct rb_node *node, struct rb_root *root) { - if (node->rb_left) - node = node->rb_left; - else if (node->rb_right) - node = node->rb_right; - - rb_augment_path(node, func, data); + __rb_insert(node, root, dummy_rotate); } -EXPORT_SYMBOL(rb_augment_insert); +EXPORT_SYMBOL(rb_insert_color); -/* - * before removing the node, find the deepest node on the rebalance path - * that will still be there after @node gets removed - */ -struct rb_node *rb_augment_erase_begin(struct rb_node *node) +void rb_erase(struct rb_node *node, struct rb_root *root) { - struct rb_node *deepest; - - if (!node->rb_right && !node->rb_left) - deepest = rb_parent(node); - else if (!node->rb_right) - deepest = node->rb_left; - else if (!node->rb_left) - deepest = node->rb_right; - else { - deepest = rb_next(node); - if (deepest->rb_right) - deepest = deepest->rb_right; - else if (rb_parent(deepest) != node) - deepest = rb_parent(deepest); - } - - return deepest; + struct rb_node *rebalance; + rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); + if (rebalance) + ____rb_erase_color(rebalance, root, dummy_rotate); } -EXPORT_SYMBOL(rb_augment_erase_begin); +EXPORT_SYMBOL(rb_erase); /* - * after removal, update the tree to account for the removed entry - * and any rebalance damage. + * Augmented rbtree manipulation functions. + * + * This instantiates the same __always_inline functions as in the non-augmented + * case, but this time with user-defined callbacks. */ -void rb_augment_erase_end(struct rb_node *node, rb_augment_f func, void *data) + +void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, + void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - if (node) - rb_augment_path(node, func, data); + __rb_insert(node, root, augment_rotate); } -EXPORT_SYMBOL(rb_augment_erase_end); +EXPORT_SYMBOL(__rb_insert_augmented); /* * This function returns the first node (in sort order) of the tree. @@ -387,11 +477,13 @@ struct rb_node *rb_next(const struct rb_node *node) { struct rb_node *parent; - if (rb_parent(node) == node) + if (RB_EMPTY_NODE(node)) return NULL; - /* If we have a right-hand child, go down and then left as far - as we can. */ + /* + * If we have a right-hand child, go down and then left as far + * as we can. + */ if (node->rb_right) { node = node->rb_right; while (node->rb_left) @@ -399,12 +491,13 @@ struct rb_node *rb_next(const struct rb_node *node) return (struct rb_node *)node; } - /* No right-hand children. Everything down and left is - smaller than us, so any 'next' node must be in the general - direction of our parent. Go up the tree; any time the - ancestor is a right-hand child of its parent, keep going - up. First time it's a left-hand child of its parent, said - parent is our 'next' node. */ + /* + * No right-hand children. Everything down and left is smaller than us, + * so any 'next' node must be in the general direction of our parent. + * Go up the tree; any time the ancestor is a right-hand child of its + * parent, keep going up. First time it's a left-hand child of its + * parent, said parent is our 'next' node. + */ while ((parent = rb_parent(node)) && node == parent->rb_right) node = parent; @@ -416,11 +509,13 @@ struct rb_node *rb_prev(const struct rb_node *node) { struct rb_node *parent; - if (rb_parent(node) == node) + if (RB_EMPTY_NODE(node)) return NULL; - /* If we have a left-hand child, go down and then right as far - as we can. */ + /* + * If we have a left-hand child, go down and then right as far + * as we can. + */ if (node->rb_left) { node = node->rb_left; while (node->rb_right) @@ -428,8 +523,10 @@ struct rb_node *rb_prev(const struct rb_node *node) return (struct rb_node *)node; } - /* No left-hand children. Go up till we find an ancestor which - is a right-hand child of its parent */ + /* + * No left-hand children. Go up till we find an ancestor which + * is a right-hand child of its parent. + */ while ((parent = rb_parent(node)) && node == parent->rb_left) node = parent; @@ -443,14 +540,7 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_node *parent = rb_parent(victim); /* Set the surrounding nodes to point to the replacement */ - if (parent) { - if (victim == parent->rb_left) - parent->rb_left = new; - else - parent->rb_right = new; - } else { - root->rb_node = new; - } + __rb_change_child(victim, new, parent, root); if (victim->rb_left) rb_set_parent(victim->rb_left, new); if (victim->rb_right) @@ -499,4 +589,4 @@ struct rb_node *rb_first_postorder(const struct rb_root *root) return rb_left_deepest_node(root->rb_node); } -EXPORT_SYMBOL(rb_first_postorder); \ No newline at end of file +EXPORT_SYMBOL(rb_first_postorder); From bb7ccaf90ff3a1415aa924ec0fadd658c31b09f5 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 20 Jun 2016 07:27:25 -0400 Subject: [PATCH 43/52] rbtree: update to 3.10 --- drivers/misc/modem_if/sipc4_modem.c | 4 ++-- drivers/misc/modem_if/sipc5_modem.c | 4 ++-- fs/proc/proc_sysctl.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/misc/modem_if/sipc4_modem.c b/drivers/misc/modem_if/sipc4_modem.c index 66ae4c1af..2ba3bac89 100755 --- a/drivers/misc/modem_if/sipc4_modem.c +++ b/drivers/misc/modem_if/sipc4_modem.c @@ -105,8 +105,8 @@ static struct io_device *create_io_device(struct modem_io_t *io_t, return NULL; } - rb_init_node(&iod->node_chan); - rb_init_node(&iod->node_fmt); + RB_CLEAR_NODE(&iod->node_chan); + RB_CLEAR_NODE(&iod->node_fmt); iod->name = io_t->name; iod->id = io_t->id; diff --git a/drivers/misc/modem_if/sipc5_modem.c b/drivers/misc/modem_if/sipc5_modem.c index 8943686fa..84043a2a7 100755 --- a/drivers/misc/modem_if/sipc5_modem.c +++ b/drivers/misc/modem_if/sipc5_modem.c @@ -108,8 +108,8 @@ static struct io_device *create_io_device(struct modem_io_t *io_t, return NULL; } - rb_init_node(&iod->node_chan); - rb_init_node(&iod->node_fmt); + RB_CLEAR_NODE(&iod->node_chan); + RB_CLEAR_NODE(&iod->node_fmt); iod->name = io_t->name; iod->id = io_t->id; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 85a854e38..0abccc488 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -170,7 +170,7 @@ static void init_header(struct ctl_table_header *head, if (node) { struct ctl_table *entry; for (entry = table; entry->procname; entry++, node++) { - rb_init_node(&node->node); + RB_CLEAR_NODE(&node->node); node->header = head; } } From c60c94e2c1985cf1c4db9b138d60f9e261d47b2c Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 20 Jun 2016 21:34:33 -0400 Subject: [PATCH 44/52] radix-tree: update to 3.10 --- include/linux/pagemap.h | 5 + include/linux/radix-tree.h | 66 +++++- lib/radix-tree.c | 470 ++++++++++++++++++++++--------------- mm/filemap.c | 80 +++++++ mm/readahead.c | 4 +- 5 files changed, 423 insertions(+), 202 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 696ca3920..530b807fc 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -236,6 +236,11 @@ static inline struct page *page_cache_alloc_readahead(struct address_space *x) typedef int filler_t(void *, struct page *); +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan); +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan); + extern struct page * find_get_page(struct address_space *mapping, pgoff_t index); extern struct page * find_lock_page(struct address_space *mapping, diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index ffc444c38..74aee022b 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -60,6 +60,33 @@ static inline int radix_tree_is_indirect_ptr(void *ptr) #define RADIX_TREE_MAX_TAGS 3 +#ifdef __KERNEL__ +#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) +#else +#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ +#endif + +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) + +#define RADIX_TREE_TAG_LONGS \ + ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) + +struct radix_tree_node { + unsigned int height; /* Height from the bottom */ + unsigned int count; + union { + struct radix_tree_node *parent; /* Used when ascending tree */ + struct rcu_head rcu_head; /* Used when freeing node */ + }; + void __rcu *slots[RADIX_TREE_MAP_SIZE]; + unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; +}; + +#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) +#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ + RADIX_TREE_MAP_SHIFT)) + /* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */ struct radix_tree_root { unsigned int height; @@ -101,6 +128,7 @@ do { \ * concurrently with other readers. * * The notable exceptions to this rule are the following functions: + * __radix_tree_lookup * radix_tree_lookup * radix_tree_lookup_slot * radix_tree_tag_get @@ -216,21 +244,29 @@ static inline void radix_tree_replace_slot(void **pslot, void *item) rcu_assign_pointer(*pslot, item); } +int __radix_tree_create(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node **nodep, void ***slotp); int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); +void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node *node); +void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); +unsigned int +radix_tree_gang_lookup_index(struct radix_tree_root *root, void **results, + unsigned long *indices, unsigned long first_index, + unsigned int max_items); unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, unsigned long *indices, unsigned long first_index, unsigned int max_items); -unsigned long radix_tree_next_hole(struct radix_tree_root *root, - unsigned long index, unsigned long max_scan); -unsigned long radix_tree_prev_hole(struct radix_tree_root *root, - unsigned long index, unsigned long max_scan); int radix_tree_preload(gfp_t gfp_mask); +int radix_tree_maybe_preload(gfp_t gfp_mask); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag); @@ -321,13 +357,29 @@ radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start) void **radix_tree_next_chunk(struct radix_tree_root *root, struct radix_tree_iter *iter, unsigned flags); +/** + * radix_tree_iter_retry - retry this chunk of the iteration + * @iter: iterator state + * + * If we iterate over a tree protected only by the RCU lock, a race + * against deletion or creation may result in seeing a slot for which + * radix_tree_deref_retry() returns true. If so, call this function + * and continue the iteration. + */ +static inline __must_check +void **radix_tree_iter_retry(struct radix_tree_iter *iter) +{ + iter->next_index = iter->index; + return NULL; +} + /** * radix_tree_chunk_size - get current chunk size * * @iter: pointer to radix tree iterator * Returns: current chunk size */ -static __always_inline unsigned +static __always_inline long radix_tree_chunk_size(struct radix_tree_iter *iter) { return iter->next_index - iter->index; @@ -361,9 +413,9 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) return slot + offset + 1; } } else { - unsigned size = radix_tree_chunk_size(iter) - 1; + long size = radix_tree_chunk_size(iter); - while (size--) { + while (--size > 0) { slot++; iter->index++; if (likely(*slot)) diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 3ac50dc55..6ebbdba6b 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -32,35 +32,9 @@ #include #include #include +#include /* in_interrupt() */ -#ifdef __KERNEL__ -#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) -#else -#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ -#endif - -#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) -#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) - -#define RADIX_TREE_TAG_LONGS \ - ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) - -struct radix_tree_node { - unsigned int height; /* Height from the bottom */ - unsigned int count; - union { - struct radix_tree_node *parent; /* Used when ascending tree */ - struct rcu_head rcu_head; /* Used when freeing node */ - }; - void __rcu *slots[RADIX_TREE_MAP_SIZE]; - unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; -}; - -#define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ - RADIX_TREE_MAP_SHIFT)) - /* * The height_to_maxindex array needs to be one deeper than the maximum * path as height 0 holds only 1 entry. @@ -72,12 +46,25 @@ static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly; */ static struct kmem_cache *radix_tree_node_cachep; +/* + * The radix tree is variable-height, so an insert operation not only has + * to build the branch to its corresponding item, it also has to build the + * branch to existing items if the size has to be increased (by + * radix_tree_extend). + * + * The worst case is a zero height tree with just a single item at index 0, + * and then inserting an item at index ULONG_MAX. This requires 2 new branches + * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared. + * Hence: + */ +#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1) + /* * Per-cpu pool of preloaded nodes */ struct radix_tree_preload { int nr; - struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH]; + struct radix_tree_node *nodes[RADIX_TREE_PRELOAD_SIZE]; }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; @@ -194,7 +181,12 @@ radix_tree_node_alloc(struct radix_tree_root *root) struct radix_tree_node *ret = NULL; gfp_t gfp_mask = root_gfp_mask(root); - if (!(gfp_mask & __GFP_WAIT)) { + /* + * Preload code isn't irq safe and it doesn't make sence to use + * preloading in the interrupt anyway as all the allocations have to + * be atomic. So just do normal allocation when in interrupt. + */ + if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) { struct radix_tree_preload *rtp; /* @@ -202,7 +194,7 @@ radix_tree_node_alloc(struct radix_tree_root *root) * succeed in getting a node here (and never reach * kmem_cache_alloc) */ - rtp = &__get_cpu_var(radix_tree_preloads); + rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; rtp->nodes[rtp->nr - 1] = NULL; @@ -251,21 +243,21 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_WAIT being passed to INIT_RADIX_TREE(). */ -int radix_tree_preload(gfp_t gfp_mask) +static int __radix_tree_preload(gfp_t gfp_mask) { struct radix_tree_preload *rtp; struct radix_tree_node *node; int ret = -ENOMEM; preempt_disable(); - rtp = &__get_cpu_var(radix_tree_preloads); + rtp = this_cpu_ptr(&radix_tree_preloads); while (rtp->nr < ARRAY_SIZE(rtp->nodes)) { preempt_enable(); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; preempt_disable(); - rtp = &__get_cpu_var(radix_tree_preloads); + rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr < ARRAY_SIZE(rtp->nodes)) rtp->nodes[rtp->nr++] = node; else @@ -275,8 +267,39 @@ int radix_tree_preload(gfp_t gfp_mask) out: return ret; } + +/* + * Load up this CPU's radix_tree_node buffer with sufficient objects to + * ensure that the addition of a single element in the tree cannot fail. On + * success, return zero, with preemption disabled. On error, return -ENOMEM + * with preemption not disabled. + * + * To make use of this facility, the radix tree must be initialised without + * __GFP_WAIT being passed to INIT_RADIX_TREE(). + */ +int radix_tree_preload(gfp_t gfp_mask) +{ + /* Warn on non-sensical use... */ + WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT)); + return __radix_tree_preload(gfp_mask); +} EXPORT_SYMBOL(radix_tree_preload); +/* + * The same as above function, except we don't guarantee preloading happens. + * We do it, if we decide it helps. On success, return zero with preemption + * disabled. On error, return -ENOMEM with preemption not disabled. + */ +int radix_tree_maybe_preload(gfp_t gfp_mask) +{ + if (gfp_mask & __GFP_WAIT) + return __radix_tree_preload(gfp_mask); + /* Preloading doesn't help anything with this gfp mask, skip it */ + preempt_disable(); + return 0; +} +EXPORT_SYMBOL(radix_tree_maybe_preload); + /* * Return the maximum key which can be store into a * radix tree with height HEIGHT. @@ -337,23 +360,28 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) } /** - * radix_tree_insert - insert into a radix tree + * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key - * @item: item to insert + * @nodep: returns node + * @slotp: returns slot * - * Insert an item into the radix tree at position @index. + * Create, if necessary, and return the node and slot for an item + * at position @index in the radix tree @root. + * + * Until there is more than one item in the tree, no nodes are + * allocated and @root->rnode is used as a direct slot instead of + * pointing to a node, in which case *@nodep will be NULL. + * + * Returns -ENOMEM, or 0 for success. */ -int radix_tree_insert(struct radix_tree_root *root, - unsigned long index, void *item) +int __radix_tree_create(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node **nodep, void ***slotp) { struct radix_tree_node *node = NULL, *slot; - unsigned int height, shift; - int offset; + unsigned int height, shift, offset; int error; - BUG_ON(radix_tree_is_indirect_ptr(item)); - /* Make sure the tree is high enough. */ if (index > radix_tree_maxindex(root->height)) { error = radix_tree_extend(root, index); @@ -389,16 +417,42 @@ int radix_tree_insert(struct radix_tree_root *root, height--; } - if (slot != NULL) + if (nodep) + *nodep = node; + if (slotp) + *slotp = node ? node->slots + offset : (void **)&root->rnode; + return 0; +} + +/** + * radix_tree_insert - insert into a radix tree + * @root: radix tree root + * @index: index key + * @item: item to insert + * + * Insert an item into the radix tree at position @index. + */ +int radix_tree_insert(struct radix_tree_root *root, + unsigned long index, void *item) +{ + struct radix_tree_node *node; + void **slot; + int error; + + BUG_ON(radix_tree_is_indirect_ptr(item)); + + error = __radix_tree_create(root, index, &node, &slot); + if (error) + return error; + if (*slot != NULL) return -EEXIST; + rcu_assign_pointer(*slot, item); if (node) { node->count++; - rcu_assign_pointer(node->slots[offset], item); - BUG_ON(tag_get(node, 0, offset)); - BUG_ON(tag_get(node, 1, offset)); + BUG_ON(tag_get(node, 0, index & RADIX_TREE_MAP_MASK)); + BUG_ON(tag_get(node, 1, index & RADIX_TREE_MAP_MASK)); } else { - rcu_assign_pointer(root->rnode, item); BUG_ON(root_tag_get(root, 0)); BUG_ON(root_tag_get(root, 1)); } @@ -407,15 +461,26 @@ int radix_tree_insert(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_insert); -/* - * is_slot == 1 : search for the slot. - * is_slot == 0 : search for the node. +/** + * __radix_tree_lookup - lookup an item in a radix tree + * @root: radix tree root + * @index: index key + * @nodep: returns node + * @slotp: returns slot + * + * Lookup and return the item at position @index in the radix + * tree @root. + * + * Until there is more than one item in the tree, no nodes are + * allocated and @root->rnode is used as a direct slot instead of + * pointing to a node, in which case *@nodep will be NULL. */ -static void *radix_tree_lookup_element(struct radix_tree_root *root, - unsigned long index, int is_slot) +void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node **nodep, void ***slotp) { + struct radix_tree_node *node, *parent; unsigned int height, shift; - struct radix_tree_node *node, **slot; + void **slot; node = rcu_dereference_raw(root->rnode); if (node == NULL) @@ -424,7 +489,12 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, if (!radix_tree_is_indirect_ptr(node)) { if (index > 0) return NULL; - return is_slot ? (void *)&root->rnode : node; + + if (nodep) + *nodep = NULL; + if (slotp) + *slotp = (void **)&root->rnode; + return node; } node = indirect_to_ptr(node); @@ -435,8 +505,8 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, shift = (height-1) * RADIX_TREE_MAP_SHIFT; do { - slot = (struct radix_tree_node **) - (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK)); + parent = node; + slot = node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK); node = rcu_dereference_raw(*slot); if (node == NULL) return NULL; @@ -445,7 +515,11 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, height--; } while (height > 0); - return is_slot ? (void *)slot : indirect_to_ptr(node); + if (nodep) + *nodep = parent; + if (slotp) + *slotp = slot; + return node; } /** @@ -463,7 +537,11 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, */ void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index) { - return (void **)radix_tree_lookup_element(root, index, 1); + void **slot; + + if (!__radix_tree_lookup(root, index, NULL, &slot)) + return NULL; + return slot; } EXPORT_SYMBOL(radix_tree_lookup_slot); @@ -481,7 +559,7 @@ EXPORT_SYMBOL(radix_tree_lookup_slot); */ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) { - return radix_tree_lookup_element(root, index, 0); + return __radix_tree_lookup(root, index, NULL, NULL); } EXPORT_SYMBOL(radix_tree_lookup); @@ -896,103 +974,73 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); - /** - * radix_tree_next_hole - find the next hole (not-present entry) - * @root: tree root - * @index: index key - * @max_scan: maximum range to search + * radix_tree_gang_lookup - perform multiple lookup on a radix tree + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results * - * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest - * indexed hole. + * Performs an index-ascending scan of the tree for present items. Places + * them at *@results and returns the number of items which were placed at + * *@results. * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'return - index >= max_scan' - * will be true). In rare cases of index wrap-around, 0 will be returned. + * The implementation is naive. * - * radix_tree_next_hole may be called under rcu_read_lock. However, like - * radix_tree_gang_lookup, this will not atomically search a snapshot of - * the tree at a single point in time. For example, if a hole is created - * at index 5, then subsequently a hole is created at index 10, - * radix_tree_next_hole covering both indexes may return 10 if called - * under rcu_read_lock. + * Like radix_tree_lookup, radix_tree_gang_lookup may be called under + * rcu_read_lock. In this case, rather than the returned results being + * an atomic snapshot of the tree at a single point in time, the semantics + * of an RCU protected gang lookup are as though multiple radix_tree_lookups + * have been issued in individual locks, and results stored in 'results'. */ -unsigned long radix_tree_next_hole(struct radix_tree_root *root, - unsigned long index, unsigned long max_scan) +unsigned int +radix_tree_gang_lookup(struct radix_tree_root *root, void **results, + unsigned long first_index, unsigned int max_items) { - unsigned long i; - - for (i = 0; i < max_scan; i++) { - if (!radix_tree_lookup(root, index)) - break; - index++; - if (index == 0) - break; - } - - return index; -} -EXPORT_SYMBOL(radix_tree_next_hole); + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; -/** - * radix_tree_prev_hole - find the prev hole (not-present entry) - * @root: tree root - * @index: index key - * @max_scan: maximum range to search - * - * Search backwards in the range [max(index-max_scan+1, 0), index] - * for the first hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'index - return >= max_scan' - * will be true). In rare cases of wrap-around, ULONG_MAX will be returned. - * - * radix_tree_next_hole may be called under rcu_read_lock. However, like - * radix_tree_gang_lookup, this will not atomically search a snapshot of - * the tree at a single point in time. For example, if a hole is created - * at index 10, then subsequently a hole is created at index 5, - * radix_tree_prev_hole covering both indexes may return 5 if called under - * rcu_read_lock. - */ -unsigned long radix_tree_prev_hole(struct radix_tree_root *root, - unsigned long index, unsigned long max_scan) -{ - unsigned long i; + if (unlikely(!max_items)) + return 0; - for (i = 0; i < max_scan; i++) { - if (!radix_tree_lookup(root, index)) - break; - index--; - if (index == ULONG_MAX) + radix_tree_for_each_slot(slot, root, &iter, first_index) { + results[ret] = rcu_dereference_raw(*slot); + if (!results[ret]) + continue; + if (radix_tree_is_indirect_ptr(results[ret])) { + slot = radix_tree_iter_retry(&iter); + continue; + } + if (++ret == max_items) break; } - return index; + return ret; } -EXPORT_SYMBOL(radix_tree_prev_hole); +EXPORT_SYMBOL(radix_tree_gang_lookup); /** - * radix_tree_gang_lookup - perform multiple lookup on a radix tree + * radix_tree_gang_lookup_index - perform multiple lookup on a radix tree * @root: radix tree root * @results: where the results of the lookup are placed + * @indices: where their indices should be placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * * Performs an index-ascending scan of the tree for present items. Places * them at *@results and returns the number of items which were placed at - * *@results. + * *@results. The indices are placed in @indices. * * The implementation is naive. * - * Like radix_tree_lookup, radix_tree_gang_lookup may be called under - * rcu_read_lock. In this case, rather than the returned results being - * an atomic snapshot of the tree at a single point in time, the semantics - * of an RCU protected gang lookup are as though multiple radix_tree_lookups - * have been issued in individual locks, and results stored in 'results'. + * Just one difference from radix_tree_gang_lookup, the indices are also + * collected along with the results of lookup. */ unsigned int -radix_tree_gang_lookup(struct radix_tree_root *root, void **results, - unsigned long first_index, unsigned int max_items) +radix_tree_gang_lookup_index(struct radix_tree_root *root, void **results, + unsigned long *indices, unsigned long first_index, + unsigned int max_items) { struct radix_tree_iter iter; void **slot; @@ -1005,13 +1053,15 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot)); if (!results[ret]) continue; + if (indices) + indices[ret] = iter.index; if (++ret == max_items) break; } return ret; } -EXPORT_SYMBOL(radix_tree_gang_lookup); +EXPORT_SYMBOL(radix_tree_gang_lookup_index); /** * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree @@ -1081,9 +1131,13 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, return 0; radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { - results[ret] = indirect_to_ptr(rcu_dereference_raw(*slot)); + results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; + if (radix_tree_is_indirect_ptr(results[ret])) { + slot = radix_tree_iter_retry(&iter); + continue; + } if (++ret == max_items) break; } @@ -1203,8 +1257,10 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); - if (cur_index > max_index) + if (cur_index > max_index) { + rcu_read_unlock(); break; + } cur_index = __locate(node, item, cur_index, &found_index); rcu_read_unlock(); @@ -1285,48 +1341,89 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) } /** - * radix_tree_delete - delete an item from a radix tree + * __radix_tree_delete_node - try to free node after clearing a slot * @root: radix tree root * @index: index key + * @node: node containing @index * - * Remove the item at @index from the radix tree rooted at @root. + * After clearing the slot at @index in @node from radix tree + * rooted at @root, call this function to attempt freeing the + * node and shrinking the tree. * - * Returns the address of the deleted item, or NULL if it was not present. + * Returns %true if @node was freed, %false otherwise. */ -void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) +bool __radix_tree_delete_node(struct radix_tree_root *root, unsigned long index, + struct radix_tree_node *node) { - struct radix_tree_node *node = NULL; - struct radix_tree_node *slot = NULL; - struct radix_tree_node *to_free; - unsigned int height, shift; + bool deleted = false; + + do { + struct radix_tree_node *parent; + + if (node->count) { + if (node == indirect_to_ptr(root->rnode)) { + radix_tree_shrink(root); + if (root->height == 0) + deleted = true; + } + return deleted; + } + + parent = node->parent; + if (parent) { + index >>= RADIX_TREE_MAP_SHIFT; + + parent->slots[index & RADIX_TREE_MAP_MASK] = NULL; + parent->count--; + } else { + root_tag_clear_all(root); + root->height = 0; + root->rnode = NULL; + } + + radix_tree_node_free(node); + deleted = true; + + node = parent; + } while (node); + + return deleted; +} + +/** + * radix_tree_delete_item - delete an item from a radix tree + * @root: radix tree root + * @index: index key + * @item: expected item + * + * Remove @item at @index from the radix tree rooted at @root. + * + * Returns the address of the deleted item, or NULL if it was not present + * or the entry at the given @index was not @item. + */ +void *radix_tree_delete_item(struct radix_tree_root *root, + unsigned long index, void *item) +{ + struct radix_tree_node *node; + unsigned int offset; + void **slot; + void *entry; int tag; - int uninitialized_var(offset); - height = root->height; - if (index > radix_tree_maxindex(height)) - goto out; + entry = __radix_tree_lookup(root, index, &node, &slot); + if (!entry) + return NULL; - slot = root->rnode; - if (height == 0) { + if (item && entry != item) + return NULL; + + if (!node) { root_tag_clear_all(root); root->rnode = NULL; - goto out; + return entry; } - slot = indirect_to_ptr(slot); - shift = height * RADIX_TREE_MAP_SHIFT; - do { - if (slot == NULL) - goto out; - - shift -= RADIX_TREE_MAP_SHIFT; - offset = (index >> shift) & RADIX_TREE_MAP_MASK; - node = slot; - slot = slot->slots[offset]; - } while (shift); - - if (slot == NULL) - goto out; + offset = index & RADIX_TREE_MAP_MASK; /* * Clear all tags associated with the item to be deleted. @@ -1337,40 +1434,27 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) radix_tree_tag_clear(root, index, tag); } - to_free = NULL; - /* Now free the nodes we do not need anymore */ - while (node) { - node->slots[offset] = NULL; - node->count--; - /* - * Queue the node for deferred freeing after the - * last reference to it disappears (set NULL, above). - */ - if (to_free) - radix_tree_node_free(to_free); - - if (node->count) { - if (node == indirect_to_ptr(root->rnode)) - radix_tree_shrink(root); - goto out; - } - - /* Node with zero slots in use so free it */ - to_free = node; + node->slots[offset] = NULL; + node->count--; - index >>= RADIX_TREE_MAP_SHIFT; - offset = index & RADIX_TREE_MAP_MASK; - node = node->parent; - } + __radix_tree_delete_node(root, index, node); - root_tag_clear_all(root); - root->height = 0; - root->rnode = NULL; - if (to_free) - radix_tree_node_free(to_free); + return entry; +} +EXPORT_SYMBOL(radix_tree_delete_item); -out: - return slot; +/** + * radix_tree_delete - delete an item from a radix tree + * @root: radix tree root + * @index: index key + * + * Remove the item at @index from the radix tree rooted at @root. + * + * Returns the address of the deleted item, or NULL if it was not present. + */ +void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) +{ + return radix_tree_delete_item(root, index, NULL); } EXPORT_SYMBOL(radix_tree_delete); diff --git a/mm/filemap.c b/mm/filemap.c index 48a807ac0..e5e1543f9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -666,6 +666,86 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } } +/** + * page_cache_next_hole - find the next hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the + * lowest indexed hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'return - index >= + * max_scan' will be true). In rare cases of index wrap-around, 0 will + * be returned. + * + * page_cache_next_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 5, then subsequently a hole is created at + * index 10, page_cache_next_hole covering both indexes may return 10 + * if called under rcu_read_lock. + */ +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index++; + if (index == 0) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_next_hole); + +/** + * page_cache_prev_hole - find the prev hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] for + * the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= + * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX + * will be returned. + * + * page_cache_prev_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 10, then subsequently a hole is created at + * index 5, page_cache_prev_hole covering both indexes may return 5 if + * called under rcu_read_lock. + */ +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + if (!radix_tree_lookup(&mapping->page_tree, index)) + break; + index--; + if (index == ULONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_prev_hole); + + /** * find_get_page - find and get a page reference * @mapping: the address_space to search diff --git a/mm/readahead.c b/mm/readahead.c index 0eac4cbf6..abec1a59b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -367,7 +367,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t head; rcu_read_lock(); - head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); + head = page_cache_prev_hole(mapping, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; @@ -446,7 +446,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); + start = page_cache_next_hole(mapping, offset+1,max); rcu_read_unlock(); if (!start || start - offset > max) From 5ba273bbcc0ef0e4455592d4d7824f4b73d8d242 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Mon, 20 Jun 2016 22:09:01 -0400 Subject: [PATCH 45/52] cleancache: adopt to old fs --- include/linux/exportfs.h | 16 +++++++++ include/linux/list.h | 74 ++++++++++++++++++++++++++-------------- mm/cleancache.c | 6 ++-- 3 files changed, 68 insertions(+), 28 deletions(-) diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 3a4cef532..10ca9e2a7 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -83,6 +83,22 @@ enum fid_type { * 64 bit parent inode number. */ FILEID_NILFS_WITH_PARENT = 0x62, + + /* + * 32 bit generation number, 40 bit i_pos. + */ + FILEID_FAT_WITHOUT_PARENT = 0x71, + + /* + * 32 bit generation number, 40 bit i_pos, + * 32 bit parent generation number, 40 bit parent i_pos + */ + FILEID_FAT_WITH_PARENT = 0x72, + + /* + * Filesystems must not use 0xff file ID. + */ + FILEID_INVALID = 0xff, }; struct fid { diff --git a/include/linux/list.h b/include/linux/list.h index 1712c7e1f..8a8acdcf3 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -361,6 +361,28 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) +/** + * list_last_entry - get the last element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note, that list is expected to be not empty. + */ +#define list_last_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) + +/** + * list_first_entry_or_null - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note that if the list is empty, it returns NULL. + */ +#define list_first_entry_or_null(ptr, type, member) \ + (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) + /** * list_next_entry - get the next element in list * @pos: the type * to cursor @@ -432,9 +454,9 @@ static inline void list_splice_tail_init(struct list_head *list, * @member: the name of the list_struct within the struct. */ #define list_for_each_entry(pos, head, member) \ - for (pos = list_entry((head)->next, typeof(*pos), member); \ - &pos->member != (head); \ - pos = list_entry(pos->member.next, typeof(*pos), member)) + for (pos = list_first_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. @@ -443,9 +465,9 @@ static inline void list_splice_tail_init(struct list_head *list, * @member: the name of the list_struct within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ - for (pos = list_entry((head)->prev, typeof(*pos), member); \ - &pos->member != (head); \ - pos = list_entry(pos->member.prev, typeof(*pos), member)) + for (pos = list_last_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_prev_entry(pos, member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() @@ -468,9 +490,9 @@ static inline void list_splice_tail_init(struct list_head *list, * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ - for (pos = list_entry(pos->member.next, typeof(*pos), member); \ - &pos->member != (head); \ - pos = list_entry(pos->member.next, typeof(*pos), member)) + for (pos = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point @@ -482,9 +504,9 @@ static inline void list_splice_tail_init(struct list_head *list, * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ - for (pos = list_entry(pos->member.prev, typeof(*pos), member); \ - &pos->member != (head); \ - pos = list_entry(pos->member.prev, typeof(*pos), member)) + for (pos = list_prev_entry(pos, member); \ + &pos->member != (head); \ + pos = list_prev_entry(pos, member)) /** * list_for_each_entry_from - iterate over list of given type from the current point @@ -495,8 +517,8 @@ static inline void list_splice_tail_init(struct list_head *list, * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ - for (; &pos->member != (head); \ - pos = list_entry(pos->member.next, typeof(*pos), member)) + for (; &pos->member != (head); \ + pos = list_next_entry(pos, member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry @@ -506,10 +528,10 @@ static inline void list_splice_tail_init(struct list_head *list, * @member: the name of the list_struct within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ - for (pos = list_entry((head)->next, typeof(*pos), member), \ - n = list_entry(pos->member.next, typeof(*pos), member); \ + for (pos = list_first_entry(head, typeof(*pos), member), \ + n = list_next_entry(pos, member); \ &pos->member != (head); \ - pos = n, n = list_entry(n->member.next, typeof(*n), member)) + pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal @@ -522,10 +544,10 @@ static inline void list_splice_tail_init(struct list_head *list, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ - for (pos = list_entry(pos->member.next, typeof(*pos), member), \ - n = list_entry(pos->member.next, typeof(*pos), member); \ + for (pos = list_next_entry(pos, member), \ + n = list_next_entry(pos, member); \ &pos->member != (head); \ - pos = n, n = list_entry(n->member.next, typeof(*n), member)) + pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal @@ -538,9 +560,9 @@ static inline void list_splice_tail_init(struct list_head *list, * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ - for (n = list_entry(pos->member.next, typeof(*pos), member); \ + for (n = list_next_entry(pos, member); \ &pos->member != (head); \ - pos = n, n = list_entry(n->member.next, typeof(*n), member)) + pos = n, n = list_next_entry(n, member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal @@ -553,10 +575,10 @@ static inline void list_splice_tail_init(struct list_head *list, * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ - for (pos = list_entry((head)->prev, typeof(*pos), member), \ - n = list_entry(pos->member.prev, typeof(*pos), member); \ + for (pos = list_last_entry(head, typeof(*pos), member), \ + n = list_prev_entry(pos, member); \ &pos->member != (head); \ - pos = n, n = list_entry(n->member.prev, typeof(*n), member)) + pos = n, n = list_prev_entry(n, member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop @@ -571,7 +593,7 @@ static inline void list_splice_tail_init(struct list_head *list, * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ - n = list_entry(pos->member.next, typeof(*pos), member) + n = list_next_entry(pos, member) /* * Double linked lists with a single pointer list head. diff --git a/mm/cleancache.c b/mm/cleancache.c index 770794807..1d97b0d05 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -145,7 +145,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs); static int cleancache_get_key(struct inode *inode, struct cleancache_filekey *key) { - int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *); + int (*fhfn)(struct dentry *, __u32 *fh, int *, int); int len = 0, maxlen = CLEANCACHE_KEY_MAX; struct super_block *sb = inode->i_sb; @@ -153,7 +153,9 @@ static int cleancache_get_key(struct inode *inode, if (sb->s_export_op != NULL) { fhfn = sb->s_export_op->encode_fh; if (fhfn) { - len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); + struct dentry d; + d.d_inode = inode; + len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); if (len <= FILEID_ROOT || len == FILEID_INVALID) return -1; if (maxlen > CLEANCACHE_KEY_MAX) From f4cde30c7a63c279a793100ffc7fe502ebe37363 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Tue, 21 Jun 2016 00:33:41 -0400 Subject: [PATCH 46/52] BLOCK: Update BFQ I/O Sched gov from v7r7 to v7r8 (cherry picked from commit b349cfcee99fae37ae930b574bf025438dba968e) --- block/bfq-iosched.c | 4207 +++++++++++++++++++++++++++++++++++++++++++ block/bfq-sched.c | 1180 ++++++++++++ block/bfq.h | 806 +++++++++ 3 files changed, 6193 insertions(+) create mode 100644 block/bfq-iosched.c create mode 100644 block/bfq-sched.c create mode 100644 block/bfq.h diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c new file mode 100644 index 000000000..637c90665 --- /dev/null +++ b/block/bfq-iosched.c @@ -0,0 +1,4207 @@ +/* + * Budget Fair Queueing (BFQ) disk scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + * + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ + * file. + * + * BFQ is a proportional-share storage-I/O scheduling algorithm based on + * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, + * measured in number of sectors, to processes instead of time slices. The + * device is not granted to the in-service process for a given time slice, + * but until it has exhausted its assigned budget. This change from the time + * to the service domain allows BFQ to distribute the device throughput + * among processes as desired, without any distortion due to ZBR, workload + * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, + * called B-WF2Q+, to schedule processes according to their budgets. More + * precisely, BFQ schedules queues associated to processes. Thanks to the + * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to + * I/O-bound processes issuing sequential requests (to boost the + * throughput), and yet guarantee a low latency to interactive and soft + * real-time applications. + * + * BFQ is described in [1], where also a reference to the initial, more + * theoretical paper on BFQ can be found. The interested reader can find + * in the latter paper full details on the main algorithm, as well as + * formulas of the guarantees and formal proofs of all the properties. + * With respect to the version of BFQ presented in these papers, this + * implementation adds a few more heuristics, such as the one that + * guarantees a low latency to soft real-time applications, and a + * hierarchical extension based on H-WF2Q+. + * + * B-WF2Q+ is based on WF2Q+, that is described in [2], together with + * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) + * complexity derives from the one introduced with EEVDF in [3]. + * + * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness + * with the BFQ Disk I/O Scheduler'', + * Proceedings of the 5th Annual International Systems and Storage + * Conference (SYSTOR '12), June 2012. + * + * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf + * + * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing + * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, + * Oct 1997. + * + * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz + * + * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline + * First: A Flexible and Accurate Mechanism for Proportional Share + * Resource Allocation,'' technical report. + * + * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "bfq.h" +#include "blk.h" + +/* Expiration time of sync (0) and async (1) requests, in jiffies. */ +static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; + +/* Maximum backwards seek, in KiB. */ +static const int bfq_back_max = 16 * 1024; + +/* Penalty of a backwards seek, in number of sectors. */ +static const int bfq_back_penalty = 2; + +/* Idling period duration, in jiffies. */ +static int bfq_slice_idle = HZ / 125; + +/* Default maximum budget values, in sectors and number of requests. */ +static const int bfq_default_max_budget = 16 * 1024; +static const int bfq_max_budget_async_rq = 4; + +/* + * Async to sync throughput distribution is controlled as follows: + * when an async request is served, the entity is charged the number + * of sectors of the request, multiplied by the factor below + */ +static const int bfq_async_charge_factor = 10; + +/* Default timeout values, in jiffies, approximating CFQ defaults. */ +static const int bfq_timeout_sync = HZ / 8; +static int bfq_timeout_async = HZ / 25; + +struct kmem_cache *bfq_pool; + +/* Below this threshold (in ms), we consider thinktime immediate. */ +#define BFQ_MIN_TT 2 + +/* hw_tag detection: parallel requests threshold and min samples needed. */ +#define BFQ_HW_QUEUE_THRESHOLD 4 +#define BFQ_HW_QUEUE_SAMPLES 32 + +#define BFQQ_SEEK_THR (sector_t)(8 * 1024) +#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) + +/* Min samples used for peak rate estimation (for autotuning). */ +#define BFQ_PEAK_RATE_SAMPLES 32 + +/* Shift used for peak rate fixed precision calculations. */ +#define BFQ_RATE_SHIFT 16 + +/* + * By default, BFQ computes the duration of the weight raising for + * interactive applications automatically, using the following formula: + * duration = (R / r) * T, where r is the peak rate of the device, and + * R and T are two reference parameters. + * In particular, R is the peak rate of the reference device (see below), + * and T is a reference time: given the systems that are likely to be + * installed on the reference device according to its speed class, T is + * about the maximum time needed, under BFQ and while reading two files in + * parallel, to load typical large applications on these systems. + * In practice, the slower/faster the device at hand is, the more/less it + * takes to load applications with respect to the reference device. + * Accordingly, the longer/shorter BFQ grants weight raising to interactive + * applications. + * + * BFQ uses four different reference pairs (R, T), depending on: + * . whether the device is rotational or non-rotational; + * . whether the device is slow, such as old or portable HDDs, as well as + * SD cards, or fast, such as newer HDDs and SSDs. + * + * The device's speed class is dynamically (re)detected in + * bfq_update_peak_rate() every time the estimated peak rate is updated. + * + * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] + * are the reference values for a slow/fast rotational device, whereas + * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for + * a slow/fast non-rotational device. Finally, device_speed_thresh are the + * thresholds used to switch between speed classes. + * Both the reference peak rates and the thresholds are measured in + * sectors/usec, left-shifted by BFQ_RATE_SHIFT. + */ +static int R_slow[2] = {1536, 10752}; +static int R_fast[2] = {17415, 34791}; +/* + * To improve readability, a conversion function is used to initialize the + * following arrays, which entails that they can be initialized only in a + * function. + */ +static int T_slow[2]; +static int T_fast[2]; +static int device_speed_thresh[2]; + +#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) + +#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) +#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) + +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd); + +#include "bfq-ioc.c" +#include "bfq-sched.c" +#include "bfq-cgroup.c" + +#define bfq_class_idle(bfqq) ((bfqq)->entity.ioprio_class ==\ + IOPRIO_CLASS_IDLE) +#define bfq_class_rt(bfqq) ((bfqq)->entity.ioprio_class ==\ + IOPRIO_CLASS_RT) + +#define bfq_sample_valid(samples) ((samples) > 80) + +/* + * The following macro groups conditions that need to be evaluated when + * checking if existing queues and groups form a symmetric scenario + * and therefore idling can be reduced or disabled for some of the + * queues. See the comment to the function bfq_bfqq_must_not_expire() + * for further details. + */ +#ifdef CONFIG_CGROUP_BFQIO +#define symmetric_scenario (!bfqd->active_numerous_groups && \ + !bfq_differentiated_weights(bfqd)) +#else +#define symmetric_scenario (!bfq_differentiated_weights(bfqd)) +#endif + +/* + * We regard a request as SYNC, if either it's a read or has the SYNC bit + * set (in which case it could also be a direct WRITE). + */ +static inline int bfq_bio_sync(struct bio *bio) +{ + if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) + return 1; + + return 0; +} + +/* + * Scheduler run of queue, if there are requests pending and no one in the + * driver that will restart queueing. + */ +static inline void bfq_schedule_dispatch(struct bfq_data *bfqd) +{ + if (bfqd->queued != 0) { + bfq_log(bfqd, "schedule dispatch"); + kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work); + } +} + +/* + * Lifted from AS - choose which of rq1 and rq2 that is best served now. + * We choose the request that is closesr to the head right now. Distance + * behind the head is penalized and only allowed to a certain extent. + */ +static struct request *bfq_choose_req(struct bfq_data *bfqd, + struct request *rq1, + struct request *rq2, + sector_t last) +{ + sector_t s1, s2, d1 = 0, d2 = 0; + unsigned long back_max; +#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ +#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ + unsigned wrap = 0; /* bit mask: requests behind the disk head? */ + + if (rq1 == NULL || rq1 == rq2) + return rq2; + if (rq2 == NULL) + return rq1; + + if (rq_is_sync(rq1) && !rq_is_sync(rq2)) + return rq1; + else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) + return rq2; + if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) + return rq1; + else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) + return rq2; + + s1 = blk_rq_pos(rq1); + s2 = blk_rq_pos(rq2); + + /* + * By definition, 1KiB is 2 sectors. + */ + back_max = bfqd->bfq_back_max * 2; + + /* + * Strict one way elevator _except_ in the case where we allow + * short backward seeks which are biased as twice the cost of a + * similar forward seek. + */ + if (s1 >= last) + d1 = s1 - last; + else if (s1 + back_max >= last) + d1 = (last - s1) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ1_WRAP; + + if (s2 >= last) + d2 = s2 - last; + else if (s2 + back_max >= last) + d2 = (last - s2) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ2_WRAP; + + /* Found required data */ + + /* + * By doing switch() on the bit mask "wrap" we avoid having to + * check two variables for all permutations: --> faster! + */ + switch (wrap) { + case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ + if (d1 < d2) + return rq1; + else if (d2 < d1) + return rq2; + else { + if (s1 >= s2) + return rq1; + else + return rq2; + } + + case BFQ_RQ2_WRAP: + return rq1; + case BFQ_RQ1_WRAP: + return rq2; + case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ + default: + /* + * Since both rqs are wrapped, + * start with the one that's further behind head + * (--> only *one* back seek required), + * since back seek takes more time than forward. + */ + if (s1 <= s2) + return rq1; + else + return rq2; + } +} + +static struct bfq_queue * +bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, + sector_t sector, struct rb_node **ret_parent, + struct rb_node ***rb_link) +{ + struct rb_node **p, *parent; + struct bfq_queue *bfqq = NULL; + + parent = NULL; + p = &root->rb_node; + while (*p) { + struct rb_node **n; + + parent = *p; + bfqq = rb_entry(parent, struct bfq_queue, pos_node); + + /* + * Sort strictly based on sector. Smallest to the left, + * largest to the right. + */ + if (sector > blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_right; + else if (sector < blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_left; + else + break; + p = n; + bfqq = NULL; + } + + *ret_parent = parent; + if (rb_link) + *rb_link = p; + + bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", + (long long unsigned)sector, + bfqq != NULL ? bfqq->pid : 0); + + return bfqq; +} + +static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct rb_node **p, *parent; + struct bfq_queue *__bfqq; + + if (bfqq->pos_root != NULL) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + + if (bfq_class_idle(bfqq)) + return; + if (!bfqq->next_rq) + return; + + bfqq->pos_root = &bfqd->rq_pos_tree; + __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, + blk_rq_pos(bfqq->next_rq), &parent, &p); + if (__bfqq == NULL) { + rb_link_node(&bfqq->pos_node, parent, p); + rb_insert_color(&bfqq->pos_node, bfqq->pos_root); + } else + bfqq->pos_root = NULL; +} + +/* + * Tell whether there are active queues or groups with differentiated weights. + */ +static inline bool bfq_differentiated_weights(struct bfq_data *bfqd) +{ + /* + * For weights to differ, at least one of the trees must contain + * at least two nodes. + */ + return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && + (bfqd->queue_weights_tree.rb_node->rb_left || + bfqd->queue_weights_tree.rb_node->rb_right) +#ifdef CONFIG_CGROUP_BFQIO + ) || + (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && + (bfqd->group_weights_tree.rb_node->rb_left || + bfqd->group_weights_tree.rb_node->rb_right) +#endif + ); +} + +/* + * If the weight-counter tree passed as input contains no counter for + * the weight of the input entity, then add that counter; otherwise just + * increment the existing counter. + * + * Note that weight-counter trees contain few nodes in mostly symmetric + * scenarios. For example, if all queues have the same weight, then the + * weight-counter tree for the queues may contain at most one node. + * This holds even if low_latency is on, because weight-raised queues + * are not inserted in the tree. + * In most scenarios, the rate at which nodes are created/destroyed + * should be low too. + */ +static void bfq_weights_tree_add(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* + * Do not insert if the entity is already associated with a + * counter, which happens if: + * 1) the entity is associated with a queue, + * 2) a request arrival has caused the queue to become both + * non-weight-raised, and hence change its weight, and + * backlogged; in this respect, each of the two events + * causes an invocation of this function, + * 3) this is the invocation of this function caused by the + * second event. This second invocation is actually useless, + * and we handle this fact by exiting immediately. More + * efficient or clearer solutions might possibly be adopted. + */ + if (entity->weight_counter) + return; + + while (*new) { + struct bfq_weight_counter *__counter = container_of(*new, + struct bfq_weight_counter, + weights_node); + parent = *new; + + if (entity->weight == __counter->weight) { + entity->weight_counter = __counter; + goto inc_counter; + } + if (entity->weight < __counter->weight) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), + GFP_ATOMIC); + entity->weight_counter->weight = entity->weight; + rb_link_node(&entity->weight_counter->weights_node, parent, new); + rb_insert_color(&entity->weight_counter->weights_node, root); + +inc_counter: + entity->weight_counter->num_active++; +} + +/* + * Decrement the weight counter associated with the entity, and, if the + * counter reaches 0, remove the counter from the tree. + * See the comments to the function bfq_weights_tree_add() for considerations + * about overhead. + */ +static void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) +{ + if (!entity->weight_counter) + return; + + BUG_ON(RB_EMPTY_ROOT(root)); + BUG_ON(entity->weight_counter->weight != entity->weight); + + BUG_ON(!entity->weight_counter->num_active); + entity->weight_counter->num_active--; + if (entity->weight_counter->num_active > 0) + goto reset_entity_pointer; + + rb_erase(&entity->weight_counter->weights_node, root); + kfree(entity->weight_counter); + +reset_entity_pointer: + entity->weight_counter = NULL; +} + +static struct request *bfq_find_next_rq(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *last) +{ + struct rb_node *rbnext = rb_next(&last->rb_node); + struct rb_node *rbprev = rb_prev(&last->rb_node); + struct request *next = NULL, *prev = NULL; + + BUG_ON(RB_EMPTY_NODE(&last->rb_node)); + + if (rbprev != NULL) + prev = rb_entry_rq(rbprev); + + if (rbnext != NULL) + next = rb_entry_rq(rbnext); + else { + rbnext = rb_first(&bfqq->sort_list); + if (rbnext && rbnext != &last->rb_node) + next = rb_entry_rq(rbnext); + } + + return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); +} + +/* see the definition of bfq_async_charge_factor for details */ +static inline unsigned long bfq_serv_to_charge(struct request *rq, + struct bfq_queue *bfqq) +{ + return blk_rq_sectors(rq) * + (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * + bfq_async_charge_factor)); +} + +/** + * bfq_updated_next_req - update the queue after a new next_rq selection. + * @bfqd: the device data the queue belongs to. + * @bfqq: the queue to update. + * + * If the first request of a queue changes we make sure that the queue + * has enough budget to serve at least its first request (if the + * request has grown). We do this because if the queue has not enough + * budget for its first request, it has to go through two dispatch + * rounds to actually get it dispatched. + */ +static void bfq_updated_next_req(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + struct request *next_rq = bfqq->next_rq; + unsigned long new_budget; + + if (next_rq == NULL) + return; + + if (bfqq == bfqd->in_service_queue) + /* + * In order not to break guarantees, budgets cannot be + * changed after an entity has been selected. + */ + return; + + BUG_ON(entity->tree != &st->active); + BUG_ON(entity == entity->sched_data->in_service_entity); + + new_budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + if (entity->budget != new_budget) { + entity->budget = new_budget; + bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", + new_budget); + bfq_activate_bfqq(bfqd, bfqq); + } +} + +static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd) +{ + u64 dur; + + if (bfqd->bfq_wr_max_time > 0) + return bfqd->bfq_wr_max_time; + + dur = bfqd->RT_prod; + do_div(dur, bfqd->peak_rate); + + return dur; +} + +static inline unsigned +bfq_bfqq_cooperations(struct bfq_queue *bfqq) +{ + return bfqq->bic ? bfqq->bic->cooperations : 0; +} + +static inline void +bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +{ + if (bic->saved_idle_window) + bfq_mark_bfqq_idle_window(bfqq); + else + bfq_clear_bfqq_idle_window(bfqq); + if (bic->saved_IO_bound) + bfq_mark_bfqq_IO_bound(bfqq); + else + bfq_clear_bfqq_IO_bound(bfqq); + /* Assuming that the flag in_large_burst is already correctly set */ + if (bic->wr_time_left && bfqq->bfqd->low_latency && + !bfq_bfqq_in_large_burst(bfqq) && + bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { + /* + * Start a weight raising period with the duration given by + * the raising_time_left snapshot. + */ + if (bfq_bfqq_busy(bfqq)) + bfqq->bfqd->wr_busy_queues++; + bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bic->wr_time_left; + bfqq->last_wr_start_finish = jiffies; + bfqq->entity.ioprio_changed = 1; + } + /* + * Clear wr_time_left to prevent bfq_bfqq_save_state() from + * getting confused about the queue's need of a weight-raising + * period. + */ + bic->wr_time_left = 0; +} + +/* Must be called with the queue_lock held. */ +static int bfqq_process_refs(struct bfq_queue *bfqq) +{ + int process_refs, io_refs; + + io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; + process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; + BUG_ON(process_refs < 0); + return process_refs; +} + +/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ +static inline void bfq_reset_burst_list(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_queue *item; + struct hlist_node *pos, *n; + + hlist_for_each_entry_safe(item, pos, n, + &bfqd->burst_list, burst_list_node) + hlist_del_init(&item->burst_list_node); + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); + bfqd->burst_size = 1; +} + +/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ +static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + /* Increment burst size to take into account also bfqq */ + bfqd->burst_size++; + + if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { + struct bfq_queue *pos, *bfqq_item; + struct hlist_node *p, *n; + + /* + * Enough queues have been activated shortly after each + * other to consider this burst as large. + */ + bfqd->large_burst = true; + + /* + * We can now mark all queues in the burst list as + * belonging to a large burst. + */ + hlist_for_each_entry(bfqq_item, n, &bfqd->burst_list, + burst_list_node) + bfq_mark_bfqq_in_large_burst(bfqq_item); + bfq_mark_bfqq_in_large_burst(bfqq); + + /* + * From now on, and until the current burst finishes, any + * new queue being activated shortly after the last queue + * was inserted in the burst can be immediately marked as + * belonging to a large burst. So the burst list is not + * needed any more. Remove it. + */ + hlist_for_each_entry_safe(pos, p, n, &bfqd->burst_list, + burst_list_node) + hlist_del_init(&pos->burst_list_node); + } else /* burst not yet large: add bfqq to the burst list */ + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); +} + +/* + * If many queues happen to become active shortly after each other, then, + * to help the processes associated to these queues get their job done as + * soon as possible, it is usually better to not grant either weight-raising + * or device idling to these queues. In this comment we describe, firstly, + * the reasons why this fact holds, and, secondly, the next function, which + * implements the main steps needed to properly mark these queues so that + * they can then be treated in a different way. + * + * As for the terminology, we say that a queue becomes active, i.e., + * switches from idle to backlogged, either when it is created (as a + * consequence of the arrival of an I/O request), or, if already existing, + * when a new request for the queue arrives while the queue is idle. + * Bursts of activations, i.e., activations of different queues occurring + * shortly after each other, are typically caused by services or applications + * that spawn or reactivate many parallel threads/processes. Examples are + * systemd during boot or git grep. + * + * These services or applications benefit mostly from a high throughput: + * the quicker the requests of the activated queues are cumulatively served, + * the sooner the target job of these queues gets completed. As a consequence, + * weight-raising any of these queues, which also implies idling the device + * for it, is almost always counterproductive: in most cases it just lowers + * throughput. + * + * On the other hand, a burst of activations may be also caused by the start + * of an application that does not consist in a lot of parallel I/O-bound + * threads. In fact, with a complex application, the burst may be just a + * consequence of the fact that several processes need to be executed to + * start-up the application. To start an application as quickly as possible, + * the best thing to do is to privilege the I/O related to the application + * with respect to all other I/O. Therefore, the best strategy to start as + * quickly as possible an application that causes a burst of activations is + * to weight-raise all the queues activated during the burst. This is the + * exact opposite of the best strategy for the other type of bursts. + * + * In the end, to take the best action for each of the two cases, the two + * types of bursts need to be distinguished. Fortunately, this seems + * relatively easy to do, by looking at the sizes of the bursts. In + * particular, we found a threshold such that bursts with a larger size + * than that threshold are apparently caused only by services or commands + * such as systemd or git grep. For brevity, hereafter we call just 'large' + * these bursts. BFQ *does not* weight-raise queues whose activations occur + * in a large burst. In addition, for each of these queues BFQ performs or + * does not perform idling depending on which choice boosts the throughput + * most. The exact choice depends on the device and request pattern at + * hand. + * + * Turning back to the next function, it implements all the steps needed + * to detect the occurrence of a large burst and to properly mark all the + * queues belonging to it (so that they can then be treated in a different + * way). This goal is achieved by maintaining a special "burst list" that + * holds, temporarily, the queues that belong to the burst in progress. The + * list is then used to mark these queues as belonging to a large burst if + * the burst does become large. The main steps are the following. + * + * . when the very first queue is activated, the queue is inserted into the + * list (as it could be the first queue in a possible burst) + * + * . if the current burst has not yet become large, and a queue Q that does + * not yet belong to the burst is activated shortly after the last time + * at which a new queue entered the burst list, then the function appends + * Q to the burst list + * + * . if, as a consequence of the previous step, the burst size reaches + * the large-burst threshold, then + * + * . all the queues in the burst list are marked as belonging to a + * large burst + * + * . the burst list is deleted; in fact, the burst list already served + * its purpose (keeping temporarily track of the queues in a burst, + * so as to be able to mark them as belonging to a large burst in the + * previous sub-step), and now is not needed any more + * + * . the device enters a large-burst mode + * + * . if a queue Q that does not belong to the burst is activated while + * the device is in large-burst mode and shortly after the last time + * at which a queue either entered the burst list or was marked as + * belonging to the current large burst, then Q is immediately marked + * as belonging to a large burst. + * + * . if a queue Q that does not belong to the burst is activated a while + * later, i.e., not shortly after, than the last time at which a queue + * either entered the burst list or was marked as belonging to the + * current large burst, then the current burst is deemed as finished and: + * + * . the large-burst mode is reset if set + * + * . the burst list is emptied + * + * . Q is inserted in the burst list, as Q may be the first queue + * in a possible new burst (then the burst list contains just Q + * after this step). + */ +static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool idle_for_long_time) +{ + /* + * If bfqq happened to be activated in a burst, but has been idle + * for at least as long as an interactive queue, then we assume + * that, in the overall I/O initiated in the burst, the I/O + * associated to bfqq is finished. So bfqq does not need to be + * treated as a queue belonging to a burst anymore. Accordingly, + * we reset bfqq's in_large_burst flag if set, and remove bfqq + * from the burst list if it's there. We do not decrement instead + * burst_size, because the fact that bfqq does not need to belong + * to the burst list any more does not invalidate the fact that + * bfqq may have been activated during the current burst. + */ + if (idle_for_long_time) { + hlist_del_init(&bfqq->burst_list_node); + bfq_clear_bfqq_in_large_burst(bfqq); + } + + /* + * If bfqq is already in the burst list or is part of a large + * burst, then there is nothing else to do. + */ + if (!hlist_unhashed(&bfqq->burst_list_node) || + bfq_bfqq_in_large_burst(bfqq)) + return; + + /* + * If bfqq's activation happens late enough, then the current + * burst is finished, and related data structures must be reset. + * + * In this respect, consider the special case where bfqq is the very + * first queue being activated. In this case, last_ins_in_burst is + * not yet significant when we get here. But it is easy to verify + * that, whether or not the following condition is true, bfqq will + * end up being inserted into the burst list. In particular the + * list will happen to contain only bfqq. And this is exactly what + * has to happen, as bfqq may be the first queue in a possible + * burst. + */ + if (time_is_before_jiffies(bfqd->last_ins_in_burst + + bfqd->bfq_burst_interval)) { + bfqd->large_burst = false; + bfq_reset_burst_list(bfqd, bfqq); + return; + } + + /* + * If we get here, then bfqq is being activated shortly after the + * last queue. So, if the current burst is also large, we can mark + * bfqq as belonging to this large burst immediately. + */ + if (bfqd->large_burst) { + bfq_mark_bfqq_in_large_burst(bfqq); + return; + } + + /* + * If we get here, then a large-burst state has not yet been + * reached, but bfqq is being activated shortly after the last + * queue. Then we add bfqq to the burst. + */ + bfq_add_to_burst(bfqd, bfqq); +} + +static void bfq_add_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_entity *entity = &bfqq->entity; + struct bfq_data *bfqd = bfqq->bfqd; + struct request *next_rq, *prev; + unsigned long old_wr_coeff = bfqq->wr_coeff; + bool interactive = false; + + bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); + bfqq->queued[rq_is_sync(rq)]++; + bfqd->queued++; + + elv_rb_add(&bfqq->sort_list, rq); + + /* + * Check if this request is a better next-serve candidate. + */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); + BUG_ON(next_rq == NULL); + bfqq->next_rq = next_rq; + + /* + * Adjust priority tree position, if next_rq changes. + */ + if (prev != bfqq->next_rq) + bfq_rq_pos_tree_add(bfqd, bfqq); + + if (!bfq_bfqq_busy(bfqq)) { + bool soft_rt, coop_or_in_burst, + idle_for_long_time = time_is_before_jiffies( + bfqq->budget_timeout + + bfqd->bfq_wr_min_idle_time); + + if (bfq_bfqq_sync(bfqq)) { + bool already_in_burst = + !hlist_unhashed(&bfqq->burst_list_node) || + bfq_bfqq_in_large_burst(bfqq); + bfq_handle_burst(bfqd, bfqq, idle_for_long_time); + /* + * If bfqq was not already in the current burst, + * then, at this point, bfqq either has been + * added to the current burst or has caused the + * current burst to terminate. In particular, in + * the second case, bfqq has become the first + * queue in a possible new burst. + * In both cases last_ins_in_burst needs to be + * moved forward. + */ + if (!already_in_burst) + bfqd->last_ins_in_burst = jiffies; + } + + coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || + bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; + soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && + !coop_or_in_burst && + time_is_before_jiffies(bfqq->soft_rt_next_start); + interactive = !coop_or_in_burst && idle_for_long_time; + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + + if (!bfq_bfqq_IO_bound(bfqq)) { + if (time_before(jiffies, + RQ_BIC(rq)->ttime.last_end_request + + bfqd->bfq_slice_idle)) { + bfqq->requests_within_timer++; + if (bfqq->requests_within_timer >= + bfqd->bfq_requests_within_timer) + bfq_mark_bfqq_IO_bound(bfqq); + } else + bfqq->requests_within_timer = 0; + } + + if (!bfqd->low_latency) + goto add_bfqq_busy; + + if (bfq_bfqq_just_split(bfqq)) + goto set_ioprio_changed; + + /* + * If the queue: + * - is not being boosted, + * - has been idle for enough time, + * - is not a sync queue or is linked to a bfq_io_cq (it is + * shared "for its nature" or it is not shared and its + * requests have not been redirected to a shared queue) + * start a weight-raising period. + */ + if (old_wr_coeff == 1 && (interactive || soft_rt) && + (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + if (interactive) + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + else + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + bfq_log_bfqq(bfqd, bfqq, + "wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } else if (old_wr_coeff > 1) { + if (interactive) + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + else if (coop_or_in_burst || + (bfqq->wr_cur_max_time == + bfqd->bfq_wr_rt_max_time && + !soft_rt)) { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq-> + wr_cur_max_time)); + } else if (time_before( + bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time, + jiffies + + bfqd->bfq_wr_rt_max_time) && + soft_rt) { + /* + * + * The remaining weight-raising time is lower + * than bfqd->bfq_wr_rt_max_time, which means + * that the application is enjoying weight + * raising either because deemed soft-rt in + * the near past, or because deemed interactive + * a long ago. + * In both cases, resetting now the current + * remaining weight-raising time for the + * application to the weight-raising duration + * for soft rt applications would not cause any + * latency increase for the application (as the + * new duration would be higher than the + * remaining time). + * + * In addition, the application is now meeting + * the requirements for being deemed soft rt. + * In the end we can correctly and safely + * (re)charge the weight-raising duration for + * the application with the weight-raising + * duration for soft rt applications. + * + * In particular, doing this recharge now, i.e., + * before the weight-raising period for the + * application finishes, reduces the probability + * of the following negative scenario: + * 1) the weight of a soft rt application is + * raised at startup (as for any newly + * created application), + * 2) since the application is not interactive, + * at a certain time weight-raising is + * stopped for the application, + * 3) at that time the application happens to + * still have pending requests, and hence + * is destined to not have a chance to be + * deemed soft rt before these requests are + * completed (see the comments to the + * function bfq_bfqq_softrt_next_start() + * for details on soft rt detection), + * 4) these pending requests experience a high + * latency because the application is not + * weight-raised while they are pending. + */ + bfqq->last_wr_start_finish = jiffies; + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + } + } +set_ioprio_changed: + if (old_wr_coeff != bfqq->wr_coeff) + entity->ioprio_changed = 1; +add_bfqq_busy: + bfqq->last_idle_bklogged = jiffies; + bfqq->service_from_backlogged = 0; + bfq_clear_bfqq_softrt_update(bfqq); + bfq_add_bfqq_busy(bfqd, bfqq); + } else { + if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && + time_is_before_jiffies( + bfqq->last_wr_start_finish + + bfqd->bfq_wr_min_inter_arr_async)) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + + bfqd->wr_busy_queues++; + entity->ioprio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, + "non-idle wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + if (prev != bfqq->next_rq) + bfq_updated_next_req(bfqd, bfqq); + } + + if (bfqd->low_latency && + (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) + bfqq->last_wr_start_finish = jiffies; +} + +static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, + struct bio *bio) +{ + struct task_struct *tsk = current; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq; + + bic = bfq_bic_lookup(bfqd, tsk->io_context); + if (bic == NULL) + return NULL; + + bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); + if (bfqq != NULL) { + sector_t sector = bio->bi_sector + bio_sectors(bio); + + return elv_rb_find(&bfqq->sort_list, sector); + } + + return NULL; +} + +static void bfq_activate_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + + bfqd->rq_in_driver++; + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); + bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", + (long long unsigned)bfqd->last_position); +} + +static inline void bfq_deactivate_request(struct request_queue *q, + struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + + BUG_ON(bfqd->rq_in_driver == 0); + bfqd->rq_in_driver--; +} + +static void bfq_remove_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + const int sync = rq_is_sync(rq); + + if (bfqq->next_rq == rq) { + bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); + bfq_updated_next_req(bfqd, bfqq); + } + + if (rq->queuelist.prev != &rq->queuelist) + list_del_init(&rq->queuelist); + BUG_ON(bfqq->queued[sync] == 0); + bfqq->queued[sync]--; + bfqd->queued--; + elv_rb_del(&bfqq->sort_list, rq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) + bfq_del_bfqq_busy(bfqd, bfqq, 1); + /* + * Remove queue from request-position tree as it is empty. + */ + if (bfqq->pos_root != NULL) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + } + + if (rq->cmd_flags & REQ_META) { + BUG_ON(bfqq->meta_pending == 0); + bfqq->meta_pending--; + } +} + +static int bfq_merge(struct request_queue *q, struct request **req, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct request *__rq; + + __rq = bfq_find_rq_fmerge(bfqd, bio); + if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) { + *req = __rq; + return ELEVATOR_FRONT_MERGE; + } + + return ELEVATOR_NO_MERGE; +} + +static void bfq_merged_request(struct request_queue *q, struct request *req, + int type) +{ + if (type == ELEVATOR_FRONT_MERGE && + rb_prev(&req->rb_node) && + blk_rq_pos(req) < + blk_rq_pos(container_of(rb_prev(&req->rb_node), + struct request, rb_node))) { + struct bfq_queue *bfqq = RQ_BFQQ(req); + struct bfq_data *bfqd = bfqq->bfqd; + struct request *prev, *next_rq; + + /* Reposition request in its sort_list */ + elv_rb_del(&bfqq->sort_list, req); + elv_rb_add(&bfqq->sort_list, req); + /* Choose next request to be served for bfqq */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, + bfqd->last_position); + BUG_ON(next_rq == NULL); + bfqq->next_rq = next_rq; + /* + * If next_rq changes, update both the queue's budget to + * fit the new request and the queue's position in its + * rq_pos_tree. + */ + if (prev != bfqq->next_rq) { + bfq_updated_next_req(bfqd, bfqq); + bfq_rq_pos_tree_add(bfqd, bfqq); + } + } +} + +static void bfq_merged_requests(struct request_queue *q, struct request *rq, + struct request *next) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); + + /* + * If next and rq belong to the same bfq_queue and next is older + * than rq, then reposition rq in the fifo (by substituting next + * with rq). Otherwise, if next and rq belong to different + * bfq_queues, never reposition rq: in fact, we would have to + * reposition it with respect to next's position in its own fifo, + * which would most certainly be too expensive with respect to + * the benefits. + */ + if (bfqq == next_bfqq && + !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && + time_before(rq_fifo_time(next), rq_fifo_time(rq))) { + list_del_init(&rq->queuelist); + list_replace_init(&next->queuelist, &rq->queuelist); + rq_set_fifo_time(rq, rq_fifo_time(next)); + } + + if (bfqq->next_rq == next) + bfqq->next_rq = rq; + + bfq_remove_request(next); +} + +/* Must be called with bfqq != NULL */ +static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq) +{ + BUG_ON(bfqq == NULL); + if (bfq_bfqq_busy(bfqq)) + bfqq->bfqd->wr_busy_queues--; + bfqq->wr_coeff = 1; + bfqq->wr_cur_max_time = 0; + /* Trigger a weight change on the next activation of the queue */ + bfqq->entity.ioprio_changed = 1; +} + +static void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + if (bfqg->async_bfqq[i][j] != NULL) + bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); + if (bfqg->async_idle_bfqq != NULL) + bfq_bfqq_end_wr(bfqg->async_idle_bfqq); +} + +static void bfq_end_wr(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + + spin_lock_irq(bfqd->queue->queue_lock); + + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + bfq_end_wr_async(bfqd); + + spin_unlock_irq(bfqd->queue->queue_lock); +} + +static inline sector_t bfq_io_struct_pos(void *io_struct, bool request) +{ + if (request) + return blk_rq_pos(io_struct); + else + return ((struct bio *)io_struct)->bi_sector; +} + +static inline sector_t bfq_dist_from(sector_t pos1, + sector_t pos2) +{ + if (pos1 >= pos2) + return pos1 - pos2; + else + return pos2 - pos1; +} + +static inline int bfq_rq_close_to_sector(void *io_struct, bool request, + sector_t sector) +{ + return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <= + BFQQ_SEEK_THR; +} + +static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector) +{ + struct rb_root *root = &bfqd->rq_pos_tree; + struct rb_node *parent, *node; + struct bfq_queue *__bfqq; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + /* + * First, if we find a request starting at the end of the last + * request, choose it. + */ + __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); + if (__bfqq != NULL) + return __bfqq; + + /* + * If the exact sector wasn't found, the parent of the NULL leaf + * will contain the closest sector (rq_pos_tree sorted by + * next_request position). + */ + __bfqq = rb_entry(parent, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + if (blk_rq_pos(__bfqq->next_rq) < sector) + node = rb_next(&__bfqq->pos_node); + else + node = rb_prev(&__bfqq->pos_node); + if (node == NULL) + return NULL; + + __bfqq = rb_entry(node, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + return NULL; +} + +/* + * bfqd - obvious + * cur_bfqq - passed in so that we don't decide that the current queue + * is closely cooperating with itself + * sector - used as a reference point to search for a close queue + */ +static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, + struct bfq_queue *cur_bfqq, + sector_t sector) +{ + struct bfq_queue *bfqq; + + if (bfq_class_idle(cur_bfqq)) + return NULL; + if (!bfq_bfqq_sync(cur_bfqq)) + return NULL; + if (BFQQ_SEEKY(cur_bfqq)) + return NULL; + + /* If device has only one backlogged bfq_queue, don't search. */ + if (bfqd->busy_queues == 1) + return NULL; + + /* + * We should notice if some of the queues are cooperating, e.g. + * working closely on the same area of the disk. In that case, + * we can group them together and don't waste time idling. + */ + bfqq = bfqq_close(bfqd, sector); + if (bfqq == NULL || bfqq == cur_bfqq) + return NULL; + + /* + * Do not merge queues from different bfq_groups. + */ + if (bfqq->entity.parent != cur_bfqq->entity.parent) + return NULL; + + /* + * It only makes sense to merge sync queues. + */ + if (!bfq_bfqq_sync(bfqq)) + return NULL; + if (BFQQ_SEEKY(bfqq)) + return NULL; + + /* + * Do not merge queues of different priority classes. + */ + if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq)) + return NULL; + + return bfqq; +} + +static struct bfq_queue * +bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + int process_refs, new_process_refs; + struct bfq_queue *__bfqq; + + /* + * If there are no process references on the new_bfqq, then it is + * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain + * may have dropped their last reference (not just their last process + * reference). + */ + if (!bfqq_process_refs(new_bfqq)) + return NULL; + + /* Avoid a circular list and skip interim queue merges. */ + while ((__bfqq = new_bfqq->new_bfqq)) { + if (__bfqq == bfqq) + return NULL; + new_bfqq = __bfqq; + } + + process_refs = bfqq_process_refs(bfqq); + new_process_refs = bfqq_process_refs(new_bfqq); + /* + * If the process for the bfqq has gone away, there is no + * sense in merging the queues. + */ + if (process_refs == 0 || new_process_refs == 0) + return NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", + new_bfqq->pid); + + /* + * Merging is just a redirection: the requests of the process + * owning one of the two queues are redirected to the other queue. + * The latter queue, in its turn, is set as shared if this is the + * first time that the requests of some process are redirected to + * it. + * + * We redirect bfqq to new_bfqq and not the opposite, because we + * are in the context of the process owning bfqq, hence we have + * the io_cq of this process. So we can immediately configure this + * io_cq to redirect the requests of the process to new_bfqq. + * + * NOTE, even if new_bfqq coincides with the in-service queue, the + * io_cq of new_bfqq is not available, because, if the in-service + * queue is shared, bfqd->in_service_bic may not point to the + * io_cq of the in-service queue. + * Redirecting the requests of the process owning bfqq to the + * currently in-service queue is in any case the best option, as + * we feed the in-service queue with new requests close to the + * last request served and, by doing so, hopefully increase the + * throughput. + */ + bfqq->new_bfqq = new_bfqq; + atomic_add(process_refs, &new_bfqq->ref); + return new_bfqq; +} + +/* + * Attempt to schedule a merge of bfqq with the currently in-service queue + * or with a close queue among the scheduled queues. + * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue + * structure otherwise. + * + * The OOM queue is not allowed to participate to cooperation: in fact, since + * the requests temporarily redirected to the OOM queue could be redirected + * again to dedicated queues at any time, the state needed to correctly + * handle merging with the OOM queue would be quite complex and expensive + * to maintain. Besides, in such a critical condition as an out of memory, + * the benefits of queue merging may be little relevant, or even negligible. + */ +static struct bfq_queue * +bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + void *io_struct, bool request) +{ + struct bfq_queue *in_service_bfqq, *new_bfqq; + + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + + if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) + return NULL; + + in_service_bfqq = bfqd->in_service_queue; + + if (in_service_bfqq == NULL || in_service_bfqq == bfqq || + !bfqd->in_service_bic || + unlikely(in_service_bfqq == &bfqd->oom_bfqq)) + goto check_scheduled; + + if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq)) + goto check_scheduled; + + if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq)) + goto check_scheduled; + + if (in_service_bfqq->entity.parent != bfqq->entity.parent) + goto check_scheduled; + + if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && + bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) { + new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); + if (new_bfqq != NULL) + return new_bfqq; /* Merge with in-service queue */ + } + + /* + * Check whether there is a cooperator among currently scheduled + * queues. The only thing we need is that the bio/request is not + * NULL, as we need it to establish whether a cooperator exists. + */ +check_scheduled: + new_bfqq = bfq_close_cooperator(bfqd, bfqq, + bfq_io_struct_pos(io_struct, request)); + if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq)) + return bfq_setup_merge(bfqq, new_bfqq); + + return NULL; +} + +static inline void +bfq_bfqq_save_state(struct bfq_queue *bfqq) +{ + /* + * If bfqq->bic == NULL, the queue is already shared or its requests + * have already been redirected to a shared queue; both idle window + * and weight raising state have already been saved. Do nothing. + */ + if (bfqq->bic == NULL) + return; + if (bfqq->bic->wr_time_left) + /* + * This is the queue of a just-started process, and would + * deserve weight raising: we set wr_time_left to the full + * weight-raising duration to trigger weight-raising when + * and if the queue is split and the first request of the + * queue is enqueued. + */ + bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); + else if (bfqq->wr_coeff > 1) { + unsigned long wr_duration = + jiffies - bfqq->last_wr_start_finish; + /* + * It may happen that a queue's weight raising period lasts + * longer than its wr_cur_max_time, as weight raising is + * handled only when a request is enqueued or dispatched (it + * does not use any timer). If the weight raising period is + * about to end, don't save it. + */ + if (bfqq->wr_cur_max_time <= wr_duration) + bfqq->bic->wr_time_left = 0; + else + bfqq->bic->wr_time_left = + bfqq->wr_cur_max_time - wr_duration; + /* + * The bfq_queue is becoming shared or the requests of the + * process owning the queue are being redirected to a shared + * queue. Stop the weight raising period of the queue, as in + * both cases it should not be owned by an interactive or + * soft real-time application. + */ + bfq_bfqq_end_wr(bfqq); + } else + bfqq->bic->wr_time_left = 0; + bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bfqq->bic->cooperations++; + bfqq->bic->failed_cooperations = 0; +} + +static inline void +bfq_get_bic_reference(struct bfq_queue *bfqq) +{ + /* + * If bfqq->bic has a non-NULL value, the bic to which it belongs + * is about to begin using a shared bfq_queue. + */ + if (bfqq->bic) + atomic_long_inc(&bfqq->bic->icq.ioc->refcount); +} + +static void +bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", + (long unsigned)new_bfqq->pid); + /* Save weight raising and idle window of the merged queues */ + bfq_bfqq_save_state(bfqq); + bfq_bfqq_save_state(new_bfqq); + if (bfq_bfqq_IO_bound(bfqq)) + bfq_mark_bfqq_IO_bound(new_bfqq); + bfq_clear_bfqq_IO_bound(bfqq); + /* + * Grab a reference to the bic, to prevent it from being destroyed + * before being possibly touched by a bfq_split_bfqq(). + */ + bfq_get_bic_reference(bfqq); + bfq_get_bic_reference(new_bfqq); + /* + * Merge queues (that is, let bic redirect its requests to new_bfqq) + */ + bic_set_bfqq(bic, new_bfqq, 1); + bfq_mark_bfqq_coop(new_bfqq); + /* + * new_bfqq now belongs to at least two bics (it is a shared queue): + * set new_bfqq->bic to NULL. bfqq either: + * - does not belong to any bic any more, and hence bfqq->bic must + * be set to NULL, or + * - is a queue whose owning bics have already been redirected to a + * different queue, hence the queue is destined to not belong to + * any bic soon and bfqq->bic is already NULL (therefore the next + * assignment causes no harm). + */ + new_bfqq->bic = NULL; + bfqq->bic = NULL; + bfq_put_queue(bfqq); +} + +static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) +{ + struct bfq_io_cq *bic = bfqq->bic; + struct bfq_data *bfqd = bfqq->bfqd; + + if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { + bic->failed_cooperations++; + if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) + bic->cooperations = 0; + } +} + +static int bfq_allow_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq, *new_bfqq; + + /* + * Disallow merge of a sync bio into an async request. + */ + if (bfq_bio_sync(bio) && !rq_is_sync(rq)) + return 0; + + /* + * Lookup the bfqq that this bio will be queued with. Allow + * merge only if rq is queued there. + * Queue lock is held here. + */ + bic = bfq_bic_lookup(bfqd, current->io_context); + if (bic == NULL) + return 0; + + bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); + /* + * We take advantage of this function to perform an early merge + * of the queues of possible cooperating processes. + */ + if (bfqq != NULL) { + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + if (new_bfqq != NULL) { + bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); + /* + * If we get here, the bio will be queued in the + * shared queue, i.e., new_bfqq, so use new_bfqq + * to decide whether bio and rq can be merged. + */ + bfqq = new_bfqq; + } else + bfq_bfqq_increase_failed_cooperations(bfqq); + } + + return bfqq == RQ_BFQQ(rq); +} + +static void __bfq_set_in_service_queue(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + if (bfqq != NULL) { + bfq_mark_bfqq_must_alloc(bfqq); + bfq_mark_bfqq_budget_new(bfqq); + bfq_clear_bfqq_fifo_expire(bfqq); + + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; + + bfq_log_bfqq(bfqd, bfqq, + "set_in_service_queue, cur-budget = %lu", + bfqq->entity.budget); + } + + bfqd->in_service_queue = bfqq; +} + +/* + * Get and set a new queue for service. + */ +static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); + + __bfq_set_in_service_queue(bfqd, bfqq); + return bfqq; +} + +/* + * If enough samples have been computed, return the current max budget + * stored in bfqd, which is dynamically updated according to the + * estimated disk peak rate; otherwise return the default max budget + */ +static inline unsigned long bfq_max_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < 194) + return bfq_default_max_budget; + else + return bfqd->bfq_max_budget; +} + +/* + * Return min budget, which is a fraction of the current or default + * max budget (trying with 1/32) + */ +static inline unsigned long bfq_min_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < 194) + return bfq_default_max_budget / 32; + else + return bfqd->bfq_max_budget / 32; +} + +static void bfq_arm_slice_timer(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfqd->in_service_queue; + struct bfq_io_cq *bic; + unsigned long sl; + + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + + /* Processes have exited, don't wait. */ + bic = bfqd->in_service_bic; + if (bic == NULL || atomic_read(&bic->icq.ioc->nr_tasks) == 0) + return; + + bfq_mark_bfqq_wait_request(bfqq); + + /* + * We don't want to idle for seeks, but we do want to allow + * fair distribution of slice time for a process doing back-to-back + * seeks. So allow a little bit of time for him to submit a new rq. + * + * To prevent processes with (partly) seeky workloads from + * being too ill-treated, grant them a small fraction of the + * assigned budget before reducing the waiting time to + * BFQ_MIN_TT. This happened to help reduce latency. + */ + sl = bfqd->bfq_slice_idle; + /* + * Unless the queue is being weight-raised or the scenario is + * asymmetric, grant only minimum idle time if the queue either + * has been seeky for long enough or has already proved to be + * constantly seeky. + */ + if (bfq_sample_valid(bfqq->seek_samples) && + ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > + bfq_max_budget(bfqq->bfqd) / 8) || + bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && + symmetric_scenario) + sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); + else if (bfqq->wr_coeff > 1) + sl = sl * 3; + bfqd->last_idling_start = ktime_get(); + mod_timer(&bfqd->idle_slice_timer, jiffies + sl); + bfq_log(bfqd, "arm idle: %u/%u ms", + jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); +} + +/* + * Set the maximum time for the in-service queue to consume its + * budget. This prevents seeky processes from lowering the disk + * throughput (always guaranteed with a time slice scheme as in CFQ). + */ +static void bfq_set_budget_timeout(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfqd->in_service_queue; + unsigned int timeout_coeff; + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) + timeout_coeff = 1; + else + timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; + + bfqd->last_budget_start = ktime_get(); + + bfq_clear_bfqq_budget_new(bfqq); + bfqq->budget_timeout = jiffies + + bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; + + bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", + jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * + timeout_coeff)); +} + +/* + * Move request from internal lists to the request queue dispatch list. + */ +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + /* + * For consistency, the next instruction should have been executed + * after removing the request from the queue and dispatching it. + * We execute instead this instruction before bfq_remove_request() + * (and hence introduce a temporary inconsistency), for efficiency. + * In fact, in a forced_dispatch, this prevents two counters related + * to bfqq->dispatched to risk to be uselessly decremented if bfqq + * is not in service, and then to be incremented again after + * incrementing bfqq->dispatched. + */ + bfqq->dispatched++; + bfq_remove_request(rq); + elv_dispatch_sort(q, rq); + + if (bfq_bfqq_sync(bfqq)) + bfqd->sync_flight++; +} + +/* + * Return expired entry, or NULL to just start from scratch in rbtree. + */ +static struct request *bfq_check_fifo(struct bfq_queue *bfqq) +{ + struct request *rq = NULL; + + if (bfq_bfqq_fifo_expire(bfqq)) + return NULL; + + bfq_mark_bfqq_fifo_expire(bfqq); + + if (list_empty(&bfqq->fifo)) + return NULL; + + rq = rq_entry_fifo(bfqq->fifo.next); + + if (time_before(jiffies, rq_fifo_time(rq))) + return NULL; + + return rq; +} + +static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + return entity->budget - entity->service; +} + +static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + BUG_ON(bfqq != bfqd->in_service_queue); + + __bfq_bfqd_reset_in_service(bfqd); + + /* + * If this bfqq is shared between multiple processes, check + * to make sure that those processes are still issuing I/Os + * within the mean seek distance. If not, it may be time to + * break the queues apart again. + */ + if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) + bfq_mark_bfqq_split_coop(bfqq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* + * Overloading budget_timeout field to store the time + * at which the queue remains with no backlog; used by + * the weight-raising mechanism. + */ + bfqq->budget_timeout = jiffies; + bfq_del_bfqq_busy(bfqd, bfqq, 1); + } else { + bfq_activate_bfqq(bfqd, bfqq); + /* + * Resort priority tree of potential close cooperators. + */ + bfq_rq_pos_tree_add(bfqd, bfqq); + } +} + +/** + * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. + * @bfqd: device data. + * @bfqq: queue to update. + * @reason: reason for expiration. + * + * Handle the feedback on @bfqq budget. See the body for detailed + * comments. + */ +static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + enum bfqq_expiration reason) +{ + struct request *next_rq; + unsigned long budget, min_budget; + + budget = bfqq->max_budget; + min_budget = bfq_min_budget(bfqd); + + BUG_ON(bfqq != bfqd->in_service_queue); + + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", + budget, bfq_min_budget(bfqd)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", + bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); + + if (bfq_bfqq_sync(bfqq)) { + switch (reason) { + /* + * Caveat: in all the following cases we trade latency + * for throughput. + */ + case BFQ_BFQQ_TOO_IDLE: + /* + * This is the only case where we may reduce + * the budget: if there is no request of the + * process still waiting for completion, then + * we assume (tentatively) that the timer has + * expired because the batch of requests of + * the process could have been served with a + * smaller budget. Hence, betting that + * process will behave in the same way when it + * becomes backlogged again, we reduce its + * next budget. As long as we guess right, + * this budget cut reduces the latency + * experienced by the process. + * + * However, if there are still outstanding + * requests, then the process may have not yet + * issued its next request just because it is + * still waiting for the completion of some of + * the still outstanding ones. So in this + * subcase we do not reduce its budget, on the + * contrary we increase it to possibly boost + * the throughput, as discussed in the + * comments to the BUDGET_TIMEOUT case. + */ + if (bfqq->dispatched > 0) /* still outstanding reqs */ + budget = min(budget * 2, bfqd->bfq_max_budget); + else { + if (budget > 5 * min_budget) + budget -= 4 * min_budget; + else + budget = min_budget; + } + break; + case BFQ_BFQQ_BUDGET_TIMEOUT: + /* + * We double the budget here because: 1) it + * gives the chance to boost the throughput if + * this is not a seeky process (which may have + * bumped into this timeout because of, e.g., + * ZBR), 2) together with charge_full_budget + * it helps give seeky processes higher + * timestamps, and hence be served less + * frequently. + */ + budget = min(budget * 2, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_BUDGET_EXHAUSTED: + /* + * The process still has backlog, and did not + * let either the budget timeout or the disk + * idling timeout expire. Hence it is not + * seeky, has a short thinktime and may be + * happy with a higher budget too. So + * definitely increase the budget of this good + * candidate to boost the disk throughput. + */ + budget = min(budget * 4, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_NO_MORE_REQUESTS: + /* + * Leave the budget unchanged. + */ + default: + return; + } + } else /* async queue */ + /* async queues get always the maximum possible budget + * (their ability to dispatch is limited by + * @bfqd->bfq_max_budget_async_rq). + */ + budget = bfqd->bfq_max_budget; + + bfqq->max_budget = budget; + + if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 && + bfqq->max_budget > bfqd->bfq_max_budget) + bfqq->max_budget = bfqd->bfq_max_budget; + + /* + * Make sure that we have enough budget for the next request. + * Since the finish time of the bfqq must be kept in sync with + * the budget, be sure to call __bfq_bfqq_expire() after the + * update. + */ + next_rq = bfqq->next_rq; + if (next_rq != NULL) + bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + else + bfqq->entity.budget = bfqq->max_budget; + + bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu", + next_rq != NULL ? blk_rq_sectors(next_rq) : 0, + bfqq->entity.budget); +} + +static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) +{ + unsigned long max_budget; + + /* + * The max_budget calculated when autotuning is equal to the + * amount of sectors transfered in timeout_sync at the + * estimated peak rate. + */ + max_budget = (unsigned long)(peak_rate * 1000 * + timeout >> BFQ_RATE_SHIFT); + + return max_budget; +} + +/* + * In addition to updating the peak rate, checks whether the process + * is "slow", and returns 1 if so. This slow flag is used, in addition + * to the budget timeout, to reduce the amount of service provided to + * seeky processes, and hence reduce their chances to lower the + * throughput. See the code for more details. + */ +static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int compensate, enum bfqq_expiration reason) +{ + u64 bw, usecs, expected, timeout; + ktime_t delta; + int update = 0; + + if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) + return 0; + + if (compensate) + delta = bfqd->last_idling_start; + else + delta = ktime_get(); + delta = ktime_sub(delta, bfqd->last_budget_start); + usecs = ktime_to_us(delta); + + /* Don't trust short/unrealistic values. */ + if (usecs < 100 || usecs >= LONG_MAX) + return 0; + + /* + * Calculate the bandwidth for the last slice. We use a 64 bit + * value to store the peak rate, in sectors per usec in fixed + * point math. We do so to have enough precision in the estimate + * and to avoid overflows. + */ + bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; + do_div(bw, (unsigned long)usecs); + + timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); + + /* + * Use only long (> 20ms) intervals to filter out spikes for + * the peak rate estimation. + */ + if (usecs > 20000) { + if (bw > bfqd->peak_rate || + (!BFQQ_SEEKY(bfqq) && + reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { + bfq_log(bfqd, "measured bw =%llu", bw); + /* + * To smooth oscillations use a low-pass filter with + * alpha=7/8, i.e., + * new_rate = (7/8) * old_rate + (1/8) * bw + */ + do_div(bw, 8); + if (bw == 0) + return 0; + bfqd->peak_rate *= 7; + do_div(bfqd->peak_rate, 8); + bfqd->peak_rate += bw; + update = 1; + bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); + } + + update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; + + if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) + bfqd->peak_rate_samples++; + + if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && + update) { + int dev_type = blk_queue_nonrot(bfqd->queue); + if (bfqd->bfq_user_max_budget == 0) { + bfqd->bfq_max_budget = + bfq_calc_max_budget(bfqd->peak_rate, + timeout); + bfq_log(bfqd, "new max_budget=%lu", + bfqd->bfq_max_budget); + } + if (bfqd->device_speed == BFQ_BFQD_FAST && + bfqd->peak_rate < device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_SLOW; + bfqd->RT_prod = R_slow[dev_type] * + T_slow[dev_type]; + } else if (bfqd->device_speed == BFQ_BFQD_SLOW && + bfqd->peak_rate > device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_FAST; + bfqd->RT_prod = R_fast[dev_type] * + T_fast[dev_type]; + } + } + } + + /* + * If the process has been served for a too short time + * interval to let its possible sequential accesses prevail on + * the initial seek time needed to move the disk head on the + * first sector it requested, then give the process a chance + * and for the moment return false. + */ + if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) + return 0; + + /* + * A process is considered ``slow'' (i.e., seeky, so that we + * cannot treat it fairly in the service domain, as it would + * slow down too much the other processes) if, when a slice + * ends for whatever reason, it has received service at a + * rate that would not be high enough to complete the budget + * before the budget timeout expiration. + */ + expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; + + /* + * Caveat: processes doing IO in the slower disk zones will + * tend to be slow(er) even if not seeky. And the estimated + * peak rate will actually be an average over the disk + * surface. Hence, to not be too harsh with unlucky processes, + * we keep a budget/3 margin of safety before declaring a + * process slow. + */ + return expected > (4 * bfqq->entity.budget) / 3; +} + +/* + * To be deemed as soft real-time, an application must meet two + * requirements. First, the application must not require an average + * bandwidth higher than the approximate bandwidth required to playback or + * record a compressed high-definition video. + * The next function is invoked on the completion of the last request of a + * batch, to compute the next-start time instant, soft_rt_next_start, such + * that, if the next request of the application does not arrive before + * soft_rt_next_start, then the above requirement on the bandwidth is met. + * + * The second requirement is that the request pattern of the application is + * isochronous, i.e., that, after issuing a request or a batch of requests, + * the application stops issuing new requests until all its pending requests + * have been completed. After that, the application may issue a new batch, + * and so on. + * For this reason the next function is invoked to compute + * soft_rt_next_start only for applications that meet this requirement, + * whereas soft_rt_next_start is set to infinity for applications that do + * not. + * + * Unfortunately, even a greedy application may happen to behave in an + * isochronous way if the CPU load is high. In fact, the application may + * stop issuing requests while the CPUs are busy serving other processes, + * then restart, then stop again for a while, and so on. In addition, if + * the disk achieves a low enough throughput with the request pattern + * issued by the application (e.g., because the request pattern is random + * and/or the device is slow), then the application may meet the above + * bandwidth requirement too. To prevent such a greedy application to be + * deemed as soft real-time, a further rule is used in the computation of + * soft_rt_next_start: soft_rt_next_start must be higher than the current + * time plus the maximum time for which the arrival of a request is waited + * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. + * This filters out greedy applications, as the latter issue instead their + * next request as soon as possible after the last one has been completed + * (in contrast, when a batch of requests is completed, a soft real-time + * application spends some time processing data). + * + * Unfortunately, the last filter may easily generate false positives if + * only bfqd->bfq_slice_idle is used as a reference time interval and one + * or both the following cases occur: + * 1) HZ is so low that the duration of a jiffy is comparable to or higher + * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with + * HZ=100. + * 2) jiffies, instead of increasing at a constant rate, may stop increasing + * for a while, then suddenly 'jump' by several units to recover the lost + * increments. This seems to happen, e.g., inside virtual machines. + * To address this issue, we do not use as a reference time interval just + * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In + * particular we add the minimum number of jiffies for which the filter + * seems to be quite precise also in embedded systems and KVM/QEMU virtual + * machines. + */ +static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + return max(bfqq->last_idle_bklogged + + HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate, + jiffies + bfqq->bfqd->bfq_slice_idle + 4); +} + +/* + * Return the largest-possible time instant such that, for as long as possible, + * the current time will be lower than this time instant according to the macro + * time_is_before_jiffies(). + */ +static inline unsigned long bfq_infinity_from_now(unsigned long now) +{ + return now + ULONG_MAX / 2; +} + +/** + * bfq_bfqq_expire - expire a queue. + * @bfqd: device owning the queue. + * @bfqq: the queue to expire. + * @compensate: if true, compensate for the time spent idling. + * @reason: the reason causing the expiration. + * + * + * If the process associated to the queue is slow (i.e., seeky), or in + * case of budget timeout, or, finally, if it is async, we + * artificially charge it an entire budget (independently of the + * actual service it received). As a consequence, the queue will get + * higher timestamps than the correct ones upon reactivation, and + * hence it will be rescheduled as if it had received more service + * than what it actually received. In the end, this class of processes + * will receive less service in proportion to how slowly they consume + * their budgets (and hence how seriously they tend to lower the + * throughput). + * + * In contrast, when a queue expires because it has been idling for + * too much or because it exhausted its budget, we do not touch the + * amount of service it has received. Hence when the queue will be + * reactivated and its timestamps updated, the latter will be in sync + * with the actual service received by the queue until expiration. + * + * Charging a full budget to the first type of queues and the exact + * service to the others has the effect of using the WF2Q+ policy to + * schedule the former on a timeslice basis, without violating the + * service domain guarantees of the latter. + */ +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + int compensate, + enum bfqq_expiration reason) +{ + int slow; + BUG_ON(bfqq != bfqd->in_service_queue); + + /* Update disk peak rate for autotuning and check whether the + * process is slow (see bfq_update_peak_rate). + */ + slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); + + /* + * As above explained, 'punish' slow (i.e., seeky), timed-out + * and async queues, to favor sequential sync workloads. + * + * Processes doing I/O in the slower disk zones will tend to be + * slow(er) even if not seeky. Hence, since the estimated peak + * rate is actually an average over the disk surface, these + * processes may timeout just for bad luck. To avoid punishing + * them we do not charge a full budget to a process that + * succeeded in consuming at least 2/3 of its budget. + */ + if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) + bfq_bfqq_charge_full_budget(bfqq); + + bfqq->service_from_backlogged += bfqq->entity.service; + + if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && + !bfq_bfqq_constantly_seeky(bfqq)) { + bfq_mark_bfqq_constantly_seeky(bfqq); + if (!blk_queue_nonrot(bfqd->queue)) + bfqd->const_seeky_busy_in_flight_queues++; + } + + if (reason == BFQ_BFQQ_TOO_IDLE && + bfqq->entity.service <= 2 * bfqq->entity.budget / 10 ) + bfq_clear_bfqq_IO_bound(bfqq); + + if (bfqd->low_latency && bfqq->wr_coeff == 1) + bfqq->last_wr_start_finish = jiffies; + + if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* + * If we get here, and there are no outstanding requests, + * then the request pattern is isochronous (see the comments + * to the function bfq_bfqq_softrt_next_start()). Hence we + * can compute soft_rt_next_start. If, instead, the queue + * still has outstanding requests, then we have to wait + * for the completion of all the outstanding requests to + * discover whether the request pattern is actually + * isochronous. + */ + if (bfqq->dispatched == 0) + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + else { + /* + * The application is still waiting for the + * completion of one or more requests: + * prevent it from possibly being incorrectly + * deemed as soft real-time by setting its + * soft_rt_next_start to infinity. In fact, + * without this assignment, the application + * would be incorrectly deemed as soft + * real-time if: + * 1) it issued a new request before the + * completion of all its in-flight + * requests, and + * 2) at that time, its soft_rt_next_start + * happened to be in the past. + */ + bfqq->soft_rt_next_start = + bfq_infinity_from_now(jiffies); + /* + * Schedule an update of soft_rt_next_start to when + * the task may be discovered to be isochronous. + */ + bfq_mark_bfqq_softrt_update(bfqq); + } + } + + bfq_log_bfqq(bfqd, bfqq, + "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, + slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); + + /* + * Increase, decrease or leave budget unchanged according to + * reason. + */ + __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); + __bfq_bfqq_expire(bfqd, bfqq); +} + +/* + * Budget timeout is not implemented through a dedicated timer, but + * just checked on request arrivals and completions, as well as on + * idle timer expirations. + */ +static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) +{ + if (bfq_bfqq_budget_new(bfqq) || + time_before(jiffies, bfqq->budget_timeout)) + return 0; + return 1; +} + +/* + * If we expire a queue that is waiting for the arrival of a new + * request, we may prevent the fictitious timestamp back-shifting that + * allows the guarantees of the queue to be preserved (see [1] for + * this tricky aspect). Hence we return true only if this condition + * does not hold, or if the queue is slow enough to deserve only to be + * kicked off for preserving a high throughput. +*/ +static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, + "may_budget_timeout: wait_request %d left %d timeout %d", + bfq_bfqq_wait_request(bfqq), + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, + bfq_bfqq_budget_timeout(bfqq)); + + return (!bfq_bfqq_wait_request(bfqq) || + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) + && + bfq_bfqq_budget_timeout(bfqq); +} + +/* + * Device idling is allowed only for the queues for which this function + * returns true. For this reason, the return value of this function plays a + * critical role for both throughput boosting and service guarantees. The + * return value is computed through a logical expression. In this rather + * long comment, we try to briefly describe all the details and motivations + * behind the components of this logical expression. + * + * First, the expression is false if bfqq is not sync, or if: bfqq happened + * to become active during a large burst of queue activations, and the + * pattern of requests bfqq contains boosts the throughput if bfqq is + * expired. In fact, queues that became active during a large burst benefit + * only from throughput, as discussed in the comments to bfq_handle_burst. + * In this respect, expiring bfqq certainly boosts the throughput on NCQ- + * capable flash-based devices, whereas, on rotational devices, it boosts + * the throughput only if bfqq contains random requests. + * + * On the opposite end, if (a) bfqq is sync, (b) the above burst-related + * condition does not hold, and (c) bfqq is being weight-raised, then the + * expression always evaluates to true, as device idling is instrumental + * for preserving low-latency guarantees (see [1]). If, instead, conditions + * (a) and (b) do hold, but (c) does not, then the expression evaluates to + * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and + * (2) at least one of the following two conditions holds. + * The first condition is that the device is not performing NCQ, because + * idling the device most certainly boosts the throughput if this condition + * holds and bfqq is I/O-bound and has been granted a non-null idle window. + * The second compound condition is made of the logical AND of two components. + * + * The first component is true only if there is no weight-raised busy + * queue. This guarantees that the device is not idled for a sync non- + * weight-raised queue when there are busy weight-raised queues. The former + * is then expired immediately if empty. Combined with the timestamping + * rules of BFQ (see [1] for details), this causes sync non-weight-raised + * queues to get a lower number of requests served, and hence to ask for a + * lower number of requests from the request pool, before the busy weight- + * raised queues get served again. + * + * This is beneficial for the processes associated with weight-raised + * queues, when the request pool is saturated (e.g., in the presence of + * write hogs). In fact, if the processes associated with the other queues + * ask for requests at a lower rate, then weight-raised processes have a + * higher probability to get a request from the pool immediately (or at + * least soon) when they need one. Hence they have a higher probability to + * actually get a fraction of the disk throughput proportional to their + * high weight. This is especially true with NCQ-capable drives, which + * enqueue several requests in advance and further reorder internally- + * queued requests. + * + * In the end, mistreating non-weight-raised queues when there are busy + * weight-raised queues seems to mitigate starvation problems in the + * presence of heavy write workloads and NCQ, and hence to guarantee a + * higher application and system responsiveness in these hostile scenarios. + * + * If the first component of the compound condition is instead true, i.e., + * there is no weight-raised busy queue, then the second component of the + * compound condition takes into account service-guarantee and throughput + * issues related to NCQ (recall that the compound condition is evaluated + * only if the device is detected as supporting NCQ). + * + * As for service guarantees, allowing the drive to enqueue more than one + * request at a time, and hence delegating de facto final scheduling + * decisions to the drive's internal scheduler, causes loss of control on + * the actual request service order. In this respect, when the drive is + * allowed to enqueue more than one request at a time, the service + * distribution enforced by the drive's internal scheduler is likely to + * coincide with the desired device-throughput distribution only in the + * following, perfectly symmetric, scenario: + * 1) all active queues have the same weight, + * 2) all active groups at the same level in the groups tree have the same + * weight, + * 3) all active groups at the same level in the groups tree have the same + * number of children. + * + * Even in such a scenario, sequential I/O may still receive a preferential + * treatment, but this is not likely to be a big issue with flash-based + * devices, because of their non-dramatic loss of throughput with random + * I/O. Things do differ with HDDs, for which additional care is taken, as + * explained after completing the discussion for flash-based devices. + * + * Unfortunately, keeping the necessary state for evaluating exactly the + * above symmetry conditions would be quite complex and time-consuming. + * Therefore BFQ evaluates instead the following stronger sub-conditions, + * for which it is much easier to maintain the needed state: + * 1) all active queues have the same weight, + * 2) all active groups have the same weight, + * 3) all active groups have at most one active child each. + * In particular, the last two conditions are always true if hierarchical + * support and the cgroups interface are not enabled, hence no state needs + * to be maintained in this case. + * + * According to the above considerations, the second component of the + * compound condition evaluates to true if any of the above symmetry + * sub-condition does not hold, or the device is not flash-based. Therefore, + * if also the first component is true, then idling is allowed for a sync + * queue. These are the only sub-conditions considered if the device is + * flash-based, as, for such a device, it is sensible to force idling only + * for service-guarantee issues. In fact, as for throughput, idling + * NCQ-capable flash-based devices would not boost the throughput even + * with sequential I/O; rather it would lower the throughput in proportion + * to how fast the device is. In the end, (only) if all the three + * sub-conditions hold and the device is flash-based, the compound + * condition evaluates to false and therefore no idling is performed. + * + * As already said, things change with a rotational device, where idling + * boosts the throughput with sequential I/O (even with NCQ). Hence, for + * such a device the second component of the compound condition evaluates + * to true also if the following additional sub-condition does not hold: + * the queue is constantly seeky. Unfortunately, this different behavior + * with respect to flash-based devices causes an additional asymmetry: if + * some sync queues enjoy idling and some other sync queues do not, then + * the latter get a low share of the device throughput, simply because the + * former get many requests served after being set as in service, whereas + * the latter do not. As a consequence, to guarantee the desired throughput + * distribution, on HDDs the compound expression evaluates to true (and + * hence device idling is performed) also if the following last symmetry + * condition does not hold: no other queue is benefiting from idling. Also + * this last condition is actually replaced with a simpler-to-maintain and + * stronger condition: there is no busy queue which is not constantly seeky + * (and hence may also benefit from idling). + * + * To sum up, when all the required symmetry and throughput-boosting + * sub-conditions hold, the second component of the compound condition + * evaluates to false, and hence no idling is performed. This helps to + * keep the drives' internal queues full on NCQ-capable devices, and hence + * to boost the throughput, without causing 'almost' any loss of service + * guarantees. The 'almost' follows from the fact that, if the internal + * queue of one such device is filled while all the sub-conditions hold, + * but at some point in time some sub-condition stops to hold, then it may + * become impossible to let requests be served in the new desired order + * until all the requests already queued in the device have been served. + */ +static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; +#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \ + bfqd->busy_in_flight_queues == \ + bfqd->const_seeky_busy_in_flight_queues) + +#define cond_for_expiring_in_burst (bfq_bfqq_in_large_burst(bfqq) && \ + bfqd->hw_tag && \ + (blk_queue_nonrot(bfqd->queue) || \ + bfq_bfqq_constantly_seeky(bfqq))) + +/* + * Condition for expiring a non-weight-raised queue (and hence not idling + * the device). + */ +#define cond_for_expiring_non_wr (bfqd->hw_tag && \ + (bfqd->wr_busy_queues > 0 || \ + (blk_queue_nonrot(bfqd->queue) || \ + cond_for_seeky_on_ncq_hdd))) + + return bfq_bfqq_sync(bfqq) && + !cond_for_expiring_in_burst && + (bfqq->wr_coeff > 1 || !symmetric_scenario || + (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) && + !cond_for_expiring_non_wr) + ); +} + +/* + * If the in-service queue is empty but sync, and the function + * bfq_bfqq_must_not_expire returns true, then: + * 1) the queue must remain in service and cannot be expired, and + * 2) the disk must be idled to wait for the possible arrival of a new + * request for the queue. + * See the comments to the function bfq_bfqq_must_not_expire for the reasons + * why performing device idling is the best choice to boost the throughput + * and preserve service guarantees when bfq_bfqq_must_not_expire itself + * returns true. + */ +static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; + + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && + bfq_bfqq_must_not_expire(bfqq); +} + +/* + * Select a queue for service. If we have a current queue in service, + * check whether to continue servicing it, or retrieve and set a new one. + */ +static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + struct request *next_rq; + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; + + bfqq = bfqd->in_service_queue; + if (bfqq == NULL) + goto new_queue; + + bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); + + if (bfq_may_expire_for_budg_timeout(bfqq) && + !timer_pending(&bfqd->idle_slice_timer) && + !bfq_bfqq_must_idle(bfqq)) + goto expire; + + next_rq = bfqq->next_rq; + /* + * If bfqq has requests queued and it has enough budget left to + * serve them, keep the queue, otherwise expire it. + */ + if (next_rq != NULL) { + if (bfq_serv_to_charge(next_rq, bfqq) > + bfq_bfqq_budget_left(bfqq)) { + reason = BFQ_BFQQ_BUDGET_EXHAUSTED; + goto expire; + } else { + /* + * The idle timer may be pending because we may + * not disable disk idling even when a new request + * arrives. + */ + if (timer_pending(&bfqd->idle_slice_timer)) { + /* + * If we get here: 1) at least a new request + * has arrived but we have not disabled the + * timer because the request was too small, + * 2) then the block layer has unplugged + * the device, causing the dispatch to be + * invoked. + * + * Since the device is unplugged, now the + * requests are probably large enough to + * provide a reasonable throughput. + * So we disable idling. + */ + bfq_clear_bfqq_wait_request(bfqq); + del_timer(&bfqd->idle_slice_timer); + } + goto keep_queue; + } + } + + /* + * No requests pending. However, if the in-service queue is idling + * for a new request, or has requests waiting for a completion and + * may idle after their completion, then keep it anyway. + */ + if (timer_pending(&bfqd->idle_slice_timer) || + (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) { + bfqq = NULL; + goto keep_queue; + } + + reason = BFQ_BFQQ_NO_MORE_REQUESTS; +expire: + bfq_bfqq_expire(bfqd, bfqq, 0, reason); +new_queue: + bfqq = bfq_set_in_service_queue(bfqd); + bfq_log(bfqd, "select_queue: new queue %d returned", + bfqq != NULL ? bfqq->pid : 0); +keep_queue: + return bfqq; +} + +static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ + bfq_log_bfqq(bfqd, bfqq, + "raising period dur %u/%u msec, old coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); + + BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != + entity->orig_weight * bfqq->wr_coeff); + if (entity->ioprio_changed) + bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); + + /* + * If the queue was activated in a burst, or + * too much time has elapsed from the beginning + * of this weight-raising period, or the queue has + * exceeded the acceptable number of cooperations, + * then end weight raising. + */ + if (bfq_bfqq_in_large_burst(bfqq) || + bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time)) { + bfqq->last_wr_start_finish = jiffies; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + bfqq->last_wr_start_finish, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + bfq_bfqq_end_wr(bfqq); + } + } + /* Update weight both if it must be raised and if it must be lowered */ + if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) + __bfq_entity_update_weight_prio( + bfq_entity_service_tree(entity), + entity); +} + +/* + * Dispatch one request from bfqq, moving it to the request queue + * dispatch list. + */ +static int bfq_dispatch_request(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + int dispatched = 0; + struct request *rq; + unsigned long service_to_charge; + + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + + /* Follow expired path, else get first next available. */ + rq = bfq_check_fifo(bfqq); + if (rq == NULL) + rq = bfqq->next_rq; + service_to_charge = bfq_serv_to_charge(rq, bfqq); + + if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { + /* + * This may happen if the next rq is chosen in fifo order + * instead of sector order. The budget is properly + * dimensioned to be always sufficient to serve the next + * request only if it is chosen in sector order. The reason + * is that it would be quite inefficient and little useful + * to always make sure that the budget is large enough to + * serve even the possible next rq in fifo order. + * In fact, requests are seldom served in fifo order. + * + * Expire the queue for budget exhaustion, and make sure + * that the next act_budget is enough to serve the next + * request, even if it comes from the fifo expired path. + */ + bfqq->next_rq = rq; + /* + * Since this dispatch is failed, make sure that + * a new one will be performed + */ + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); + goto expire; + } + + /* Finally, insert request into driver dispatch list. */ + bfq_bfqq_served(bfqq, service_to_charge); + bfq_dispatch_insert(bfqd->queue, rq); + + bfq_update_wr_data(bfqd, bfqq); + + bfq_log_bfqq(bfqd, bfqq, + "dispatched %u sec req (%llu), budg left %lu", + blk_rq_sectors(rq), + (long long unsigned)blk_rq_pos(rq), + bfq_bfqq_budget_left(bfqq)); + + dispatched++; + + if (bfqd->in_service_bic == NULL) { + atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); + bfqd->in_service_bic = RQ_BIC(rq); + } + + if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && + dispatched >= bfqd->bfq_max_budget_async_rq) || + bfq_class_idle(bfqq))) + goto expire; + + return dispatched; + +expire: + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED); + return dispatched; +} + +static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) +{ + int dispatched = 0; + + while (bfqq->next_rq != NULL) { + bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); + dispatched++; + } + + BUG_ON(!list_empty(&bfqq->fifo)); + return dispatched; +} + +/* + * Drain our current requests. + * Used for barriers and when switching io schedulers on-the-fly. + */ +static int bfq_forced_dispatch(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq, *n; + struct bfq_service_tree *st; + int dispatched = 0; + + bfqq = bfqd->in_service_queue; + if (bfqq != NULL) + __bfq_bfqq_expire(bfqd, bfqq); + + /* + * Loop through classes, and be careful to leave the scheduler + * in a consistent state, as feedback mechanisms and vtime + * updates cannot be disabled during the process. + */ + list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { + st = bfq_entity_service_tree(&bfqq->entity); + + dispatched += __bfq_forced_dispatch_bfqq(bfqq); + bfqq->max_budget = bfq_max_budget(bfqd); + + bfq_forget_idle(st); + } + + BUG_ON(bfqd->busy_queues != 0); + + return dispatched; +} + +static int bfq_dispatch_requests(struct request_queue *q, int force) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq; + int max_dispatch; + + bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); + if (bfqd->busy_queues == 0) + return 0; + + if (unlikely(force)) + return bfq_forced_dispatch(bfqd); + + bfqq = bfq_select_queue(bfqd); + if (bfqq == NULL) + return 0; + + if (bfq_class_idle(bfqq)) + max_dispatch = 1; + + if (!bfq_bfqq_sync(bfqq)) + max_dispatch = bfqd->bfq_max_budget_async_rq; + + if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { + if (bfqd->busy_queues > 1) + return 0; + if (bfqq->dispatched >= 4 * max_dispatch) + return 0; + } + + if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) + return 0; + + bfq_clear_bfqq_wait_request(bfqq); + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + + if (!bfq_dispatch_request(bfqd, bfqq)) + return 0; + + bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", + bfq_bfqq_sync(bfqq) ? "sync" : "async"); + + return 1; +} + +/* + * Task holds one reference to the queue, dropped when task exits. Each rq + * in-flight on this queue also holds a reference, dropped when rq is freed. + * + * Queue lock must be held here. + */ +static void bfq_put_queue(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; + + BUG_ON(atomic_read(&bfqq->ref) <= 0); + + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, + atomic_read(&bfqq->ref)); + if (!atomic_dec_and_test(&bfqq->ref)) + return; + + BUG_ON(rb_first(&bfqq->sort_list) != NULL); + BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); + BUG_ON(bfqq->entity.tree != NULL); + BUG_ON(bfq_bfqq_busy(bfqq)); + BUG_ON(bfqd->in_service_queue == bfqq); + + if (bfq_bfqq_sync(bfqq)) + /* + * The fact that this queue is being destroyed does not + * invalidate the fact that this queue may have been + * activated during the current burst. As a consequence, + * although the queue does not exist anymore, and hence + * needs to be removed from the burst list if there, + * the burst size has not to be decremented. + */ + hlist_del_init(&bfqq->burst_list_node); + + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); + + kmem_cache_free(bfq_pool, bfqq); +} + +static void bfq_put_cooperator(struct bfq_queue *bfqq) +{ + struct bfq_queue *__bfqq, *next; + + /* + * If this queue was scheduled to merge with another queue, be + * sure to drop the reference taken on that queue (and others in + * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. + */ + __bfqq = bfqq->new_bfqq; + while (__bfqq) { + if (__bfqq == bfqq) + break; + next = __bfqq->new_bfqq; + bfq_put_queue(__bfqq); + __bfqq = next; + } +} + +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + if (bfqq == bfqd->in_service_queue) { + __bfq_bfqq_expire(bfqd, bfqq); + bfq_schedule_dispatch(bfqd); + } + + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, + atomic_read(&bfqq->ref)); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); +} + +static inline void bfq_init_icq(struct io_cq *icq) +{ + struct bfq_io_cq *bic = icq_to_bic(icq); + + bic->ttime.last_end_request = jiffies; + /* + * A newly created bic indicates that the process has just + * started doing I/O, and is probably mapping into memory its + * executable and libraries: it definitely needs weight raising. + * There is however the possibility that the process performs, + * for a while, I/O close to some other process. EQM intercepts + * this behavior and may merge the queue corresponding to the + * process with some other queue, BEFORE the weight of the queue + * is raised. Merged queues are not weight-raised (they are assumed + * to belong to processes that benefit only from high throughput). + * If the merge is basically the consequence of an accident, then + * the queue will be split soon and will get back its old weight. + * It is then important to write down somewhere that this queue + * does need weight raising, even if it did not make it to get its + * weight raised before being merged. To this purpose, we overload + * the field raising_time_left and assign 1 to it, to mark the queue + * as needing weight raising. + */ + bic->wr_time_left = 1; +} + +static void bfq_exit_icq(struct io_cq *icq) +{ + struct bfq_io_cq *bic = icq_to_bic(icq); + struct bfq_data *bfqd = bic_to_bfqd(bic); + + if (bic->bfqq[BLK_RW_ASYNC]) { + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); + bic->bfqq[BLK_RW_ASYNC] = NULL; + } + + if (bic->bfqq[BLK_RW_SYNC]) { + /* + * If the bic is using a shared queue, put the reference + * taken on the io_context when the bic started using a + * shared bfq_queue. + */ + if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) + put_io_context(icq->ioc); + bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); + bic->bfqq[BLK_RW_SYNC] = NULL; + } +} + +/* + * Update the entity prio values; note that the new values will not + * be used until the next (re)activation. + */ +static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +{ + struct task_struct *tsk = current; + struct io_context *ioc = bic->icq.ioc; + int ioprio_class; + + ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio); + switch (ioprio_class) { + default: + dev_err(bfqq->bfqd->queue->backing_dev_info.dev, + "bfq: bad prio class %d\n", ioprio_class); + case IOPRIO_CLASS_NONE: + /* + * No prio set, inherit CPU scheduling settings. + */ + bfqq->entity.new_ioprio = task_nice_ioprio(tsk); + bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk); + break; + case IOPRIO_CLASS_RT: + bfqq->entity.new_ioprio = task_ioprio(ioc); + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT; + break; + case IOPRIO_CLASS_BE: + bfqq->entity.new_ioprio = task_ioprio(ioc); + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE; + break; + case IOPRIO_CLASS_IDLE: + bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE; + bfqq->entity.new_ioprio = 7; + bfq_clear_bfqq_idle_window(bfqq); + break; + } + + if (bfqq->entity.new_ioprio < 0 || + bfqq->entity.new_ioprio >= IOPRIO_BE_NR) { + printk(KERN_CRIT "bfq_set_next_ioprio_data: new_ioprio %d\n", + bfqq->entity.new_ioprio); + BUG(); + } + + bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->entity.new_ioprio); + bfqq->entity.ioprio_changed = 1; +} + +static void bfq_check_ioprio_change(struct io_context *ioc, + struct bfq_io_cq *bic) +{ + struct bfq_data *bfqd; + struct bfq_queue *bfqq, *new_bfqq; + struct bfq_group *bfqg; + unsigned long uninitialized_var(flags); + int ioprio = bic->icq.ioc->ioprio; + + bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), + &flags); + if (unlikely(bfqd == NULL)) + return; + + bic->ioprio = ioprio; + + bfqq = bic->bfqq[BLK_RW_ASYNC]; + if (bfqq != NULL) { + bfqg = container_of(bfqq->entity.sched_data, struct bfq_group, + sched_data); + new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic->icq.ioc, + GFP_ATOMIC); + if (new_bfqq != NULL) { + bic->bfqq[BLK_RW_ASYNC] = new_bfqq; + bfq_log_bfqq(bfqd, bfqq, + "check_ioprio_change: bfqq %p %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } + } + + bfqq = bic->bfqq[BLK_RW_SYNC]; + if (bfqq != NULL) + bfq_set_next_ioprio_data(bfqq, bic); + + bfq_put_bfqd_unlock(bfqd, &flags); +} + +static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_io_cq *bic, pid_t pid, int is_sync) +{ + RB_CLEAR_NODE(&bfqq->entity.rb_node); + INIT_LIST_HEAD(&bfqq->fifo); + INIT_HLIST_NODE(&bfqq->burst_list_node); + + atomic_set(&bfqq->ref, 0); + bfqq->bfqd = bfqd; + + if (bic) + bfq_set_next_ioprio_data(bfqq, bic); + + if (is_sync) { + if (!bfq_class_idle(bfqq)) + bfq_mark_bfqq_idle_window(bfqq); + bfq_mark_bfqq_sync(bfqq); + } + bfq_mark_bfqq_IO_bound(bfqq); + + /* Tentative initial value to trade off between thr and lat */ + bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; + bfqq->pid = pid; + + bfqq->wr_coeff = 1; + bfqq->last_wr_start_finish = 0; + /* + * Set to the value for which bfqq will not be deemed as + * soft rt when it becomes backlogged. + */ + bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); +} + +static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, + int is_sync, + struct io_context *ioc, + gfp_t gfp_mask) +{ + struct bfq_queue *bfqq, *new_bfqq = NULL; + struct bfq_io_cq *bic; + +retry: + bic = bfq_bic_lookup(bfqd, ioc); + /* bic always exists here */ + bfqq = bic_to_bfqq(bic, is_sync); + + /* + * Always try a new alloc if we fall back to the OOM bfqq + * originally, since it should just be a temporary situation. + */ + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { + bfqq = NULL; + if (new_bfqq != NULL) { + bfqq = new_bfqq; + new_bfqq = NULL; + } else if (gfp_mask & __GFP_WAIT) { + spin_unlock_irq(bfqd->queue->queue_lock); + new_bfqq = kmem_cache_alloc_node(bfq_pool, + gfp_mask | __GFP_ZERO, + bfqd->queue->node); + spin_lock_irq(bfqd->queue->queue_lock); + if (new_bfqq != NULL) + goto retry; + } else { + bfqq = kmem_cache_alloc_node(bfq_pool, + gfp_mask | __GFP_ZERO, + bfqd->queue->node); + } + + if (bfqq != NULL) { + bfq_init_bfqq(bfqd, bfqq, bic, current->pid, + is_sync); + bfq_init_entity(&bfqq->entity, bfqg); + bfq_log_bfqq(bfqd, bfqq, "allocated"); + } else { + bfqq = &bfqd->oom_bfqq; + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); + } + } + + if (new_bfqq != NULL) + kmem_cache_free(bfq_pool, new_bfqq); + + return bfqq; +} + +static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, + struct bfq_group *bfqg, + int ioprio_class, int ioprio) +{ + switch (ioprio_class) { + case IOPRIO_CLASS_RT: + return &bfqg->async_bfqq[0][ioprio]; + case IOPRIO_CLASS_BE: + return &bfqg->async_bfqq[1][ioprio]; + case IOPRIO_CLASS_IDLE: + return &bfqg->async_idle_bfqq; + default: + BUG(); + } +} + +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, int is_sync, + struct io_context *ioc, gfp_t gfp_mask) +{ + const int ioprio = task_ioprio(ioc); + const int ioprio_class = task_ioprio_class(ioc); + struct bfq_queue **async_bfqq = NULL; + struct bfq_queue *bfqq = NULL; + + if (!is_sync) { + async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, + ioprio); + bfqq = *async_bfqq; + } + + if (bfqq == NULL) + bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, ioc, gfp_mask); + + /* + * Pin the queue now that it's allocated, scheduler exit will + * prune it. + */ + if (!is_sync && *async_bfqq == NULL) { + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", + bfqq, atomic_read(&bfqq->ref)); + *async_bfqq = bfqq; + } + + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, + atomic_read(&bfqq->ref)); + return bfqq; +} + +static void bfq_update_io_thinktime(struct bfq_data *bfqd, + struct bfq_io_cq *bic) +{ + unsigned long elapsed = jiffies - bic->ttime.last_end_request; + unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); + + bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; + bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; + bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / + bic->ttime.ttime_samples; +} + +static void bfq_update_io_seektime(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *rq) +{ + sector_t sdist; + u64 total; + + if (bfqq->last_request_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - bfqq->last_request_pos; + else + sdist = bfqq->last_request_pos - blk_rq_pos(rq); + + /* + * Don't allow the seek distance to get too large from the + * odd fragment, pagein, etc. + */ + if (bfqq->seek_samples == 0) /* first request, not really a seek */ + sdist = 0; + else if (bfqq->seek_samples <= 60) /* second & third seek */ + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); + else + sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); + + bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; + bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; + total = bfqq->seek_total + (bfqq->seek_samples/2); + do_div(total, bfqq->seek_samples); + bfqq->seek_mean = (sector_t)total; + + bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, + (u64)bfqq->seek_mean); +} + +/* + * Disable idle window if the process thinks too long or seeks so much that + * it doesn't matter. + */ +static void bfq_update_idle_window(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) +{ + int enable_idle; + + /* Don't idle for async or idle io prio class. */ + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) + return; + + /* Idle window just restored, statistics are meaningless. */ + if (bfq_bfqq_just_split(bfqq)) + return; + + enable_idle = bfq_bfqq_idle_window(bfqq); + + if (atomic_read(&bic->icq.ioc->nr_tasks) == 0 || + bfqd->bfq_slice_idle == 0 || + (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && + bfqq->wr_coeff == 1)) + enable_idle = 0; + else if (bfq_sample_valid(bic->ttime.ttime_samples)) { + if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && + bfqq->wr_coeff == 1) + enable_idle = 0; + else + enable_idle = 1; + } + bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", + enable_idle); + + if (enable_idle) + bfq_mark_bfqq_idle_window(bfqq); + else + bfq_clear_bfqq_idle_window(bfqq); +} + +/* + * Called when a new fs request (rq) is added to bfqq. Check if there's + * something we should do about it. + */ +static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct request *rq) +{ + struct bfq_io_cq *bic = RQ_BIC(rq); + + if (rq->cmd_flags & REQ_META) + bfqq->meta_pending++; + + bfq_update_io_thinktime(bfqd, bic); + bfq_update_io_seektime(bfqd, bfqq, rq); + if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { + bfq_clear_bfqq_constantly_seeky(bfqq); + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || + !BFQQ_SEEKY(bfqq)) + bfq_update_idle_window(bfqd, bfqq, bic); + bfq_clear_bfqq_just_split(bfqq); + + bfq_log_bfqq(bfqd, bfqq, + "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", + bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), + (long long unsigned)bfqq->seek_mean); + + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); + + if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { + int small_req = bfqq->queued[rq_is_sync(rq)] == 1 && + blk_rq_sectors(rq) < 32; + int budget_timeout = bfq_bfqq_budget_timeout(bfqq); + + /* + * There is just this request queued: if the request + * is small and the queue is not to be expired, then + * just exit. + * + * In this way, if the disk is being idled to wait for + * a new request from the in-service queue, we avoid + * unplugging the device and committing the disk to serve + * just a small request. On the contrary, we wait for + * the block layer to decide when to unplug the device: + * hopefully, new requests will be merged to this one + * quickly, then the device will be unplugged and + * larger requests will be dispatched. + */ + if (small_req && !budget_timeout) + return; + + /* + * A large enough request arrived, or the queue is to + * be expired: in both cases disk idling is to be + * stopped, so clear wait_request flag and reset + * timer. + */ + bfq_clear_bfqq_wait_request(bfqq); + del_timer(&bfqd->idle_slice_timer); + + /* + * The queue is not empty, because a new request just + * arrived. Hence we can safely expire the queue, in + * case of budget timeout, without risking that the + * timestamps of the queue are not updated correctly. + * See [1] for more details. + */ + if (budget_timeout) + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); + + /* + * Let the request rip immediately, or let a new queue be + * selected if bfqq has just been expired. + */ + __blk_run_queue(bfqd->queue); + } +} + +static void bfq_insert_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; + + assert_spin_locked(bfqd->queue->queue_lock); + + /* + * An unplug may trigger a requeue of a request from the device + * driver: make sure we are in process context while trying to + * merge two bfq_queues. + */ + if (!in_interrupt()) { + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); + if (new_bfqq != NULL) { + if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) + new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); + /* + * Release the request's reference to the old bfqq + * and make sure one is taken to the shared queue. + */ + new_bfqq->allocated[rq_data_dir(rq)]++; + bfqq->allocated[rq_data_dir(rq)]--; + atomic_inc(&new_bfqq->ref); + bfq_put_queue(bfqq); + if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + bfq_merge_bfqqs(bfqd, RQ_BIC(rq), + bfqq, new_bfqq); + rq->elv.priv[1] = new_bfqq; + bfqq = new_bfqq; + } else + bfq_bfqq_increase_failed_cooperations(bfqq); + } + + bfq_add_request(rq); + + /* + * Here a newly-created bfq_queue has already started a weight-raising + * period: clear raising_time_left to prevent bfq_bfqq_save_state() + * from assigning it a full weight-raising period. See the detailed + * comments about this field in bfq_init_icq(). + */ + if (bfqq->bic != NULL) + bfqq->bic->wr_time_left = 0; + rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); + list_add_tail(&rq->queuelist, &bfqq->fifo); + + bfq_rq_enqueued(bfqd, bfqq, rq); +} + +static void bfq_update_hw_tag(struct bfq_data *bfqd) +{ + bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, + bfqd->rq_in_driver); + + if (bfqd->hw_tag == 1) + return; + + /* + * This sample is valid if the number of outstanding requests + * is large enough to allow a queueing behavior. Note that the + * sum is not exact, as it's not taking into account deactivated + * requests. + */ + if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) + return; + + if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) + return; + + bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; + bfqd->max_rq_in_driver = 0; + bfqd->hw_tag_samples = 0; +} + +static void bfq_completed_request(struct request_queue *q, struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + bool sync = bfq_bfqq_sync(bfqq); + + bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", + blk_rq_sectors(rq), sync); + + bfq_update_hw_tag(bfqd); + + BUG_ON(!bfqd->rq_in_driver); + BUG_ON(!bfqq->dispatched); + bfqd->rq_in_driver--; + bfqq->dispatched--; + + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { + bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->busy_in_flight_queues); + bfqd->busy_in_flight_queues--; + if (bfq_bfqq_constantly_seeky(bfqq)) { + BUG_ON(!bfqd-> + const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } + } + + if (sync) { + bfqd->sync_flight--; + RQ_BIC(rq)->ttime.last_end_request = jiffies; + } + + /* + * If we are waiting to discover whether the request pattern of the + * task associated with the queue is actually isochronous, and + * both requisites for this condition to hold are satisfied, then + * compute soft_rt_next_start (see the comments to the function + * bfq_bfqq_softrt_next_start()). + */ + if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + + /* + * If this is the in-service queue, check if it needs to be expired, + * or if we want to idle in case it has no pending requests. + */ + if (bfqd->in_service_queue == bfqq) { + if (bfq_bfqq_budget_new(bfqq)) + bfq_set_budget_timeout(bfqd); + + if (bfq_bfqq_must_idle(bfqq)) { + bfq_arm_slice_timer(bfqd); + goto out; + } else if (bfq_may_expire_for_budg_timeout(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); + else if (RB_EMPTY_ROOT(&bfqq->sort_list) && + (bfqq->dispatched == 0 || + !bfq_bfqq_must_not_expire(bfqq))) + bfq_bfqq_expire(bfqd, bfqq, 0, + BFQ_BFQQ_NO_MORE_REQUESTS); + } + + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); + +out: + return; +} + +static inline int __bfq_may_queue(struct bfq_queue *bfqq) +{ + if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { + bfq_clear_bfqq_must_alloc(bfqq); + return ELV_MQUEUE_MUST; + } + + return ELV_MQUEUE_MAY; +} + +static int bfq_may_queue(struct request_queue *q, int rw) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct task_struct *tsk = current; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq; + + /* + * Don't force setup of a queue from here, as a call to may_queue + * does not necessarily imply that a request actually will be + * queued. So just lookup a possibly existing queue, or return + * 'may queue' if that fails. + */ + bic = bfq_bic_lookup(bfqd, tsk->io_context); + if (bic == NULL) + return ELV_MQUEUE_MAY; + + bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); + if (bfqq != NULL) + return __bfq_may_queue(bfqq); + + return ELV_MQUEUE_MAY; +} + +/* + * Queue lock held here. + */ +static void bfq_put_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + if (bfqq != NULL) { + const int rw = rq_data_dir(rq); + + BUG_ON(!bfqq->allocated[rw]); + bfqq->allocated[rw]--; + + rq->elv.priv[0] = NULL; + rq->elv.priv[1] = NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } +} + +/* + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this + * was the last process referring to said bfqq. + */ +static struct bfq_queue * +bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); + + put_io_context(bic->icq.ioc); + + if (bfqq_process_refs(bfqq) == 1) { + bfqq->pid = current->pid; + bfq_clear_bfqq_coop(bfqq); + bfq_clear_bfqq_split_coop(bfqq); + return bfqq; + } + + bic_set_bfqq(bic, NULL, 1); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); + return NULL; +} + +/* + * Allocate bfq data structures associated with this request. + */ +static int bfq_set_request(struct request_queue *q, struct request *rq, + gfp_t gfp_mask) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); + const int rw = rq_data_dir(rq); + const int is_sync = rq_is_sync(rq); + struct bfq_queue *bfqq; + struct bfq_group *bfqg; + unsigned long flags; + bool split = false; + + /* handle changed prio notifications; cgroup change is handled separately */ + if (unlikely(icq_get_changed(&bic->icq) & ICQ_IOPRIO_CHANGED)) + bfq_check_ioprio_change(bic->icq.ioc, bic); + + might_sleep_if(gfp_mask & __GFP_WAIT); + + spin_lock_irqsave(q->queue_lock, flags); + + if (bic == NULL) + goto queue_fail; + + bfqg = bfq_bic_update_cgroup(bic); + +new_queue: + bfqq = bic_to_bfqq(bic, is_sync); + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { + bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic->icq.ioc, gfp_mask); + bic_set_bfqq(bic, bfqq, is_sync); + if (split && is_sync) { + if ((bic->was_in_burst_list && bfqd->large_burst) || + bic->saved_in_large_burst) + bfq_mark_bfqq_in_large_burst(bfqq); + else { + bfq_clear_bfqq_in_large_burst(bfqq); + if (bic->was_in_burst_list) + hlist_add_head(&bfqq->burst_list_node, + &bfqd->burst_list); + } + } + } else { + /* If the queue was seeky for too long, break it apart. */ + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + bfqq = bfq_split_bfqq(bic, bfqq); + split = true; + if (!bfqq) + goto new_queue; + } + } + + bfqq->allocated[rw]++; + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, + atomic_read(&bfqq->ref)); + + rq->elv.priv[0] = bic; + rq->elv.priv[1] = bfqq; + + /* + * If a bfq_queue has only one process reference, it is owned + * by only one bfq_io_cq: we can set the bic field of the + * bfq_queue to the address of that structure. Also, if the + * queue has just been split, mark a flag so that the + * information is available to the other scheduler hooks. + */ + if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + bfqq->bic = bic; + if (split) { + bfq_mark_bfqq_just_split(bfqq); + /* + * If the queue has just been split from a shared + * queue, restore the idle window and the possible + * weight raising period. + */ + bfq_bfqq_resume_state(bfqq, bic); + } + } + + spin_unlock_irqrestore(q->queue_lock, flags); + + return 0; + +queue_fail: + bfq_schedule_dispatch(bfqd); + spin_unlock_irqrestore(q->queue_lock, flags); + + return 1; +} + +static void bfq_kick_queue(struct work_struct *work) +{ + struct bfq_data *bfqd = + container_of(work, struct bfq_data, unplug_work); + struct request_queue *q = bfqd->queue; + + spin_lock_irq(q->queue_lock); + __blk_run_queue(q); + spin_unlock_irq(q->queue_lock); +} + +/* + * Handler of the expiration of the timer running if the in-service queue + * is idling inside its time slice. + */ +static void bfq_idle_slice_timer(unsigned long data) +{ + struct bfq_data *bfqd = (struct bfq_data *)data; + struct bfq_queue *bfqq; + unsigned long flags; + enum bfqq_expiration reason; + + spin_lock_irqsave(bfqd->queue->queue_lock, flags); + + bfqq = bfqd->in_service_queue; + /* + * Theoretical race here: the in-service queue can be NULL or + * different from the queue that was idling if the timer handler + * spins on the queue_lock and a new request arrives for the + * current queue and there is a full dispatch cycle that changes + * the in-service queue. This can hardly happen, but in the worst + * case we just expire a queue too early. + */ + if (bfqq != NULL) { + bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); + if (bfq_bfqq_budget_timeout(bfqq)) + /* + * Also here the queue can be safely expired + * for budget timeout without wasting + * guarantees + */ + reason = BFQ_BFQQ_BUDGET_TIMEOUT; + else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) + /* + * The queue may not be empty upon timer expiration, + * because we may not disable the timer when the + * first request of the in-service queue arrives + * during disk idling. + */ + reason = BFQ_BFQQ_TOO_IDLE; + else + goto schedule_dispatch; + + bfq_bfqq_expire(bfqd, bfqq, 1, reason); + } + +schedule_dispatch: + bfq_schedule_dispatch(bfqd); + + spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); +} + +static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) +{ + del_timer_sync(&bfqd->idle_slice_timer); + cancel_work_sync(&bfqd->unplug_work); +} + +static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd, + struct bfq_queue **bfqq_ptr) +{ + struct bfq_group *root_group = bfqd->root_group; + struct bfq_queue *bfqq = *bfqq_ptr; + + bfq_log(bfqd, "put_async_bfqq: %p", bfqq); + if (bfqq != NULL) { + bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); + bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + *bfqq_ptr = NULL; + } +} + +/* + * Release all the bfqg references to its async queues. If we are + * deallocating the group these queues may still contain requests, so + * we reparent them to the root cgroup (i.e., the only one that will + * exist for sure until all the requests on a device are gone). + */ +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); + + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); +} + +static void bfq_exit_queue(struct elevator_queue *e) +{ + struct bfq_data *bfqd = e->elevator_data; + struct request_queue *q = bfqd->queue; + struct bfq_queue *bfqq, *n; + + bfq_shutdown_timer_wq(bfqd); + + spin_lock_irq(q->queue_lock); + + BUG_ON(bfqd->in_service_queue != NULL); + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) + bfq_deactivate_bfqq(bfqd, bfqq, 0); + + bfq_disconnect_groups(bfqd); + spin_unlock_irq(q->queue_lock); + + bfq_shutdown_timer_wq(bfqd); + + synchronize_rcu(); + + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + + bfq_free_root_group(bfqd); + kfree(bfqd); +} + +static void *bfq_init_queue(struct request_queue *q) +{ + struct bfq_group *bfqg; + struct bfq_data *bfqd; + + bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); + if (bfqd == NULL) + return NULL; + + /* + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow + * will not attempt to free it. + */ + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); + atomic_inc(&bfqd->oom_bfqq.ref); + bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; + bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE; + bfqd->oom_bfqq.entity.new_weight = + bfq_ioprio_to_weight(bfqd->oom_bfqq.entity.new_ioprio); + /* + * Trigger weight initialization, according to ioprio, at the + * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio + * class won't be changed any more. + */ + bfqd->oom_bfqq.entity.ioprio_changed = 1; + + bfqd->queue = q; + + bfqg = bfq_alloc_root_group(bfqd, q->node); + if (bfqg == NULL) { + kfree(bfqd); + return NULL; + } + + bfqd->root_group = bfqg; + bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); +#ifdef CONFIG_CGROUP_BFQIO + bfqd->active_numerous_groups = 0; +#endif + + init_timer(&bfqd->idle_slice_timer); + bfqd->idle_slice_timer.function = bfq_idle_slice_timer; + bfqd->idle_slice_timer.data = (unsigned long)bfqd; + + bfqd->rq_pos_tree = RB_ROOT; + bfqd->queue_weights_tree = RB_ROOT; + bfqd->group_weights_tree = RB_ROOT; + + INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); + + INIT_LIST_HEAD(&bfqd->active_list); + INIT_LIST_HEAD(&bfqd->idle_list); + INIT_HLIST_HEAD(&bfqd->burst_list); + + bfqd->hw_tag = -1; + + bfqd->bfq_max_budget = bfq_default_max_budget; + + bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; + bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; + bfqd->bfq_back_max = bfq_back_max; + bfqd->bfq_back_penalty = bfq_back_penalty; + bfqd->bfq_slice_idle = bfq_slice_idle; + bfqd->bfq_class_idle_last_service = 0; + bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; + bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; + bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; + + bfqd->bfq_coop_thresh = 2; + bfqd->bfq_failed_cooperations = 7000; + bfqd->bfq_requests_within_timer = 120; + + bfqd->bfq_large_burst_thresh = 11; + bfqd->bfq_burst_interval = msecs_to_jiffies(500); + + bfqd->low_latency = true; + + bfqd->bfq_wr_coeff = 20; + bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); + bfqd->bfq_wr_max_time = 0; + bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); + bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); + bfqd->bfq_wr_max_softrt_rate = 7000; /* + * Approximate rate required + * to playback or record a + * high-definition compressed + * video. + */ + bfqd->wr_busy_queues = 0; + bfqd->busy_in_flight_queues = 0; + bfqd->const_seeky_busy_in_flight_queues = 0; + + /* + * Begin by assuming, optimistically, that the device peak rate is + * equal to the highest reference rate. + */ + bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * + T_fast[blk_queue_nonrot(bfqd->queue)]; + bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; + bfqd->device_speed = BFQ_BFQD_FAST; + + return bfqd; +} + +static void bfq_slab_kill(void) +{ + if (bfq_pool != NULL) + kmem_cache_destroy(bfq_pool); +} + +static int __init bfq_slab_setup(void) +{ + bfq_pool = KMEM_CACHE(bfq_queue, 0); + if (bfq_pool == NULL) + return -ENOMEM; + return 0; +} + +static ssize_t bfq_var_show(unsigned int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static ssize_t bfq_var_store(unsigned long *var, const char *page, + size_t count) +{ + unsigned long new_val; + int ret = kstrtoul(page, 10, &new_val); + + if (ret == 0) + *var = new_val; + + return count; +} + +static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) +{ + struct bfq_data *bfqd = e->elevator_data; + return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? + jiffies_to_msecs(bfqd->bfq_wr_max_time) : + jiffies_to_msecs(bfq_wr_duration(bfqd))); +} + +static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) +{ + struct bfq_queue *bfqq; + struct bfq_data *bfqd = e->elevator_data; + ssize_t num_char = 0; + + num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", + bfqd->queued); + + spin_lock_irq(bfqd->queue->queue_lock); + + num_char += sprintf(page + num_char, "Active:\n"); + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { + num_char += sprintf(page + num_char, + "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n", + bfqq->pid, + bfqq->entity.weight, + bfqq->queued[0], + bfqq->queued[1], + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + num_char += sprintf(page + num_char, "Idle:\n"); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { + num_char += sprintf(page + num_char, + "pid%d: weight %hu, dur %d/%u\n", + bfqq->pid, + bfqq->entity.weight, + jiffies_to_msecs(jiffies - + bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + spin_unlock_irq(bfqd->queue->queue_lock); + + return num_char; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned int __data = __VAR; \ + if (__CONV) \ + __data = jiffies_to_msecs(__data); \ + return bfq_var_show(__data, (page)); \ +} +SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); +SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); +SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); +SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); +SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); +SHOW_FUNCTION(bfq_max_budget_async_rq_show, + bfqd->bfq_max_budget_async_rq, 0); +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); +SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); +SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); +SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); +SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); +SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); +SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, + 1); +SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t \ +__FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned long uninitialized_var(__data); \ + int ret = bfq_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + if (__CONV) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else \ + *(__PTR) = __data; \ + return ret; \ +} +STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, + INT_MAX, 1); +STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, + INT_MAX, 1); +STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); +STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, + INT_MAX, 0); +STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, + 1, INT_MAX, 0); +STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, + INT_MAX, 1); +STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); +STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, + 1); +STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, + INT_MAX, 1); +STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, + &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, + INT_MAX, 0); +#undef STORE_FUNCTION + +/* do nothing for the moment */ +static ssize_t bfq_weights_store(struct elevator_queue *e, + const char *page, size_t count) +{ + return count; +} + +static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) +{ + u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); + + if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) + return bfq_calc_max_budget(bfqd->peak_rate, timeout); + else + return bfq_default_max_budget; +} + +static ssize_t bfq_max_budget_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data == 0) + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + else { + if (__data > INT_MAX) + __data = INT_MAX; + bfqd->bfq_max_budget = __data; + } + + bfqd->bfq_user_max_budget = __data; + + return ret; +} + +static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data < 1) + __data = 1; + else if (__data > INT_MAX) + __data = INT_MAX; + + bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); + if (bfqd->bfq_user_max_budget == 0) + bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); + + return ret; +} + +static ssize_t bfq_low_latency_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data > 1) + __data = 1; + if (__data == 0 && bfqd->low_latency != 0) + bfq_end_wr(bfqd); + bfqd->low_latency = __data; + + return ret; +} + +#define BFQ_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) + +static struct elv_fs_entry bfq_attrs[] = { + BFQ_ATTR(fifo_expire_sync), + BFQ_ATTR(fifo_expire_async), + BFQ_ATTR(back_seek_max), + BFQ_ATTR(back_seek_penalty), + BFQ_ATTR(slice_idle), + BFQ_ATTR(max_budget), + BFQ_ATTR(max_budget_async_rq), + BFQ_ATTR(timeout_sync), + BFQ_ATTR(timeout_async), + BFQ_ATTR(low_latency), + BFQ_ATTR(wr_coeff), + BFQ_ATTR(wr_max_time), + BFQ_ATTR(wr_rt_max_time), + BFQ_ATTR(wr_min_idle_time), + BFQ_ATTR(wr_min_inter_arr_async), + BFQ_ATTR(wr_max_softrt_rate), + BFQ_ATTR(weights), + __ATTR_NULL +}; + +static struct elevator_type iosched_bfq = { + .ops = { + .elevator_merge_fn = bfq_merge, + .elevator_merged_fn = bfq_merged_request, + .elevator_merge_req_fn = bfq_merged_requests, + .elevator_allow_merge_fn = bfq_allow_merge, + .elevator_dispatch_fn = bfq_dispatch_requests, + .elevator_add_req_fn = bfq_insert_request, + .elevator_activate_req_fn = bfq_activate_request, + .elevator_deactivate_req_fn = bfq_deactivate_request, + .elevator_completed_req_fn = bfq_completed_request, + .elevator_former_req_fn = elv_rb_former_request, + .elevator_latter_req_fn = elv_rb_latter_request, + .elevator_init_icq_fn = bfq_init_icq, + .elevator_exit_icq_fn = bfq_exit_icq, + .elevator_set_req_fn = bfq_set_request, + .elevator_put_req_fn = bfq_put_request, + .elevator_may_queue_fn = bfq_may_queue, + .elevator_init_fn = bfq_init_queue, + .elevator_exit_fn = bfq_exit_queue, + }, + .icq_size = sizeof(struct bfq_io_cq), + .icq_align = __alignof__(struct bfq_io_cq), + .elevator_attrs = bfq_attrs, + .elevator_name = "bfq", + .elevator_owner = THIS_MODULE, +}; + +static int __init bfq_init(void) +{ + /* + * Can be 0 on HZ < 1000 setups. + */ + if (bfq_slice_idle == 0) + bfq_slice_idle = 1; + + if (bfq_timeout_async == 0) + bfq_timeout_async = 1; + + if (bfq_slab_setup()) + return -ENOMEM; + + /* + * Times to load large popular applications for the typical systems + * installed on the reference devices (see the comments before the + * definitions of the two arrays). + */ + T_slow[0] = msecs_to_jiffies(2600); + T_slow[1] = msecs_to_jiffies(1000); + T_fast[0] = msecs_to_jiffies(5500); + T_fast[1] = msecs_to_jiffies(2000); + + /* + * Thresholds that determine the switch between speed classes (see + * the comments before the definition of the array). + */ + device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; + device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; + + elv_register(&iosched_bfq); + pr_info("BFQ I/O-scheduler: v7r8"); + + return 0; +} + +static void __exit bfq_exit(void) +{ + elv_unregister(&iosched_bfq); + bfq_slab_kill(); +} + +module_init(bfq_init); +module_exit(bfq_exit); + +MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); +MODULE_LICENSE("GPL"); diff --git a/block/bfq-sched.c b/block/bfq-sched.c new file mode 100644 index 000000000..d0890c6d4 --- /dev/null +++ b/block/bfq-sched.c @@ -0,0 +1,1180 @@ +/* + * BFQ: Hierarchical B-WF2Q+ scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + */ + +#ifdef CONFIG_CGROUP_BFQIO +#define for_each_entity(entity) \ + for (; entity != NULL; entity = entity->parent) + +#define for_each_entity_safe(entity, parent) \ + for (; entity && ({ parent = entity->parent; 1; }); entity = parent) + +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + int extract, + struct bfq_data *bfqd); + +static inline void bfq_update_budget(struct bfq_entity *next_in_service) +{ + struct bfq_entity *bfqg_entity; + struct bfq_group *bfqg; + struct bfq_sched_data *group_sd; + + BUG_ON(next_in_service == NULL); + + group_sd = next_in_service->sched_data; + + bfqg = container_of(group_sd, struct bfq_group, sched_data); + /* + * bfq_group's my_entity field is not NULL only if the group + * is not the root group. We must not touch the root entity + * as it must never become an in-service entity. + */ + bfqg_entity = bfqg->my_entity; + if (bfqg_entity != NULL) + bfqg_entity->budget = next_in_service->budget; +} + +static int bfq_update_next_in_service(struct bfq_sched_data *sd) +{ + struct bfq_entity *next_in_service; + + if (sd->in_service_entity != NULL) + /* will update/requeue at the end of service */ + return 0; + + /* + * NOTE: this can be improved in many ways, such as returning + * 1 (and thus propagating upwards the update) only when the + * budget changes, or caching the bfqq that will be scheduled + * next from this subtree. By now we worry more about + * correctness than about performance... + */ + next_in_service = bfq_lookup_next_entity(sd, 0, NULL); + sd->next_in_service = next_in_service; + + if (next_in_service != NULL) + bfq_update_budget(next_in_service); + + return 1; +} + +static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) +{ + BUG_ON(sd->next_in_service != entity); +} +#else +#define for_each_entity(entity) \ + for (; entity != NULL; entity = NULL) + +#define for_each_entity_safe(entity, parent) \ + for (parent = NULL; entity != NULL; entity = parent) + +static inline int bfq_update_next_in_service(struct bfq_sched_data *sd) +{ + return 0; +} + +static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *entity) +{ +} + +static inline void bfq_update_budget(struct bfq_entity *next_in_service) +{ +} +#endif + +/* + * Shift for timestamp calculations. This actually limits the maximum + * service allowed in one timestamp delta (small shift values increase it), + * the maximum total weight that can be used for the queues in the system + * (big shift values increase it), and the period of virtual time + * wraparounds. + */ +#define WFQ_SERVICE_SHIFT 22 + +/** + * bfq_gt - compare two timestamps. + * @a: first ts. + * @b: second ts. + * + * Return @a > @b, dealing with wrapping correctly. + */ +static inline int bfq_gt(u64 a, u64 b) +{ + return (s64)(a - b) > 0; +} + +static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = NULL; + + BUG_ON(entity == NULL); + + if (entity->my_sched_data == NULL) + bfqq = container_of(entity, struct bfq_queue, entity); + + return bfqq; +} + + +/** + * bfq_delta - map service into the virtual time domain. + * @service: amount of service. + * @weight: scale factor (weight of an entity or weight sum). + */ +static inline u64 bfq_delta(unsigned long service, + unsigned long weight) +{ + u64 d = (u64)service << WFQ_SERVICE_SHIFT; + + do_div(d, weight); + return d; +} + +/** + * bfq_calc_finish - assign the finish time to an entity. + * @entity: the entity to act upon. + * @service: the service to be charged to the entity. + */ +static inline void bfq_calc_finish(struct bfq_entity *entity, + unsigned long service) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + BUG_ON(entity->weight == 0); + + entity->finish = entity->start + + bfq_delta(service, entity->weight); + + if (bfqq != NULL) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: serv %lu, w %d", + service, entity->weight); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: start %llu, finish %llu, delta %llu", + entity->start, entity->finish, + bfq_delta(service, entity->weight)); + } +} + +/** + * bfq_entity_of - get an entity from a node. + * @node: the node field of the entity. + * + * Convert a node pointer to the relative entity. This is used only + * to simplify the logic of some functions and not as the generic + * conversion mechanism because, e.g., in the tree walking functions, + * the check for a %NULL value would be redundant. + */ +static inline struct bfq_entity *bfq_entity_of(struct rb_node *node) +{ + struct bfq_entity *entity = NULL; + + if (node != NULL) + entity = rb_entry(node, struct bfq_entity, rb_node); + + return entity; +} + +/** + * bfq_extract - remove an entity from a tree. + * @root: the tree root. + * @entity: the entity to remove. + */ +static inline void bfq_extract(struct rb_root *root, + struct bfq_entity *entity) +{ + BUG_ON(entity->tree != root); + + entity->tree = NULL; + rb_erase(&entity->rb_node, root); +} + +/** + * bfq_idle_extract - extract an entity from the idle tree. + * @st: the service tree of the owning @entity. + * @entity: the entity being removed. + */ +static void bfq_idle_extract(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *next; + + BUG_ON(entity->tree != &st->idle); + + if (entity == st->first_idle) { + next = rb_next(&entity->rb_node); + st->first_idle = bfq_entity_of(next); + } + + if (entity == st->last_idle) { + next = rb_prev(&entity->rb_node); + st->last_idle = bfq_entity_of(next); + } + + bfq_extract(&st->idle, entity); + + if (bfqq != NULL) + list_del(&bfqq->bfqq_list); +} + +/** + * bfq_insert - generic tree insertion. + * @root: tree root. + * @entity: entity to insert. + * + * This is used for the idle and the active tree, since they are both + * ordered by finish time. + */ +static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) +{ + struct bfq_entity *entry; + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + + BUG_ON(entity->tree != NULL); + + while (*node != NULL) { + parent = *node; + entry = rb_entry(parent, struct bfq_entity, rb_node); + + if (bfq_gt(entry->finish, entity->finish)) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&entity->rb_node, parent, node); + rb_insert_color(&entity->rb_node, root); + + entity->tree = root; +} + +/** + * bfq_update_min - update the min_start field of a entity. + * @entity: the entity to update. + * @node: one of its children. + * + * This function is called when @entity may store an invalid value for + * min_start due to updates to the active tree. The function assumes + * that the subtree rooted at @node (which may be its left or its right + * child) has a valid min_start value. + */ +static inline void bfq_update_min(struct bfq_entity *entity, + struct rb_node *node) +{ + struct bfq_entity *child; + + if (node != NULL) { + child = rb_entry(node, struct bfq_entity, rb_node); + if (bfq_gt(entity->min_start, child->min_start)) + entity->min_start = child->min_start; + } +} + +/** + * bfq_update_active_node - recalculate min_start. + * @node: the node to update. + * + * @node may have changed position or one of its children may have moved, + * this function updates its min_start value. The left and right subtrees + * are assumed to hold a correct min_start value. + */ +static inline void bfq_update_active_node(struct rb_node *node) +{ + struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); + + entity->min_start = entity->start; + bfq_update_min(entity, node->rb_right); + bfq_update_min(entity, node->rb_left); +} + +/** + * bfq_update_active_tree - update min_start for the whole active tree. + * @node: the starting node. + * + * @node must be the deepest modified node after an update. This function + * updates its min_start using the values held by its children, assuming + * that they did not change, and then updates all the nodes that may have + * changed in the path to the root. The only nodes that may have changed + * are the ones in the path or their siblings. + */ +static void bfq_update_active_tree(struct rb_node *node) +{ + struct rb_node *parent; + +up: + bfq_update_active_node(node); + + parent = rb_parent(node); + if (parent == NULL) + return; + + if (node == parent->rb_left && parent->rb_right != NULL) + bfq_update_active_node(parent->rb_right); + else if (parent->rb_left != NULL) + bfq_update_active_node(parent->rb_left); + + node = parent; + goto up; +} + +static void bfq_weights_tree_add(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root); + +static void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root); + + +/** + * bfq_active_insert - insert an entity in the active tree of its + * group/device. + * @st: the service tree of the entity. + * @entity: the entity being inserted. + * + * The active tree is ordered by finish time, but an extra key is kept + * per each node, containing the minimum value for the start times of + * its children (and the node itself), so it's possible to search for + * the eligible node with the lowest finish time in logarithmic time. + */ +static void bfq_active_insert(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node = &entity->rb_node; +#ifdef CONFIG_CGROUP_BFQIO + struct bfq_sched_data *sd = NULL; + struct bfq_group *bfqg = NULL; + struct bfq_data *bfqd = NULL; +#endif + + bfq_insert(&st->active, entity); + + if (node->rb_left != NULL) + node = node->rb_left; + else if (node->rb_right != NULL) + node = node->rb_right; + + bfq_update_active_tree(node); + +#ifdef CONFIG_CGROUP_BFQIO + sd = entity->sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; +#endif + if (bfqq != NULL) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); +#ifdef CONFIG_CGROUP_BFQIO + else { /* bfq_group */ + BUG_ON(!bfqd); + bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); + } + if (bfqg != bfqd->root_group) { + BUG_ON(!bfqg); + BUG_ON(!bfqd); + bfqg->active_entities++; + if (bfqg->active_entities == 2) + bfqd->active_numerous_groups++; + } +#endif +} + +/** + * bfq_ioprio_to_weight - calc a weight from an ioprio. + * @ioprio: the ioprio value to convert. + */ +static inline unsigned short bfq_ioprio_to_weight(int ioprio) +{ + BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); + return IOPRIO_BE_NR - ioprio; +} + +/** + * bfq_weight_to_ioprio - calc an ioprio from a weight. + * @weight: the weight value to convert. + * + * To preserve as mush as possible the old only-ioprio user interface, + * 0 is used as an escape ioprio value for weights (numerically) equal or + * larger than IOPRIO_BE_NR + */ +static inline unsigned short bfq_weight_to_ioprio(int weight) +{ + BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); + return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; +} + +static inline void bfq_get_entity(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + if (bfqq != NULL) { + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", + bfqq, atomic_read(&bfqq->ref)); + } +} + +/** + * bfq_find_deepest - find the deepest node that an extraction can modify. + * @node: the node being removed. + * + * Do the first step of an extraction in an rb tree, looking for the + * node that will replace @node, and returning the deepest node that + * the following modifications to the tree can touch. If @node is the + * last node in the tree return %NULL. + */ +static struct rb_node *bfq_find_deepest(struct rb_node *node) +{ + struct rb_node *deepest; + + if (node->rb_right == NULL && node->rb_left == NULL) + deepest = rb_parent(node); + else if (node->rb_right == NULL) + deepest = node->rb_left; + else if (node->rb_left == NULL) + deepest = node->rb_right; + else { + deepest = rb_next(node); + if (deepest->rb_right != NULL) + deepest = deepest->rb_right; + else if (rb_parent(deepest) != node) + deepest = rb_parent(deepest); + } + + return deepest; +} + +/** + * bfq_active_extract - remove an entity from the active tree. + * @st: the service_tree containing the tree. + * @entity: the entity being removed. + */ +static void bfq_active_extract(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node; +#ifdef CONFIG_CGROUP_BFQIO + struct bfq_sched_data *sd = NULL; + struct bfq_group *bfqg = NULL; + struct bfq_data *bfqd = NULL; +#endif + + node = bfq_find_deepest(&entity->rb_node); + bfq_extract(&st->active, entity); + + if (node != NULL) + bfq_update_active_tree(node); + +#ifdef CONFIG_CGROUP_BFQIO + sd = entity->sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; +#endif + if (bfqq != NULL) + list_del(&bfqq->bfqq_list); +#ifdef CONFIG_CGROUP_BFQIO + else { /* bfq_group */ + BUG_ON(!bfqd); + bfq_weights_tree_remove(bfqd, entity, + &bfqd->group_weights_tree); + } + if (bfqg != bfqd->root_group) { + BUG_ON(!bfqg); + BUG_ON(!bfqd); + BUG_ON(!bfqg->active_entities); + bfqg->active_entities--; + if (bfqg->active_entities == 1) { + BUG_ON(!bfqd->active_numerous_groups); + bfqd->active_numerous_groups--; + } + } +#endif +} + +/** + * bfq_idle_insert - insert an entity into the idle tree. + * @st: the service tree containing the tree. + * @entity: the entity to insert. + */ +static void bfq_idle_insert(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_entity *first_idle = st->first_idle; + struct bfq_entity *last_idle = st->last_idle; + + if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish)) + st->first_idle = entity; + if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish)) + st->last_idle = entity; + + bfq_insert(&st->idle, entity); + + if (bfqq != NULL) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); +} + +/** + * bfq_forget_entity - remove an entity from the wfq trees. + * @st: the service tree. + * @entity: the entity being removed. + * + * Update the device status and forget everything about @entity, putting + * the device reference to it, if it is a queue. Entities belonging to + * groups are not refcounted. + */ +static void bfq_forget_entity(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_sched_data *sd; + + BUG_ON(!entity->on_st); + + entity->on_st = 0; + st->wsum -= entity->weight; + if (bfqq != NULL) { + sd = entity->sched_data; + bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", + bfqq, atomic_read(&bfqq->ref)); + bfq_put_queue(bfqq); + } +} + +/** + * bfq_put_idle_entity - release the idle tree ref of an entity. + * @st: service tree for the entity. + * @entity: the entity being released. + */ +static void bfq_put_idle_entity(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + bfq_idle_extract(st, entity); + bfq_forget_entity(st, entity); +} + +/** + * bfq_forget_idle - update the idle tree if necessary. + * @st: the service tree to act upon. + * + * To preserve the global O(log N) complexity we only remove one entry here; + * as the idle tree will not grow indefinitely this can be done safely. + */ +static void bfq_forget_idle(struct bfq_service_tree *st) +{ + struct bfq_entity *first_idle = st->first_idle; + struct bfq_entity *last_idle = st->last_idle; + + if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL && + !bfq_gt(last_idle->finish, st->vtime)) { + /* + * Forget the whole idle tree, increasing the vtime past + * the last finish time of idle entities. + */ + st->vtime = last_idle->finish; + } + + if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime)) + bfq_put_idle_entity(st, first_idle); +} + +static struct bfq_service_tree * +__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + struct bfq_entity *entity) +{ + struct bfq_service_tree *new_st = old_st; + + if (entity->ioprio_changed) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + unsigned short prev_weight, new_weight; + struct bfq_data *bfqd = NULL; + struct rb_root *root; +#ifdef CONFIG_CGROUP_BFQIO + struct bfq_sched_data *sd; + struct bfq_group *bfqg; +#endif + + if (bfqq != NULL) + bfqd = bfqq->bfqd; +#ifdef CONFIG_CGROUP_BFQIO + else { + sd = entity->my_sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; + BUG_ON(!bfqd); + } +#endif + + BUG_ON(old_st->wsum < entity->weight); + old_st->wsum -= entity->weight; + + if (entity->new_weight != entity->orig_weight) { + if (entity->new_weight < BFQ_MIN_WEIGHT || + entity->new_weight > BFQ_MAX_WEIGHT) { + printk(KERN_CRIT "update_weight_prio: " + "new_weight %d\n", + entity->new_weight); + BUG(); + } + entity->orig_weight = entity->new_weight; + entity->ioprio = + bfq_weight_to_ioprio(entity->orig_weight); + } + + entity->ioprio_class = entity->new_ioprio_class; + entity->ioprio_changed = 0; + + /* + * NOTE: here we may be changing the weight too early, + * this will cause unfairness. The correct approach + * would have required additional complexity to defer + * weight changes to the proper time instants (i.e., + * when entity->finish <= old_st->vtime). + */ + new_st = bfq_entity_service_tree(entity); + + prev_weight = entity->weight; + new_weight = entity->orig_weight * + (bfqq != NULL ? bfqq->wr_coeff : 1); + /* + * If the weight of the entity changes, remove the entity + * from its old weight counter (if there is a counter + * associated with the entity), and add it to the counter + * associated with its new weight. + */ + if (prev_weight != new_weight) { + root = bfqq ? &bfqd->queue_weights_tree : + &bfqd->group_weights_tree; + bfq_weights_tree_remove(bfqd, entity, root); + } + entity->weight = new_weight; + /* + * Add the entity to its weights tree only if it is + * not associated with a weight-raised queue. + */ + if (prev_weight != new_weight && + (bfqq ? bfqq->wr_coeff == 1 : 1)) + /* If we get here, root has been initialized. */ + bfq_weights_tree_add(bfqd, entity, root); + + new_st->wsum += entity->weight; + + if (new_st != old_st) + entity->start = new_st->vtime; + } + + return new_st; +} + +/** + * bfq_bfqq_served - update the scheduler status after selection for + * service. + * @bfqq: the queue being served. + * @served: bytes to transfer. + * + * NOTE: this can be optimized, as the timestamps of upper level entities + * are synchronized every time a new bfqq is selected for service. By now, + * we keep it to better check consistency. + */ +static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st; + + for_each_entity(entity) { + st = bfq_entity_service_tree(entity); + + entity->service += served; + BUG_ON(entity->service > entity->budget); + BUG_ON(st->wsum == 0); + + st->vtime += bfq_delta(served, st->wsum); + bfq_forget_idle(st); + } + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served); +} + +/** + * bfq_bfqq_charge_full_budget - set the service to the entity budget. + * @bfqq: the queue that needs a service update. + * + * When it's not possible to be fair in the service domain, because + * a queue is not consuming its budget fast enough (the meaning of + * fast depends on the timeout parameter), we charge it a full + * budget. In this way we should obtain a sort of time-domain + * fairness among all the seeky/slow queues. + */ +static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); + + bfq_bfqq_served(bfqq, entity->budget - entity->service); +} + +/** + * __bfq_activate_entity - activate an entity. + * @entity: the entity being activated. + * + * Called whenever an entity is activated, i.e., it is not active and one + * of its children receives a new request, or has to be reactivated due to + * budget exhaustion. It uses the current budget of the entity (and the + * service received if @entity is active) of the queue to calculate its + * timestamps. + */ +static void __bfq_activate_entity(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + + if (entity == sd->in_service_entity) { + BUG_ON(entity->tree != NULL); + /* + * If we are requeueing the current entity we have + * to take care of not charging to it service it has + * not received. + */ + bfq_calc_finish(entity, entity->service); + entity->start = entity->finish; + sd->in_service_entity = NULL; + } else if (entity->tree == &st->active) { + /* + * Requeueing an entity due to a change of some + * next_in_service entity below it. We reuse the + * old start time. + */ + bfq_active_extract(st, entity); + } else if (entity->tree == &st->idle) { + /* + * Must be on the idle tree, bfq_idle_extract() will + * check for that. + */ + bfq_idle_extract(st, entity); + entity->start = bfq_gt(st->vtime, entity->finish) ? + st->vtime : entity->finish; + } else { + /* + * The finish time of the entity may be invalid, and + * it is in the past for sure, otherwise the queue + * would have been on the idle tree. + */ + entity->start = st->vtime; + st->wsum += entity->weight; + bfq_get_entity(entity); + + BUG_ON(entity->on_st); + entity->on_st = 1; + } + + st = __bfq_entity_update_weight_prio(st, entity); + bfq_calc_finish(entity, entity->budget); + bfq_active_insert(st, entity); +} + +/** + * bfq_activate_entity - activate an entity and its ancestors if necessary. + * @entity: the entity to activate. + * + * Activate @entity and all the entities on the path from it to the root. + */ +static void bfq_activate_entity(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd; + + for_each_entity(entity) { + __bfq_activate_entity(entity); + + sd = entity->sched_data; + if (!bfq_update_next_in_service(sd)) + /* + * No need to propagate the activation to the + * upper entities, as they will be updated when + * the in-service entity is rescheduled. + */ + break; + } +} + +/** + * __bfq_deactivate_entity - deactivate an entity from its service tree. + * @entity: the entity to deactivate. + * @requeue: if false, the entity will not be put into the idle tree. + * + * Deactivate an entity, independently from its previous state. If the + * entity was not on a service tree just return, otherwise if it is on + * any scheduler tree, extract it from that tree, and if necessary + * and if the caller did not specify @requeue, put it on the idle tree. + * + * Return %1 if the caller should update the entity hierarchy, i.e., + * if the entity was in service or if it was the next_in_service for + * its sched_data; return %0 otherwise. + */ +static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + int was_in_service = entity == sd->in_service_entity; + int ret = 0; + + if (!entity->on_st) + return 0; + + BUG_ON(was_in_service && entity->tree != NULL); + + if (was_in_service) { + bfq_calc_finish(entity, entity->service); + sd->in_service_entity = NULL; + } else if (entity->tree == &st->active) + bfq_active_extract(st, entity); + else if (entity->tree == &st->idle) + bfq_idle_extract(st, entity); + else if (entity->tree != NULL) + BUG(); + + if (was_in_service || sd->next_in_service == entity) + ret = bfq_update_next_in_service(sd); + + if (!requeue || !bfq_gt(entity->finish, st->vtime)) + bfq_forget_entity(st, entity); + else + bfq_idle_insert(st, entity); + + BUG_ON(sd->in_service_entity == entity); + BUG_ON(sd->next_in_service == entity); + + return ret; +} + +/** + * bfq_deactivate_entity - deactivate an entity. + * @entity: the entity to deactivate. + * @requeue: true if the entity can be put on the idle tree + */ +static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) +{ + struct bfq_sched_data *sd; + struct bfq_entity *parent; + + for_each_entity_safe(entity, parent) { + sd = entity->sched_data; + + if (!__bfq_deactivate_entity(entity, requeue)) + /* + * The parent entity is still backlogged, and + * we don't need to update it as it is still + * in service. + */ + break; + + if (sd->next_in_service != NULL) + /* + * The parent entity is still backlogged and + * the budgets on the path towards the root + * need to be updated. + */ + goto update; + + /* + * If we reach there the parent is no more backlogged and + * we want to propagate the dequeue upwards. + */ + requeue = 1; + } + + return; + +update: + entity = parent; + for_each_entity(entity) { + __bfq_activate_entity(entity); + + sd = entity->sched_data; + if (!bfq_update_next_in_service(sd)) + break; + } +} + +/** + * bfq_update_vtime - update vtime if necessary. + * @st: the service tree to act upon. + * + * If necessary update the service tree vtime to have at least one + * eligible entity, skipping to its start time. Assumes that the + * active tree of the device is not empty. + * + * NOTE: this hierarchical implementation updates vtimes quite often, + * we may end up with reactivated processes getting timestamps after a + * vtime skip done because we needed a ->first_active entity on some + * intermediate node. + */ +static void bfq_update_vtime(struct bfq_service_tree *st) +{ + struct bfq_entity *entry; + struct rb_node *node = st->active.rb_node; + + entry = rb_entry(node, struct bfq_entity, rb_node); + if (bfq_gt(entry->min_start, st->vtime)) { + st->vtime = entry->min_start; + bfq_forget_idle(st); + } +} + +/** + * bfq_first_active_entity - find the eligible entity with + * the smallest finish time + * @st: the service tree to select from. + * + * This function searches the first schedulable entity, starting from the + * root of the tree and going on the left every time on this side there is + * a subtree with at least one eligible (start >= vtime) entity. The path on + * the right is followed only if a) the left subtree contains no eligible + * entities and b) no eligible entity has been found yet. + */ +static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) +{ + struct bfq_entity *entry, *first = NULL; + struct rb_node *node = st->active.rb_node; + + while (node != NULL) { + entry = rb_entry(node, struct bfq_entity, rb_node); +left: + if (!bfq_gt(entry->start, st->vtime)) + first = entry; + + BUG_ON(bfq_gt(entry->min_start, st->vtime)); + + if (node->rb_left != NULL) { + entry = rb_entry(node->rb_left, + struct bfq_entity, rb_node); + if (!bfq_gt(entry->min_start, st->vtime)) { + node = node->rb_left; + goto left; + } + } + if (first != NULL) + break; + node = node->rb_right; + } + + BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active)); + return first; +} + +/** + * __bfq_lookup_next_entity - return the first eligible entity in @st. + * @st: the service tree. + * + * Update the virtual time in @st and return the first eligible entity + * it contains. + */ +static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, + bool force) +{ + struct bfq_entity *entity, *new_next_in_service = NULL; + + if (RB_EMPTY_ROOT(&st->active)) + return NULL; + + bfq_update_vtime(st); + entity = bfq_first_active_entity(st); + BUG_ON(bfq_gt(entity->start, st->vtime)); + + /* + * If the chosen entity does not match with the sched_data's + * next_in_service and we are forcedly serving the IDLE priority + * class tree, bubble up budget update. + */ + if (unlikely(force && entity != entity->sched_data->next_in_service)) { + new_next_in_service = entity; + for_each_entity(new_next_in_service) + bfq_update_budget(new_next_in_service); + } + + return entity; +} + +/** + * bfq_lookup_next_entity - return the first eligible entity in @sd. + * @sd: the sched_data. + * @extract: if true the returned entity will be also extracted from @sd. + * + * NOTE: since we cache the next_in_service entity at each level of the + * hierarchy, the complexity of the lookup can be decreased with + * absolutely no effort just returning the cached next_in_service value; + * we prefer to do full lookups to test the consistency of * the data + * structures. + */ +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + int extract, + struct bfq_data *bfqd) +{ + struct bfq_service_tree *st = sd->service_tree; + struct bfq_entity *entity; + int i = 0; + + BUG_ON(sd->in_service_entity != NULL); + + if (bfqd != NULL && + jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { + entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, + true); + if (entity != NULL) { + i = BFQ_IOPRIO_CLASSES - 1; + bfqd->bfq_class_idle_last_service = jiffies; + sd->next_in_service = entity; + } + } + for (; i < BFQ_IOPRIO_CLASSES; i++) { + entity = __bfq_lookup_next_entity(st + i, false); + if (entity != NULL) { + if (extract) { + bfq_check_next_in_service(sd, entity); + bfq_active_extract(st + i, entity); + sd->in_service_entity = entity; + sd->next_in_service = NULL; + } + break; + } + } + + return entity; +} + +/* + * Get next queue for service. + */ +static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) +{ + struct bfq_entity *entity = NULL; + struct bfq_sched_data *sd; + struct bfq_queue *bfqq; + + BUG_ON(bfqd->in_service_queue != NULL); + + if (bfqd->busy_queues == 0) + return NULL; + + sd = &bfqd->root_group->sched_data; + for (; sd != NULL; sd = entity->my_sched_data) { + entity = bfq_lookup_next_entity(sd, 1, bfqd); + BUG_ON(entity == NULL); + entity->service = 0; + } + + bfqq = bfq_entity_to_bfqq(entity); + BUG_ON(bfqq == NULL); + + return bfqq; +} + +static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) +{ + if (bfqd->in_service_bic != NULL) { + put_io_context(bfqd->in_service_bic->icq.ioc); + bfqd->in_service_bic = NULL; + } + + bfqd->in_service_queue = NULL; + del_timer(&bfqd->idle_slice_timer); +} + +static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int requeue) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (bfqq == bfqd->in_service_queue) + __bfq_bfqd_reset_in_service(bfqd); + + bfq_deactivate_entity(entity, requeue); +} + +static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_activate_entity(entity); +} + +/* + * Called when the bfqq no longer has requests pending, remove it from + * the service tree. + */ +static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, + int requeue) +{ + BUG_ON(!bfq_bfqq_busy(bfqq)); + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + + bfq_log_bfqq(bfqd, bfqq, "del from busy"); + + bfq_clear_bfqq_busy(bfqq); + + BUG_ON(bfqd->busy_queues == 0); + bfqd->busy_queues--; + + if (!bfqq->dispatched) { + bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + BUG_ON(!bfqd->busy_in_flight_queues); + bfqd->busy_in_flight_queues--; + if (bfq_bfqq_constantly_seeky(bfqq)) { + BUG_ON(!bfqd-> + const_seeky_busy_in_flight_queues); + bfqd->const_seeky_busy_in_flight_queues--; + } + } + } + if (bfqq->wr_coeff > 1) + bfqd->wr_busy_queues--; + + bfq_deactivate_bfqq(bfqd, bfqq, requeue); +} + +/* + * Called when an inactive queue receives a new request. + */ +static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + BUG_ON(bfq_bfqq_busy(bfqq)); + BUG_ON(bfqq == bfqd->in_service_queue); + + bfq_log_bfqq(bfqd, bfqq, "add to busy"); + + bfq_activate_bfqq(bfqd, bfqq); + + bfq_mark_bfqq_busy(bfqq); + bfqd->busy_queues++; + + if (!bfqq->dispatched) { + if (bfqq->wr_coeff == 1) + bfq_weights_tree_add(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + if (!blk_queue_nonrot(bfqd->queue)) { + bfqd->busy_in_flight_queues++; + if (bfq_bfqq_constantly_seeky(bfqq)) + bfqd->const_seeky_busy_in_flight_queues++; + } + } + if (bfqq->wr_coeff > 1) + bfqd->wr_busy_queues++; +} diff --git a/block/bfq.h b/block/bfq.h new file mode 100644 index 000000000..e6f1a88f4 --- /dev/null +++ b/block/bfq.h @@ -0,0 +1,806 @@ +/* + * BFQ-v7r8 for 3.4.0: data structures and common functions prototypes. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + */ + +#ifndef _BFQ_H +#define _BFQ_H + +#include +#include +#include +#include + +#define BFQ_IOPRIO_CLASSES 3 +#define BFQ_CL_IDLE_TIMEOUT (HZ/5) + +#define BFQ_MIN_WEIGHT 1 +#define BFQ_MAX_WEIGHT 1000 + +#define BFQ_DEFAULT_QUEUE_IOPRIO 4 + +#define BFQ_DEFAULT_GRP_WEIGHT 10 +#define BFQ_DEFAULT_GRP_IOPRIO 0 +#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE + +struct bfq_entity; + +/** + * struct bfq_service_tree - per ioprio_class service tree. + * @active: tree for active entities (i.e., those backlogged). + * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). + * @first_idle: idle entity with minimum F_i. + * @last_idle: idle entity with maximum F_i. + * @vtime: scheduler virtual time. + * @wsum: scheduler weight sum; active and idle entities contribute to it. + * + * Each service tree represents a B-WF2Q+ scheduler on its own. Each + * ioprio_class has its own independent scheduler, and so its own + * bfq_service_tree. All the fields are protected by the queue lock + * of the containing bfqd. + */ +struct bfq_service_tree { + struct rb_root active; + struct rb_root idle; + + struct bfq_entity *first_idle; + struct bfq_entity *last_idle; + + u64 vtime; + unsigned long wsum; +}; + +/** + * struct bfq_sched_data - multi-class scheduler. + * @in_service_entity: entity in service. + * @next_in_service: head-of-the-line entity in the scheduler. + * @service_tree: array of service trees, one per ioprio_class. + * + * bfq_sched_data is the basic scheduler queue. It supports three + * ioprio_classes, and can be used either as a toplevel queue or as + * an intermediate queue on a hierarchical setup. + * @next_in_service points to the active entity of the sched_data + * service trees that will be scheduled next. + * + * The supported ioprio_classes are the same as in CFQ, in descending + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. + * Requests from higher priority queues are served before all the + * requests from lower priority queues; among requests of the same + * queue requests are served according to B-WF2Q+. + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_sched_data { + struct bfq_entity *in_service_entity; + struct bfq_entity *next_in_service; + struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; +}; + +/** + * struct bfq_weight_counter - counter of the number of all active entities + * with a given weight. + * @weight: weight of the entities that this counter refers to. + * @num_active: number of active entities with this weight. + * @weights_node: weights tree member (see bfq_data's @queue_weights_tree + * and @group_weights_tree). + */ +struct bfq_weight_counter { + short int weight; + unsigned int num_active; + struct rb_node weights_node; +}; + +/** + * struct bfq_entity - schedulable entity. + * @rb_node: service_tree member. + * @weight_counter: pointer to the weight counter associated with this entity. + * @on_st: flag, true if the entity is on a tree (either the active or + * the idle one of its service_tree). + * @finish: B-WF2Q+ finish timestamp (aka F_i). + * @start: B-WF2Q+ start timestamp (aka S_i). + * @tree: tree the entity is enqueued into; %NULL if not on a tree. + * @min_start: minimum start time of the (active) subtree rooted at + * this entity; used for O(log N) lookups into active trees. + * @service: service received during the last round of service. + * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. + * @weight: weight of the queue + * @parent: parent entity, for hierarchical scheduling. + * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the + * associated scheduler queue, %NULL on leaf nodes. + * @sched_data: the scheduler queue this entity belongs to. + * @ioprio: the ioprio in use. + * @new_weight: when a weight change is requested, the new weight value. + * @orig_weight: original weight, used to implement weight boosting + * @new_ioprio: when an ioprio change is requested, the new ioprio value. + * @ioprio_class: the ioprio_class in use. + * @new_ioprio_class: when an ioprio_class change is requested, the new + * ioprio_class value. + * @ioprio_changed: flag, true when the user requested a weight, ioprio or + * ioprio_class change. + * + * A bfq_entity is used to represent either a bfq_queue (leaf node in the + * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each + * entity belongs to the sched_data of the parent group in the cgroup + * hierarchy. Non-leaf entities have also their own sched_data, stored + * in @my_sched_data. + * + * Each entity stores independently its priority values; this would + * allow different weights on different devices, but this + * functionality is not exported to userspace by now. Priorities and + * weights are updated lazily, first storing the new values into the + * new_* fields, then setting the @ioprio_changed flag. As soon as + * there is a transition in the entity state that allows the priority + * update to take place the effective and the requested priority + * values are synchronized. + * + * Unless cgroups are used, the weight value is calculated from the + * ioprio to export the same interface as CFQ. When dealing with + * ``well-behaved'' queues (i.e., queues that do not spend too much + * time to consume their budget and have true sequential behavior, and + * when there are no external factors breaking anticipation) the + * relative weights at each level of the cgroups hierarchy should be + * guaranteed. All the fields are protected by the queue lock of the + * containing bfqd. + */ +struct bfq_entity { + struct rb_node rb_node; + struct bfq_weight_counter *weight_counter; + + int on_st; + + u64 finish; + u64 start; + + struct rb_root *tree; + + u64 min_start; + + unsigned long service, budget; + unsigned short weight, new_weight; + unsigned short orig_weight; + + struct bfq_entity *parent; + + struct bfq_sched_data *my_sched_data; + struct bfq_sched_data *sched_data; + + unsigned short ioprio, new_ioprio; + unsigned short ioprio_class, new_ioprio_class; + + int ioprio_changed; +}; + +struct bfq_group; + +/** + * struct bfq_queue - leaf schedulable entity. + * @ref: reference counter. + * @bfqd: parent bfq_data. + * @new_bfqq: shared bfq_queue if queue is cooperating with + * one or more other queues. + * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree). + * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree). + * @sort_list: sorted list of pending requests. + * @next_rq: if fifo isn't expired, next request to serve. + * @queued: nr of requests queued in @sort_list. + * @allocated: currently allocated requests. + * @meta_pending: pending metadata requests. + * @fifo: fifo list of requests in sort_list. + * @entity: entity representing this queue in the scheduler. + * @max_budget: maximum budget allowed from the feedback mechanism. + * @budget_timeout: budget expiration (in jiffies). + * @dispatched: number of requests on the dispatch list or inside driver. + * @flags: status flags. + * @bfqq_list: node for active/idle bfqq list inside our bfqd. + * @burst_list_node: node for the device's burst list. + * @seek_samples: number of seeks sampled + * @seek_total: sum of the distances of the seeks sampled + * @seek_mean: mean seek distance + * @last_request_pos: position of the last request enqueued + * @requests_within_timer: number of consecutive pairs of request completion + * and arrival, such that the queue becomes idle + * after the completion, but the next request arrives + * within an idle time slice; used only if the queue's + * IO_bound has been cleared. + * @pid: pid of the process owning the queue, used for logging purposes. + * @last_wr_start_finish: start time of the current weight-raising period if + * the @bfq-queue is being weight-raised, otherwise + * finish time of the last weight-raising period + * @wr_cur_max_time: current max raising time for this queue + * @soft_rt_next_start: minimum time instant such that, only if a new + * request is enqueued after this time instant in an + * idle @bfq_queue with no outstanding requests, then + * the task associated with the queue it is deemed as + * soft real-time (see the comments to the function + * bfq_bfqq_softrt_next_start()) + * @last_idle_bklogged: time of the last transition of the @bfq_queue from + * idle to backlogged + * @service_from_backlogged: cumulative service received from the @bfq_queue + * since the last transition from idle to + * backlogged + * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the + * queue is shared + * + * A bfq_queue is a leaf request queue; it can be associated with an + * io_context or more, if it is async or shared between cooperating + * processes. @cgroup holds a reference to the cgroup, to be sure that it + * does not disappear while a bfqq still references it (mostly to avoid + * races between request issuing and task migration followed by cgroup + * destruction). + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_queue { + atomic_t ref; + struct bfq_data *bfqd; + + /* fields for cooperating queues handling */ + struct bfq_queue *new_bfqq; + struct rb_node pos_node; + struct rb_root *pos_root; + + struct rb_root sort_list; + struct request *next_rq; + int queued[2]; + int allocated[2]; + int meta_pending; + struct list_head fifo; + + struct bfq_entity entity; + + unsigned long max_budget; + unsigned long budget_timeout; + + int dispatched; + + unsigned int flags; + + struct list_head bfqq_list; + + struct hlist_node burst_list_node; + + unsigned int seek_samples; + u64 seek_total; + sector_t seek_mean; + sector_t last_request_pos; + + unsigned int requests_within_timer; + + pid_t pid; + struct bfq_io_cq *bic; + + /* weight-raising fields */ + unsigned long wr_cur_max_time; + unsigned long soft_rt_next_start; + unsigned long last_wr_start_finish; + unsigned int wr_coeff; + unsigned long last_idle_bklogged; + unsigned long service_from_backlogged; +}; + +/** + * struct bfq_ttime - per process thinktime stats. + * @ttime_total: total process thinktime + * @ttime_samples: number of thinktime samples + * @ttime_mean: average process thinktime + */ +struct bfq_ttime { + unsigned long last_end_request; + + unsigned long ttime_total; + unsigned long ttime_samples; + unsigned long ttime_mean; +}; + +/** + * struct bfq_io_cq - per (request_queue, io_context) structure. + * @icq: associated io_cq structure + * @bfqq: array of two process queues, the sync and the async + * @ttime: associated @bfq_ttime struct + * @wr_time_left: snapshot of the time left before weight raising ends + * for the sync queue associated to this process; this + * snapshot is taken to remember this value while the weight + * raising is suspended because the queue is merged with a + * shared queue, and is used to set @raising_cur_max_time + * when the queue is split from the shared queue and its + * weight is raised again + * @saved_idle_window: same purpose as the previous field for the idle + * window + * @saved_IO_bound: same purpose as the previous two fields for the I/O + * bound classification of a queue + * @saved_in_large_burst: same purpose as the previous fields for the + * value of the field keeping the queue's belonging + * to a large burst + * @was_in_burst_list: true if the queue belonged to a burst list + * before its merge with another cooperating queue + * @cooperations: counter of consecutive successful queue merges underwent + * by any of the process' @bfq_queues + * @failed_cooperations: counter of consecutive failed queue merges of any + * of the process' @bfq_queues + */ +struct bfq_io_cq { + struct io_cq icq; /* must be the first member */ + struct bfq_queue *bfqq[2]; + struct bfq_ttime ttime; + int ioprio; + + unsigned int wr_time_left; + bool saved_idle_window; + bool saved_IO_bound; + + bool saved_in_large_burst; + bool was_in_burst_list; + + unsigned int cooperations; + unsigned int failed_cooperations; +}; + +enum bfq_device_speed { + BFQ_BFQD_FAST, + BFQ_BFQD_SLOW, +}; + +/** + * struct bfq_data - per device data structure. + * @queue: request queue for the managed device. + * @root_group: root bfq_group for the device. + * @rq_pos_tree: rbtree sorted by next_request position, used when + * determining if two or more queues have interleaving + * requests (see bfq_close_cooperator()). + * @active_numerous_groups: number of bfq_groups containing more than one + * active @bfq_entity. + * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by + * weight. Used to keep track of whether all @bfq_queues + * have the same weight. The tree contains one counter + * for each distinct weight associated to some active + * and not weight-raised @bfq_queue (see the comments to + * the functions bfq_weights_tree_[add|remove] for + * further details). + * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted + * by weight. Used to keep track of whether all + * @bfq_groups have the same weight. The tree contains + * one counter for each distinct weight associated to + * some active @bfq_group (see the comments to the + * functions bfq_weights_tree_[add|remove] for further + * details). + * @busy_queues: number of bfq_queues containing requests (including the + * queue in service, even if it is idling). + * @busy_in_flight_queues: number of @bfq_queues containing pending or + * in-flight requests, plus the @bfq_queue in + * service, even if idle but waiting for the + * possible arrival of its next sync request. This + * field is updated only if the device is rotational, + * but used only if the device is also NCQ-capable. + * The reason why the field is updated also for non- + * NCQ-capable rotational devices is related to the + * fact that the value of @hw_tag may be set also + * later than when busy_in_flight_queues may need to + * be incremented for the first time(s). Taking also + * this possibility into account, to avoid unbalanced + * increments/decrements, would imply more overhead + * than just updating busy_in_flight_queues + * regardless of the value of @hw_tag. + * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues + * (that is, seeky queues that expired + * for budget timeout at least once) + * containing pending or in-flight + * requests, including the in-service + * @bfq_queue if constantly seeky. This + * field is updated only if the device + * is rotational, but used only if the + * device is also NCQ-capable (see the + * comments to @busy_in_flight_queues). + * @wr_busy_queues: number of weight-raised busy @bfq_queues. + * @queued: number of queued requests. + * @rq_in_driver: number of requests dispatched and waiting for completion. + * @sync_flight: number of sync requests in the driver. + * @max_rq_in_driver: max number of reqs in driver in the last + * @hw_tag_samples completed requests. + * @hw_tag_samples: nr of samples used to calculate hw_tag. + * @hw_tag: flag set to one if the driver is showing a queueing behavior. + * @budgets_assigned: number of budgets assigned. + * @idle_slice_timer: timer set when idling for the next sequential request + * from the queue in service. + * @unplug_work: delayed work to restart dispatching on the request queue. + * @in_service_queue: bfq_queue in service. + * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. + * @last_position: on-disk position of the last served request. + * @last_budget_start: beginning of the last budget. + * @last_idling_start: beginning of the last idle slice. + * @peak_rate: peak transfer rate observed for a budget. + * @peak_rate_samples: number of samples used to calculate @peak_rate. + * @bfq_max_budget: maximum budget allotted to a bfq_queue before + * rescheduling. + * @group_list: list of all the bfq_groups active on the device. + * @active_list: list of all the bfq_queues active on the device. + * @idle_list: list of all the bfq_queues idle on the device. + * @bfq_fifo_expire: timeout for async/sync requests; when it expires + * requests are served in fifo order. + * @bfq_back_penalty: weight of backward seeks wrt forward ones. + * @bfq_back_max: maximum allowed backward seek. + * @bfq_slice_idle: maximum idling time. + * @bfq_user_max_budget: user-configured max budget value + * (0 for auto-tuning). + * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to + * async queues. + * @bfq_timeout: timeout for bfq_queues to consume their budget; used to + * to prevent seeky queues to impose long latencies to well + * behaved ones (this also implies that seeky queues cannot + * receive guarantees in the service domain; after a timeout + * they are charged for the whole allocated budget, to try + * to preserve a behavior reasonably fair among them, but + * without service-domain guarantees). + * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is + * no more granted any weight-raising. + * @bfq_failed_cooperations: number of consecutive failed cooperation + * chances after which weight-raising is restored + * to a queue subject to more than bfq_coop_thresh + * queue merges. + * @bfq_requests_within_timer: number of consecutive requests that must be + * issued within the idle time slice to set + * again idling to a queue which was marked as + * non-I/O-bound (see the definition of the + * IO_bound flag for further details). + * @last_ins_in_burst: last time at which a queue entered the current + * burst of queues being activated shortly after + * each other; for more details about this and the + * following parameters related to a burst of + * activations, see the comments to the function + * @bfq_handle_burst. + * @bfq_burst_interval: reference time interval used to decide whether a + * queue has been activated shortly after + * @last_ins_in_burst. + * @burst_size: number of queues in the current burst of queue activations. + * @bfq_large_burst_thresh: maximum burst size above which the current + * queue-activation burst is deemed as 'large'. + * @large_burst: true if a large queue-activation burst is in progress. + * @burst_list: head of the burst list (as for the above fields, more details + * in the comments to the function bfq_handle_burst). + * @low_latency: if set to true, low-latency heuristics are enabled. + * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised + * queue is multiplied. + * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). + * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. + * @bfq_wr_min_idle_time: minimum idle period after which weight-raising + * may be reactivated for a queue (in jiffies). + * @bfq_wr_min_inter_arr_async: minimum period between request arrivals + * after which weight-raising may be + * reactivated for an already busy queue + * (in jiffies). + * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, + * sectors per seconds. + * @RT_prod: cached value of the product R*T used for computing the maximum + * duration of the weight raising automatically. + * @device_speed: device-speed class for the low-latency heuristic. + * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. + * + * All the fields are protected by the @queue lock. + */ +struct bfq_data { + struct request_queue *queue; + + struct bfq_group *root_group; + struct rb_root rq_pos_tree; + +#ifdef CONFIG_CGROUP_BFQIO + int active_numerous_groups; +#endif + + struct rb_root queue_weights_tree; + struct rb_root group_weights_tree; + + int busy_queues; + int busy_in_flight_queues; + int const_seeky_busy_in_flight_queues; + int wr_busy_queues; + int queued; + int rq_in_driver; + int sync_flight; + + int max_rq_in_driver; + int hw_tag_samples; + int hw_tag; + + int budgets_assigned; + + struct timer_list idle_slice_timer; + struct work_struct unplug_work; + + struct bfq_queue *in_service_queue; + struct bfq_io_cq *in_service_bic; + + sector_t last_position; + + ktime_t last_budget_start; + ktime_t last_idling_start; + int peak_rate_samples; + u64 peak_rate; + unsigned long bfq_max_budget; + + struct hlist_head group_list; + struct list_head active_list; + struct list_head idle_list; + + unsigned int bfq_fifo_expire[2]; + unsigned int bfq_back_penalty; + unsigned int bfq_back_max; + unsigned int bfq_slice_idle; + u64 bfq_class_idle_last_service; + + unsigned int bfq_user_max_budget; + unsigned int bfq_max_budget_async_rq; + unsigned int bfq_timeout[2]; + + unsigned int bfq_coop_thresh; + unsigned int bfq_failed_cooperations; + unsigned int bfq_requests_within_timer; + + unsigned long last_ins_in_burst; + unsigned long bfq_burst_interval; + int burst_size; + unsigned long bfq_large_burst_thresh; + bool large_burst; + struct hlist_head burst_list; + + bool low_latency; + + /* parameters of the low_latency heuristics */ + unsigned int bfq_wr_coeff; + unsigned int bfq_wr_max_time; + unsigned int bfq_wr_rt_max_time; + unsigned int bfq_wr_min_idle_time; + unsigned long bfq_wr_min_inter_arr_async; + unsigned int bfq_wr_max_softrt_rate; + u64 RT_prod; + enum bfq_device_speed device_speed; + + struct bfq_queue oom_bfqq; +}; + +enum bfqq_state_flags { + BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ + BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ + BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ + BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ + BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ + BFQ_BFQQ_FLAG_sync, /* synchronous queue */ + BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ + BFQ_BFQQ_FLAG_IO_bound, /* + * bfqq has timed-out at least once + * having consumed at most 2/10 of + * its budget + */ + BFQ_BFQQ_FLAG_in_large_burst, /* + * bfqq activated in a large burst, + * see comments to bfq_handle_burst. + */ + BFQ_BFQQ_FLAG_constantly_seeky, /* + * bfqq has proved to be slow and + * seeky until budget timeout + */ + BFQ_BFQQ_FLAG_softrt_update, /* + * may need softrt-next-start + * update + */ + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ + BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ +}; + +#define BFQ_BFQQ_FNS(name) \ +static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ +} \ +static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ +} \ +static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ +{ \ + return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ +} + +BFQ_BFQQ_FNS(busy); +BFQ_BFQQ_FNS(wait_request); +BFQ_BFQQ_FNS(must_alloc); +BFQ_BFQQ_FNS(fifo_expire); +BFQ_BFQQ_FNS(idle_window); +BFQ_BFQQ_FNS(sync); +BFQ_BFQQ_FNS(budget_new); +BFQ_BFQQ_FNS(IO_bound); +BFQ_BFQQ_FNS(in_large_burst); +BFQ_BFQQ_FNS(constantly_seeky); +BFQ_BFQQ_FNS(coop); +BFQ_BFQQ_FNS(split_coop); +BFQ_BFQQ_FNS(just_split); +BFQ_BFQQ_FNS(softrt_update); +#undef BFQ_BFQQ_FNS + +/* Logging facilities. */ +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) + +#define bfq_log(bfqd, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) + +/* Expiration reasons. */ +enum bfqq_expiration { + BFQ_BFQQ_TOO_IDLE = 0, /* + * queue has been idling for + * too long + */ + BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ + BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ + BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ +}; + +#ifdef CONFIG_CGROUP_BFQIO +/** + * struct bfq_group - per (device, cgroup) data structure. + * @entity: schedulable entity to insert into the parent group sched_data. + * @sched_data: own sched_data, to contain child entities (they may be + * both bfq_queues and bfq_groups). + * @group_node: node to be inserted into the bfqio_cgroup->group_data + * list of the containing cgroup's bfqio_cgroup. + * @bfqd_node: node to be inserted into the @bfqd->group_list list + * of the groups active on the same device; used for cleanup. + * @bfqd: the bfq_data for the device this group acts upon. + * @async_bfqq: array of async queues for all the tasks belonging to + * the group, one queue per ioprio value per ioprio_class, + * except for the idle class that has only one queue. + * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). + * @my_entity: pointer to @entity, %NULL for the toplevel group; used + * to avoid too many special cases during group creation/ + * migration. + * @active_entities: number of active entities belonging to the group; + * unused for the root group. Used to know whether there + * are groups with more than one active @bfq_entity + * (see the comments to the function + * bfq_bfqq_must_not_expire()). + * + * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup + * there is a set of bfq_groups, each one collecting the lower-level + * entities belonging to the group that are acting on the same device. + * + * Locking works as follows: + * o @group_node is protected by the bfqio_cgroup lock, and is accessed + * via RCU from its readers. + * o @bfqd is protected by the queue lock, RCU is used to access it + * from the readers. + * o All the other fields are protected by the @bfqd queue lock. + */ +struct bfq_group { + struct bfq_entity entity; + struct bfq_sched_data sched_data; + + struct hlist_node group_node; + struct hlist_node bfqd_node; + + void *bfqd; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; + + struct bfq_entity *my_entity; + + int active_entities; +}; + +/** + * struct bfqio_cgroup - bfq cgroup data structure. + * @css: subsystem state for bfq in the containing cgroup. + * @weight: cgroup weight. + * @ioprio: cgroup ioprio. + * @ioprio_class: cgroup ioprio_class. + * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data. + * @group_data: list containing the bfq_group belonging to this cgroup. + * + * @group_data is accessed using RCU, with @lock protecting the updates, + * @ioprio and @ioprio_class are protected by @lock. + */ +struct bfqio_cgroup { + struct cgroup_subsys_state css; + + unsigned short weight, ioprio, ioprio_class; + + spinlock_t lock; + struct hlist_head group_data; +}; +#else +struct bfq_group { + struct bfq_sched_data sched_data; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; +}; +#endif + +static inline struct bfq_service_tree * +bfq_entity_service_tree(struct bfq_entity *entity) +{ + struct bfq_sched_data *sched_data = entity->sched_data; + unsigned int idx = entity->ioprio_class - 1; + + BUG_ON(idx >= BFQ_IOPRIO_CLASSES); + BUG_ON(sched_data == NULL); + + return sched_data->service_tree + idx; +} + +static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, + bool is_sync) +{ + return bic->bfqq[is_sync]; +} + +static inline void bic_set_bfqq(struct bfq_io_cq *bic, + struct bfq_queue *bfqq, bool is_sync) +{ + bic->bfqq[is_sync] = bfqq; +} + +static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) +{ + return bic->icq.q->elevator->elevator_data; +} + +/** + * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. + * @ptr: a pointer to a bfqd. + * @flags: storage for the flags to be saved. + * + * This function allows bfqg->bfqd to be protected by the + * queue lock of the bfqd they reference; the pointer is dereferenced + * under RCU, so the storage for bfqd is assured to be safe as long + * as the RCU read side critical section does not end. After the + * bfqd->queue->queue_lock is taken the pointer is rechecked, to be + * sure that no other writer accessed it. If we raced with a writer, + * the function returns NULL, with the queue unlocked, otherwise it + * returns the dereferenced pointer, with the queue locked. + */ +static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr, + unsigned long *flags) +{ + struct bfq_data *bfqd; + + rcu_read_lock(); + bfqd = rcu_dereference(*(struct bfq_data **)ptr); + + if (bfqd != NULL) { + spin_lock_irqsave(bfqd->queue->queue_lock, *flags); + if (*ptr == bfqd) + goto out; + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); + } + + bfqd = NULL; +out: + rcu_read_unlock(); + return bfqd; +} + +static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd, + unsigned long *flags) +{ + spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); +} + +static void bfq_check_ioprio_change(struct io_context *ioc, + struct bfq_io_cq *bic); +static void bfq_put_queue(struct bfq_queue *bfqq); +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, int is_sync, + struct io_context *ioc, gfp_t gfp_mask); +static void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg); +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); + +#endif /* _BFQ_H */ From dce21db8df013fe8dbc92040d7ab253e8cdbc628 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Tue, 21 Jun 2016 01:07:43 -0400 Subject: [PATCH 47/52] sched: Move sched.h sysctl bits into separate header Move the sysctl-related bits from include/linux/sched.h into a new file: include/linux/sched/sysctl.h. Then update source files requiring access to those bits by including the new header file. Signed-off-by: Clark Williams Cc: Peter Zijlstra Cc: Steven Rostedt Signed-off-by: Ingo Molnar Signed-off-by: Pranav Vashi (cherry picked from commit 6c27edb42f0c50733a28c2a5e2fe9654801a4f01) --- block/blk-exec.c | 1 + include/linux/init_task.h | 1 + include/linux/sched.h | 86 +--------------------------------- include/linux/sched/sysctl.h | 91 ++++++++++++++++++++++++++++++++++++ kernel/hrtimer.c | 1 + kernel/sched/sched.h | 1 + kernel/sysctl.c | 1 + kernel/timer.c | 1 + mm/mmap.c | 1 + mm/mremap.c | 1 + mm/nommu.c | 1 + 11 files changed, 102 insertions(+), 84 deletions(-) create mode 100644 include/linux/sched/sysctl.h diff --git a/block/blk-exec.c b/block/blk-exec.c index 1b5cb66ef..55dd9861d 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "blk.h" diff --git a/include/linux/init_task.h b/include/linux/init_task.h index b11e298e9..2b9894067 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -11,6 +11,7 @@ #include #include #include +#include #ifdef CONFIG_SMP # define INIT_PUSHABLE_TASKS(tsk) \ diff --git a/include/linux/sched.h b/include/linux/sched.h index bf4b7ba11..55eac4d5f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -338,19 +338,6 @@ static inline void lockup_detector_init(void) } #endif -#ifdef CONFIG_DETECT_HUNG_TASK -extern unsigned int sysctl_hung_task_panic; -extern unsigned long sysctl_hung_task_check_count; -extern unsigned long sysctl_hung_task_timeout_secs; -extern unsigned long sysctl_hung_task_warnings; -extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); -#else -/* Avoid need for ifdefs elsewhere in the code */ -enum { sysctl_hung_task_timeout_secs = 0 }; -#endif - /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) @@ -372,23 +359,6 @@ extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; struct user_namespace; -/* - * Default maximum number of active map areas, this limits the number of vmas - * per mm struct. Users can overwrite this number by sysctl but there is a - * problem. - * - * When a program's coredump is generated as ELF format, a section is created - * per a vma. In ELF, the number of sections is represented in unsigned short. - * This means the number of sections should be smaller than 65535 at coredump. - * Because the kernel adds some informative sections to a image of program at - * generating coredump, we need some margin. The number of extra sections is - * 1-3 now and depends on arch. We use "5" as safe margin, here. - */ -#define MAPCOUNT_ELF_CORE_MARGIN (5) -#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) - -extern int sysctl_max_map_count; - #include #ifdef CONFIG_MMU @@ -1258,12 +1228,6 @@ struct sched_rt_entity { #endif }; -/* - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. - */ -#define RR_TIMESLICE (100 * HZ / 1000) - struct rcu_node; enum perf_event_task_context { @@ -2083,52 +2047,10 @@ extern void wake_up_idle_cpu(int cpu); static inline void wake_up_idle_cpu(int cpu) { } #endif -extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_min_granularity; -extern unsigned int sysctl_sched_wakeup_granularity; -extern unsigned int sysctl_sched_child_runs_first; -extern unsigned int sysctl_sched_wake_to_idle; - -enum sched_tunable_scaling { - SCHED_TUNABLESCALING_NONE, - SCHED_TUNABLESCALING_LOG, - SCHED_TUNABLESCALING_LINEAR, - SCHED_TUNABLESCALING_END, -}; -extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; - -#ifdef CONFIG_SCHED_DEBUG -extern unsigned int sysctl_sched_migration_cost; -extern unsigned int sysctl_sched_nr_migrate; -extern unsigned int sysctl_sched_time_avg; -extern unsigned int sysctl_timer_migration; -extern unsigned int sysctl_sched_shares_window; - -int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos); -#endif -#ifdef CONFIG_SCHED_DEBUG -static inline unsigned int get_sysctl_timer_migration(void) -{ - return sysctl_timer_migration; -} -#else -static inline unsigned int get_sysctl_timer_migration(void) -{ - return 1; -} -#endif -extern unsigned int sysctl_sched_rt_period; -extern int sysctl_sched_rt_runtime; - -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +extern unsigned int sysctl_sched_ravg_window; +extern unsigned int sysctl_sched_wakeup_load_threshold; #ifdef CONFIG_SCHED_AUTOGROUP -extern unsigned int sysctl_sched_autogroup_enabled; - extern void sched_autogroup_create_attach(struct task_struct *p); extern void sched_autogroup_detach(struct task_struct *p); extern void sched_autogroup_fork(struct signal_struct *sig); @@ -2144,10 +2066,6 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { } static inline void sched_autogroup_exit(struct signal_struct *sig) { } #endif -#ifdef CONFIG_CFS_BANDWIDTH -extern unsigned int sysctl_sched_cfs_bandwidth_slice; -#endif - #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h new file mode 100644 index 000000000..d8c6e8606 --- /dev/null +++ b/include/linux/sched/sysctl.h @@ -0,0 +1,91 @@ +#ifndef _SCHED_SYSCTL_H +#define _SCHED_SYSCTL_H + +#ifdef CONFIG_DETECT_HUNG_TASK +extern unsigned int sysctl_hung_task_panic; +extern unsigned long sysctl_hung_task_check_count; +extern unsigned long sysctl_hung_task_timeout_secs; +extern unsigned long sysctl_hung_task_warnings; +extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos); +#else +/* Avoid need for ifdefs elsewhere in the code */ +enum { sysctl_hung_task_timeout_secs = 0 }; +#endif + +/* + * Default maximum number of active map areas, this limits the number of vmas + * per mm struct. Users can overwrite this number by sysctl but there is a + * problem. + * + * When a program's coredump is generated as ELF format, a section is created + * per a vma. In ELF, the number of sections is represented in unsigned short. + * This means the number of sections should be smaller than 65535 at coredump. + * Because the kernel adds some informative sections to a image of program at + * generating coredump, we need some margin. The number of extra sections is + * 1-3 now and depends on arch. We use "5" as safe margin, here. + */ +#define MAPCOUNT_ELF_CORE_MARGIN (5) +#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) + +extern int sysctl_max_map_count; + +extern unsigned int sysctl_sched_latency; +extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_wakeup_granularity; +extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_sched_wake_to_idle; + +enum sched_tunable_scaling { + SCHED_TUNABLESCALING_NONE, + SCHED_TUNABLESCALING_LOG, + SCHED_TUNABLESCALING_LINEAR, + SCHED_TUNABLESCALING_END, +}; +extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; + +#ifdef CONFIG_SCHED_DEBUG +extern unsigned int sysctl_sched_migration_cost; +extern unsigned int sysctl_sched_nr_migrate; +extern unsigned int sysctl_sched_time_avg; +extern unsigned int sysctl_timer_migration; +extern unsigned int sysctl_sched_shares_window; + +int sched_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos); +#endif +#ifdef CONFIG_SCHED_DEBUG +static inline unsigned int get_sysctl_timer_migration(void) +{ + return sysctl_timer_migration; +} +#else +static inline unsigned int get_sysctl_timer_migration(void) +{ + return 1; +} +#endif +extern unsigned int sysctl_sched_rt_period; +extern int sysctl_sched_rt_runtime; + +#ifdef CONFIG_CFS_BANDWIDTH +extern unsigned int sysctl_sched_cfs_bandwidth_slice; +#endif + +#ifdef CONFIG_SCHED_AUTOGROUP +extern unsigned int sysctl_sched_autogroup_enabled; +#endif + +/* + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define RR_TIMESLICE (100 * HZ / 1000) + +int sched_rt_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + +#endif /* _SCHED_SYSCTL_H */ diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 6a72395d3..72bc17919 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cc248e5ed..261c65543 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1,5 +1,6 @@ #include +#include #include #include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index aa8debaa1..2d0dc1461 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include diff --git a/kernel/timer.c b/kernel/timer.c index 219865e85..5b5310630 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include diff --git a/mm/mmap.c b/mm/mmap.c index 1ce374f97..07b3d1fff 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include diff --git a/mm/mremap.c b/mm/mremap.c index db8d983b5..8855dec4e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include diff --git a/mm/nommu.c b/mm/nommu.c index 75ff3c173..3685abade 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include From e24975e731d7858ebfecde86a2f0c08345e6f41a Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Tue, 21 Jun 2016 01:08:32 -0400 Subject: [PATCH 48/52] Makefile: build with ccache --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 470225f3b..cd79b4837 100644 --- a/Makefile +++ b/Makefile @@ -243,8 +243,8 @@ CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ else if [ -x /bin/bash ]; then echo /bin/bash; \ else echo sh; fi ; fi) -HOSTCC = gcc -HOSTCXX = g++ +HOSTCC = ccache gcc +HOSTCXX = ccache g++ HOSTCFLAGS = -Wall -Wmissing-prototypes -Wstrict-prototypes -O2 -fomit-frame-pointer -std=gnu89 HOSTCXXFLAGS = -O2 @@ -330,7 +330,7 @@ include $(srctree)/scripts/Kbuild.include AS = $(CROSS_COMPILE)as LD = $(CROSS_COMPILE)ld -REAL_CC = $(CROSS_COMPILE)gcc +REAL_CC = ccache $(CROSS_COMPILE)gcc CPP = $(CC) -E AR = $(CROSS_COMPILE)ar NM = $(CROSS_COMPILE)nm From 3da433569bff5ad82c0a79ad98f3a44d34365c08 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Tue, 21 Jun 2016 01:10:33 -0400 Subject: [PATCH 49/52] async: introduce 'async_domain' type This is in preparation for teaching async_synchronize_full() to sync all pending async work, and not just on the async_running domain. This conversion is functionally equivalent, just embedding the existing list in a new async_domain type. The .registered attribute is used in a later patch to distinguish between domains that want to be flushed by async_synchronize_full() versus those that only expect async_synchronize_{full|cookie}_domain to be used for flushing. [jejb: add async.h to scsi_priv.h for struct async_domain] Signed-off-by: Dan Williams Acked-by: Arjan van de Ven Acked-by: Mark Brown Tested-by: Eldad Zack Signed-off-by: James Bottomley Signed-off-by: Pranav Vashi (cherry picked from commit bb250a84af178d065bb930318a314008145ffdd3) --- drivers/regulator/core.c | 2 +- drivers/scsi/libsas/sas_ata.c | 2 +- drivers/scsi/scsi.c | 1 + include/linux/async.h | 35 +++++++++++++++++++++++++++++++---- kernel/async.c | 33 ++++++++++++++++----------------- sound/soc/soc-dapm.c | 2 +- 6 files changed, 51 insertions(+), 24 deletions(-) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index a4f1a503b..0303e0256 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -2592,7 +2592,7 @@ static void regulator_bulk_enable_async(void *data, async_cookie_t cookie) int regulator_bulk_enable(int num_consumers, struct regulator_bulk_data *consumers) { - LIST_HEAD(async_domain); + ASYNC_DOMAIN_EXCLUSIVE(async_domain); int i; int ret = 0; diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c index 51ee663c1..ec96c5fe3 100644 --- a/drivers/scsi/libsas/sas_ata.c +++ b/drivers/scsi/libsas/sas_ata.c @@ -720,7 +720,7 @@ static void async_sas_ata_eh(void *data, async_cookie_t cookie) void sas_ata_strategy_handler(struct Scsi_Host *shost) { struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(shost); - LIST_HEAD(async); + ASYNC_DOMAIN_EXCLUSIVE(async); int i; /* it's ok to defer revalidation events during ata eh, these diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 29fabffd6..b1f76da1e 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/async.h b/include/linux/async.h index 68a953019..364e7ff16 100644 --- a/include/linux/async.h +++ b/include/linux/async.h @@ -9,19 +9,46 @@ * as published by the Free Software Foundation; version 2 * of the License. */ +#ifndef __ASYNC_H__ +#define __ASYNC_H__ #include #include typedef u64 async_cookie_t; typedef void (async_func_ptr) (void *data, async_cookie_t cookie); +struct async_domain { + struct list_head node; + struct list_head domain; + int count; + unsigned registered:1; +}; + +/* + * domain participates in global async_synchronize_full + */ +#define ASYNC_DOMAIN(_name) \ + struct async_domain _name = { .node = LIST_HEAD_INIT(_name.node), \ + .domain = LIST_HEAD_INIT(_name.domain), \ + .count = 0, \ + .registered = 1 } + +/* + * domain is free to go out of scope as soon as all pending work is + * complete, this domain does not participate in async_synchronize_full + */ +#define ASYNC_DOMAIN_EXCLUSIVE(_name) \ + struct async_domain _name = { .node = LIST_HEAD_INIT(_name.node), \ + .domain = LIST_HEAD_INIT(_name.domain), \ + .count = 0, \ + .registered = 0 } extern async_cookie_t async_schedule(async_func_ptr *ptr, void *data); extern async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, - struct list_head *list); + struct async_domain *domain); extern void async_synchronize_full(void); -extern void async_synchronize_full_domain(struct list_head *list); +extern void async_synchronize_full_domain(struct async_domain *domain); extern void async_synchronize_cookie(async_cookie_t cookie); extern void async_synchronize_cookie_domain(async_cookie_t cookie, - struct list_head *list); - + struct async_domain *domain); +#endif diff --git a/kernel/async.c b/kernel/async.c index 32d8dc960..b7717733f 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -62,7 +62,7 @@ static async_cookie_t next_cookie = 1; #define MAX_WORK 32768 static LIST_HEAD(async_pending); -static LIST_HEAD(async_running); +static ASYNC_DOMAIN(async_running); static DEFINE_SPINLOCK(async_lock); struct async_entry { @@ -71,7 +71,7 @@ struct async_entry { async_cookie_t cookie; async_func_ptr *func; void *data; - struct list_head *running; + struct async_domain *running; }; static DECLARE_WAIT_QUEUE_HEAD(async_done); @@ -82,7 +82,7 @@ static atomic_t entry_count; /* * MUST be called with the lock held! */ -static async_cookie_t __lowest_in_progress(struct list_head *running) +static async_cookie_t __lowest_in_progress(struct async_domain *running) { struct async_entry *entry; @@ -93,9 +93,8 @@ static async_cookie_t __lowest_in_progress(struct list_head *running) return next_cookie; } - if (!list_empty(running)) { - entry = list_first_entry(running, - struct async_entry, list); + if (!list_empty(&running->domain)) { + entry = list_first_entry(&running->domain, typeof(*entry), list); return entry->cookie; } @@ -106,7 +105,7 @@ static async_cookie_t __lowest_in_progress(struct list_head *running) return next_cookie; /* "infinity" value */ } -static async_cookie_t lowest_in_progress(struct list_head *running) +static async_cookie_t lowest_in_progress(struct async_domain *running) { unsigned long flags; async_cookie_t ret; @@ -126,10 +125,11 @@ static void async_run_entry_fn(struct work_struct *work) container_of(work, struct async_entry, work); unsigned long flags; ktime_t uninitialized_var(calltime), delta, rettime; + struct async_domain *running = entry->running; /* 1) move self to the running queue */ spin_lock_irqsave(&async_lock, flags); - list_move_tail(&entry->list, entry->running); + list_move_tail(&entry->list, &running->domain); spin_unlock_irqrestore(&async_lock, flags); /* 2) run (and print duration) */ @@ -163,7 +163,7 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) +static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *running) { struct async_entry *entry; unsigned long flags; @@ -230,7 +230,7 @@ EXPORT_SYMBOL_GPL(async_schedule); * Note: This function may be called from atomic or non-atomic contexts. */ async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, - struct list_head *running) + struct async_domain *running) { return __async_schedule(ptr, data, running); } @@ -249,14 +249,14 @@ EXPORT_SYMBOL_GPL(async_synchronize_full); /** * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain - * @list: running list to synchronize on + * @domain: running list to synchronize on * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @list have been done. + * synchronization domain specified by the running list @domain have been done. */ -void async_synchronize_full_domain(struct list_head *list) +void async_synchronize_full_domain(struct async_domain *domain) { - async_synchronize_cookie_domain(next_cookie, list); + async_synchronize_cookie_domain(next_cookie, domain); } EXPORT_SYMBOL_GPL(async_synchronize_full_domain); @@ -266,11 +266,10 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); * @running: running list to synchronize on, NULL indicates all lists * * This function waits until all asynchronous function calls for the - * synchronization domain specified by the running list @list submitted + * synchronization domain specified by running list @running submitted * prior to @cookie have been done. */ -void async_synchronize_cookie_domain(async_cookie_t cookie, - struct list_head *running) +void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *running) { ktime_t uninitialized_var(starttime), delta, endtime; diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index d8a22c07e..5e612f881 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -1620,7 +1620,7 @@ static int dapm_power_widgets(struct snd_soc_dapm_context *dapm, int event) struct snd_soc_dapm_context *d; LIST_HEAD(up_list); LIST_HEAD(down_list); - LIST_HEAD(async_domain); + ASYNC_DOMAIN_EXCLUSIVE(async_domain); enum snd_soc_bias_level bias; trace_snd_soc_dapm_start(card); From 65a1793c9d10dbdc894b65fc001f0501f1eb99c7 Mon Sep 17 00:00:00 2001 From: Pranav Vashi Date: Tue, 18 Mar 2014 08:23:28 +0530 Subject: [PATCH 50/52] sched: Remove get_online_cpus() usage Testing/Modified for 3.4.y from 6acce3ef upstream. Remove get_online_cpus() usage from the scheduler; there's 4 sites that use it: - sched_init_smp(); where its completely superfluous since we're in 'early' boot and there simply cannot be any hotplugging. - sched_getaffinity(); we already take a raw spinlock to protect the task cpus_allowed mask, this disables preemption and therefore also stabilizes cpu_online_mask as that's modified using stop_machine. However switch to active mask for symmetry with sched_setaffinity()/set_cpus_allowed_ptr(). We guarantee active mask stability by inserting sync_rcu/sched() into _cpu_down. - sched_setaffinity(); we don't appear to need get_online_cpus() either, there's two sites where hotplug appears relevant: * cpuset_cpus_allowed(); for the !cpuset case we use possible_mask, for the cpuset case we hold task_lock, which is a spinlock and thus for mainline disables preemption (might cause pain on RT). * set_cpus_allowed_ptr(); Holds all scheduler locks and thus has preemption properly disabled; also it already deals with hotplug races explicitly where it releases them. - reinit_sched_domains(); we shouldn't need to hotplug here. In fact, this has likely been the thorn in my side, causing mpdecision to break on sched_mc_power_savings changes. Signed-off-by: Peter Zijlstra Cc: "Srivatsa S. Bhat" Cc: Paul McKenney Cc: Mel Gorman Cc: Rik van Riel Cc: Srikar Dronamraju Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Linus Torvalds Cc: Andrew Morton Cc: Steven Rostedt Cc: Oleg Nesterov Signed-off-by: Ingo Molnar (Modified for 3.4.y to include sched_domains()) Signed-off-by: Pranav Vashi Signed-off-by: franciscofranco (cherry picked from commit 4a9d093c7a9f394c589cd0380c09a0019cd13ffc) --- kernel/cpu.c | 17 +++++++++++++++++ kernel/sched/core.c | 16 +++++++--------- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 19362f187..b2e02501b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -274,6 +274,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) goto out_release; } + /* + * By now we've cleared cpu_active_mask, wait for all preempt-disabled + * and RCU users of this state to go away such that all new such users + * will observe it. + * + * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might + * not imply sync_sched(), so explicitly call both. + */ +#ifdef CONFIG_PREEMPT + synchronize_sched(); +#endif + synchronize_rcu(); + + /* + * So now all preempt/rcu users must observe !cpu_active(). + */ + err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 12b6466b3..751d37d87 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4625,7 +4625,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) struct task_struct *p; int retval; - get_online_cpus(); rcu_read_lock(); p = find_process_by_pid(pid); @@ -4678,7 +4677,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) free_cpumask_var(cpus_allowed); out_put_task: put_task_struct(p); - put_online_cpus(); return retval; } @@ -4721,7 +4719,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) unsigned long flags; int retval; - get_online_cpus(); rcu_read_lock(); retval = -ESRCH; @@ -4734,12 +4731,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) goto out_unlock; raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); - put_online_cpus(); return retval; } @@ -7005,13 +7001,11 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) static void reinit_sched_domains(void) { - get_online_cpus(); /* Destroy domains first to force the rebuild */ partition_sched_domains(0, NULL, NULL); rebuild_sched_domains(); - put_online_cpus(); } static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) @@ -7162,14 +7156,18 @@ void __init sched_init_smp(void) alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); - get_online_cpus(); + + /* + * There's no userspace yet to cause hotplug operations; hence all the + * cpu masks are stable and all blatant races in the below code cannot + * happen. + */ mutex_lock(&sched_domains_mutex); init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); - put_online_cpus(); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); From b613992df31884003947769cccf6dcc5711ddb88 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Thu, 7 Jul 2016 13:21:09 +0800 Subject: [PATCH 51/52] swap: discard while swapping only if SWAP_FLAG_DISCARD_PAGES Considering the use cases where the swap device supports discard: a) and can do it quickly; b) but it's slow to do in small granularities (or concurrent with other I/O); c) but the implementation is so horrendous that you don't even want to send one down; And assuming that the sysadmin considers it useful to send the discards down at all, we would (probably) want the following solutions: i. do the fine-grained discards for freed swap pages, if device is capable of doing so optimally; ii. do single-time (batched) swap area discards, either at swapon or via something like fstrim (not implemented yet); iii. allow doing both single-time and fine-grained discards; or iv. turn it off completely (default behavior) As implemented today, one can only enable/disable discards for swap, but one cannot select, for instance, solution (ii) on a swap device like (b) even though the single-time discard is regarded to be interesting, or necessary to the workload because it would imply (1), and the device is not capable of performing it optimally. This patch addresses the scenario depicted above by introducing a way to ensure the (probably) wanted solutions (i, ii, iii and iv) can be flexibly flagged through swapon(8) to allow a sysadmin to select the best suitable swap discard policy accordingly to system constraints. This patch introduces SWAP_FLAG_DISCARD_PAGES and SWAP_FLAG_DISCARD_ONCE new flags to allow more flexibe swap discard policies being flagged through swapon(8). The default behavior is to keep both single-time, or batched, area discards (SWAP_FLAG_DISCARD_ONCE) and fine-grained discards for page-clusters (SWAP_FLAG_DISCARD_PAGES) enabled, in order to keep consistentcy with older kernel behavior, as well as maintain compatibility with older swapon(8). However, through the new introduced flags the best suitable discard policy can be selected accordingly to any given swap device constraint. [akpm@linux-foundation.org: tweak comments] Signed-off-by: Rafael Aquini Acked-by: KOSAKI Motohiro Cc: Hugh Dickins Cc: Shaohua Li Cc: Karel Zak Cc: Jeff Moyer Cc: Rik van Riel Cc: Larry Woodman Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 7 ++++-- mm/swapfile.c | 56 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 7 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 102021b8a..f27f30a1a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -20,10 +20,13 @@ struct bio; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_PRIO_SHIFT 0 -#define SWAP_FLAG_DISCARD 0x10000 /* discard swap cluster after use */ +#define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ +#define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ +#define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ #define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \ - SWAP_FLAG_DISCARD) + SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \ + SWAP_FLAG_DISCARD_PAGES) static inline int current_is_kswapd(void) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 3521cf159..2440b27a4 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -232,7 +232,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } - if (si->flags & SWP_DISCARDABLE) { + if (si->flags & SWP_PAGE_DISCARD) { /* * Start range check on racing allocations, in case * they overlap the cluster we eventually decide on @@ -342,7 +342,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, if (si->lowest_alloc) { /* - * Only set when SWP_DISCARDABLE, and there's a scan + * Only set when SWP_PAGE_DISCARD, and there's a scan * for a free cluster in progress or just completed. */ if (found_free_cluster) { @@ -2103,6 +2103,21 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, return nr_extents; } +/* + * Helper to sys_swapon determining if a given swap + * backing device queue supports DISCARD operations. + */ +static bool swap_discardable(struct swap_info_struct *si) +{ + struct request_queue *q = bdev_get_queue(si->bdev); + + if (!q || !blk_queue_discard(q)) + return false; + + return true; +} + + SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; @@ -2210,8 +2225,37 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->flags |= SWP_SOLIDSTATE; p->cluster_next = 1 + (random32() % p->highest_bit); } - if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) - p->flags |= SWP_DISCARDABLE; + if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + /* + * When discard is enabled for swap with no particular + * policy flagged, we set all swap discard flags here in + * order to sustain backward compatibility with older + * swapon(8) releases. + */ + p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | + SWP_PAGE_DISCARD); + + /* + * By flagging sys_swapon, a sysadmin can tell us to + * either do single-time area discards only, or to just + * perform discards for released swap page-clusters. + * Now it's time to adjust the p->flags accordingly. + */ + if (swap_flags & SWAP_FLAG_DISCARD_ONCE) + p->flags &= ~SWP_PAGE_DISCARD; + else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) + p->flags &= ~SWP_AREA_DISCARD; + + /* issue a swapon-time discard if it's still required */ + if (p->flags & SWP_AREA_DISCARD) { + int err = discard_swap(p); + if (unlikely(err)) + printk(KERN_ERR + "swapon: discard_swap(%p): %d\n", + p, err); + } + } + if (blk_queue_fast(bdev_get_queue(p->bdev))) p->flags |= SWP_FAST; } @@ -2224,11 +2268,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) enable_swap_info(p, prio, swap_map, frontswap_map); printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s%s\n", + "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : "", + (p->flags & SWP_AREA_DISCARD) ? "s" : "", + (p->flags & SWP_PAGE_DISCARD) ? "c" : "", (frontswap_map) ? "FS" : ""); mutex_unlock(&swapon_mutex); From e6fe955c1e91951831785b934290effba657dc53 Mon Sep 17 00:00:00 2001 From: "lance.zhou" Date: Thu, 7 Jul 2016 18:01:49 +0800 Subject: [PATCH 52/52] =?UTF-8?q?mm:=20add=20support=20for=20a=20filesyste?= =?UTF-8?q?m=20to=20activate=20swap=20files=20and=20use=20direc=E2=80=A6?= =?UTF-8?q?=20=E2=80=A6t=5FIO=20for=20writing=20swap=20pages?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently swapfiles are managed entirely by the core VM by using ->bmap to allocate space and write to the blocks directly. This effectively ensures that the underlying blocks are allocated and avoids the need for the swap subsystem to locate what physical blocks store offsets within a file. If the swap subsystem is to use the filesystem information to locate the blocks, it is critical that information such as block groups, block bitmaps and the block descriptor table that map the swap file were resident in memory. This patch adds address_space_operations that the VM can call when activating or deactivating swap backed by a file. int swap_activate(struct file *); int swap_deactivate(struct file *); The ->swap_activate() method is used to communicate to the file that the VM relies on it, and the address_space should take adequate measures such as reserving space in the underlying device, reserving memory for mempools and pinning information such as the block descriptor table in memory. The ->swap_deactivate() method is called on sys_swapoff() if ->swap_activate() returned success. After a successful swapfile ->swap_activate, the swapfile is marked SWP_FILE and swapper_space.a_ops will proxy to sis->swap_file->f_mappings->a_ops using ->direct_io to write swapcache pages and ->readpage to read. It is perfectly possible that direct_IO be used to read the swap pages but it is an unnecessary complication. Similarly, it is possible that ->writepage be used instead of direct_io to write the pages but filesystem developers have stated that calling writepage from the VM is undesirable for a variety of reasons and using direct_IO opens up the possibility of writing back batches of swap pages in the future. [a.p.zijlstra@chello.nl: Original patch] Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Christoph Hellwig Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mel Gorman Cc: Mike Christie Cc: Neil Brown Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + mm/page_io.c | 49 ++++++++++++++++++++++++++++++++++++++++++++ mm/swap_state.c | 2 +- mm/swapfile.c | 15 ++++++++++++-- 4 files changed, 64 insertions(+), 3 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index f27f30a1a..453425ace 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -337,6 +337,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) /* linux/mm/page_io.c */ extern int swap_readpage(struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); +extern int swap_set_page_dirty(struct page *page); extern void end_swap_bio_write(struct bio *bio, int err); extern int __swap_writepage(struct page *page, struct writeback_control *wbc, void (*end_write_func)(struct bio *, int)); diff --git a/mm/page_io.c b/mm/page_io.c index d0b4d4a55..c1d2f81db 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -145,17 +146,43 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, int swap_writepage(struct page *page, struct writeback_control *wbc) { int ret = 0; + struct swap_info_struct *sis = page_swap_info(page); if (try_to_free_swap(page)) { unlock_page(page); goto out; } + if (frontswap_store(page) == 0) { set_page_writeback(page); unlock_page(page); end_page_writeback(page); goto out; } + if (sis->flags & SWP_FILE) { + struct kiocb kiocb; + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct iovec iov = { + .iov_base = page_address(page), + .iov_len = PAGE_SIZE, + }; + + init_sync_kiocb(&kiocb, swap_file); + kiocb.ki_pos = page_file_offset(page); + kiocb.ki_left = PAGE_SIZE; + kiocb.ki_nbytes = PAGE_SIZE; + + unlock_page(page); + ret = mapping->a_ops->direct_IO(KERNEL_WRITE, + &kiocb, &iov, + kiocb.ki_pos, 1); + if (ret == PAGE_SIZE) { + count_vm_event(PSWPOUT); + ret = 0; + } + return ret; + } ret = __swap_writepage(page, wbc, end_swap_bio_write); out: return ret; @@ -188,6 +215,7 @@ int swap_readpage(struct page *page) { struct bio *bio; int ret = 0; + struct swap_info_struct *sis = page_swap_info(page); VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); @@ -196,6 +224,15 @@ int swap_readpage(struct page *page) unlock_page(page); goto out; } + if (sis->flags & SWP_FILE) { + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + + ret = mapping->a_ops->readpage(swap_file, page); + if (!ret) + count_vm_event(PSWPIN); + return ret; + } bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); @@ -207,3 +244,15 @@ int swap_readpage(struct page *page) out: return ret; } + +int swap_set_page_dirty(struct page *page) +{ + struct swap_info_struct *sis = page_swap_info(page); + + if (sis->flags & SWP_FILE) { + struct address_space *mapping = sis->swap_file->f_mapping; + return mapping->a_ops->set_page_dirty(page); + } else { + return __set_page_dirty_no_writeback(page); + } +} \ No newline at end of file diff --git a/mm/swap_state.c b/mm/swap_state.c index 7692d99b9..f1ee1e8fa 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -26,7 +26,7 @@ */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, - .set_page_dirty = __set_page_dirty_no_writeback, + .set_page_dirty = swap_set_page_dirty, .migratepage = migrate_page, }; diff --git a/mm/swapfile.c b/mm/swapfile.c index 2440b27a4..f00f23601 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1512,7 +1512,9 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, */ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) { - struct inode *inode; + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; unsigned blocks_per_page; unsigned long page_no; unsigned blkbits; @@ -1523,13 +1525,22 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) int nr_extents = 0; int ret; - inode = sis->swap_file->f_mapping->host; if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; goto out; } + if (mapping->a_ops->swap_activate) { + ret = mapping->a_ops->swap_activate(swap_file); + if (!ret) { + sis->flags |= SWP_FILE; + ret = add_swap_extent(sis, 0, sis->max, 0); + *span = sis->pages; + } + goto out; + } + blkbits = inode->i_blkbits; blocks_per_page = PAGE_SIZE >> blkbits;