From 97f5a9b09274f653cd4f072b94e8b0aec61232eb Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Mon, 17 Jun 2013 13:32:11 +0300 Subject: [PATCH 1/2] fbdev: add FBIOCOPYAREA ioctl Based on the patch authored by Ali Gholami Rudi at https://lkml.org/lkml/2009/7/13/153 Provide an ioctl for userspace applications, but only if this operation is hardware accelerated (otherwide it does not make any sense). Signed-off-by: Siarhei Siamashka --- drivers/video/fbmem.c | 30 ++++++++++++++++++++++++++++++ include/linux/fb.h | 5 +++++ 2 files changed, 35 insertions(+) diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index 0dff12a1daef26..78e6df757f6e0a 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -1074,6 +1074,25 @@ fb_blank(struct fb_info *info, int blank) return ret; } +static int fb_copyarea_user(struct fb_info *info, + struct fb_copyarea *copy) +{ + int ret = 0; + if (!lock_fb_info(info)) + return -ENODEV; + if (copy->dx + copy->width > info->var.xres || + copy->sx + copy->width > info->var.xres || + copy->dy + copy->height > info->var.yres || + copy->sy + copy->height > info->var.yres) { + ret = -EINVAL; + goto out; + } + info->fbops->fb_copyarea(info, copy); +out: + unlock_fb_info(info); + return ret; +} + static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, unsigned long arg) { @@ -1084,6 +1103,7 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, struct fb_cmap cmap_from; struct fb_cmap_user cmap; struct fb_event event; + struct fb_copyarea copy; void __user *argp = (void __user *)arg; long ret = 0; @@ -1191,6 +1211,15 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, console_unlock(); unlock_fb_info(info); break; + case FBIOCOPYAREA: + if (info->flags & FBINFO_HWACCEL_COPYAREA) { + /* only provide this ioctl if it is accelerated */ + if (copy_from_user(©, argp, sizeof(copy))) + return -EFAULT; + ret = fb_copyarea_user(info, ©); + break; + } + /* fall through */ default: if (!lock_fb_info(info)) return -ENODEV; @@ -1343,6 +1372,7 @@ static long fb_compat_ioctl(struct file *file, unsigned int cmd, case FBIOPAN_DISPLAY: case FBIOGET_CON2FBMAP: case FBIOPUT_CON2FBMAP: + case FBIOCOPYAREA: arg = (unsigned long) compat_ptr(arg); case FBIOBLANK: ret = do_fb_ioctl(info, cmd, arg); diff --git a/include/linux/fb.h b/include/linux/fb.h index ac3f1c60584320..fab79a2b0ca4e0 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -39,6 +39,11 @@ #define FBIOPUT_MODEINFO 0x4617 #define FBIOGET_DISPINFO 0x4618 #define FBIO_WAITFORVSYNC _IOW('F', 0x20, __u32) +/* + * HACK: use 'z' in order not to clash with any other ioctl numbers which might + * be concurrently added to the mainline kernel + */ +#define FBIOCOPYAREA _IOW('z', 0x21, struct fb_copyarea) #define FB_TYPE_PACKED_PIXELS 0 /* Packed Pixels */ #define FB_TYPE_PLANES 1 /* Non interleaved planes */ From d3e018b11ce6d20b6449a81c6674f859bdf1c58c Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Mon, 17 Jun 2013 16:00:25 +0300 Subject: [PATCH 2/2] bcm2708_fb: DMA acceleration for fb_copyarea Based on http://www.raspberrypi.org/phpBB3/viewtopic.php?p=62425#p62425 Also used Simon's dmaer_master module as a reference for tweaking DMA settings for better performance. For now busylooping only. IRQ support might be added later. With non-overclocked Raspberry Pi, the performance is ~360 MB/s for simple copy or ~260 MB/s for two-pass copy (used when dragging windows to the right). In the case of using DMA channel 0, the performance could improve to ~440 MB/s. But channel 0 is used by SDHCI at the moment. For comparison, VFP optimized CPU copy can only do ~114 MB/s in the same conditions (hindered by reading uncached source buffer). Signed-off-by: Siarhei Siamashka --- drivers/video/bcm2708_fb.c | 162 ++++++++++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 3 deletions(-) diff --git a/drivers/video/bcm2708_fb.c b/drivers/video/bcm2708_fb.c index efdee9d96b1c4c..56a5656bf89a6a 100644 --- a/drivers/video/bcm2708_fb.c +++ b/drivers/video/bcm2708_fb.c @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -64,6 +65,11 @@ struct bcm2708_fb { struct fbinfo_s *info; dma_addr_t dma; u32 cmap[16]; + int dma_chan; + int dma_irq; + void __iomem *dma_chan_base; + void *cb_base; /* DMA control blocks */ + dma_addr_t cb_handle; }; #define to_bcm2708(info) container_of(info, struct bcm2708_fb, fb) @@ -312,11 +318,133 @@ static void bcm2708_fb_fillrect(struct fb_info *info, cfb_fillrect(info, rect); } +/* A helper function for configuring dma control block */ +static void set_dma_cb(struct bcm2708_dma_cb *cb, + int burst_size, + dma_addr_t dst, + int dst_stride, + dma_addr_t src, + int src_stride, + int w, + int h) +{ + cb->info = BCM2708_DMA_BURST(burst_size) | BCM2708_DMA_S_WIDTH | + BCM2708_DMA_S_INC | BCM2708_DMA_D_WIDTH | + BCM2708_DMA_D_INC | BCM2708_DMA_TDMODE; + cb->dst = dst; + cb->src = src; + /* + * This is not really obvious from the DMA documentation, + * but the top 16 bits must be programmmed to "height -1" + * and not "height" in 2D mode. + */ + cb->length = ((h - 1) << 16) | w; + cb->stride = ((dst_stride - w) << 16) | (u16)(src_stride - w); + cb->pad[0] = 0; + cb->pad[1] = 0; +} + static void bcm2708_fb_copyarea(struct fb_info *info, const struct fb_copyarea *region) { - /*print_debug("bcm2708_fb_copyarea\n"); */ - cfb_copyarea(info, region); + struct bcm2708_fb *fb = to_bcm2708(info); + struct bcm2708_dma_cb *cb = fb->cb_base; + int bytes_per_pixel = (info->var.bits_per_pixel + 7) >> 3; + /* Channel 0 supports larger bursts and is a bit faster */ + int burst_size = (fb->dma_chan == 0) ? 10 : 4; + + /* Fallback to cfb_copyarea() if we don't like something */ + if (bytes_per_pixel > 4 || + info->var.xres > 1920 || info->var.yres > 1200 || + region->width <= 0 || region->width > info->var.xres || + region->height <= 0 || region->height > info->var.yres || + region->sx < 0 || region->sx >= info->var.xres || + region->sy < 0 || region->sy >= info->var.yres || + region->dx < 0 || region->dx >= info->var.xres || + region->dy < 0 || region->dy >= info->var.yres || + region->sx + region->width > info->var.xres || + region->dx + region->width > info->var.xres || + region->sy + region->height > info->var.yres || + region->dy + region->height > info->var.yres) { + cfb_copyarea(info, region); + return; + } + + if (region->dy == region->sy && region->dx > region->sx) { + /* + * A difficult case of overlapped copy. Because DMA can't + * copy individual scanlines in backwards direction, we need + * two-pass processing. We do it by programming a chain of dma + * control blocks in the first 16K part of the buffer and use + * the remaining 48K as the intermediate temporary scratch + * buffer. The buffer size is sufficient to handle up to + * 1920x1200 resolution at 32bpp pixel depth. + */ + int y; + dma_addr_t control_block_pa = fb->cb_handle; + dma_addr_t scratchbuf = fb->cb_handle + 16 * 1024; + int scanline_size = bytes_per_pixel * region->width; + int scanlines_per_cb = (64 * 1024 - 16 * 1024) / scanline_size; + + for (y = 0; y < region->height; y += scanlines_per_cb) { + dma_addr_t src = + fb->fb.fix.smem_start + + bytes_per_pixel * region->sx + + (region->sy + y) * fb->fb.fix.line_length; + dma_addr_t dst = + fb->fb.fix.smem_start + + bytes_per_pixel * region->dx + + (region->dy + y) * fb->fb.fix.line_length; + + if (region->height - y < scanlines_per_cb) + scanlines_per_cb = region->height - y; + + set_dma_cb(cb, burst_size, scratchbuf, scanline_size, + src, fb->fb.fix.line_length, + scanline_size, scanlines_per_cb); + control_block_pa += sizeof(struct bcm2708_dma_cb); + cb->next = control_block_pa; + cb++; + + set_dma_cb(cb, burst_size, dst, fb->fb.fix.line_length, + scratchbuf, scanline_size, + scanline_size, scanlines_per_cb); + control_block_pa += sizeof(struct bcm2708_dma_cb); + cb->next = control_block_pa; + cb++; + } + /* move the pointer back to the last dma control block */ + cb--; + } else { + /* A single dma control block is enough. */ + int sy, dy, stride; + if (region->dy <= region->sy) { + /* processing from top to bottom */ + dy = region->dy; + sy = region->sy; + stride = fb->fb.fix.line_length; + } else { + /* processing from bottom to top */ + dy = region->dy + region->height - 1; + sy = region->sy + region->height - 1; + stride = -fb->fb.fix.line_length; + } + set_dma_cb(cb, burst_size, + fb->fb.fix.smem_start + dy * fb->fb.fix.line_length + + bytes_per_pixel * region->dx, + stride, + fb->fb.fix.smem_start + sy * fb->fb.fix.line_length + + bytes_per_pixel * region->sx, + stride, + region->width * bytes_per_pixel, + region->height); + } + + /* end of dma control blocks chain */ + cb->next = 0; + + bcm_dma_start(fb->dma_chan_base, fb->cb_handle); + bcm_dma_wait_idle(fb->dma_chan_base); } static void bcm2708_fb_imageblit(struct fb_info *info, @@ -359,7 +487,7 @@ static int bcm2708_fb_register(struct bcm2708_fb *fb) fb->dma = dma; } fb->fb.fbops = &bcm2708_fb_ops; - fb->fb.flags = FBINFO_FLAG_DEFAULT; + fb->fb.flags = FBINFO_FLAG_DEFAULT | FBINFO_HWACCEL_COPYAREA; fb->fb.pseudo_palette = fb->cmap; strncpy(fb->fb.fix.id, bcm2708_name, sizeof(fb->fb.fix.id)); @@ -424,6 +552,28 @@ static int bcm2708_fb_probe(struct platform_device *dev) } memset(fb, 0, sizeof(struct bcm2708_fb)); + fb->cb_base = dma_alloc_writecombine(&dev->dev, SZ_64K, + &fb->cb_handle, GFP_KERNEL); + if (!fb->cb_base) { + dev_err(&dev->dev, "cannot allocate DMA CBs\n"); + ret = -ENOMEM; + goto free_fb; + } + + pr_info("BCM2708FB: allocated DMA memory %08x\n", + fb->cb_handle); + + ret = bcm_dma_chan_alloc(BCM_DMA_FEATURE_FAST, + &fb->dma_chan_base, &fb->dma_irq); + if (ret < 0) { + dev_err(&dev->dev, "couldn't allocate a DMA channel\n"); + goto free_cb; + } + fb->dma_chan = ret; + + pr_info("BCM2708FB: allocated DMA channel %d @ %p\n", + fb->dma_chan, fb->dma_chan_base); + fb->dev = dev; ret = bcm2708_fb_register(fb); @@ -432,6 +582,9 @@ static int bcm2708_fb_probe(struct platform_device *dev) goto out; } +free_cb: + dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle); +free_fb: kfree(fb); free_region: dev_err(&dev->dev, "probe failed, err %d\n", ret); @@ -449,6 +602,9 @@ static int bcm2708_fb_remove(struct platform_device *dev) iounmap(fb->fb.screen_base); unregister_framebuffer(&fb->fb); + dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle); + bcm_dma_chan_free(fb->dma_chan); + dma_free_coherent(NULL, PAGE_ALIGN(sizeof(*fb->info)), (void *)fb->info, fb->dma); kfree(fb);