PL330: Add PL330 DMA controller driver [Kernel]

Prev: mm/vmalloc: Export purge_vmap_area_lazy()
Next: perf/scripts: tuple was set from long in both branches in python_process_event()

From: jassi brar on 25 Mar 2010 22:10

On Thu, Mar 25, 2010 at 12:17 PM, Joonyoung Shim
<jy0922.shim(a)samsung.com> wrote:
> The PL330 is currently the dma controller using at the S5PC1XX arm SoC.
> This supports DMA_MEMCPY and DMA_SLAVE.
>
> The datasheet for the PL330 can find below url:
> http://infocenter.arm.com/help/topic/com.arm.doc.ddi0424a/DDI0424A_dmac_pl330_r0p0_trm.pdf
>
> Signed-off-by: Joonyoung Shim <jy0922.shim(a)samsung.com>
> ---
> Change log:
>
> v2: Convert into an amba_device driver.
> Code clean and update from v1 patch review.

Here goes some quick technical feedback of the code.
Please remember that these issues are only secondary.
The primary drawback is the approach that this patch adopts,
as already explained in other posts.

[snip]

> +/* instruction set functions */
> +static inline int pl330_dmaaddh(u8 *desc_pool_virt, u16 imm, bool ra)
> +{
> + u8 opcode = DMAADDH | (ra << 1);
> +
> + writeb(opcode, desc_pool_virt++);
> + writew(imm, desc_pool_virt);
> + return 3;
> +}
> +
> +static inline int pl330_dmaend(u8 *desc_pool_virt)
> +{
> + u8 opcode = DMAEND;
> +
> + writeb(opcode, desc_pool_virt);
> + return 1;
> +}
> +
> +static inline int pl330_dmaflushp(u8 *desc_pool_virt, u8 periph)
> +{
> + u8 opcode = DMAFLUSHHP;
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(periph << 3, desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmald(u8 *desc_pool_virt)
> +{
> + u8 opcode = DMALD;
> +
> + writeb(opcode, desc_pool_virt);
> + return 1;
> +}
> +
> +static inline int pl330_dmalds(u8 *desc_pool_virt)
> +{
> + u8 opcode = DMALDS;
> +
> + writeb(opcode, desc_pool_virt);
> + return 1;
> +}
> +
> +static inline int pl330_dmaldb(u8 *desc_pool_virt)
> +{
> + u8 opcode = DMALDB;
> +
> + writeb(opcode, desc_pool_virt);
> + return 1;
> +}
> +
> +static inline int pl330_dmaldps(u8 *desc_pool_virt, u8 periph)
> +{
> + u8 opcode = DMALDPS;
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(periph << 3, desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmaldpb(u8 *desc_pool_virt, u8 periph)
> +{
> + u8 opcode = DMALDPB;
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(periph << 3, desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmalp(u8 *desc_pool_virt, u8 iter, bool lc)
> +{
> + u8 opcode = DMALP | (lc << 1);
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(iter, desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmalpend(u8 *desc_pool_virt, u8 backwards_jump, bool lc)
> +{
> + u8 opcode = DMALPEND | (lc << 2);
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(backwards_jump, desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmalpends(u8 *desc_pool_virt, u8 backwards_jump,
> + bool lc)
> +{
> + u8 opcode = DMALPENDS | (lc << 2);
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(backwards_jump, desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmalpendb(u8 *desc_pool_virt, u8 backwards_jump,
> + bool lc)
> +{
> + u8 opcode = DMALPENDB | (lc << 2);
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(backwards_jump, desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmalpfe(u8 *desc_pool_virt, u8 backwards_jump, bool lc)
> +{
> + u8 opcode = DMALPFE | (lc << 2);
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(backwards_jump, desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmakill(u8 *desc_pool_virt)
> +{
> + u8 opcode = DMAKILL;
> +
> + writeb(opcode, desc_pool_virt);
> + return 1;
> +}
> +
> +static inline int pl330_dmamov(u8 *desc_pool_virt, u8 rd, u32 imm)
> +{
> + u8 opcode = DMAMOV;
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(rd, desc_pool_virt++);
> + writel(imm, desc_pool_virt);
> + return 6;
> +}
> +
> +static inline int pl330_dmanop(u8 *desc_pool_virt)
> +{
> + u8 opcode = DMANOP;
> +
> + writeb(opcode, desc_pool_virt);
> + return 1;
> +}
> +
> +static inline int pl330_dmarmb(u8 *desc_pool_virt)
> +{
> + u8 opcode = DMARMB;
> +
> + writeb(opcode, desc_pool_virt);
> + return 1;
> +}
> +
> +static inline int pl330_dmasev(u8 *desc_pool_virt, u8 event_num)
> +{
> + u8 opcode = DMASEV;
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(event_num << 3, desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmast(u8 *desc_pool_virt)
> +{
> + u8 opcode = DMAST;
> +
> + writeb(opcode, desc_pool_virt);
> + return 1;
> +}
> +
> +static inline int pl330_dmasts(u8 *desc_pool_virt)
> +{
> + u8 opcode = DMASTS;
> +
> + writeb(opcode, desc_pool_virt);
> + return 1;
> +}
> +
> +static inline int pl330_dmastb(u8 *desc_pool_virt)
> +{
> + u8 opcode = DMASTB;
> +
> + writeb(opcode, desc_pool_virt);
> + return 1;
> +}
> +
> +static inline int pl330_dmastps(u8 *desc_pool_virt, u8 periph)
> +{
> + u8 opcode = DMASTPS;
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(periph << 3, desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmastpb(u8 *desc_pool_virt, u8 periph)
> +{
> + u8 opcode = DMASTPB;
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(periph << 3, desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmastz(u8 *desc_pool_virt)
> +{
> + u8 opcode = DMASTZ;
> +
> + writeb(opcode, desc_pool_virt);
> + return 1;
> +}
> +
> +static inline int pl330_dmawfe(u8 *desc_pool_virt, u8 event_num, bool invalid)
> +{
> + u8 opcode = DMAWFE;
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb((event_num << 3) | (invalid << 1), desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmawfps(u8 *desc_pool_virt, u8 periph)
> +{
> + u8 opcode = DMAWFPS;
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(periph << 3, desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmawfpb(u8 *desc_pool_virt, u8 periph)
> +{
> + u8 opcode = DMAWFPB;
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(periph << 3, desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmawfpp(u8 *desc_pool_virt, u8 periph)
> +{
> + u8 opcode = DMAWFPP;
> +
> + writeb(opcode, desc_pool_virt++);
> + writeb(periph << 3, desc_pool_virt);
> + return 2;
> +}
> +
> +static inline int pl330_dmawmb(u8 *desc_pool_virt)
> +{
> + u8 opcode = DMAWMB;
> +
> + writeb(opcode, desc_pool_virt);
> + return 1;
> +}
> +
> +static void pl330_dmago(struct pl330_chan *pl330_ch, struct pl330_desc *desc,
> + bool ns)
> +{
> + unsigned int val;
> + u8 opcode = DMAGO | (ns << 1);
> +
> + val = (pl330_ch->id << 24) | (opcode << 16) | (pl330_ch->id << 8);
> + writel(val, pl330_ch->pl330_dev->reg_base + PL330_DBGINST0);
> +
> + val = desc->async_tx.phys;
> + writel(val, pl330_ch->pl330_dev->reg_base + PL330_DBGINST1);
> +
> + writel(0, pl330_ch->pl330_dev->reg_base + PL330_DBGCMD);
> +}
As already mentioned by Marc, it doesn't have to be read/write.
PL330 specifies the microcode buffers to be on system memory and that
need not be treated like ioports.

[snip]

> +static struct pl330_desc *
> +pl330_alloc_descriptor(struct pl330_chan *pl330_ch, gfp_t flags)
> +{
> + struct device *dev = pl330_ch->pl330_dev->common.dev;
> + struct pl330_desc *desc;
> + dma_addr_t phys;
> +
> + desc = kzalloc(sizeof(*desc), flags);
> + if (!desc)
> + return NULL;
> +
> + desc->desc_pool_virt = dma_alloc_coherent(dev, PL330_POOL_SIZE, &phys,
> + flags);
These allocations are inefficient and don't need to be done so often.
My implementation allocates a pool of such buffers(size specified by
DMA API driver)
and manage them by simple pointer manipulation.
Though the xfer requests for DMA API has to be managed in the DMA API driver.

> + if (!desc->desc_pool_virt) {
> + kfree(desc);
> + return NULL;
> + }
> +
> + dma_async_tx_descriptor_init(&desc->async_tx, &pl330_ch->common);
> + desc->async_tx.tx_submit = pl330_tx_submit;
> + desc->async_tx.phys = phys;
> +
> + return desc;
> +}
> +
> +static struct pl330_desc *pl330_get_descriptor(struct pl330_chan *pl330_ch)
> +{
> + struct device *dev = pl330_ch->pl330_dev->common.dev;
> + struct pl330_desc *desc;
> +
> + if (!list_empty(&pl330_ch->free_desc)) {
> + desc = to_pl330_desc(pl330_ch->free_desc.next);
> + list_del(&desc->desc_node);
> + } else {
> + /* try to get another desc */
> + desc = pl330_alloc_descriptor(pl330_ch, GFP_ATOMIC);
> + if (!desc) {
> + dev_err(dev, "descriptor alloc failed\n");
> + return NULL;
> + }
> + }
> +
> + return desc;
> +}
> +
> +static int pl330_alloc_chan_resources(struct dma_chan *chan)
> +{
> + struct pl330_chan *pl330_ch = to_pl330_chan(chan);
> + struct device *dev = pl330_ch->pl330_dev->common.dev;
> + struct pl330_desc *desc;
> + int i;
> + LIST_HEAD(tmp_list);
> +
> + /* have we already been set up? */
> + if (!list_empty(&pl330_ch->free_desc))
> + return pl330_ch->desc_num;
> +
> + for (i = 0; i < PL330_DESC_NUM; i++) {
> + desc = pl330_alloc_descriptor(pl330_ch, GFP_KERNEL);
> + if (!desc) {
> + dev_err(dev, "Only %d initial descriptors\n", i);
> + break;
> + }
> + list_add_tail(&desc->desc_node, &tmp_list);
> + }
> +
> + pl330_ch->completed = chan->cookie = 1;
> + pl330_ch->desc_num = i;
> + list_splice(&tmp_list, &pl330_ch->free_desc);
> +
> + return pl330_ch->desc_num;
> +}
> +

[snip]

> +static unsigned int pl330_make_instructions(struct pl330_chan *pl330_ch,
> + struct pl330_desc *desc, dma_addr_t dest, dma_addr_t src,
> + size_t len, unsigned int inst_size,
> + enum dma_data_direction direction)
> +{
> + struct device *dev = pl330_ch->pl330_dev->common.dev;
> + void *buf = desc->desc_pool_virt;
> + u32 control = *(u32 *)&pl330_ch->pl330_reg_cc;
> + unsigned int loop_size;
> + unsigned int loop_size_rest;
> + unsigned int loop_count0;
> + unsigned int loop_count1 = 0;
> + unsigned int loop_count0_rest = 0;
> + unsigned int loop_start0 = 0;
> + unsigned int loop_start1 = 0;
> +
> + dev_dbg(dev, "desc_pool_phys: 0x%x\n", desc->async_tx.phys);
> + dev_dbg(dev, "control: 0x%x\n", control);
> + dev_dbg(dev, "dest: 0x%x\n", dest);
> + dev_dbg(dev, "src: 0x%x\n", src);
> + dev_dbg(dev, "len: 0x%x\n", len);
> +
> + /* calculate loop count */
> + loop_size = (pl330_ch->pl330_reg_cc.src_burst_len + 1) *
> + (1 << pl330_ch->pl330_reg_cc.src_burst_size);
> + loop_count0 = (len / loop_size) - 1;
> + loop_size_rest = len % loop_size;
> +
> + dev_dbg(dev, "loop_size: 0x%x\n", loop_size);
> + dev_dbg(dev, "loop_count0: 0x%x\n", loop_count0);
> + dev_dbg(dev, "loop_size_rest: 0x%x\n", loop_size_rest);
> +
> + if (loop_size_rest) {
> + dev_err(dev, "Transfer length must be aligned to loop_size\n");
> + return -EINVAL;
> + }
This limit, though not serious, is unconditionally imposed by your design.
There are ways to get around this situation by smarter generation of
microcode.

> + if (loop_count0 >= PL330_MAX_LOOPS) {
> + loop_count1 = (loop_count0 / PL330_MAX_LOOPS) - 1;
> + loop_count0_rest = (loop_count0 % PL330_MAX_LOOPS) + 1;
> + loop_count0 = PL330_MAX_LOOPS - 1;
> + dev_dbg(dev, "loop_count0: 0x%x\n", loop_count0);
> + dev_dbg(dev, "loop_count0_rest: 0x%x\n", loop_count0_rest);
> + dev_dbg(dev, "loop_count1: 0x%x\n", loop_count1);
> +
> + if (loop_count1 >= PL330_MAX_LOOPS)
> + dev_dbg(dev, "loop_count1 overflow\n");
Again, the DMA API drivers will suffer just because someone didn't care
to generate microcode efficiently.
The microcode template for xfer takes only about 50 bytes and despite
having PL330_POOL_SIZE buffer, you have to drop xfer requests just because
the template is not properly designed.
My implementation is limited only by the microcode buffer size, which in turn
can be specified at startup by the DMA API driver.

> + }
> +
> + /* write instruction sets on buffer */
> + inst_size += pl330_dmamov(buf + inst_size, RD_DAR, dest);
> + inst_size += pl330_dmamov(buf + inst_size, RD_SAR, src);
> + inst_size += pl330_dmamov(buf + inst_size, RD_CCR, control);
> +
> + if (loop_count1) {
> + inst_size += pl330_dmalp(buf + inst_size, loop_count1, LC_1);
> + loop_start1 = inst_size;
> + }
> +
> + if (loop_count0) {
> + inst_size += pl330_dmalp(buf + inst_size, loop_count0, LC_0);
> + loop_start0 = inst_size;
> + }
> +
> + if (direction == DMA_TO_DEVICE) {
> + struct pl330_dma_slave *dma_slave = pl330_ch->common.private;
> + u8 periph = dma_slave->peri_num;
> + inst_size += pl330_dmawfps(buf + inst_size, periph);
> + inst_size += pl330_dmald(buf + inst_size);
> + inst_size += pl330_dmastps(buf + inst_size, periph);
> + inst_size += pl330_dmaflushp(buf + inst_size, periph);
> + } else if (direction == DMA_FROM_DEVICE) {
> + struct pl330_dma_slave *dma_slave = pl330_ch->common.private;
> + u8 periph = dma_slave->peri_num;
> + inst_size += pl330_dmawfps(buf + inst_size, periph);
> + inst_size += pl330_dmaldps(buf + inst_size, periph);
> + inst_size += pl330_dmast(buf + inst_size);
> + inst_size += pl330_dmaflushp(buf + inst_size, periph);
> + } else {
> + inst_size += pl330_dmald(buf + inst_size);
> + inst_size += pl330_dmarmb(buf + inst_size);
> + inst_size += pl330_dmast(buf + inst_size);
> + inst_size += pl330_dmawmb(buf + inst_size);
> + }
> +
> + if (loop_count0)
> + inst_size += pl330_dmalpend(buf + inst_size,
> + inst_size - loop_start0, LC_0);
> +
> + if (loop_count1)
> + inst_size += pl330_dmalpend(buf + inst_size,
> + inst_size - loop_start1, LC_1);
> +
> + if (loop_count0_rest) {
> + inst_size += pl330_dmalp(buf + inst_size, loop_count0_rest - 1,
> + LC_0);
> + loop_start0 = inst_size;
> +
> + if (direction == DMA_TO_DEVICE) {
> + struct pl330_dma_slave *dma_slave =
> + pl330_ch->common.private;
> + u8 periph = dma_slave->peri_num;
> + inst_size += pl330_dmawfps(buf + inst_size, periph);
> + inst_size += pl330_dmald(buf + inst_size);
> + inst_size += pl330_dmastps(buf + inst_size, periph);
> + inst_size += pl330_dmaflushp(buf + inst_size, periph);
> + } else if (direction == DMA_FROM_DEVICE) {
> + struct pl330_dma_slave *dma_slave =
> + pl330_ch->common.private;
> + u8 periph = dma_slave->peri_num;
> + inst_size += pl330_dmawfps(buf + inst_size, periph);
> + inst_size += pl330_dmaldps(buf + inst_size, periph);
> + inst_size += pl330_dmast(buf + inst_size);
> + inst_size += pl330_dmaflushp(buf + inst_size, periph);
> + } else {
> + inst_size += pl330_dmald(buf + inst_size);
> + inst_size += pl330_dmarmb(buf + inst_size);
> + inst_size += pl330_dmast(buf + inst_size);
> + inst_size += pl330_dmawmb(buf + inst_size);
> + }
> +
> + inst_size += pl330_dmalpend(buf + inst_size,
> + inst_size - loop_start0, LC_0);
> + }
> +
> + inst_size += pl330_dmasev(buf + inst_size, pl330_ch->id);
> + inst_size += pl330_dmaend(buf + inst_size);
> +
> + return inst_size;
> +}
This instruction generation leaves no scope for Security permissions for xfers,
that is a feature of PL330.

[snip]

> +static void pl330_xfer_complete(struct pl330_chan *pl330_ch)
> +{
> + struct pl330_desc *desc;
> + dma_async_tx_callback callback;
> + void *callback_param;
> +
> + /* execute next desc */
> + pl330_issue_pending(&pl330_ch->common);
> +
> + if (list_empty(&pl330_ch->complete_desc))
> + return;
> +
> + desc = to_pl330_desc(pl330_ch->complete_desc.next);
> + list_move_tail(&desc->desc_node, &pl330_ch->free_desc);
> +
> + pl330_ch->completed = desc->async_tx.cookie;
> +
> + callback = desc->async_tx.callback;
> + callback_param = desc->async_tx.callback_param;
> + if (callback)
> + callback(callback_param);
> +}
> +
> +static void pl330_ch_tasklet(unsigned long data)
> +{
> + struct pl330_chan *pl330_ch = (struct pl330_chan *)data;
> + unsigned int val;
> +
> + pl330_xfer_complete(pl330_ch);
> +
> + /* enable channel interrupt */
> + val = readl(pl330_ch->pl330_dev->reg_base + PL330_INTEN);
> + val |= (1 << pl330_ch->id);
> + writel(val, pl330_ch->pl330_dev->reg_base + PL330_INTEN);
> +}
> +
> +static irqreturn_t pl330_irq_handler(int irq, void *data)
> +{
> + struct pl330_device *pl330_dev = data;
> + struct pl330_chan *pl330_ch;
> + unsigned int intstatus;
> + unsigned int inten;
> + int i;
> +
> + intstatus = readl(pl330_dev->reg_base + PL330_INTSTATUS);
> +
> + if (intstatus == 0)
> + return IRQ_HANDLED;
> +
> + inten = readl(pl330_dev->reg_base + PL330_INTEN);
> + for (i = 0; i < PL330_MAX_CHANS; i++) {
> + if (intstatus & (1 << i)) {
> + pl330_ch = &pl330_dev->pl330_ch[i];
> + writel(1 << i, pl330_dev->reg_base + PL330_INTCLR);
> +
> + /* disable channel interrupt */
> + inten &= ~(1 << i);
> + writel(inten, pl330_dev->reg_base + PL330_INTEN);
> +
> + tasklet_schedule(&pl330_ch->tasklet);
I think the DMA API already prohibits doing non-irq-context things(like enqueue)
in the callbacks, so why implement tasklets here?
This may still get you "audio working fine" with Samsung I2S controller,
but is likely to cause problems with more demanding peripherals like SPDIF
if they operate at best QOS(even 24bits/sample Stereo at 96000Hz) and has
shallow FIFO(8 samples deep and hence 84 usecs acceptable latency).
Remember that SPDIF usually goes with other system load like HDMI HD
playaback which only increases the interrupt latency.

Not to forget, the overall throughput hit taken by other dma clients,
like MMC over SPI that use 256/512 bytes DMA xfers, due to delayed
DMA-DONE notifications.

Also, using tasklet here may break any protocol that involves _time-bound_ ACK
via some register after the xfer has been done.

If some client needs to do sleepable-context stuff after DMA-Xfer-Done,
let that driver implement tasklet in it's callback rather than have every
client pay the price.

> + }
> + }
> +
> + return IRQ_HANDLED;
> +}
> +

[snip]

> +
> +static int __devinit pl330_probe(struct amba_device *adev, struct amba_id *id)
> +{
> + struct pl330_device *pl330_dev;

[snip]

> +
> + for (i = 0; i < PL330_MAX_CHANS; i++) {
This whole code is designed around the assumption of every DMAC having
PL330_MAX_CHANS channels. That is dangerous, since PL330 is highly
configurable and some implementation may choose to implement less than
PL330_MAX_CHANS(i.e 8) channels.
As the PL330 spec says, most operations for non-existing channel result in
DMA Abort. Further, the IRQ handler assumes utopia and doesn't even
care to check
such conditions, as a result on non-s3c like implementations there are many
chances the system will just hang looping in DMA Abort irq or no irq at all
depending upon the cause.
Not to mention the unnecessary allocation for MAX possible resources, though
not very serious.

regards
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Ben Dooks on 30 Mar 2010 21:10

On Fri, Mar 26, 2010 at 11:08:06AM +0900, jassi brar wrote:
> On Thu, Mar 25, 2010 at 12:17 PM, Joonyoung Shim
> <jy0922.shim(a)samsung.com> wrote:
> > The PL330 is currently the dma controller using at the S5PC1XX arm SoC.
> > This supports DMA_MEMCPY and DMA_SLAVE.
> >
> > The datasheet for the PL330 can find below url:
> > http://infocenter.arm.com/help/topic/com.arm.doc.ddi0424a/DDI0424A_dmac_pl330_r0p0_trm.pdf
> >
> > Signed-off-by: Joonyoung Shim <jy0922.shim(a)samsung.com>
> > ---
> > Change log:
> >
> > v2: Convert into an amba_device driver.
> > Code clean and update from v1 patch review.
>
> Here goes some quick technical feedback of the code.
> Please remember that these issues are only secondary.
> The primary drawback is the approach that this patch adopts,
> as already explained in other posts.
>
> [snip]
>
> > +/* instruction set functions */
> > +static inline int pl330_dmaaddh(u8 *desc_pool_virt, u16 imm, bool ra)
> > +{
> > + u8 opcode = DMAADDH | (ra << 1);
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writew(imm, desc_pool_virt);
> > + return 3;
> > +}
> > +
> > +static inline int pl330_dmaend(u8 *desc_pool_virt)
> > +{
> > + u8 opcode = DMAEND;
> > +
> > + writeb(opcode, desc_pool_virt);
> > + return 1;
> > +}
> > +
> > +static inline int pl330_dmaflushp(u8 *desc_pool_virt, u8 periph)
> > +{
> > + u8 opcode = DMAFLUSHHP;
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(periph << 3, desc_pool_virt);
> > + return 2;
> > +}
> > +
> > +static inline int pl330_dmald(u8 *desc_pool_virt)
> > +{
> > + u8 opcode = DMALD;
> > +
> > + writeb(opcode, desc_pool_virt);
> > + return 1;
> > +}
> > +
> > +static inline int pl330_dmalds(u8 *desc_pool_virt)
> > +{
> > + u8 opcode = DMALDS;
> > +
> > + writeb(opcode, desc_pool_virt);
> > + return 1;
> > +}
> > +
> > +static inline int pl330_dmaldb(u8 *desc_pool_virt)
> > +{
> > + u8 opcode = DMALDB;
> > +
> > + writeb(opcode, desc_pool_virt);
> > + return 1;
> > +}
> > +
> > +static inline int pl330_dmaldps(u8 *desc_pool_virt, u8 periph)
> > +{
> > + u8 opcode = DMALDPS;
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(periph << 3, desc_pool_virt);
> > + return 2;
> > +}
> > +
> > +static inline int pl330_dmaldpb(u8 *desc_pool_virt, u8 periph)
> > +{
> > + u8 opcode = DMALDPB;
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(periph << 3, desc_pool_virt);
> > + return 2;
> > +}
> > +
> > +static inline int pl330_dmalp(u8 *desc_pool_virt, u8 iter, bool lc)
> > +{
> > + u8 opcode = DMALP | (lc << 1);
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(iter, desc_pool_virt);
> > + return 2;
> > +}
> > +
> > +static inline int pl330_dmalpend(u8 *desc_pool_virt, u8 backwards_jump, bool lc)
> > +{
> > + u8 opcode = DMALPEND | (lc << 2);
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(backwards_jump, desc_pool_virt);
> > + return 2;
> > +}
> > +
> > +static inline int pl330_dmalpends(u8 *desc_pool_virt, u8 backwards_jump,
> > + bool lc)
> > +{
> > + u8 opcode = DMALPENDS | (lc << 2);
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(backwards_jump, desc_pool_virt);
> > + return 2;
> > +}
> > +
> > +static inline int pl330_dmalpendb(u8 *desc_pool_virt, u8 backwards_jump,
> > + bool lc)
> > +{
> > + u8 opcode = DMALPENDB | (lc << 2);
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(backwards_jump, desc_pool_virt);
> > + return 2;
> > +}
> > +
> > +static inline int pl330_dmalpfe(u8 *desc_pool_virt, u8 backwards_jump, bool lc)
> > +{
> > + u8 opcode = DMALPFE | (lc << 2);
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(backwards_jump, desc_pool_virt);
> > + return 2;
> > +}
> > +
> > +static inline int pl330_dmakill(u8 *desc_pool_virt)
> > +{
> > + u8 opcode = DMAKILL;
> > +
> > + writeb(opcode, desc_pool_virt);
> > + return 1;
> > +}
> > +
> > +static inline int pl330_dmamov(u8 *desc_pool_virt, u8 rd, u32 imm)
> > +{
> > + u8 opcode = DMAMOV;
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(rd, desc_pool_virt++);
> > + writel(imm, desc_pool_virt);
> > + return 6;
> > +}
> > +
> > +static inline int pl330_dmanop(u8 *desc_pool_virt)
> > +{
> > + u8 opcode = DMANOP;
> > +
> > + writeb(opcode, desc_pool_virt);
> > + return 1;
> > +}
> > +
> > +static inline int pl330_dmarmb(u8 *desc_pool_virt)
> > +{
> > + u8 opcode = DMARMB;
> > +
> > + writeb(opcode, desc_pool_virt);
> > + return 1;
> > +}
> > +
> > +static inline int pl330_dmasev(u8 *desc_pool_virt, u8 event_num)
> > +{
> > + u8 opcode = DMASEV;
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(event_num << 3, desc_pool_virt);
> > + return 2;
> > +}
> > +
> > +static inline int pl330_dmast(u8 *desc_pool_virt)
> > +{
> > + u8 opcode = DMAST;
> > +
> > + writeb(opcode, desc_pool_virt);
> > + return 1;
> > +}
> > +
> > +static inline int pl330_dmasts(u8 *desc_pool_virt)
> > +{
> > + u8 opcode = DMASTS;
> > +
> > + writeb(opcode, desc_pool_virt);
> > + return 1;
> > +}
> > +
> > +static inline int pl330_dmastb(u8 *desc_pool_virt)
> > +{
> > + u8 opcode = DMASTB;
> > +
> > + writeb(opcode, desc_pool_virt);
> > + return 1;
> > +}
> > +
> > +static inline int pl330_dmastps(u8 *desc_pool_virt, u8 periph)
> > +{
> > + u8 opcode = DMASTPS;
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(periph << 3, desc_pool_virt);
> > + return 2;
> > +}
> > +
> > +static inline int pl330_dmastpb(u8 *desc_pool_virt, u8 periph)
> > +{
> > + u8 opcode = DMASTPB;
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(periph << 3, desc_pool_virt);
> > + return 2;
> > +}
> > +
> > +static inline int pl330_dmastz(u8 *desc_pool_virt)
> > +{
> > + u8 opcode = DMASTZ;
> > +
> > + writeb(opcode, desc_pool_virt);
> > + return 1;
> > +}
> > +
> > +static inline int pl330_dmawfe(u8 *desc_pool_virt, u8 event_num, bool invalid)
> > +{
> > + u8 opcode = DMAWFE;
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb((event_num << 3) | (invalid << 1), desc_pool_virt);
> > + return 2;

writeb() is for peripherals. you can do 'desc_pool_virt[0] = ' here.

> > +
> > +static inline int pl330_dmawfps(u8 *desc_pool_virt, u8 periph)
> > +{
> > + u8 opcode = DMAWFPS;
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(periph << 3, desc_pool_virt);
> > + return 2;
> > +}
> > +
> > +static inline int pl330_dmawfpb(u8 *desc_pool_virt, u8 periph)
> > +{
> > + u8 opcode = DMAWFPB;
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(periph << 3, desc_pool_virt);
> > + return 2;
> > +}
> > +
> > +static inline int pl330_dmawfpp(u8 *desc_pool_virt, u8 periph)
> > +{
> > + u8 opcode = DMAWFPP;
> > +
> > + writeb(opcode, desc_pool_virt++);
> > + writeb(periph << 3, desc_pool_virt);
> > + return 2;
> > +}
> > +
> > +static inline int pl330_dmawmb(u8 *desc_pool_virt)
> > +{
> > + u8 opcode = DMAWMB;
> > +
> > + writeb(opcode, desc_pool_virt);
> > + return 1;
> > +}
> > +
> > +static void pl330_dmago(struct pl330_chan *pl330_ch, struct pl330_desc *desc,
> > + bool ns)
> > +{
> > + unsigned int val;
> > + u8 opcode = DMAGO | (ns << 1);
> > +
> > + val = (pl330_ch->id << 24) | (opcode << 16) | (pl330_ch->id << 8);
> > + writel(val, pl330_ch->pl330_dev->reg_base + PL330_DBGINST0);
> > +
> > + val = desc->async_tx.phys;
> > + writel(val, pl330_ch->pl330_dev->reg_base + PL330_DBGINST1);
> > +
> > + writel(0, pl330_ch->pl330_dev->reg_base + PL330_DBGCMD);
> > +}
> As already mentioned by Marc, it doesn't have to be read/write.
> PL330 specifies the microcode buffers to be on system memory and that
> need not be treated like ioports.
>
> [snip]
>
> > +static struct pl330_desc *
> > +pl330_alloc_descriptor(struct pl330_chan *pl330_ch, gfp_t flags)
> > +{
> > + struct device *dev = pl330_ch->pl330_dev->common.dev;
> > + struct pl330_desc *desc;
> > + dma_addr_t phys;
> > +
> > + desc = kzalloc(sizeof(*desc), flags);
> > + if (!desc)
> > + return NULL;
> > +
> > + desc->desc_pool_virt = dma_alloc_coherent(dev, PL330_POOL_SIZE, &phys,
> > + flags);
> These allocations are inefficient and don't need to be done so often.
> My implementation allocates a pool of such buffers(size specified by
> DMA API driver)
> and manage them by simple pointer manipulation.
> Though the xfer requests for DMA API has to be managed in the DMA API driver.

There's a dma pool implementation too in the kernel.

>
> > + if (!desc->desc_pool_virt) {
> > + kfree(desc);
> > + return NULL;
> > + }
> > +
> > + dma_async_tx_descriptor_init(&desc->async_tx, &pl330_ch->common);
> > + desc->async_tx.tx_submit = pl330_tx_submit;
> > + desc->async_tx.phys = phys;
> > +
> > + return desc;
> > +}
> > +
> > +static struct pl330_desc *pl330_get_descriptor(struct pl330_chan *pl330_ch)
> > +{
> > + struct device *dev = pl330_ch->pl330_dev->common.dev;
> > + struct pl330_desc *desc;
> > +
> > + if (!list_empty(&pl330_ch->free_desc)) {
> > + desc = to_pl330_desc(pl330_ch->free_desc.next);
> > + list_del(&desc->desc_node);
> > + } else {
> > + /* try to get another desc */
> > + desc = pl330_alloc_descriptor(pl330_ch, GFP_ATOMIC);
> > + if (!desc) {
> > + dev_err(dev, "descriptor alloc failed\n");
> > + return NULL;
> > + }
> > + }
> > +
> > + return desc;
> > +}
> > +
> > +static int pl330_alloc_chan_resources(struct dma_chan *chan)
> > +{
> > + struct pl330_chan *pl330_ch = to_pl330_chan(chan);
> > + struct device *dev = pl330_ch->pl330_dev->common.dev;
> > + struct pl330_desc *desc;
> > + int i;
> > + LIST_HEAD(tmp_list);
> > +
> > + /* have we already been set up? */
> > + if (!list_empty(&pl330_ch->free_desc))
> > + return pl330_ch->desc_num;
> > +
> > + for (i = 0; i < PL330_DESC_NUM; i++) {
> > + desc = pl330_alloc_descriptor(pl330_ch, GFP_KERNEL);
> > + if (!desc) {
> > + dev_err(dev, "Only %d initial descriptors\n", i);
> > + break;
> > + }
> > + list_add_tail(&desc->desc_node, &tmp_list);
> > + }
> > +
> > + pl330_ch->completed = chan->cookie = 1;
> > + pl330_ch->desc_num = i;
> > + list_splice(&tmp_list, &pl330_ch->free_desc);
> > +
> > + return pl330_ch->desc_num;
> > +}
> > +
>
> [snip]
>
> > +static unsigned int pl330_make_instructions(struct pl330_chan *pl330_ch,
> > + struct pl330_desc *desc, dma_addr_t dest, dma_addr_t src,
> > + size_t len, unsigned int inst_size,
> > + enum dma_data_direction direction)
> > +{
> > + struct device *dev = pl330_ch->pl330_dev->common.dev;
> > + void *buf = desc->desc_pool_virt;
> > + u32 control = *(u32 *)&pl330_ch->pl330_reg_cc;
> > + unsigned int loop_size;
> > + unsigned int loop_size_rest;
> > + unsigned int loop_count0;
> > + unsigned int loop_count1 = 0;
> > + unsigned int loop_count0_rest = 0;
> > + unsigned int loop_start0 = 0;
> > + unsigned int loop_start1 = 0;
> > +
> > + dev_dbg(dev, "desc_pool_phys: 0x%x\n", desc->async_tx.phys);
> > + dev_dbg(dev, "control: 0x%x\n", control);
> > + dev_dbg(dev, "dest: 0x%x\n", dest);
> > + dev_dbg(dev, "src: 0x%x\n", src);
> > + dev_dbg(dev, "len: 0x%x\n", len);
> > +
> > + /* calculate loop count */
> > + loop_size = (pl330_ch->pl330_reg_cc.src_burst_len + 1) *
> > + (1 << pl330_ch->pl330_reg_cc.src_burst_size);
> > + loop_count0 = (len / loop_size) - 1;
> > + loop_size_rest = len % loop_size;
> > +
> > + dev_dbg(dev, "loop_size: 0x%x\n", loop_size);
> > + dev_dbg(dev, "loop_count0: 0x%x\n", loop_count0);
> > + dev_dbg(dev, "loop_size_rest: 0x%x\n", loop_size_rest);
> > +
> > + if (loop_size_rest) {
> > + dev_err(dev, "Transfer length must be aligned to loop_size\n");
> > + return -EINVAL;
> > + }
> This limit, though not serious, is unconditionally imposed by your design.
> There are ways to get around this situation by smarter generation of
> microcode.
>
> > + if (loop_count0 >= PL330_MAX_LOOPS) {
> > + loop_count1 = (loop_count0 / PL330_MAX_LOOPS) - 1;
> > + loop_count0_rest = (loop_count0 % PL330_MAX_LOOPS) + 1;
> > + loop_count0 = PL330_MAX_LOOPS - 1;
> > + dev_dbg(dev, "loop_count0: 0x%x\n", loop_count0);
> > + dev_dbg(dev, "loop_count0_rest: 0x%x\n", loop_count0_rest);
> > + dev_dbg(dev, "loop_count1: 0x%x\n", loop_count1);
> > +
> > + if (loop_count1 >= PL330_MAX_LOOPS)
> > + dev_dbg(dev, "loop_count1 overflow\n");
> Again, the DMA API drivers will suffer just because someone didn't care
> to generate microcode efficiently.
> The microcode template for xfer takes only about 50 bytes and despite
> having PL330_POOL_SIZE buffer, you have to drop xfer requests just because
> the template is not properly designed.
> My implementation is limited only by the microcode buffer size, which in turn
> can be specified at startup by the DMA API driver.
>
> > + }
> > +
> > + /* write instruction sets on buffer */
> > + inst_size += pl330_dmamov(buf + inst_size, RD_DAR, dest);
> > + inst_size += pl330_dmamov(buf + inst_size, RD_SAR, src);
> > + inst_size += pl330_dmamov(buf + inst_size, RD_CCR, control);
> > +
> > + if (loop_count1) {
> > + inst_size += pl330_dmalp(buf + inst_size, loop_count1, LC_1);
> > + loop_start1 = inst_size;
> > + }
> > +
> > + if (loop_count0) {
> > + inst_size += pl330_dmalp(buf + inst_size, loop_count0, LC_0);
> > + loop_start0 = inst_size;
> > + }
> > +
> > + if (direction == DMA_TO_DEVICE) {
> > + struct pl330_dma_slave *dma_slave = pl330_ch->common.private;
> > + u8 periph = dma_slave->peri_num;
> > + inst_size += pl330_dmawfps(buf + inst_size, periph);
> > + inst_size += pl330_dmald(buf + inst_size);
> > + inst_size += pl330_dmastps(buf + inst_size, periph);
> > + inst_size += pl330_dmaflushp(buf + inst_size, periph);
> > + } else if (direction == DMA_FROM_DEVICE) {
> > + struct pl330_dma_slave *dma_slave = pl330_ch->common.private;
> > + u8 periph = dma_slave->peri_num;
> > + inst_size += pl330_dmawfps(buf + inst_size, periph);
> > + inst_size += pl330_dmaldps(buf + inst_size, periph);
> > + inst_size += pl330_dmast(buf + inst_size);
> > + inst_size += pl330_dmaflushp(buf + inst_size, periph);
> > + } else {
> > + inst_size += pl330_dmald(buf + inst_size);
> > + inst_size += pl330_dmarmb(buf + inst_size);
> > + inst_size += pl330_dmast(buf + inst_size);
> > + inst_size += pl330_dmawmb(buf + inst_size);
> > + }
> > +
> > + if (loop_count0)
> > + inst_size += pl330_dmalpend(buf + inst_size,
> > + inst_size - loop_start0, LC_0);
> > +
> > + if (loop_count1)
> > + inst_size += pl330_dmalpend(buf + inst_size,
> > + inst_size - loop_start1, LC_1);
> > +
> > + if (loop_count0_rest) {
> > + inst_size += pl330_dmalp(buf + inst_size, loop_count0_rest - 1,
> > + LC_0);
> > + loop_start0 = inst_size;
> > +
> > + if (direction == DMA_TO_DEVICE) {
> > + struct pl330_dma_slave *dma_slave =
> > + pl330_ch->common.private;
> > + u8 periph = dma_slave->peri_num;
> > + inst_size += pl330_dmawfps(buf + inst_size, periph);
> > + inst_size += pl330_dmald(buf + inst_size);
> > + inst_size += pl330_dmastps(buf + inst_size, periph);
> > + inst_size += pl330_dmaflushp(buf + inst_size, periph);
> > + } else if (direction == DMA_FROM_DEVICE) {
> > + struct pl330_dma_slave *dma_slave =
> > + pl330_ch->common.private;
> > + u8 periph = dma_slave->peri_num;
> > + inst_size += pl330_dmawfps(buf + inst_size, periph);
> > + inst_size += pl330_dmaldps(buf + inst_size, periph);
> > + inst_size += pl330_dmast(buf + inst_size);
> > + inst_size += pl330_dmaflushp(buf + inst_size, periph);
> > + } else {
> > + inst_size += pl330_dmald(buf + inst_size);
> > + inst_size += pl330_dmarmb(buf + inst_size);
> > + inst_size += pl330_dmast(buf + inst_size);
> > + inst_size += pl330_dmawmb(buf + inst_size);
> > + }
> > +
> > + inst_size += pl330_dmalpend(buf + inst_size,
> > + inst_size - loop_start0, LC_0);
> > + }
> > +
> > + inst_size += pl330_dmasev(buf + inst_size, pl330_ch->id);
> > + inst_size += pl330_dmaend(buf + inst_size);
> > +
> > + return inst_size;
> > +}
> This instruction generation leaves no scope for Security permissions for xfers,
> that is a feature of PL330.
>
> [snip]
>
> > +static void pl330_xfer_complete(struct pl330_chan *pl330_ch)
> > +{
> > + struct pl330_desc *desc;
> > + dma_async_tx_callback callback;
> > + void *callback_param;
> > +
> > + /* execute next desc */
> > + pl330_issue_pending(&pl330_ch->common);
> > +
> > + if (list_empty(&pl330_ch->complete_desc))
> > + return;
> > +
> > + desc = to_pl330_desc(pl330_ch->complete_desc.next);
> > + list_move_tail(&desc->desc_node, &pl330_ch->free_desc);
> > +
> > + pl330_ch->completed = desc->async_tx.cookie;
> > +
> > + callback = desc->async_tx.callback;
> > + callback_param = desc->async_tx.callback_param;
> > + if (callback)
> > + callback(callback_param);
> > +}
> > +
> > +static void pl330_ch_tasklet(unsigned long data)
> > +{
> > + struct pl330_chan *pl330_ch = (struct pl330_chan *)data;
> > + unsigned int val;
> > +
> > + pl330_xfer_complete(pl330_ch);
> > +
> > + /* enable channel interrupt */
> > + val = readl(pl330_ch->pl330_dev->reg_base + PL330_INTEN);
> > + val |= (1 << pl330_ch->id);
> > + writel(val, pl330_ch->pl330_dev->reg_base + PL330_INTEN);
> > +}
> > +
> > +static irqreturn_t pl330_irq_handler(int irq, void *data)
> > +{
> > + struct pl330_device *pl330_dev = data;
> > + struct pl330_chan *pl330_ch;
> > + unsigned int intstatus;
> > + unsigned int inten;
> > + int i;
> > +
> > + intstatus = readl(pl330_dev->reg_base + PL330_INTSTATUS);
> > +
> > + if (intstatus == 0)
> > + return IRQ_HANDLED;
> > +
> > + inten = readl(pl330_dev->reg_base + PL330_INTEN);
> > + for (i = 0; i < PL330_MAX_CHANS; i++) {
> > + if (intstatus & (1 << i)) {
> > + pl330_ch = &pl330_dev->pl330_ch[i];
> > + writel(1 << i, pl330_dev->reg_base + PL330_INTCLR);
> > +
> > + /* disable channel interrupt */
> > + inten &= ~(1 << i);
> > + writel(inten, pl330_dev->reg_base + PL330_INTEN);
> > +
> > + tasklet_schedule(&pl330_ch->tasklet);
> I think the DMA API already prohibits doing non-irq-context things(like enqueue)
> in the callbacks, so why implement tasklets here?
> This may still get you "audio working fine" with Samsung I2S controller,
> but is likely to cause problems with more demanding peripherals like SPDIF
> if they operate at best QOS(even 24bits/sample Stereo at 96000Hz) and has
> shallow FIFO(8 samples deep and hence 84 usecs acceptable latency).
> Remember that SPDIF usually goes with other system load like HDMI HD
> playaback which only increases the interrupt latency.
>
> Not to forget, the overall throughput hit taken by other dma clients,
> like MMC over SPI that use 256/512 bytes DMA xfers, due to delayed
> DMA-DONE notifications.
>
> Also, using tasklet here may break any protocol that involves _time-bound_ ACK
> via some register after the xfer has been done.
>
> If some client needs to do sleepable-context stuff after DMA-Xfer-Done,
> let that driver implement tasklet in it's callback rather than have every
> client pay the price.
>
> > + }
> > + }
> > +
> > + return IRQ_HANDLED;
> > +}
> > +
>
> [snip]
>
> > +
> > +static int __devinit pl330_probe(struct amba_device *adev, struct amba_id *id)
> > +{
> > + struct pl330_device *pl330_dev;
>
> [snip]
>
> > +
> > + for (i = 0; i < PL330_MAX_CHANS; i++) {
> This whole code is designed around the assumption of every DMAC having
> PL330_MAX_CHANS channels. That is dangerous, since PL330 is highly
> configurable and some implementation may choose to implement less than
> PL330_MAX_CHANS(i.e 8) channels.
> As the PL330 spec says, most operations for non-existing channel result in
> DMA Abort. Further, the IRQ handler assumes utopia and doesn't even
> care to check
> such conditions, as a result on non-s3c like implementations there are many
> chances the system will just hang looping in DMA Abort irq or no irq at all
> depending upon the cause.
> Not to mention the unnecessary allocation for MAX possible resources, though
> not very serious.

ditto.

> regards
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel(a)lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

--
--
Ben

Q: What's a light-year?
A: One-third less calories than a regular year.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: jassi brar on 30 Mar 2010 21:50

On Wed, Mar 31, 2010 at 10:07 AM, Ben Dooks <ben-linux(a)fluff.org> wrote:
> On Fri, Mar 26, 2010 at 11:08:06AM +0900, jassi brar wrote:
>> On Thu, Mar 25, 2010 at 12:17 PM, Joonyoung Shim
>> <jy0922.shim(a)samsung.com> wrote:
>> > +static struct pl330_desc *
>> > +pl330_alloc_descriptor(struct pl330_chan *pl330_ch, gfp_t flags)
>> > +{
>> > + struct device *dev = pl330_ch->pl330_dev->common.dev;
>> > + struct pl330_desc *desc;
>> > + dma_addr_t phys;
>> > +
>> > + desc = kzalloc(sizeof(*desc), flags);
>> > + if (!desc)
>> > + return NULL;
>> > +
>> > + desc->desc_pool_virt = dma_alloc_coherent(dev, PL330_POOL_SIZE, &phys,
>> > + flags);
>> These allocations are inefficient and don't need to be done so often.
>> My implementation allocates a pool of such buffers(size specified by
>> DMA API driver)
>> and manage them by simple pointer manipulation.
>> Though the xfer requests for DMA API has to be managed in the DMA API driver.
>
> There's a dma pool implementation too in the kernel.
I meant during 'probe' of the DMAC a chunk of dma consistent memory is allocated
for MicroCode for each channel.
We use the same chunk during xfers, since we can generate MC in a way that 256
bytes are enough to do xfer of 2.5MB at burst size of 1 byte and for
bulkier requests
the DMA API driver can either break the bigger request or allocate
bigger chunk for
the channels.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: jassi brar on 1 Apr 2010 21:40

On Fri, Apr 2, 2010 at 8:23 AM, Linus Walleij
<linus.ml.walleij(a)gmail.com> wrote:
> Hi Jassi,
> The only advantage of the other driver by Joonyoung is that it is finished and
> ready for integration. If you finalize the DMA devices/engine API and post
> this in time for the next merge window I would easily vote for including this
> one rather than the other one. (Whatever that means for the world.)
> Simply for technical merits.
Thank you. I am working on it full time. This submission was solely as
a preview, the
code needs to be optimized, polished and tested, though I don't anticipate much
changes in the API. By when should I submit the final version?

.........

> I understand it that as this is the core engine so you intend to keep the core
> in arch/arm/common/* and then a separate interface to the DMAdevices
> implementing <linux/dmaengine.h> in drivers/dma/ and this is what the
> "DMA API" referenced below refers to?
Yes, drivers/dma/pl330.c could be one DMA API driver.
Another may implement S3C DMA API and reside in arch/arm/plat-samsung/ or
wherever the S3C PL080 driver is atm.
So, DMA API driver is the frontend that makes use of the
arch/arm/common/pl330.c
backend driver.

> In that case I really like this clear separation between hardware driver
> and DMA devices/engine API. And I see that the DMA API is not far
> away. If you implement it you will be able to excersise this with the
> DMA engine memcpy test to assure it's working.
Yes, implementing PL330 core was supposed to be the major challenge.
DMA API drivers should be easy, though I plan to first test the pl330.c with
S3C DMA API because that way I'll have a benchmark to compare the stability
and performance of this new driver.
Of course, I'll like to see driver/dma driver as well, but maybe JoonYoung wants
to implement that part, if he doesn't show interest then I will.

> There is nothing wrong with moving this entire thing except the header
> file into drivers/dma it will be more comfortable there, with the other
> DMA drivers. Whether the header should be in include/linux/amba
> or include/linux/dma is however a good question for the philosophers,
> but I would stick it into linux/amba with the rest of the PrimeCells.
> But perhaps you have better ideas.
For platforms that choose to implement their own DMA API (how good or bad is a
different subject), arch/arm/common/ seems be more appropriate for this pl330.c
And all drivers in drivers/dma/ implement the include/linux/dmaengine.h API

> 2010/4/1 jassi brar <jassisinghbrar(a)gmail.com>:
>
>> o The DMA API driver submits 'request' to PL330 engine.
>> A request is a sequence of DMA 'xfers' to be done before the DMA
>> API driver wants to be notified.
>
> This hints that there is some other patch to provide that API
> <linux/dmaengine.h> that is not part of this patch, right?
Right, and that is what I call Client/DMA-API driver. Though the patch is not
ready yet.

>> A req can be a scatter-Gather-List.
>
> This is great, do you also plan to support that for M<->M xfers like we
> added for the DMA40? Then we might want to lift that into the generic
> DMA engine.
It should already be working with this driver as such.

>> o TODO: Desirable is to implement true LLI using MicroCode
>> modification during each
>> request enqueue, so that the xfer continues even while IRQ is
>> handled and callbacks made.
>> To me, there doesn't seem to be a way to flush ICACHE of a channel
>> without halting it, so we
>> can't modify MicroCode in runtime. Using two channels per client
>> to achieve true LLI is the last resort.
>
> True, not as elegant as being able to do it with microcode but
> still quite elegant.
I hope the driver is efficient/fast enough for some tough test cases I have,
otherwise I might have to modify or add to the API to implement this
two-channels per user situation.

>> So currently, cpu intervention is required to trigger each xfer,
>> hence interrupt latency might play
>> some role.
>
> From the DMA API level in the PrimeCell drivers the crucial driver that
> need something like this is the AMBA PL011 UART driver, RX part,
> where data comes in from the outside and we have no control over
> the data flow. I trigger one transfer to a buffer here, then wait for it
> to complete or be interrupted. If it completes, I immediately trigger
> another transfer to the second buffer before I start processing the just
> recieved buffer (like front/back buffers).
If I get it right, that is common issue with any 'slave-receiver' type device
and it might do good to have timeout option support in DMA API for such receive
requests. That is provide whatever data is collected within some
amount of time,
provide that to upper layers and queue request again.

>> o TODO: PAUSE/RESUME support. Currently the DMA API driver has to emulate it.
>
> The only PrimeCell that needs this is currently again the PL011.
> It needs to PAUSE then get the number of pending bytes and then
> terminate the transfer. This is done when we timeout transfers e.g.
> for UART consoles. So being able to pause and retrieve the number
> of bytes left and then cancel is the most advanced sequence that
> will be used by a PrimeCell currently.
Even with this implementation, for PAUSE, the DMA API driver can call
pl330_chan_status, saving remaining requests locally and calling
PL330_OP_ABORT. During RESUME it simply submit remaining requests again.

>> +/* Returns bytes consumed and updates bursts */
>> +static inline int _loop(unsigned dry_run, u8 buf[],
>> + unsigned long *bursts, const struct _xfer_spec *pxs)
>> +{
>> + int cyc, cycmax, szlp, szlpend, szbrst, off;
>> + unsigned lcnt0, lcnt1, ljmp0, ljmp1;
>> + struct _arg_LPEND lpend;
>> +
>> + /* Max iterations possibile in DMALP is 256 */
>> + if (*bursts >= 256*256) {
>> + lcnt1 = 256;
>> + lcnt0 = 256;
>> + cyc = *bursts / lcnt1 / lcnt0;
>> + } else if (*bursts > 256) {
>> + lcnt1 = 256;
>> + lcnt0 = *bursts / lcnt1;
>> + cyc = 1;
>> + } else {
>> + lcnt1 = *bursts;
>> + lcnt0 = 0;
>> + cyc = 1;
>> + }
>> +
>> + szlp = _emit_LP(1, buf, 0, 0);
>> + szbrst = _bursts(1, buf, pxs, 1);
>> +
>> + lpend.cond = ALWAYS;
>> + lpend.forever = false;
>> + lpend.loop = 0;
>> + lpend.bjump = 0;
>> + szlpend = _emit_LPEND(1, buf, &lpend);
>> +
>> + if (lcnt0) {
>> + szlp *= 2;
>> + szlpend *= 2;
>> + }
>> +
>> + /** Do not mess with the construct **/
>
> Which means? Hackers like to mess with stuff... Note to self?
> Usually comments like that is a trace of questionable design
> so if the design is solid, remove the comments because then it
> will be obvious that you don't want to mess with the construct.
That is mostly to self. It just means that every variable in the block
must be analyzed before making any change there.

.......

>> +xfer_exit:
>> + mutex_unlock(&thrd->mtx);
>> + return ret;
>> +}
>> +EXPORT_SYMBOL(pl330_submit_req);
>
> For all exported symbols: I have a hard time seeing anyone compiling the
> DMA engine driver or anything else using this as a module and making use
> of all these exports. But maybe for testing, what do I know...
I think it is considered good practice to export every symbol of an API.

.......

>> +
>> + /* Check if we can handle this DMAC */
>> + if (get_id(pl330, PERIPH_ID) != PERIPH_ID_VAL
>> + || get_id(pl330, PCELL_ID) != PCELL_ID_VAL) {
>> + printk(KERN_INFO "PERIPH_ID 0x%x, PCELL_ID 0x%x !\n",
>> + readl(regs + PERIPH_ID), readl(regs + PCELL_ID));
>
> dev_info(pl330->dev, ...)
>
>> + return -EINVAL;
>> + }
>
> If the parent device (IMO a DMAdevices/DMAengine) is an struct amba_device
> I don't think this ID check is necessary, there is already PrimeCell
> matching code in
> <linux/amba/bus.h>
As I said, this driver is designed to be usable with any DMA API, and
some, like S3C,
do not see the DMAC as some amba device.

>> +
>> + /* Make sure it isn't already added */
>> + list_for_each_entry(pt, &pl330_list, node)
>> + if (pt == pl330)
>
> Perhaps print some warning here. Doesn't seem sound that this
> would happen.
The check is there just for some robustness.

.......

>> +extern struct pl330_info *pl330_alloc(struct device *);
>> +extern int pl330_add(struct pl330_info *);
>> +extern void pl330_del(struct pl330_info *pi);
>> +extern int pl330_update(struct pl330_info *pi);
>> +extern void pl330_release_channel(void *ch_id);
>> +extern void *pl330_request_channel(struct pl330_info *pi);
>> +extern int pl330_chan_status(void *ch_id, struct pl330_chanstatus *pstatus);
>> +extern int pl330_chan_ctrl(void *ch_id, enum pl330_chan_op op);
>> +extern int pl330_submit_req(void *ch_id, struct pl330_req *r);
>> +extern void pl330_free(struct pl330_info *pi);
>
>
> Do you really need both pairs:
>
> pl330_alloc() + pl330_add() and
> pl330_del() + pl330_free()
yes I was already thinking on similar lines. I'll merge them to one.

As I said, this code was just for preview. It needs to undergo at
least one cycle of
optimizing->polishing->testing before I finally submit for inclusion.
comments, prints, types etc will be modified to match other code in
the directory
it will be aimed to be put in. Of course, I have taken every feedback you gave.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

From: Kyungmin Park on 17 Apr 2010 03:10

Hi,

If there's no comments. it's time to select to merge it.
I think it can't which is better. so I hope community select any one
Only consideration which is proper for linux and future usage

To Linux Walleij.
I hope it will merge your patch series together.

Thank you,
Kyungmin Park

On Fri, Apr 2, 2010 at 10:38 AM, jassi brar <jassisinghbrar(a)gmail.com> wrote:
> On Fri, Apr 2, 2010 at 8:23 AM, Linus Walleij
> <linus.ml.walleij(a)gmail.com> wrote:
>> Hi Jassi,
>> The only advantage of the other driver by Joonyoung is that it is finished and
>> ready for integration. If you finalize the DMA devices/engine API and post
>> this in time for the next merge window I would easily vote for including this
>> one rather than the other one. (Whatever that means for the world.)
>> Simply for technical merits.
> Thank you. I am working on it full time. This submission was solely as
> a preview, the
> code needs to be optimized, polished and tested, though I don't anticipate much
> changes in the API. By when should I submit the final version?
>
> ........
>
>> I understand it that as this is the core engine so you intend to keep the core
>> in arch/arm/common/* and then a separate interface to the DMAdevices
>> implementing <linux/dmaengine.h> in drivers/dma/ and this is what the
>> "DMA API" referenced below refers to?
> Yes, drivers/dma/pl330.c could be one DMA API driver.
> Another may implement S3C DMA API and reside in arch/arm/plat-samsung/ or
> wherever the S3C PL080 driver is atm.
> So, DMA API driver is the frontend that makes use of the
> arch/arm/common/pl330.c
> backend driver.
>
>> In that case I really like this clear separation between hardware driver
>> and DMA devices/engine API. And I see that the DMA API is not far
>> away. If you implement it you will be able to excersise this with the
>> DMA engine memcpy test to assure it's working.
> Yes, implementing PL330 core was supposed to be the major challenge.
> DMA API drivers should be easy, though I plan to first test the pl330.c with
> S3C DMA API because that way I'll have a benchmark to compare the stability
> and performance of this new driver.
> Of course, I'll like to see driver/dma driver as well, but maybe JoonYoung wants
> to implement that part, if he doesn't show interest then I will.
>
>> There is nothing wrong with moving this entire thing except the header
>> file into drivers/dma it will be more comfortable there, with the other
>> DMA drivers. Whether the header should be in include/linux/amba
>> or include/linux/dma is however a good question for the philosophers,
>> but I would stick it into linux/amba with the rest of the PrimeCells.
>> But perhaps you have better ideas.
> For platforms that choose to implement their own DMA API (how good or bad is a
> different subject), arch/arm/common/ seems be more appropriate for this pl330.c
> And all drivers in drivers/dma/ implement the include/linux/dmaengine.h API
>
>> 2010/4/1 jassi brar <jassisinghbrar(a)gmail.com>:
>>
>>> o �The DMA API driver submits 'request' to PL330 engine.
>>> � � A request is a sequence of DMA 'xfers' to be done before the DMA
>>> API driver wants to be notified.
>>
>> This hints that there is some other patch to provide that API
>> <linux/dmaengine.h> that is not part of this patch, right?
> Right, and that is what I call Client/DMA-API driver. Though the patch is not
> ready yet.
>
>>> � � A req can be a scatter-Gather-List.
>>
>> This is great, do you also plan to support that for M<->M xfers like we
>> added for the DMA40? Then we might want to lift that into the generic
>> DMA engine.
> It should already be working with this driver as such.
>
>>> o �TODO: Desirable is to implement true LLI using MicroCode
>>> modification during each
>>> � �request enqueue, so that the xfer continues even while IRQ is
>>> handled and callbacks made.
>>> � �To me, there doesn't seem to be a way to flush ICACHE of a channel
>>> without halting it, so we
>>> � �can't modify MicroCode in runtime. Using two channels per client
>>> to achieve true LLI is the last resort.
>>
>> True, not as elegant as being able to do it with microcode but
>> still quite elegant.
> I hope the driver is efficient/fast enough for some tough test cases I have,
> otherwise I might have to modify or add to the API to implement this
> two-channels per user situation.
>
>>> � �So currently, cpu intervention is required to trigger each xfer,
>>> hence interrupt latency might play
>>> � �some role.
>>
>> From the DMA API level in the PrimeCell drivers the crucial driver that
>> need something like this is the AMBA PL011 UART driver, RX part,
>> where data comes in from the outside and we have no control over
>> the data flow. I trigger one transfer to a buffer here, then wait for it
>> to complete or be interrupted. If it completes, I immediately trigger
>> another transfer to the second buffer before I start processing the just
>> recieved buffer (like front/back buffers).
> If I get it right, that is common issue with any 'slave-receiver' type device
> and it might do good to have timeout option support in DMA API for such receive
> requests. That is provide whatever data is collected within some
> amount of time,
> provide that to upper layers and queue request again.
>
>>> o �TODO: PAUSE/RESUME support. Currently the DMA API driver has to emulate it.
>>
>> The only PrimeCell that needs this is currently again the PL011.
>> It needs to PAUSE then get the number of pending bytes and then
>> terminate the transfer. This is done when we timeout transfers e.g.
>> for UART consoles. So being able to pause and retrieve the number
>> of bytes left and then cancel is the most advanced sequence that
>> will be used by a PrimeCell currently.
> Even with this implementation, for PAUSE, the DMA API driver can call
> pl330_chan_status, saving remaining requests locally and calling
> PL330_OP_ABORT. During RESUME it simply submit remaining requests again.
>
>>> +/* Returns bytes consumed and updates bursts */
>>> +static inline int _loop(unsigned dry_run, u8 buf[],
>>> + � � � � � � � unsigned long *bursts, const struct _xfer_spec *pxs)
>>> +{
>>> + � � � int cyc, cycmax, szlp, szlpend, szbrst, off;
>>> + � � � unsigned lcnt0, lcnt1, ljmp0, ljmp1;
>>> + � � � struct _arg_LPEND lpend;
>>> +
>>> + � � � /* Max iterations possibile in DMALP is 256 */
>>> + � � � if (*bursts >= 256*256) {
>>> + � � � � � � � lcnt1 = 256;
>>> + � � � � � � � lcnt0 = 256;
>>> + � � � � � � � cyc = *bursts / lcnt1 / lcnt0;
>>> + � � � } else if (*bursts > 256) {
>>> + � � � � � � � lcnt1 = 256;
>>> + � � � � � � � lcnt0 = *bursts / lcnt1;
>>> + � � � � � � � cyc = 1;
>>> + � � � } else {
>>> + � � � � � � � lcnt1 = *bursts;
>>> + � � � � � � � lcnt0 = 0;
>>> + � � � � � � � cyc = 1;
>>> + � � � }
>>> +
>>> + � � � szlp = _emit_LP(1, buf, 0, 0);
>>> + � � � szbrst = _bursts(1, buf, pxs, 1);
>>> +
>>> + � � � lpend.cond = ALWAYS;
>>> + � � � lpend.forever = false;
>>> + � � � lpend.loop = 0;
>>> + � � � lpend.bjump = 0;
>>> + � � � szlpend = _emit_LPEND(1, buf, &lpend);
>>> +
>>> + � � � if (lcnt0) {
>>> + � � � � � � � szlp *= 2;
>>> + � � � � � � � szlpend *= 2;
>>> + � � � }
>>> +
>>> + � � � /** Do not mess with the construct **/
>>
>> Which means? Hackers like to mess with stuff... Note to self?
>> Usually comments like that is a trace of questionable design
>> so if the design is solid, remove the comments because then it
>> will be obvious that you don't want to mess with the construct.
> That is mostly to self. It just means that every variable in the block
> must be analyzed before making any change there.
>
> ......
>
>>> +xfer_exit:
>>> + � � � mutex_unlock(&thrd->mtx);
>>> + � � � return ret;
>>> +}
>>> +EXPORT_SYMBOL(pl330_submit_req);
>>
>> For all exported symbols: I have a hard time seeing anyone compiling the
>> DMA engine driver or anything else using this as a module and making use
>> of all these exports. But maybe for testing, what do I know...
> I think it is considered good practice to export every symbol of an API.
>
> ......
>
>>> +
>>> + � � � /* Check if we can handle this DMAC */
>>> + � � � if (get_id(pl330, PERIPH_ID) != PERIPH_ID_VAL
>>> + � � � � �|| get_id(pl330, PCELL_ID) != PCELL_ID_VAL) {
>>> + � � � � � � � printk(KERN_INFO "PERIPH_ID 0x%x, PCELL_ID 0x%x !\n",
>>> + � � � � � � � � � � � readl(regs + PERIPH_ID), readl(regs + PCELL_ID));
>>
>> dev_info(pl330->dev, ...)
>>
>>> + � � � � � � � return -EINVAL;
>>> + � � � }
>>
>> If the parent device (IMO a DMAdevices/DMAengine) is an struct amba_device
>> I don't think this ID check is necessary, there is already PrimeCell
>> matching code in
>> <linux/amba/bus.h>
> As I said, this driver is designed to be usable with any DMA API, and
> some, like S3C,
> do not see the DMAC as some amba device.
>
>>> +
>>> + � � � /* Make sure it isn't already added */
>>> + � � � list_for_each_entry(pt, &pl330_list, node)
>>> + � � � � � � � if (pt == pl330)
>>
>> Perhaps print some warning here. Doesn't seem sound that this
>> would happen.
> The check is there just for some robustness.
>
> ......
>
>>> +extern struct pl330_info *pl330_alloc(struct device *);
>>> +extern int pl330_add(struct pl330_info *);
>>> +extern void pl330_del(struct pl330_info *pi);
>>> +extern int pl330_update(struct pl330_info *pi);
>>> +extern void pl330_release_channel(void *ch_id);
>>> +extern void *pl330_request_channel(struct pl330_info *pi);
>>> +extern int pl330_chan_status(void *ch_id, struct pl330_chanstatus *pstatus);
>>> +extern int pl330_chan_ctrl(void *ch_id, enum pl330_chan_op op);
>>> +extern int pl330_submit_req(void *ch_id, struct pl330_req *r);
>>> +extern void pl330_free(struct pl330_info *pi);
>>
>>
>> Do you really need both pairs:
>>
>> pl330_alloc() + pl330_add() and
>> pl330_del() + pl330_free()
> yes I was already thinking on similar lines. I'll merge them to one.
>
> As I said, this code was just for preview. It needs to undergo at
> least one cycle of
> optimizing->polishing->testing before I finally submit for inclusion.
> comments, prints, types etc will be modified to match other code in
> the directory
> it will be aimed to be put in. Of course, I have taken every feedback you gave.
>
> Thanks.
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel(a)lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

First | Prev | Next | Last
Pages: 1 2 3 4 5 6
Prev: mm/vmalloc: Export purge_vmap_area_lazy()
Next: perf/scripts: tuple was set from long in both branches in python_process_event()