Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

This might help to achieve faster SPI transfer speeds #61

Closed
deividAlfa opened this issue Feb 2, 2024 · 4 comments
Closed

This might help to achieve faster SPI transfer speeds #61

deividAlfa opened this issue Feb 2, 2024 · 4 comments

Comments

@deividAlfa
Copy link

deividAlfa commented Feb 2, 2024

Hi, based on xfel, I made a program to backup/restore the SPI NAND of a F1C200s-based device (Hantek DSO2C10).
Originally the transfer speeds were very low, about 40KB/s, so 128MB took a very long time.
After analyzing how xfel worked, I made some modifications and was able to read/write to the nand at 300KB/s.

The cmd/swap buffers are very small, so it can't run large queues, requiring a lot of overhead in small packets.
I modified the SPI init to also initialize the SDRAM, and the payload + SPI run function to use the SDRAM address instead the sram buffer at 0x9800.
(This could be done for all SoCs having embedded SDRAM).

Now it's possible to run large command queues and read/write huge data blocks.
For example, this program does:

  • Read 128*2K flash pages, store in SDRAM, then Transfer 128*2K bytes from SDRAM.
  • Transfer 128*2K bytes to SDRAM, then Write 128*2K flash pages from SDRAM.

Though the speed will be still limited to ~100KB/s due the USB speed.
It's possible to set the USB in HS mode, see here.
But doesn't work with xfel write32 0x01c13040 0x29860, getting usb bulk error...
So, I'm using sunxi-fel writel 0x01c13040 0x29860.

It's not in Gihub, but I'm attaching it here.
Due the nature of the program, all the unnecessary code / soc support (Else than for the F1C200s) was removed.

dsoflash-src_cygwin64.zip

@jianjunjiang
Copy link
Member

yes, your are right, requiring a lot of overhead in small packets.

xfel write32 0x01c13040 0x29860 this command don't work, because of byte access not work with usb register.

/*
 * This R32 and W32 macro can only be used for byte access address, Don't used for address
 * that can only support word access. Because the fel protocol can only support
 * byte operation, VERY IMPORTANT !!!
 */
#define R32(reg)		fel_read32(ctx, reg)
#define W32(reg, val)	fel_write32(ctx, reg, val)

you may need blow code:

static uint32_t payload_read32(struct xfel_ctx_t * ctx, uint32_t addr)
{
	static const uint8_t payload[] = {
		0x00, 0x00, 0xa0, 0xe3, 0x17, 0x0f, 0x08, 0xee, 0x15, 0x0f, 0x07, 0xee,
		0xd5, 0x0f, 0x07, 0xee, 0x9a, 0x0f, 0x07, 0xee, 0x95, 0x0f, 0x07, 0xee,
		0xff, 0xff, 0xff, 0xea, 0x0c, 0x00, 0x9f, 0xe5, 0x0c, 0x10, 0x8f, 0xe2,
		0x00, 0x20, 0x90, 0xe5, 0x00, 0x20, 0x81, 0xe5, 0x1e, 0xff, 0x2f, 0xe1,
	};
	uint32_t adr = cpu_to_le32(addr);
	uint32_t val;

	fel_write(ctx, ctx->version.scratchpad, (void *)payload, sizeof(payload));
	fel_write(ctx, ctx->version.scratchpad + sizeof(payload), (void *)&adr, sizeof(adr));
	fel_exec(ctx, ctx->version.scratchpad);
	fel_read(ctx, ctx->version.scratchpad + sizeof(payload) + sizeof(adr), (void *)&val, sizeof(val));
	return le32_to_cpu(val);
}

static void payload_write32(struct xfel_ctx_t * ctx, uint32_t addr, uint32_t val)
{
	static const uint8_t payload[] = {
		0x00, 0x00, 0xa0, 0xe3, 0x17, 0x0f, 0x08, 0xee, 0x15, 0x0f, 0x07, 0xee,
		0xd5, 0x0f, 0x07, 0xee, 0x9a, 0x0f, 0x07, 0xee, 0x95, 0x0f, 0x07, 0xee,
		0xff, 0xff, 0xff, 0xea, 0x08, 0x00, 0x9f, 0xe5, 0x08, 0x10, 0x9f, 0xe5,
		0x00, 0x10, 0x80, 0xe5, 0x1e, 0xff, 0x2f, 0xe1,
	};
	uint32_t params[2] = {
		cpu_to_le32(addr),
		cpu_to_le32(val),
	};

	fel_write(ctx, ctx->version.scratchpad, (void *)payload, sizeof(payload));
	fel_write(ctx, ctx->version.scratchpad + sizeof(payload), (void *)params, sizeof(params));
	fel_exec(ctx, ctx->version.scratchpad);
}

@deividAlfa
Copy link
Author

Great, will try it out, thanks!

@deividAlfa
Copy link
Author

deividAlfa commented Feb 3, 2024

I made these changes to chip_t struct:

struct chip_t {
        ...
	uint32_t (*read32)(struct xfel_ctx_t * ctx, uint32_t addr);
	int (*write32)(struct xfel_ctx_t * ctx, uint32_t addr, uint32_t val);
};
struct chip_t f1c100s_f1c200s_f1c500s = {
        ...
	.write32 = payload_write32,
	.read32 = payload_read32,
};



Hardcoded the scratchpad to use 0x8800: (It's a small payload, so it fits, also will execute faster from sram)

static uint32_t payload_read32(struct xfel_ctx_t * ctx, uint32_t addr)
{
	static const uint8_t payload[] = {
		0x00, 0x00, 0xa0, 0xe3, 0x17, 0x0f, 0x08, 0xee, 0x15, 0x0f, 0x07, 0xee,
		0xd5, 0x0f, 0x07, 0xee, 0x9a, 0x0f, 0x07, 0xee, 0x95, 0x0f, 0x07, 0xee,
		0xff, 0xff, 0xff, 0xea, 0x0c, 0x00, 0x9f, 0xe5, 0x0c, 0x10, 0x8f, 0xe2,
		0x00, 0x20, 0x90, 0xe5, 0x00, 0x20, 0x81, 0xe5, 0x1e, 0xff, 0x2f, 0xe1,
	};
	uint32_t adr = cpu_to_le32(addr);
	uint32_t val;

	fel_write(ctx, 0x00008800, (void *)payload, sizeof(payload));
	fel_write(ctx, 0x00008800 + sizeof(payload), (void *)&adr, sizeof(adr));
	fel_exec(ctx, 0x00008800);
	fel_read(ctx, 0x00008800 + sizeof(payload) + sizeof(adr), (void *)&val, sizeof(val));
	return le32_to_cpu(val);
}

static int payload_write32(struct xfel_ctx_t * ctx, uint32_t addr, uint32_t val)
{
	static const uint8_t payload[] = {
		0x00, 0x00, 0xa0, 0xe3, 0x17, 0x0f, 0x08, 0xee, 0x15, 0x0f, 0x07, 0xee,
		0xd5, 0x0f, 0x07, 0xee, 0x9a, 0x0f, 0x07, 0xee, 0x95, 0x0f, 0x07, 0xee,
		0xff, 0xff, 0xff, 0xea, 0x08, 0x00, 0x9f, 0xe5, 0x08, 0x10, 0x9f, 0xe5,
		0x00, 0x10, 0x80, 0xe5, 0x1e, 0xff, 0x2f, 0xe1,
	};
	uint32_t params[2] = {
		cpu_to_le32(addr),
		cpu_to_le32(val),
	};

	fel_write(ctx, 0x00008800, (void *)payload, sizeof(payload));
	fel_write(ctx, 0x00008800 + sizeof(payload), (void *)params, sizeof(params));
	fel_exec(ctx, 0x00008800);
    return 0;
}



This is the code switching USB mode:

    printf("Configuring USB to HS mode... ");
    ctx.chip->write32(&ctx, 0x01c13040, 0x29860);    
    libusb_close(ctx.hdl);                                              // Close USB
    libusb_exit(NULL);    
    sleep(2);                                                           // Wait 2 seconds for USB reenumeration
    
    libusb_init(NULL);                                                  // Reinitialize usb
    ctx.hdl = libusb_open_device_with_vid_pid(NULL, 0x1f3a, 0xefe8);
    if(!fel_init(&ctx))
    {
        printf("ERROR: No FEL device found\r\n");
        return -1;
    }
    else
    {
         printf("OK\r\n");
    }

I get Configuring USB to HS mode... usb bulk send error.




Before messing with the USB, I tried reading/writing to the scratchpad:
Still crashing on reading or writing.

    printf("Writing to 0x9800...\r\n");
    ctx.chip->write32(&ctx, 0x9800, 0x12345678);    
    printf("Reading from 0x9800...\r\n");
    uint32_t r = ctx.chip->read32(&ctx, 0x9800);
    printf("Written: 0x%08X    Read: 0x%08X\r\n", 0x12345678, r);

@deividAlfa
Copy link
Author

deividAlfa commented Feb 3, 2024

I copied the read/write functions from sunxi-fel and adapted them, this works:
(Also the read/write test)
Configuring USB to HS mode... OK

static uint32_t payload_read32(struct xfel_ctx_t * ctx, uint32_t addr)
{
    uint32_t val;
    fel_readl_n(ctx, addr, &val, 1);
    return val;
}

static int payload_write32(struct xfel_ctx_t * ctx, uint32_t addr, uint32_t val)
{
    fel_writel_n(ctx, addr, &val, 1);
    return 1;
}
/*
 * We don't want the scratch code/buffer to exceed a maximum size of 0x400 bytes
 * (256 32-bit words) on readl_n/writel_n transfers. To guarantee this, we have
 * to account for the amount of space the ARM code uses.
 */
#define LCODE_ARM_WORDS  12 /* word count of the [read/write]l_n scratch code */
#define LCODE_ARM_SIZE   (LCODE_ARM_WORDS << 2) /* code size in bytes */
#define LCODE_MAX_TOTAL  0x100 /* max. words in buffer */
#define LCODE_MAX_WORDS  (LCODE_MAX_TOTAL - LCODE_ARM_WORDS) /* data words */

/* multiple "readl" from sequential addresses to a destination buffer */
static void aw_fel_readl_n(struct xfel_ctx_t * ctx, uint32_t addr,
               uint32_t *dst, size_t count)
{
    if (count == 0) return;
    if (count > LCODE_MAX_WORDS) {
        fprintf(stderr,
            "ERROR: Max. word count exceeded, truncating aw_fel_readl_n() transfer\n");
        count = LCODE_MAX_WORDS;
    }

    assert(LCODE_MAX_WORDS < 256); /* protect against corruption of ARM code */
    uint32_t arm_code[] = {
        htole32(0xe59f0020), /* ldr  r0, [pc, #32] ; ldr r0,[read_addr]  */
        htole32(0xe28f1024), /* add  r1, pc, #36   ; adr r1, read_data   */
        htole32(0xe59f201c), /* ldr  r2, [pc, #28] ; ldr r2,[read_count] */
        htole32(0xe3520000 + LCODE_MAX_WORDS), /* cmp    r2, #LCODE_MAX_WORDS */
        htole32(0xc3a02000 + LCODE_MAX_WORDS), /* movgt    r2, #LCODE_MAX_WORDS */
        /* read_loop: */
        htole32(0xe2522001), /* subs r2, r2, #1    ; r2 -= 1             */
        htole32(0x412fff1e), /* bxmi lr            ; return if (r2 < 0)  */
        htole32(0xe4903004), /* ldr  r3, [r0], #4  ; load and post-inc   */
        htole32(0xe4813004), /* str  r3, [r1], #4  ; store and post-inc  */
        htole32(0xeafffffa), /* b    read_loop                           */
        htole32(addr),       /* read_addr */
        htole32(count)       /* read_count */
        /* read_data (buffer) follows, i.e. values go here */
    };
    assert(sizeof(arm_code) == LCODE_ARM_SIZE);

    /* scratch buffer setup: transfers ARM code, including addr and count */
    fel_write(ctx, 0x00008800, (void *)arm_code, sizeof(arm_code));
    /* execute code, read back the result */
    fel_exec(ctx, 0x00008800);
    uint32_t buffer[count];
    fel_read(ctx, 0x00008800 + LCODE_ARM_SIZE, buffer, sizeof(buffer));    
    /* extract values to destination buffer */
    uint32_t *val = buffer;
    while (count-- > 0)
        *dst++ = le32toh(*val++);
}

/*
 * aw_fel_readl_n() wrapper that can handle large transfers. If necessary,
 * those will be done in separate 'chunks' of no more than LCODE_MAX_WORDS.
 */
void fel_readl_n(struct xfel_ctx_t * ctx, uint32_t addr, uint32_t *dst, size_t count)
{
    while (count > 0) {
        size_t n = count > LCODE_MAX_WORDS ? LCODE_MAX_WORDS : count;
        aw_fel_readl_n(ctx, addr, dst, n);
        addr += n * sizeof(uint32_t);
        dst += n;
        count -= n;
    }
}
/* multiple "writel" from a source buffer to sequential addresses */
static void aw_fel_writel_n(struct xfel_ctx_t * ctx, uint32_t addr,
                uint32_t *src, size_t count)
{
    if (count == 0) return;
    if (count > LCODE_MAX_WORDS) {
        fprintf(stderr,
            "ERROR: Max. word count exceeded, truncating aw_fel_writel_n() transfer\n");
        count = LCODE_MAX_WORDS;
    }

    assert(LCODE_MAX_WORDS < 256); /* protect against corruption of ARM code */
    /*
     * We need a fixed array size to allow for (partial) initialization,
     * so we'll claim the maximum total number of words (0x100) here.
     */
    uint32_t arm_code[LCODE_MAX_TOTAL] = {
        htole32(0xe59f0020), /* ldr  r0, [pc, #32] ; ldr r0,[write_addr] */
        htole32(0xe28f1024), /* add  r1, pc, #36   ; adr r1, write_data  */
        htole32(0xe59f201c), /* ldr  r2, [pc, #28] ; ldr r2,[write_count]*/
        htole32(0xe3520000 + LCODE_MAX_WORDS), /* cmp    r2, #LCODE_MAX_WORDS */
        htole32(0xc3a02000 + LCODE_MAX_WORDS), /* movgt    r2, #LCODE_MAX_WORDS */
        /* write_loop: */
        htole32(0xe2522001), /* subs r2, r2, #1    ; r2 -= 1             */
        htole32(0x412fff1e), /* bxmi lr            ; return if (r2 < 0)  */
        htole32(0xe4913004), /* ldr  r3, [r1], #4  ; load and post-inc   */
        htole32(0xe4803004), /* str  r3, [r0], #4  ; store and post-inc  */
        htole32(0xeafffffa), /* b    write_loop                          */
        htole32(addr),       /* write_addr */
        htole32(count)       /* write_count */
        /* write_data (buffer) follows, i.e. values taken from here */
    };

    /* copy values from source buffer */    
    for (size_t i = 0; i < count; i++)
    {
        arm_code[LCODE_ARM_WORDS + i] = htole32(*src++);
    }
    
    /* scratch buffer setup: transfers ARM code and data */
    fel_write(ctx, 0x00008800, (void *)arm_code, (LCODE_ARM_WORDS + count) * sizeof(uint32_t));
    /* execute, and we're done */
    fel_exec(ctx, 0x00008800);
}


/*
 * aw_fel_writel_n() wrapper that can handle large transfers. If necessary,
 * those will be done in separate 'chunks' of no more than LCODE_MAX_WORDS.
 */
void fel_writel_n(struct xfel_ctx_t * ctx, uint32_t addr, uint32_t *src, size_t count)
{
    while (count > 0) {
        size_t n = count > LCODE_MAX_WORDS ? LCODE_MAX_WORDS : count;
        aw_fel_writel_n(ctx, addr, src, n);
        addr += n * sizeof(uint32_t);
        src += n;
        count -= n;
    }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants