Write is twice the speed as read? | |
| [Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] | |
Folks,I am trying to optimize large block memcpy speed on my Versatile A/B board (210 Mhz ARM926EJ-S with VFP, Linux 2.6.9). So I wrote a few timing loops to determine the speed of moving memory. For my requirement, I need to copy a block of memory that's 2x the size of the dcache.
So I wrote two loops to time the speed of reading or writing (but not reading *and* writing at the same time). With a 32KB dcache, I try to read or write 4KB or 64KB of data using load/store multiple instructions. The strange results are that large blocks of writes are about 2x the speed of reads. The speed of memcpy is measured using the standard GNU C library.
rd_ldmia(4) read 4KB of data (ldmia) 134217728 bytes in 220864 us
607.694 MByte/sec
rd_ldmia(64) read 64KB of data (ldmia) 134217728 bytes in 1030162 us
130.287 MByte/sec
wd_stmia(4) write 4KB of data (stmia) 134217728 bytes in 544041 us
246.705 MByte/sec
wd_stmia(64) write 64KB of data (stmia) 134217728 bytes in 539586 us
248.742 MByte/sec
memcpy(4) memcpy 4KB of data 134217728 bytes in 575432 us
233.246 MByte/sec
memcpy(64) memcpy 64KB of data 134217728 bytes in 1510152 us
88.876 MByte/sec
Does anyone know why the reads are just half the speed of the writes?
Maybe I am not doing the right thing to use the full bandwidth between
main_mem -> dcache? I am hoping that if I can get the reads to be as
fast as the writes, memcpy can achieve around 120MB/sec.
--------------------------------------------------------------------------------------------------
@void rd_ldmia(int *addr, int num_bytes, int num_loops);
.global rd_ldmia
rd_ldmia:
stmdb sp!, {r4, r5, r6, r7, r8}
mov r8, r0
rd_ldmia_0:
mov r3, r1
rd_ldmia_1:
ldmia r0!, {r4, r5, r6, r7}
ldmia r0!, {r4, r5, r6, r7}
ldmia r0!, {r4, r5, r6, r7}
ldmia r0!, {r4, r5, r6, r7}
subs r3, r3, #64
bne rd_ldmia_1
subs r2, r2, #1
movne r0, r8
bne rd_ldmia_0
ldmia sp!, {r4, r5, r6, r7, r8}
mov pc, lr
@void wd_stmia(int *addr, int num_bytes, int num_loops);
.global wd_stmia
wd_stmia:
stmdb sp!, {r4, r5, r6, r7, r8}
mov r8, r0
wd_stmia_0:
mov r3, r1
wd_stmia_1:
stmia r0!, {r4, r5, r6, r7}
stmia r0!, {r4, r5, r6, r7}
stmia r0!, {r4, r5, r6, r7}
stmia r0!, {r4, r5, r6, r7}
subs r3, r3, #64
bne wd_stmia_1
subs r2, r2, #1
movne r0, r8
bne wd_stmia_0
stmia sp!, {r4, r5, r6, r7, r8}
mov pc, lr
-------------------------------------------------------------------
List admin: http://lists.arm.linux.org.uk/mailman/listinfo/linux-arm
FAQ: http://www.arm.linux.org.uk/mailinglists/faq.php
Etiquette: http://www.arm.linux.org.uk/mailinglists/etiquette.php
[Site Home] [IETF Annouce] [Security] [Bugtraq] [Linux] [Linux ARM Kernel] [Linux MIPS] [ECOS] [Tools] [DDR & Rambus] [Monitors]