Запись во внешний медленный порт сильно портит все дело. Вот что получилось у меня (RVDS 2.2, полная оптимизация по скорости):
1. Без записи в порт, вместо регистра пишем кэшируемую память.
Ваш вариант 334 FPS, мой - 397 FPS. Разница заметна.
2. С записью в порт.
Ваш вариант - 215 FPS, мой - 221 FPS. Практически одинаково.
CODE
BlitScreen
$a
.text
0x00000000: e92d07f0 ..-. PUSH {r4-r10}
0x00000004: e1a03000 .0.. MOV r3,r0
0x00000008: e59f03f4 .... LDR r0,[pc,#1012] ; [.bss$5 = 0x404] = 0
0x0000000c: e3a0a000 .... MOV r10,#0
0x00000010: e3a06202 .b.. MOV r6,#0x20000000
0x00000014: e3a050e0 .P.. MOV r5,#0xe0
0x00000018: e3a0401c .@.. MOV r4,#0x1c
0x0000001c: e3a07000 .p.. MOV r7,#0
0x00000020: e5931000 .... LDR r1,[r3,#0]
0x00000024: e5932100 .!.. LDR r2,[r3,#0x100]
0x00000028: e201c0ff .... AND r12,r1,#0xff
0x0000002c: e2018cff .... AND r8,r1,#0xff00
0x00000030: e08083a8 .... ADD r8,r0,r8,LSR #7
0x00000034: e080c08c .... ADD r12,r0,r12,LSL #1
0x00000038: e1dcc0b0 .... LDRH r12,[r12,#0]
0x0000003c: e1d880b0 .... LDRH r8,[r8,#0]
0x00000040: e088c00c .... ADD r12,r8,r12
0x00000044: e20280ff .... AND r8,r2,#0xff
0x00000048: e0808088 .... ADD r8,r0,r8,LSL #1
0x0000004c: e1d880b0 .... LDRH r8,[r8,#0]
0x00000050: e088c00c .... ADD r12,r8,r12
0x00000054: e2028cff .... AND r8,r2,#0xff00
0x00000058: e08083a8 .... ADD r8,r0,r8,LSR #7
0x0000005c: e1d880b0 .... LDRH r8,[r8,#0]
0x00000060: e088c00c .... ADD r12,r8,r12
0x00000064: e004822c ,... AND r8,r4,r12,LSR #4
0x00000068: e1a09e0c .... LSL r9,r12,#28
0x0000006c: e1888f29 )... ORR r8,r8,r9,LSR #30
0x00000070: e005c32c ,... AND r12,r5,r12,LSR #6
0x00000074: e188c00c .... ORR r12,r8,r12
0x00000078: e5c6c007 .... STRB r12,[r6,#7]
0x0000007c: e201c8ff .... AND r12,r1,#0xff0000
0x00000080: e1a01c21 !... LSR r1,r1,#24
0x00000084: e0801081 .... ADD r1,r0,r1,LSL #1
0x00000088: e080c7ac .... ADD r12,r0,r12,LSR #15
0x0000008c: e1dcc0b0 .... LDRH r12,[r12,#0]
0x00000090: e1d110b0 .... LDRH r1,[r1,#0]
0x00000094: e081100c .... ADD r1,r1,r12
0x00000098: e202c8ff .... AND r12,r2,#0xff0000
0x0000009c: e080c7ac .... ADD r12,r0,r12,LSR #15
0x000000a0: e1a02c22 ",.. LSR r2,r2,#24
0x000000a4: e1dcc0b0 .... LDRH r12,[r12,#0]
0x000000a8: e0802082 . .. ADD r2,r0,r2,LSL #1
0x000000ac: e1d220b0 . .. LDRH r2,[r2,#0]
0x000000b0: e08c1001 .... ADD r1,r12,r1
0x000000b4: e0821001 .... ADD r1,r2,r1
0x000000b8: e0042221 !".. AND r2,r4,r1,LSR #4
0x000000bc: e1a0ce01 .... LSL r12,r1,#28
0x000000c0: e1822f2c ,/.. ORR r2,r2,r12,LSR #30
0x000000c4: e0051321 !... AND r1,r5,r1,LSR #6
0x000000c8: e1821001 .... ORR r1,r2,r1
0x000000cc: e5c61007 .... STRB r1,[r6,#7]
0x000000d0: e5b31004 .... LDR r1,[r3,#4]!
0x000000d4: e5932100 .!.. LDR r2,[r3,#0x100]
0x000000d8: e201c0ff .... AND r12,r1,#0xff
0x000000dc: e2018cff .... AND r8,r1,#0xff00
0x000000e0: e08083a8 .... ADD r8,r0,r8,LSR #7
0x000000e4: e080c08c .... ADD r12,r0,r12,LSL #1
0x000000e8: e1dcc0b0 .... LDRH r12,[r12,#0]
0x000000ec: e1d880b0 .... LDRH r8,[r8,#0]
0x000000f0: e088c00c .... ADD r12,r8,r12
0x000000f4: e20280ff .... AND r8,r2,#0xff
0x000000f8: e0808088 .... ADD r8,r0,r8,LSL #1
0x000000fc: e1d880b0 .... LDRH r8,[r8,#0]
0x00000100: e088c00c .... ADD r12,r8,r12
0x00000104: e2028cff .... AND r8,r2,#0xff00
0x00000108: e08083a8 .... ADD r8,r0,r8,LSR #7
0x0000010c: e1d880b0 .... LDRH r8,[r8,#0]
0x00000110: e088c00c .... ADD r12,r8,r12
0x00000114: e004822c ,... AND r8,r4,r12,LSR #4
0x00000118: e1a09e0c .... LSL r9,r12,#28
0x0000011c: e1888f29 )... ORR r8,r8,r9,LSR #30
0x00000120: e005c32c ,... AND r12,r5,r12,LSR #6
0x00000124: e188c00c .... ORR r12,r8,r12
0x00000128: e5c6c007 .... STRB r12,[r6,#7]
0x0000012c: e201c8ff .... AND r12,r1,#0xff0000
0x00000130: e1a01c21 !... LSR r1,r1,#24
0x00000134: e0801081 .... ADD r1,r0,r1,LSL #1
0x00000138: e080c7ac .... ADD r12,r0,r12,LSR #15
0x0000013c: e1dcc0b0 .... LDRH r12,[r12,#0]
0x00000140: e1d110b0 .... LDRH r1,[r1,#0]
0x00000144: e081100c .... ADD r1,r1,r12
0x00000148: e202c8ff .... AND r12,r2,#0xff0000
0x0000014c: e080c7ac .... ADD r12,r0,r12,LSR #15
0x00000150: e1a02c22 ",.. LSR r2,r2,#24
0x00000154: e1dcc0b0 .... LDRH r12,[r12,#0]
0x00000158: e0802082 . .. ADD r2,r0,r2,LSL #1
0x0000015c: e1d220b0 . .. LDRH r2,[r2,#0]
0x00000160: e08c1001 .... ADD r1,r12,r1
0x00000164: e0821001 .... ADD r1,r2,r1
0x00000168: e0042221 !".. AND r2,r4,r1,LSR #4
0x0000016c: e1a0ce01 .... LSL r12,r1,#28
0x00000170: e1822f2c ,/.. ORR r2,r2,r12,LSR #30
0x00000174: e0051321 !... AND r1,r5,r1,LSR #6
0x00000178: e1821001 .... ORR r1,r2,r1
0x0000017c: e5c61007 .... STRB r1,[r6,#7]
0x00000180: e5b31004 .... LDR r1,[r3,#4]!
0x00000184: e5932100 .!.. LDR r2,[r3,#0x100]
0x00000188: e201c0ff .... AND r12,r1,#0xff
0x0000018c: e2018cff .... AND r8,r1,#0xff00
0x00000190: e08083a8 .... ADD r8,r0,r8,LSR #7
0x00000194: e080c08c .... ADD r12,r0,r12,LSL #1
0x00000198: e1dcc0b0 .... LDRH r12,[r12,#0]
0x0000019c: e1d880b0 .... LDRH r8,[r8,#0]
0x000001a0: e088c00c .... ADD r12,r8,r12
0x000001a4: e20280ff .... AND r8,r2,#0xff
0x000001a8: e0808088 .... ADD r8,r0,r8,LSL #1
0x000001ac: e1d880b0 .... LDRH r8,[r8,#0]
0x000001b0: e088c00c .... ADD r12,r8,r12
0x000001b4: e2028cff .... AND r8,r2,#0xff00
0x000001b8: e08083a8 .... ADD r8,r0,r8,LSR #7
0x000001bc: e1d880b0 .... LDRH r8,[r8,#0]
0x000001c0: e088c00c .... ADD r12,r8,r12
0x000001c4: e004822c ,... AND r8,r4,r12,LSR #4
0x000001c8: e1a09e0c .... LSL r9,r12,#28
0x000001cc: e1888f29 )... ORR r8,r8,r9,LSR #30
0x000001d0: e005c32c ,... AND r12,r5,r12,LSR #6
0x000001d4: e188c00c .... ORR r12,r8,r12
0x000001d8: e5c6c007 .... STRB r12,[r6,#7]
0x000001dc: e201c8ff .... AND r12,r1,#0xff0000
0x000001e0: e1a01c21 !... LSR r1,r1,#24
0x000001e4: e0801081 .... ADD r1,r0,r1,LSL #1
0x000001e8: e080c7ac .... ADD r12,r0,r12,LSR #15
0x000001ec: e1dcc0b0 .... LDRH r12,[r12,#0]
0x000001f0: e1d110b0 .... LDRH r1,[r1,#0]
0x000001f4: e081100c .... ADD r1,r1,r12
0x000001f8: e202c8ff .... AND r12,r2,#0xff0000
0x000001fc: e080c7ac .... ADD r12,r0,r12,LSR #15
0x00000200: e1a02c22 ",.. LSR r2,r2,#24
0x00000204: e1dcc0b0 .... LDRH r12,[r12,#0]
0x00000208: e0802082 . .. ADD r2,r0,r2,LSL #1
0x0000020c: e1d220b0 . .. LDRH r2,[r2,#0]
0x00000210: e08c1001 .... ADD r1,r12,r1
0x00000214: e0821001 .... ADD r1,r2,r1
0x00000218: e0042221 !".. AND r2,r4,r1,LSR #4
0x0000021c: e1a0ce01 .... LSL r12,r1,#28
0x00000220: e1822f2c ,/.. ORR r2,r2,r12,LSR #30
0x00000224: e0051321 !... AND r1,r5,r1,LSR #6
0x00000228: e1821001 .... ORR r1,r2,r1
0x0000022c: e5c61007 .... STRB r1,[r6,#7]
0x00000230: e5b31004 .... LDR r1,[r3,#4]!
0x00000234: e5932100 .!.. LDR r2,[r3,#0x100]
0x00000238: e201c0ff .... AND r12,r1,#0xff
0x0000023c: e2018cff .... AND r8,r1,#0xff00
0x00000240: e08083a8 .... ADD r8,r0,r8,LSR #7
0x00000244: e080c08c .... ADD r12,r0,r12,LSL #1
0x00000248: e1dcc0b0 .... LDRH r12,[r12,#0]
0x0000024c: e1d880b0 .... LDRH r8,[r8,#0]
0x00000250: e088c00c .... ADD r12,r8,r12
0x00000254: e20280ff .... AND r8,r2,#0xff
0x00000258: e0808088 .... ADD r8,r0,r8,LSL #1
0x0000025c: e1d880b0 .... LDRH r8,[r8,#0]
0x00000260: e088c00c .... ADD r12,r8,r12
0x00000264: e2028cff .... AND r8,r2,#0xff00
0x00000268: e08083a8 .... ADD r8,r0,r8,LSR #7
0x0000026c: e1d880b0 .... LDRH r8,[r8,#0]
0x00000270: e088c00c .... ADD r12,r8,r12
0x00000274: e004822c ,... AND r8,r4,r12,LSR #4
0x00000278: e1a09e0c .... LSL r9,r12,#28
0x0000027c: e1888f29 )... ORR r8,r8,r9,LSR #30
0x00000280: e005c32c ,... AND r12,r5,r12,LSR #6
0x00000284: e188c00c .... ORR r12,r8,r12
0x00000288: e5c6c007 .... STRB r12,[r6,#7]
0x0000028c: e201c8ff .... AND r12,r1,#0xff0000
0x00000290: e1a01c21 !... LSR r1,r1,#24
0x00000294: e0801081 .... ADD r1,r0,r1,LSL #1
0x00000298: e080c7ac .... ADD r12,r0,r12,LSR #15
0x0000029c: e1dcc0b0 .... LDRH r12,[r12,#0]
0x000002a0: e1d110b0 .... LDRH r1,[r1,#0]
0x000002a4: e081100c .... ADD r1,r1,r12
0x000002a8: e202c8ff .... AND r12,r2,#0xff0000
0x000002ac: e080c7ac .... ADD r12,r0,r12,LSR #15
0x000002b0: e1a02c22 ",.. LSR r2,r2,#24
0x000002b4: e1dcc0b0 .... LDRH r12,[r12,#0]
0x000002b8: e0802082 . .. ADD r2,r0,r2,LSL #1
0x000002bc: e1d220b0 . .. LDRH r2,[r2,#0]
0x000002c0: e08c1001 .... ADD r1,r12,r1
0x000002c4: e0821001 .... ADD r1,r2,r1
0x000002c8: e0042221 !".. AND r2,r4,r1,LSR #4
0x000002cc: e1a0ce01 .... LSL r12,r1,#28
0x000002d0: e1822f2c ,/.. ORR r2,r2,r12,LSR #30
0x000002d4: e0051321 !... AND r1,r5,r1,LSR #6
0x000002d8: e1821001 .... ORR r1,r2,r1
0x000002dc: e5c61007 .... STRB r1,[r6,#7]
0x000002e0: e2877001 .p.. ADD r7,r7,#1
0x000002e4: e3570010 ..W. CMP r7,#0x10
0x000002e8: e2833004 .0.. ADD r3,r3,#4
0x000002ec: 3affff4b K..: BCC {pc} - 0x2cc ; 0x20
0x000002f0: e28aa001 .... ADD r10,r10,#1
0x000002f4: e35a0078 x.Z. CMP r10,#0x78
0x000002f8: e2833c01 .<.. ADD r3,r3,#0x100
0x000002fc: 3affff46 F..: BCC {pc} - 0x2e0 ; 0x1c
0x00000300: e8bd07f0 .... POP {r4-r10}
0x00000304: e12fff1e ../. BX r14
Цитата(axa09 @ Feb 24 2009, 13:45)

замерял производительность с двумя вариантами стеков: во внутренней срам и во внешней - тоже самое все.
пробовал PALETTE[] расположить во внутренней срам- тоже ничего не поменялось.
Стек не используется, а PALETTE полностью ложится в кэш, так что разницы и не должно быть.