Вот lds:
CODE
OUTPUT_FORMAT("elf32-littlearm", "elf32-littlearm", "elf32-littlearm")
OUTPUT_ARCH(arm)
ENTRY(entry)
MEMORY
{
sram (W!RX) : ORIGIN = 0x300000, LENGTH = 64K
ddr_ebi0 (W!RX) : ORIGIN = 0x73f00000, LENGTH = 64M
}
SECTIONS
{
.fixed :
{
. = ALIGN(4);
_sfixed = .;
*(.text*)
*(.rodata*)
*(.glue_7)
*(.glue_7t)
*(.data)
*(.CP15_*)
. = ALIGN(4);
_efixed = .;
} >ddr_ebi0
.prerelocate : AT (_efixed)
{
. = ALIGN(4);
_sprerelocate = .;
. = ALIGN(4);
_eprerelocate = .;
}
.postrelocate : AT (_efixed + SIZEOF(.prerelocate))
{
. = ALIGN(4);
_spostrelocate = .;
*(.vectors);
*(.ramfunc)
. = ALIGN(4);
_epostrelocate = .;
} >sram
.bss (NOLOAD) : {
_szero = .;
*(.bss)
. = ALIGN(4);
_ezero = .;
} >ddr_ebi0
_sstack = 0x73f00000 + 64 * 1024 * 1024;
}
end = .;
А вот стартап:
CODE
#include "board.h"
//------------------------------------------------------------------------------
// Definitions
//------------------------------------------------------------------------------
#define IRQ_STACK_SIZE 8*3*4
#define ARM_MODE_ABT 0x17
#define ARM_MODE_FIQ 0x11
#define ARM_MODE_IRQ 0x12
#define ARM_MODE_SVC 0x13
#define I_BIT 0x80
#define F_BIT 0x40
//------------------------------------------------------------------------------
// Startup routine
//------------------------------------------------------------------------------
.align 4
.arm
/* Exception vectors
*******************/
.section .vectors, "a", %progbits
resetVector:
ldr pc, =resetHandler /* Reset */
undefVector:
b undefVector /* Undefined instruction */
swiVector:
b swiVector /* Software interrupt */
prefetchAbortVector:
b prefetchAbortVector /* Prefetch abort */
dataAbortVector:
b dataAbortVector /* Data abort */
reservedVector:
b reservedVector /* Reserved for future use */
irqVector:
b irqHandler /* Interrupt */
fiqVector:
/* Fast interrupt */
//------------------------------------------------------------------------------
/// Handles a fast interrupt request by branching to the address defined in the
/// AIC.
//------------------------------------------------------------------------------
fiqHandler:
b fiqHandler
//------------------------------------------------------------------------------
/// Handles incoming interrupt requests by branching to the corresponding
/// handler, as defined in the AIC. Supports interrupt nesting.
//------------------------------------------------------------------------------
irqHandler:
/* Save interrupt context on the stack to allow nesting */
sub lr, lr, #4
stmfd sp!, {lr}
mrs lr, SPSR
stmfd sp!, {r0, lr}
/* Write in the IVR to support Protect Mode */
ldr lr, =AT91C_BASE_AIC
ldr r0, [lr, #AIC_IVR]
str lr, [lr, #AIC_IVR]
/* Branch to interrupt handler in Supervisor mode */
msr CPSR_c, #ARM_MODE_SVC
stmfd sp!, {r1-r3, r4, r12, lr}
blx r0
/* Restore scratch/used registers and LR from User Stack */
/* Disable Interrupt and switch back in IRQ mode */
ldmia sp!, {r1-r3, r4, r12, lr}
msr CPSR_c, #ARM_MODE_IRQ | I_BIT
/* Acknowledge interrupt */
ldr lr, =AT91C_BASE_AIC
str lr, [lr, #AIC_EOICR]
/* Restore interrupt context and branch back to calling code */
ldmia sp!, {r0, lr}
msr SPSR_cxsf, lr
ldmia sp!, {pc}^
//------------------------------------------------------------------------------
/// Initializes the chip and branches to the main() function.
//------------------------------------------------------------------------------
.section .text
.global entry
entry:
resetHandler:
/* Useless instruction for referencing the .vectors section */
ldr r0, =resetVector
/* Set pc to actual code location (i.e. not in remap zone) */
ldr pc, =1f
/* Initialize the prerelocate segment */
1:
ldr r0, =_efixed
ldr r1, =_sprerelocate
ldr r2, =_eprerelocate
1:
cmp r1, r2
ldrcc r3, [r0], #4
strcc r3, [r1], #4
bcc 1b
/* Perform low-level initialization of the chip using LowLevelInit() */
ldr sp, =_sstack
stmfd sp!, {r0}
ldr r0, =LowLevelInit
blx r0
/* Initialize the postrelocate segment */
ldmfd sp!, {r0}
ldr r1, =_spostrelocate
ldr r2, =_epostrelocate
1:
cmp r1, r2
ldrcc r3, [r0], #4
strcc r3, [r1], #4
bcc 1b
/* Clear the zero segment */
ldr r0, =_szero
ldr r1, =_ezero
mov r2, #0
1:
cmp r0, r1
strcc r2, [r0], #4
bcc 1b
/* Setup stacks
**************/
/* IRQ mode */
msr CPSR_c, #ARM_MODE_IRQ | I_BIT | F_BIT
ldr sp, =_sstack
sub r4, sp, #IRQ_STACK_SIZE
/* Supervisor mode (interrupts enabled) */
msr CPSR_c, #ARM_MODE_SVC | F_BIT
mov sp, r4
/* Branch to main()
******************/
ldr r0, =main
blx r0
/* Loop indefinitely when program is finished */
1:
b 1b
Почему если я запускаю только айкеш производительность возрастает по сравнению с тем когда запускаю ММУ датакеш и айкеш в 1,5 раза
вот на всякий случай мой код включения мму дата и айкешей
CODE
static unsigned char *BufMMU = (unsigned char *) (SRAM_ADDRESS + 0x4000);
#include "cp15/cp15.h" //Atmel at91lib
void InitMMU(unsigned int *pTranslationTable)
{
int i;
int addSRAM;
// Program the TTB
printf("TTB = 0x%X\n\r", (unsigned int)pTranslationTable);
CP15_WriteTTB((unsigned int)pTranslationTable);
// Program the domain access register
CP15_WriteDomainAccessControl(0xFFFFFFFF); // domain 15: access are not checked
// Reset table entries
for (i = 0; i < 4096; i++) {
//printf("0x%X %d \r\n", (i << 20 | (3 << 10) | 0x12), (i << 20 | (3 << 10) | 0x12));
pTranslationTable[i] = i << 20 | (3 << 10) | 0x12;
}
// Program level 1 page table entry
// Vector adress
pTranslationTable[0x0] = (0x00000000)|TLB_NCNB;
// SRAM adress (with D cache)
addSRAM = (SRAM_ADDRESS >> 20);
printf("addSRAM = 0x%X\n\r", addSRAM);
printf("SRAM_ADDRESS = 0x%X\n\r", SRAM_ADDRESS);
pTranslationTable[addSRAM] = (SRAM_ADDRESS)|TLB_WB;
//DDRAM buf1 with write-through
pTranslationTable[0x701] = (AT91C_DDR2 + 0x00100000)|TLB_WT;
//DDRAM buf2 with write-through
pTranslationTable[0x702] = (AT91C_DDR2 + 0x00200000)|TLB_WT;
// Peripherals adress
pTranslationTable[0xFFF] = (0xFFF00000)|TLB_NCNB;
CP15_EnableMMU();
CP15_EnableDcache();
CP15_EnableIcache();
}
void main(void)
{
InitMMU((unsigned int*)BufMMU);
unsigned int mask = (1<<20);
AT91S_PIOM *pioa;
pioa = PIOA_BASE;
pio_enable(pioa, mask);
pio_disable_irq(pioa, mask);
pio_disable_multiple_driver(pioa, mask);
pio_disable_pull_ups(pioa, mask);
pio_output_enable(pioa, mask);
volatile unsigned int i;
while(1)
{
i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++
;i++;
PIO_ClearM(pioa, mask);
i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++;i++
;i++;
PIO_SetM(pioa, mask);
}
}
pTranslationTable[i] = i << 20 | (3 << 10) | 0x12;
Эту строку взял из убута до этого было
pTranslationTable[i] = 0
И при этом не работали мму и дкеш
но производительность при включении мму и дкеша сразу уменьшается чем это можно объяснить?