dwelch67
Posts: 955
Joined: Sat May 26, 2012 5:32 pm

Re: Run all 4 cores Raspberry Pi 3

Fri Jul 21, 2017 9:07 pm

the smc probably hits here

Code: Select all

   8:   e3001131    movw   r1, #305   ; 0x131
   c:   ee011f11    mcr   15, 0, r1, cr1, cr1, {0}
  10:   e30001da    movw   r0, #474   ; 0x1da
  14:   e16ff000    msr   SPSR_fsxc, r0
  18:   e1b0f00e    movs   pc, lr
which change to hyp mode and returns. as far as the sctlr cache bits being enabled but not actually set when we read them later, not sure on that...im not clearing them.

Code: Select all

SCTLR
enable caches  
  28:   ee110f10    mrc   15, 0, r0, cr1, cr0, {0}
  2c:   e3800004    orr   r0, r0, #4
  30:   e3800a01    orr   r0, r0, #4096   ; 0x1000
  34:   ee010f10    mcr   15, 0, r0, cr1, cr0, {0}
you could do some math based on the lower bits of the mpidr to generate the stack pointer for each without any if-then-elses.

and it with 3 and shift left 12 or something like that and add some like another 0x1000 giving 0x1000, 0x2000, 0x3000, 0x4000

LdB
Posts: 1207
Joined: Wed Dec 07, 2016 2:29 pm

Re: Run all 4 cores Raspberry Pi 3

Fri Jul 21, 2017 9:11 pm

made a generic stack pointer set code and setup core .. this assembler

Code: Select all

.globl setStackPtr; 
setStackPtr : .4byte 0;
.globl ExtraCoreSetup;
ExtraCoreSetup:
    ldr r1, =setStackPtr
    ldr r0, [r1]
    mov sp, r0							;@ Set the stack pointer for that mode
;@"================================================================"
;@ If the cpu is in HYP_MODE(EL2) we will bring it SVC_MODE (EL1).
;@"================================================================"
    mrs r0,cpsr						;@ Fetch the cpsr register which includes CPU mode bits 
 	and r1, r0, #0x1F						;@ Mask off the CPU mode bits to register r1                            
 	cmp r1, #0x1A						;@ check we are in HYP_MODE AKA register reads 1A											
	bne .NotInHypMode1					;@ Branch if not equal meaning was not in HYP_MODE  
	bic r0,r0,#0x1F						;@ Clear the CPU mode bits in register r0							
   	orr r0,r0,#0x13						;@ Logical OR SVC_MODE bits onto register
    msr spsr_cxsf,r0						;@ Hold value in spsr_cxsf
    add lr,pc,#4						;@ Calculate address of .NotInHypMode label
	/* I borrowed this trick from Ultibo because ARM6 code running on an ARM7/8 needs this opcode  */
	/* The ARM6 compiler does not know these instructions so it is a way to get needed opcode here */
    	/* So our ARM6 code can drop an arm7 or arm8 out of HYP mode and run on an ARM7/8.             */
 	/* Native ARM7/8 compilers already understand the OPCODE but do not mind it this way either	   */        
	.long 0xE12EF30E						;@ "msr ELR_hyp, lr" Set the address to ELR_hyp
	.long 0xE160006E						;@ "eret" Elevated return which will exit at .NotInHypMode in SVC_MODE
.NotInHypMode1:
    ldr r1, =setStackPtr					;@ fetch stack ptr value user set
    ldr r0, [r1]
	mov sp, r0								;@ Set the stack pointer for that mode
	
@"================================================================"
;@ PI NSACR regsister setup for access to floating point unit
;@ Cortex A-7 => Section 4.3.34. Non-Secure Access Control Register
;@ Cortex A-53 => Section 4.5.32. Non-Secure Access Control Register
;@"================================================================"
	mrc p15, 0, r0, c1, c1, 2				;@ Read NSACR into R0
	cmp r0, #0x00000C00						;@ Access turned on or in AARCH32 mode and can not touch register or EL3 fault
	beq .free_to_enable_fpu1
	orr r0, r0, #0x3<<10					;@ Set access to both secure and non secure modes
	mcr p15, 0, r0, c1, c1, 2				;@ Write NSACR
;@"================================================================"
;@ Bring fpu online
;@"================================================================"
.free_to_enable_fpu1:
	mrc p15, 0, r0, c1, c0, #2				;@ R0 = Access Control Register
	orr r0, #(0x300000 + 0xC00000)			;@ Enable Single & Double Precision
	mcr p15,0,r0,c1,c0, #2					;@ Access Control Register = R0
	mov r0, #0x40000000						;@ R0 = Enable VFP
	vmsr fpexc, r0							;@ FPEXC = R0
;@"================================================================"
;@ Enable L1 cache
;@"================================================================"
    mrc p15,0,r0,c1,c0,0					;@ R0 = System Control Register

    /* Enable caches and branch prediction */
    orr r0, #SCTLR_ENABLE_BRANCH_PREDICTION
    orr r0, #SCTLR_ENABLE_DATA_CACHE
    orr r0, #SCTLR_ENABLE_INSTRUCTION_CACHE

    mcr p15,0,r0,c1,c0,0					;@ System Control Register = R0

    ldr r1, =setStackPtr
	mov r0, #0
	str r0, [r1]      /* This zeros setStackPtr ... AKA setup completed */ 
	bx  lr									;@ Return
This c code

Code: Select all

extern uint32_t setStackPtr;
extern void ExtraCoreSetup (void);
#define CORE1_MAILBOX ((volatile __attribute__((aligned(4))) uint32_t*) (0x4000009C))
#define CORE2_MAILBOX ((volatile __attribute__((aligned(4))) uint32_t*) (0x400000AC))
#define CORE3_MAILBOX ((volatile __attribute__((aligned(4))) uint32_t*) (0x400000BC))

int main (void){
	setStackPtr = 0x4000;
	*CORE1_MAILBOX = (uintptr_t)&ExtraCoreSetup;
	while (setStackPtr != 0);
	setStackPtr = 0x5000;
	*CORE2_MAILBOX = (uintptr_t)&ExtraCoreSetup;
	while (setStackPtr != 0);
	setStackPtr = 0x6000;
	*CORE3_MAILBOX = (uintptr_t)&ExtraCoreSetup;
	while (setStackPtr != 0);

/* I reach here so each core confirms it is setup by clearing the setStackPtr value */
/*  it will hang on the while loops if the cores don't execute */
}
So it is working for me on the Pi3 each core is taking a stack pointer, doing the HYP drop. I am not sure if I am allowed to connect the FPU to each core but I do :-)

I am worried about the link register when it pulls the HYP drop I might save it to a memory location. Anyhow I definitely have execution it is just smoothing it out now.

Now to do it on AARCH64 mode.

banspri
Posts: 24
Joined: Tue Jun 27, 2017 11:02 pm

Re: Run all 4 cores Raspberry Pi 3

Fri Jul 21, 2017 11:15 pm

I got the cores running!!!!!!!!!!!!!!!!!!! Thanks so much guys.

LdB
Posts: 1207
Joined: Wed Dec 07, 2016 2:29 pm

Re: Run all 4 cores Raspberry Pi 3

Sat Jul 22, 2017 1:37 am

There is actually a bug with my code that you must have worked out :-)

A C compiler doesn't save the R0-R3. We need to put it back in the loop with those registers preserved OR you need to make you own secondary spin loop with it reading it's mailbox and C tolerant.

The first option would be totally dependent on the STUB loader if it changed it would stop working so going to quickly code the first option and make my own C tolerant mailbox read secondary spin which actually is tolerant on registers r0-r3.

One completely C safe assembler block code, the setup allows set of svc_stack as well as irq_stack on each core and puts us back to the new C tolerant spin loop.

Code: Select all

;@"================================================================"
;@ Same spin as the STUB does but tolerant on registers R0-R3 for C.
;@"================================================================"
SecondarySpin:
	mrc     p15, 0, r0, c0, c0, 5
	ubfx    r0, r0, #0, #2					// Read core id and make a bit mask in R0 
	ldr r5, =mbox		
	ldr r5, [r5]		@ mbox
	mov	r3, #0			@ magic
	add	r5, #(0x400000CC-0x4000008C)	@ mbox
1:
	ldr	r4, [r5, r0, lsl #4]
	cmp	r4, r3
	beq	1b
@ clear mailbox
	str	r4, [r5, r0, lsl #4]
	mov	r0, #0
	ldr r1, =machid		
	ldr r1, [r1]		@ BCM2708 machine id
	ldr r2, = atags		
	ldr r2, [r2]		@ ATAGS
	ldr lr, =SecondarySpin
	bx	r4
	b SecondarySpin
mbox: 	.4byte 0x4000008C
machid:	.4byte 3138
atags:  .4byte 0x100

.globl setStackPtr; 
setStackPtr : .4byte 0;
.globl setIrqStackPtr; 
setIrqStackPtr : .4byte 0;
.globl ExtraCoreSetup;
ExtraCoreSetup:
    ldr r1, =setStackPtr				;@ Address of User stack pointer value
	ldr sp, [r1]						;@ Set the stack pointer for that mode
;@"================================================================"
;@ If the cpu is in HYP_MODE(EL2) we will bring it SVC_MODE (EL1).
;@"================================================================"
    mrs r0,cpsr							;@ Fetch the cpsr register which includes CPU mode bits 
 	and r1, r0, #0x1F					;@ Mask off the CPU mode bits to register r1                            
 	cmp r1, #0x1A						;@ check we are in HYP_MODE AKA register reads 1A											
	bne .NotInHypMode1					;@ Branch if not equal meaning was not in HYP_MODE  
	bic r0,r0,#0x1F						;@ Clear the CPU mode bits in register r0							
   	orr r0,r0,#0x13						;@ Logical OR SVC_MODE bits onto register
    msr spsr_cxsf,r0					;@ Hold value in spsr_cxsf
    add lr,pc,#4						;@ Calculate address of .NotInHypMode label
	/* I borrowed this trick from Ultibo because ARM6 code running on an ARM7/8 needs this opcode  */
	/* The ARM6 compiler does not know these instructions so it is a way to get needed opcode here */
    /* So our ARM6 code can drop an arm7 or arm8 out of HYP mode and run on an ARM7/8.             */
 	/* Native ARM7/8 compilers already understand the OPCODE but do not mind it this way either	   */        
	.long 0xE12EF30E					;@ "msr ELR_hyp, lr" Set the address to ELR_hyp
	.long 0xE160006E					;@ "eret" Elevated return which will exit at .NotInHypMode in SVC_MODE
.NotInHypMode1:
;@"================================================================"
;@ Now setup stack pointers for the different CPU operation modes.
;@"================================================================"
	msr CPSR_c, #0xD2					;@ Switch to IRQ_MODE
    ldr r1, =setIrqStackPtr
	ldr sp, [r1]						;@ Set the stack pointer for that mode
	msr CPSR_c, #0xD3					;@ Switch back to SRV_MODE
    ldr r1, =setStackPtr
	ldr sp, [r1]						;@ Set the stack pointer for that mode
@"================================================================"
;@ PI NSACR regsister setup for access to floating point unit
;@ Cortex A-7 => Section 4.3.34. Non-Secure Access Control Register
;@ Cortex A-53 => Section 4.5.32. Non-Secure Access Control Register
;@"================================================================"
	mrc p15, 0, r0, c1, c1, 2			;@ Read NSACR into R0
	cmp r0, #0x00000C00					;@ Access turned on or in AARCH32 mode and can not touch register or EL3 fault
	beq .free_to_enable_fpu1
	orr r0, r0, #0x3<<10				;@ Set access to both secure and non secure modes
	mcr p15, 0, r0, c1, c1, 2			;@ Write NSACR
;@"================================================================"
;@ Bring fpu online
;@"================================================================"
.free_to_enable_fpu1:
	mrc p15, 0, r0, c1, c0, #2			;@ R0 = Access Control Register
	orr r0, #(0x300000 + 0xC00000)		;@ Enable Single & Double Precision
	mcr p15,0,r0,c1,c0, #2				;@ Access Control Register = R0
	mov r0, #0x40000000					;@ R0 = Enable VFP
	vmsr fpexc, r0						;@ FPEXC = R0
;@"================================================================"
;@ Enable L1 cache
;@"================================================================"
    mrc p15,0,r0,c1,c0,0				;@ R0 = System Control Register

    /* Enable caches and branch prediction */
    orr r0, #SCTLR_ENABLE_BRANCH_PREDICTION
    orr r0, #SCTLR_ENABLE_DATA_CACHE
    orr r0, #SCTLR_ENABLE_INSTRUCTION_CACHE

    mcr p15,0,r0,c1,c0,0				;@ System Control Register = R0
;@"================================================================"
;@ Acknowledge stacks set by clearing values
;@"================================================================"
	mov r0, #0
	ldr r1, =setIrqStackPtr
	str r0, [r1]
    ldr r1, =setStackPtr
	str r0, [r1]
	ldr lr, =SecondarySpin
	bx lr								;@ Always return to secondary spin	
	bl SecondarySpin					;@ Just safety .. to be sure to be sure
I started playing with cores in IRQ function so I needed to set IrqStack.
Setting a value to setStackPtr is mandatory, setIrqStackPtr is optional but here I show setting both but as I will use cores in same irq call on core 0 I set them all to same place being 0x7000.

Code: Select all

extern volatile uint32_t setStackPtr;
extern volatile uint32_t setIrqStackPtr;
#define CORE1_MAILBOX ((volatile __attribute__((aligned(4))) uint32_t*) (0x4000009C))
#define CORE2_MAILBOX ((volatile __attribute__((aligned(4))) uint32_t*) (0x400000AC))
#define CORE3_MAILBOX ((volatile __attribute__((aligned(4))) uint32_t*) (0x400000BC))

int main (void) {
	setStackPtr = 0x4000;
	setIrqStackPtr = 0x7000;         
	*CORE1_MAILBOX = (uintptr_t)&ExtraCoreSetup;
	while (setStackPtr != 0);
	setStackPtr = 0x5000;
	setIrqStackPtr = 0x7000;
	*CORE2_MAILBOX = (uintptr_t)&ExtraCoreSetup;
	while (setStackPtr != 0);
	setStackPtr = 0x6000;
	setIrqStackPtr = 0x7000;
	*CORE3_MAILBOX = (uintptr_t)&ExtraCoreSetup;
	while (setStackPtr != 0);
}

LdB
Posts: 1207
Joined: Wed Dec 07, 2016 2:29 pm

Re: Run all 4 cores Raspberry Pi 3

Sun Jul 23, 2017 2:35 pm

GitHub sample is up if anyone was having trouble put all the bits together .. nice team effort guys.
https://github.com/LdB-ECM/Raspberry-Pi ... /Multicore
Made it C safe, all 4 core stacks are setup in linker file and all 4 cores are setup by startup assembler. So all setup ready to go by the time it enters the C code.

banspri
Posts: 24
Joined: Tue Jun 27, 2017 11:02 pm

Re: Run all 4 cores Raspberry Pi 3

Mon Jul 24, 2017 3:36 pm

Is there any way to ensure that two cores aren't attempting to access, say, a semaphore at the same time?

dwelch67
Posts: 955
Joined: Sat May 26, 2012 5:32 pm

Re: Run all 4 cores Raspberry Pi 3

Mon Jul 24, 2017 5:21 pm

that is what ldrex/strex are for assuming broadcom implemented their side properly.

User avatar
Ultibo
Posts: 158
Joined: Wed Sep 30, 2015 10:29 am
Location: Australia
Contact: Website

Re: Run all 4 cores Raspberry Pi 3

Mon Jul 24, 2017 11:51 pm

banspri wrote:Is there any way to ensure that two cores aren't attempting to access, say, a semaphore at the same time?
You might be asking the wrong question really, semaphores, spinlocks and mutexes etc are what you use to protect code and data from being accessed by multiple cores in cases where it would cause issues.

There is no reason to prevent multiple cores from accessing a semaphore because a properly written implementation is designed to handle that case. The better question might be how do you write a semaphore or mutex so that it works correctly with multiple cores and for that question ARM provide a document called ARM Synchronization Primitives that gives a very good introduction.
Ultibo.org | Make something amazing
https://ultibo.org

Threads, multi-core, OpenGL, Camera, FAT, NTFS, TCP/IP, USB and more in 3MB with 2 second boot!

Return to “Bare metal, Assembly language”