Git Product home page Git Product logo

Comments (11)

jannic avatar jannic commented on September 15, 2024

I don't see a reason why there should be a difference between STM32 and RP2040 in this regard.
You need two things:

  • a linker script
  • some code which copies the code to RAM

For the linker script, instead of replacing link.x from cortex-m-rt, you can use a memory.x like this:

MEMORY {
    BOOT2 : ORIGIN = 0x10000000, LENGTH = 0x100
    FLASH : ORIGIN = 0x10000100, LENGTH = 2048K - 0x100
    RAM : ORIGIN = 0x20000000, LENGTH = 256K
    SRAM4 : ORIGIN = 0x20040000, LENGTH = 4k
    SRAM5 : ORIGIN = 0x20041000, LENGTH = 4k
}

EXTERN(BOOT2_FIRMWARE)

SECTIONS {
    /* ### Boot loader */
    .boot2 ORIGIN(BOOT2) :
    {
        KEEP(*(.boot2));
    } > BOOT2
} INSERT BEFORE .text;

SECTIONS {
    .sram4 :
    {
	KEEP(*(.sram4));
    } > SRAM4 AT>FLASH
} INSERT AFTER .text;

You'd need to add code to actually initialize RAM4 contents before jumping to functions located there, and that code probably needs some linker symbols so it can find the data to be copied in flash.

I don't have a ready-made example, and the details depend on what you are actually trying to achieve. But if you have working code for STM32, you can probably copy most of what you need from there.

from rp-hal.

thejpster avatar thejpster commented on September 15, 2024

You can also just put functions in .data, if you don't mind where in SRAM they go:

https://github.com/Neotron-Compute/Neotron-Pico-BIOS/blob/4bf443cd7fb35142b3f832a224ff4be93794314c/src/vga/mod.rs#L200

from rp-hal.

alexkazik avatar alexkazik commented on September 15, 2024

@jannic with your linker file the resulting elf still reports that region at address 0, which will not work:

Sections:
Idx Name          Size      VMA       LMA       File off  Algn
  2 .text         00006e04  100001c0  100001c0  000002f4  2**2
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
  3 .sram4and5    0000004c  00000000  00000000  000086e0  2**2
                  CONTENTS, READONLY

@thejpster I do care where it is, the idea is that the second core will only run within sram4+5 to improve performance.

from rp-hal.

thejpster avatar thejpster commented on September 15, 2024

You should benchmark carefully to ensure it actually does improve performance, as SRAM0-3 are striped so each successive word comes from a different bank.

jannic suggested a section and a segment called sram4 but your ELF contains a segment called sram4and5. Can we see the source code? If not (e.g. because this is a proprietary project) you may require professional support under NDA.

from rp-hal.

alexkazik avatar alexkazik commented on September 15, 2024

sorry, I did just put the two together:

MEMORY {
    BOOT2 : ORIGIN = 0x10000000, LENGTH = 0x100
    FLASH : ORIGIN = 0x10000100, LENGTH = 2048K - 0x100
    RAM : ORIGIN = 0x20000000, LENGTH = 256K
    SRAM4AND5 : ORIGIN = 0x20040000, LENGTH = 8k
}

EXTERN(BOOT2_FIRMWARE)

SECTIONS {
    /* ### Boot loader */
    .boot2 ORIGIN(BOOT2) :
    {
        KEEP(*(.boot2));
    } > BOOT2
} INSERT BEFORE .text;

SECTIONS {
    .sram4and5 :
    {
	KEEP(*(.sram4and5));
    } > SRAM4AND5 AT>FLASH
} INSERT AFTER .text;

it's currently a very basic routine just to test that all works as expected

.syntax unified
.cpu cortex-m0plus
.thumb

//bare metal assembly blinking routine
.global basic
.section .sram4and5
.p2align 2
.type basic,%function
basic:
.fnstart


//releases the peripheral reset for iobank_0
	ldr r0, =rst_clr	// atomic register for clearing reset controller (0x4000c000+0x3000)
	movs r1, #32      	// load a 1 into bit 5
	str r1, [r0, #0] 	// store the bitmask into the atomic register to clear register

// check if reset is done
rst:
    ldr r0, =rst_base	// base address for reset controller
	ldr r1, [r0, #8] 	// offset to get to the reset_done register
	movs r2, #32			// load 1 in bit 5 of register 2 (...0000000000100000)
	ands r1, r1, r2		// isolate bit 5
	beq rst				// if bit five is 0 then check again, if not, reset is done


// set the control
	ldr r0, =ctrl		// control register for GPIO25
	movs r1, #5			// Function 5, select SIO for GPIO25 2.19.2
	str r1, [r0]  		// Store function_5 in GPIO25 control register
//shifts over "1" the number of bits of GPIO pin
	movs r1, #1			// load a 1 into register 1
	lsls r1, r1, #25 	// move the bit over to align with GPIO25
	ldr r0, =sio_base	// SIO base
	str r1, [r0, #36]  	// 0x24 GPIO output enable

led_loop:
	str r1, [r0, #20] 	// 0x14 GPIO output value set
	ldr r3, =big_num	// load countdown number
	bl delay 			// branch to subroutine delay

	str r1, [r0, #24]	// 0x18 GPIO output value clear
	ldr r3, =big_num	// load countdown number
	bl delay 			// branch to subroutine delay

	b led_loop			// do the loop again

delay:
	subs r3, #1			// subtract 1 from register 3
	bne delay			// loop back to delay if not zero
	bx lr				// return from subroutine

.fnend


.p2align 2
data:

.equ rst_clr, 0x4000f000 	// atomic register for clearing reset controller 2.1.2

.equ rst_base, 0x4000c000	// reset controller base 2.14.3

.equ ctrl, 0x400140cc 		// GPIO25_CTRL 2.19.6.1

.equ sio_base, 0xd0000000	// SIO base 2.3.1.7

.equ big_num, 0x00f00000 	// large number for the delay loop

build.rs:

use std::io::Error;

fn main() -> Result<(), Error> {
    println!("cargo:rerun-if-changed=c64/basic.S");

    cc::Build::new()
        .file("c64/basic.S")
        .warnings(true)
        .warnings_into_errors(true)
        .debug(true)
        .compile("c64");

    Ok(())
}

from rp-hal.

jannic avatar jannic commented on September 15, 2024

.section .sram4and5

Please try .section .sram4and5,"ax",%progbits here.
I can't claim I fully understand why, but it seems to be important. I guess it's the a flag (SHF_ALLOC, "This section occupies memory during process execution.") that makes a difference.

References:
https://developer.arm.com/documentation/101754/0618/armclang-Reference/armclang-Integrated-Assembler/Section-directives
https://www.man7.org/linux/man-pages/man5/elf.5.html

from rp-hal.

alexkazik avatar alexkazik commented on September 15, 2024

That looks way better.

  3 .sram4and5    0000004c  20040000  10006fc4  00007118  2**2
                  CONTENTS, ALLOC, LOAD, READONLY, CODE

but it does not run, probably I have to copy the code? I'll continue this evening but wanted you to know now.

from rp-hal.

jannic avatar jannic commented on September 15, 2024

Yes, that's what I meant when I wrote "some code which copies the code to RAM".
You could for example use #[pre_init] to copy the code from flash to RAM. But make sure to read the warnings in https://docs.rs/cortex-m-rt/0.7.3/cortex_m_rt/attr.pre_init.html, there are some pitfalls. Perhaps consider writing it in assembly.

from rp-hal.

alexkazik avatar alexkazik commented on September 15, 2024

I did got it to work with:

SECTIONS {
     . = ALIGN(4);
    .sram4and5 :
    {
     . = ALIGN(4);
      _srelocate = .;
	KEEP(*(.sram4and5));
     . = ALIGN(4);
      _erelocate = .;
    } > SRAM4AND5 AT>FLASH
} INSERT AFTER .text;
_xrelocate = LOADADDR(.sram4and5);
    unsafe {
        extern "C" {
            static mut _srelocate: u8;
            static mut _erelocate: u8;
            static mut _xrelocate: u8;
        }
        let srelocate: *mut u8 = addr_of_mut!(_srelocate);
        let erelocate: *const u8 = addr_of!(_erelocate);
        let xrelocate: *const u8 = addr_of!(_xrelocate);
        let size = erelocate.offset_from(srelocate) as usize;
        from_raw_parts_mut(srelocate, size).copy_from_slice(from_raw_parts(xrelocate, size));
    }
    let _test = core1.spawn(
        unsafe {
            &mut *(slice_from_raw_parts_mut((0x20040000 + 8 * 1024 - 4 * 100) as *mut usize, 100))
        },
        move || unsafe { basic() },
    );

probably not the nicest but it's working. the first core uses the "normal" way and the second only this.

btw. the rp2040 datasheet says that if a ram 0-5 is accessed by more than one cpu/other than they have to wait, since sram4+5 is exclusively used by the second core no waiting should happen.

thanks for all you help!

from rp-hal.

jannic avatar jannic commented on September 15, 2024

Glad that it finally worked!
Regarding the possible wait states on concurrent access: I'd love to see some benchmark showing how big the difference is. I guess it's rather small, but actual measurements would be interesting.

from rp-hal.

alexkazik avatar alexkazik commented on September 15, 2024

I won't measure it, but just believe them that there is no wait states if not used elsewhere.

And as already said: Thanks!

from rp-hal.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.