In my previous project with rust on a STM32 I managed to place assembly code in ram. T

<a class="user-mention notranslate" data-hovercard-type="user" data-hovercard-url="/us

sorry, I did just put the two together: <div class="snippet-clipboard-content notr

.section .sram4and5 Please try <code clas

That looks way better. <div class="snippet-clipboard-content notranslate position-

I did got it to work with: <div class="snippet-clipboard-content notranslate posit

Using sram4/5 about rp-hal HOT 11 OPEN

alexkazik commented on September 15, 2024

Using sram4/5

from rp-hal.

Comments (11)

jannic commented on September 15, 2024

I don't see a reason why there should be a difference between STM32 and RP2040 in this regard.
You need two things:

a linker script
some code which copies the code to RAM

For the linker script, instead of replacing link.x from cortex-m-rt, you can use a memory.x like this:

MEMORY {
    BOOT2 : ORIGIN = 0x10000000, LENGTH = 0x100
    FLASH : ORIGIN = 0x10000100, LENGTH = 2048K - 0x100
    RAM : ORIGIN = 0x20000000, LENGTH = 256K
    SRAM4 : ORIGIN = 0x20040000, LENGTH = 4k
    SRAM5 : ORIGIN = 0x20041000, LENGTH = 4k
}

EXTERN(BOOT2_FIRMWARE)

SECTIONS {
    /* ### Boot loader */
    .boot2 ORIGIN(BOOT2) :
    {
        KEEP(*(.boot2));
    } > BOOT2
} INSERT BEFORE .text;

SECTIONS {
    .sram4 :
    {
	KEEP(*(.sram4));
    } > SRAM4 AT>FLASH
} INSERT AFTER .text;

You'd need to add code to actually initialize RAM4 contents before jumping to functions located there, and that code probably needs some linker symbols so it can find the data to be copied in flash.

I don't have a ready-made example, and the details depend on what you are actually trying to achieve. But if you have working code for STM32, you can probably copy most of what you need from there.

from rp-hal.

thejpster commented on September 15, 2024

You can also just put functions in .data, if you don't mind where in SRAM they go:

https://github.com/Neotron-Compute/Neotron-Pico-BIOS/blob/4bf443cd7fb35142b3f832a224ff4be93794314c/src/vga/mod.rs#L200

from rp-hal.

alexkazik commented on September 15, 2024

@jannic with your linker file the resulting elf still reports that region at address 0, which will not work:

Sections:
Idx Name          Size      VMA       LMA       File off  Algn
  2 .text         00006e04  100001c0  100001c0  000002f4  2**2
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
  3 .sram4and5    0000004c  00000000  00000000  000086e0  2**2
                  CONTENTS, READONLY

@thejpster I do care where it is, the idea is that the second core will only run within sram4+5 to improve performance.

from rp-hal.

thejpster commented on September 15, 2024

You should benchmark carefully to ensure it actually does improve performance, as SRAM0-3 are striped so each successive word comes from a different bank.

jannic suggested a section and a segment called sram4 but your ELF contains a segment called sram4and5. Can we see the source code? If not (e.g. because this is a proprietary project) you may require professional support under NDA.

from rp-hal.

alexkazik commented on September 15, 2024

sorry, I did just put the two together:

MEMORY {
    BOOT2 : ORIGIN = 0x10000000, LENGTH = 0x100
    FLASH : ORIGIN = 0x10000100, LENGTH = 2048K - 0x100
    RAM : ORIGIN = 0x20000000, LENGTH = 256K
    SRAM4AND5 : ORIGIN = 0x20040000, LENGTH = 8k
}

EXTERN(BOOT2_FIRMWARE)

SECTIONS {
    /* ### Boot loader */
    .boot2 ORIGIN(BOOT2) :
    {
        KEEP(*(.boot2));
    } > BOOT2
} INSERT BEFORE .text;

SECTIONS {
    .sram4and5 :
    {
	KEEP(*(.sram4and5));
    } > SRAM4AND5 AT>FLASH
} INSERT AFTER .text;

it's currently a very basic routine just to test that all works as expected

.syntax unified
.cpu cortex-m0plus
.thumb

//bare metal assembly blinking routine
.global basic
.section .sram4and5
.p2align 2
.type basic,%function
basic:
.fnstart


//releases the peripheral reset for iobank_0
	ldr r0, =rst_clr	// atomic register for clearing reset controller (0x4000c000+0x3000)
	movs r1, #32      	// load a 1 into bit 5
	str r1, [r0, #0] 	// store the bitmask into the atomic register to clear register

// check if reset is done
rst:
    ldr r0, =rst_base	// base address for reset controller
	ldr r1, [r0, #8] 	// offset to get to the reset_done register
	movs r2, #32			// load 1 in bit 5 of register 2 (...0000000000100000)
	ands r1, r1, r2		// isolate bit 5
	beq rst				// if bit five is 0 then check again, if not, reset is done


// set the control
	ldr r0, =ctrl		// control register for GPIO25
	movs r1, #5			// Function 5, select SIO for GPIO25 2.19.2
	str r1, [r0]  		// Store function_5 in GPIO25 control register
//shifts over "1" the number of bits of GPIO pin
	movs r1, #1			// load a 1 into register 1
	lsls r1, r1, #25 	// move the bit over to align with GPIO25
	ldr r0, =sio_base	// SIO base
	str r1, [r0, #36]  	// 0x24 GPIO output enable

led_loop:
	str r1, [r0, #20] 	// 0x14 GPIO output value set
	ldr r3, =big_num	// load countdown number
	bl delay 			// branch to subroutine delay

	str r1, [r0, #24]	// 0x18 GPIO output value clear
	ldr r3, =big_num	// load countdown number
	bl delay 			// branch to subroutine delay

	b led_loop			// do the loop again

delay:
	subs r3, #1			// subtract 1 from register 3
	bne delay			// loop back to delay if not zero
	bx lr				// return from subroutine

.fnend


.p2align 2
data:

.equ rst_clr, 0x4000f000 	// atomic register for clearing reset controller 2.1.2

.equ rst_base, 0x4000c000	// reset controller base 2.14.3

.equ ctrl, 0x400140cc 		// GPIO25_CTRL 2.19.6.1

.equ sio_base, 0xd0000000	// SIO base 2.3.1.7

.equ big_num, 0x00f00000 	// large number for the delay loop

build.rs:

use std::io::Error;

fn main() -> Result<(), Error> {
    println!("cargo:rerun-if-changed=c64/basic.S");

    cc::Build::new()
        .file("c64/basic.S")
        .warnings(true)
        .warnings_into_errors(true)
        .debug(true)
        .compile("c64");

    Ok(())
}

from rp-hal.

jannic commented on September 15, 2024

.section .sram4and5

Please try .section .sram4and5,"ax",%progbits here.
I can't claim I fully understand why, but it seems to be important. I guess it's the a flag (SHF_ALLOC, "This section occupies memory during process execution.") that makes a difference.

References:
https://developer.arm.com/documentation/101754/0618/armclang-Reference/armclang-Integrated-Assembler/Section-directives
https://www.man7.org/linux/man-pages/man5/elf.5.html

from rp-hal.

alexkazik commented on September 15, 2024

That looks way better.

  3 .sram4and5    0000004c  20040000  10006fc4  00007118  2**2
                  CONTENTS, ALLOC, LOAD, READONLY, CODE

but it does not run, probably I have to copy the code? I'll continue this evening but wanted you to know now.

from rp-hal.

jannic commented on September 15, 2024

Yes, that's what I meant when I wrote "some code which copies the code to RAM".
You could for example use #[pre_init] to copy the code from flash to RAM. But make sure to read the warnings in https://docs.rs/cortex-m-rt/0.7.3/cortex_m_rt/attr.pre_init.html, there are some pitfalls. Perhaps consider writing it in assembly.

from rp-hal.

alexkazik commented on September 15, 2024

I did got it to work with:

SECTIONS {
     . = ALIGN(4);
    .sram4and5 :
    {
     . = ALIGN(4);
      _srelocate = .;
	KEEP(*(.sram4and5));
     . = ALIGN(4);
      _erelocate = .;
    } > SRAM4AND5 AT>FLASH
} INSERT AFTER .text;
_xrelocate = LOADADDR(.sram4and5);

    unsafe {
        extern "C" {
            static mut _srelocate: u8;
            static mut _erelocate: u8;
            static mut _xrelocate: u8;
        }
        let srelocate: *mut u8 = addr_of_mut!(_srelocate);
        let erelocate: *const u8 = addr_of!(_erelocate);
        let xrelocate: *const u8 = addr_of!(_xrelocate);
        let size = erelocate.offset_from(srelocate) as usize;
        from_raw_parts_mut(srelocate, size).copy_from_slice(from_raw_parts(xrelocate, size));
    }
    let _test = core1.spawn(
        unsafe {
            &mut *(slice_from_raw_parts_mut((0x20040000 + 8 * 1024 - 4 * 100) as *mut usize, 100))
        },
        move || unsafe { basic() },
    );

probably not the nicest but it's working. the first core uses the "normal" way and the second only this.

btw. the rp2040 datasheet says that if a ram 0-5 is accessed by more than one cpu/other than they have to wait, since sram4+5 is exclusively used by the second core no waiting should happen.

thanks for all you help!

from rp-hal.

jannic commented on September 15, 2024

Glad that it finally worked!
Regarding the possible wait states on concurrent access: I'd love to see some benchmark showing how big the difference is. I guess it's rather small, but actual measurements would be interesting.

from rp-hal.

alexkazik commented on September 15, 2024

I won't measure it, but just believe them that there is no wait states if not used elsewhere.

And as already said: Thanks!

from rp-hal.

Using sram4/5 about rp-hal HOT 11 OPEN

Comments (11)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent