You solved it one way but I'll still write my answer.
Very bare bare metal example...
strap.s
.globl _start
_start:
b reset
b hang
b swi_handler
b hang
reset:
msr cpsr_c, 0x13 /* Supervisor mode */
mov sp,#0x10000
msr cpsr_c, 0x10 /* User mode */
mov sp,#0x9000
bl notmain
hang:
b hang
swi_handler:
push {r0,r1,r2,r3,r4,lr}
pop {r0,r1,r2,r3,r4,lr}
movs pc,lr
.globl GETPC
GETPC:
mov r0,pc
bx lr
.globl PUT32
PUT32:
str r1,[r0]
bx lr
.globl GET32
GET32:
ldr r0,[r0]
bx lr
notmain.c
void PUT32 ( unsigned int, unsigned int );
unsigned int GET32 ( unsigned int );
unsigned int GETPC ( void );
#define UART_BASE 0x101F1000
#define UARTDR (UART_BASE+0x000)
static void uart_send ( unsigned int x )
{
PUT32(UARTDR,x);
}
static void hexstrings ( unsigned int d )
{
unsigned int rb;
unsigned int rc;
rb=32;
while(1)
{
rb-=4;
rc=(d>>rb)&0xF;
if(rc>9) rc+=0x37; else rc+=0x30;
uart_send(rc);
if(rb==0) break;
}
uart_send(0x20);
}
static void hexstring ( unsigned int d )
{
hexstrings(d);
uart_send(0x0D);
uart_send(0x0A);
}
int notmain ( void )
{
unsigned int ra;
hexstring(0x12345678);
hexstring(GETPC());
for(ra=0;ra<0x20;ra+=4)
{
hexstrings(ra);
hexstring(GET32(ra));
}
return(0);
}
memmap
MEMORY
{
ram : ORIGIN = 0x00010000, LENGTH = 32K
}
SECTIONS
{
.text : { *(.text*) } > ram
.bss : { *(.text*) } > ram
}
Build
arm-linux-gnueabi-as --warn --fatal-warnings -march=armv5t strap.s -o strap.o
arm-linux-gnueabi-gcc -c -Wall -O2 -nostdlib -nostartfiles -ffreestanding -march=armv5t notmain.c -o notmain.o
arm-linux-gnueabi-ld strap.o notmain.o -T memmap -o notmain.elf
arm-linux-gnueabi-objdump -D notmain.elf > notmain.list
arm-linux-gnueabi-objcopy notmain.elf -O binary notmain.bin
Execute
qemu-system-arm -M versatilepb -m 128M -nographic -kernel notmain.bin
Output
12345678
0001003C
00000000 E3A00000
00000004 E59F1004
00000008 E59F2004
0000000C E59FF004
00000010 00000183
00000014 00000100
00000018 00010000
0000001C 00000000
Examine, assemble disassemble
.word 0xE3A00000
.word 0xE59F1004
.word 0xE59F2004
.word 0xE59FF004
.word 0x00000183
.word 0x00000100
.word 0x00010000
.word 0x00000000
0: e3a00000 mov r0, #0
4: e59f1004 ldr r1, [pc, #4] ; 10 <.text+0x10>
8: e59f2004 ldr r2, [pc, #4] ; 14 <.text+0x14>
c: e59ff004 ldr pc, [pc, #4] ; 18 <.text+0x18>
10: 00000183 andeq r0, r0, r3, lsl #3
14: 00000100 andeq r0, r0, r0, lsl #2
18: 00010000 andeq r0, r1, r0
1c: 00000000 andeq r0, r0, r0
So you can see that they are basically launching a Linux kernel the ATAGS/dtb is in ram at 0x100 perhaps. And they jump to 0x10000. 0001003C being the pc shown by the program as loaded with that command line using the -O binary version was loaded at 0x10000 and executed there. If you were to have an swi event then you would execute starting with the ldr r2 instruction and land on the rest handler in your code.
(Note incidentally that qemu doesn't properly model uarts, at least so far as I have found so you don't have to initialize them you don't have to wait for the tx buffer to be empty you just jam bytes into the tx buffer and they come out).
If you run the elf without changing the linker script
qemu-system-arm -M versatilepb -m 128M -nographic -kernel notmain.elf
12345678
0001003C
00000000 00000000
00000004 00000000
00000008 00000000
0000000C 00000000
00000010 00000000
00000014 00000000
00000018 00000000
0000001C 00000000
Interesting it loads and runs at 0x10000 which is what it was linked for but doesn't bother to setup for coming out of reset at 0x00000000 and/or this is that linker issue that makes for bad elf files and it padded with zeros which is
1c: 00000000 andeq r0, r0, r0
So it could have executed from 0x00000000 to 0x10000 and run into our code.
If we change the linker script
ram : ORIGIN = 0x00000000, LENGTH = 32K
Run the elf not the bin
qemu-system-arm -M versatilepb -m 128M -nographic -kernel notmain.elf
12345678
0000003C
00000000 EA000002
00000004 EA000006
00000008 EA000006
0000000C EA000004
00000010 E321F013
00000014 E3A0D801
00000018 E321F010
0000001C E3A0DA09
as expected.
Now for the swi.
strap.s
.globl _start
_start:
b reset
b hang
b swi_handler
b hang
reset:
msr cpsr_c, 0x13 /* Supervisor mode */
mov sp,#0x10000
msr cpsr_c, 0x10 /* User mode */
mov sp,#0x9000
bl notmain
hang:
b hang
swi_handler:
push {r0,r1,r2,r3,r4,lr}
bl handler
pop {r0,r1,r2,r3,r4,lr}
movs pc,lr
.globl GETPC
GETPC:
mov r0,pc
bx lr
.globl PUT32
PUT32:
str r1,[r0]
bx lr
.globl GET32
GET32:
ldr r0,[r0]
bx lr
.globl do_swi
do_swi:
svc #0x08
bx lr
notmain.c
void PUT32 ( unsigned int, unsigned int );
unsigned int GET32 ( unsigned int );
unsigned int GETPC ( void );
void do_swi ( void );
#define UART_BASE 0x101F1000
#define UARTDR (UART_BASE+0x000)
static void uart_send ( unsigned int x )
{
PUT32(UARTDR,x);
}
static void hexstring ( unsigned int d )
{
unsigned int rb;
unsigned int rc;
rb=32;
while(1)
{
rb-=4;
rc=(d>>rb)&0xF;
if(rc>9) rc+=0x37; else rc+=0x30;
uart_send(rc);
if(rb==0) break;
}
uart_send(0x0D);
uart_send(0x0A);
}
void handler ( void )
{
hexstring(0x11223344);
}
int notmain ( void )
{
hexstring(0x12345678);
do_swi();
hexstring(0x12345678);
return(0);
}
memmap
MEMORY
{
ram : ORIGIN = 0x00000000, LENGTH = 32K
}
SECTIONS
{
.text : { *(.text*) } > ram
.bss : { *(.text*) } > ram
}
Run the elf, output is
12345678
11223344
12345678
as desired. But you could have also done this
strap.s
.globl _start
_start:
ldr pc,reset_addr
ldr pc,hang_addr
ldr pc,swi_handler_addr
ldr pc,hang_addr
reset_addr: .word reset
hang_addr: .word hang
swi_handler_addr: .word swi_handler
reset:
mov r0,#0x10000
mov r1,#0x00000
ldmia r0!,{r2,r3,r4,r5}
stmia r1!,{r2,r3,r4,r5}
ldmia r0!,{r2,r3,r4,r5}
stmia r1!,{r2,r3,r4,r5}
msr cpsr_c, 0x13 /* Supervisor mode */
mov sp,#0x10000
msr cpsr_c, 0x10 /* User mode */
mov sp,#0x9000
bl notmain
hang:
b hang
swi_handler:
push {r0,r1,r2,r3,r4,lr}
bl handler
pop {r0,r1,r2,r3,r4,lr}
movs pc,lr
.globl GETPC
GETPC:
mov r0,pc
bx lr
.globl PUT32
PUT32:
str r1,[r0]
bx lr
.globl GET32
GET32:
ldr r0,[r0]
bx lr
.globl do_swi
do_swi:
svc #0x08
bx lr
notmain.c
void PUT32 ( unsigned int, unsigned int );
unsigned int GET32 ( unsigned int );
unsigned int GETPC ( void );
void do_swi ( void );
#define UART_BASE 0x101F1000
#define UARTDR (UART_BASE+0x000)
static void uart_send ( unsigned int x )
{
PUT32(UARTDR,x);
}
static void hexstring ( unsigned int d )
{
unsigned int rb;
unsigned int rc;
rb=32;
while(1)
{
rb-=4;
rc=(d>>rb)&0xF;
if(rc>9) rc+=0x37; else rc+=0x30;
uart_send(rc);
if(rb==0) break;
}
uart_send(0x0D);
uart_send(0x0A);
}
void handler ( void )
{
hexstring(0x11223344);
}
int notmain ( void )
{
unsigned int ra;
hexstring(0x12345678);
for(ra=0x10000;ra<0x10020;ra+=4) hexstring(GET32(ra));
for(ra=0x00000;ra<0x00020;ra+=4) hexstring(GET32(ra));
do_swi();
hexstring(0x12345678);
return(0);
}
memmap
MEMORY
{
ram : ORIGIN = 0x00010000, LENGTH = 32K
}
SECTIONS
{
.text : { *(.text*) } > ram
.bss : { *(.text*) } > ram
}
And now both the elf and the binary image versions work. I let the toolchain do the work for me:
00010010 <reset_addr>:
10010: 0001001c
00010014 <hang_addr>:
10014: 00010048
00010018 <swi_handler_addr>:
10018: 0001004c
The ldr pc, is position independent. I copy the four entries plus the four (well three) addresses so that 0x00000 matches 0x10000 and now the exception table (it is not a vector table btw) works.
With newer arm processors you could instead set VTOR to 0x10000 and it would use the one built into the binary, no copying necessary. Or as you solved just build and run your program from 0x00000 and there you go. I wanted to show the alternatives as well as how to figure out (by cheating, you have to love uarts in qemu) what qemu is doing and where it is loading without having to use a debugger.