summaryrefslogtreecommitdiff
path: root/arch/arm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm/lib')
-rw-r--r--arch/arm/lib/Makefile5
-rw-r--r--arch/arm/lib/asm-offsets.c88
-rw-r--r--arch/arm/lib/asmdefs.h98
-rw-r--r--arch/arm/lib/bootm.c45
-rw-r--r--arch/arm/lib/ccn504.S3
-rw-r--r--arch/arm/lib/crt0.S11
-rw-r--r--arch/arm/lib/div64.S10
-rw-r--r--arch/arm/lib/lib1funcs.S6
-rw-r--r--arch/arm/lib/memcpy-arm64.S242
-rw-r--r--arch/arm/lib/memset-arm64.S148
-rw-r--r--arch/arm/lib/relocate.S35
-rw-r--r--arch/arm/lib/stack.c14
12 files changed, 553 insertions, 152 deletions
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 7f663327151..c48e1f622d3 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -39,8 +39,13 @@ obj-$(CONFIG_$(SPL_TPL_)FRAMEWORK) += spl.o
obj-$(CONFIG_SPL_FRAMEWORK) += zimage.o
obj-$(CONFIG_OF_LIBFDT) += bootm-fdt.o
endif
+ifdef CONFIG_ARM64
+obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset-arm64.o
+obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy-arm64.o
+else
obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset.o
obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy.o
+endif
obj-$(CONFIG_SEMIHOSTING) += semihosting.o
obj-y += bdinfo.o
diff --git a/arch/arm/lib/asm-offsets.c b/arch/arm/lib/asm-offsets.c
index 1a306ec4153..22fd541f9a2 100644
--- a/arch/arm/lib/asm-offsets.c
+++ b/arch/arm/lib/asm-offsets.c
@@ -15,7 +15,7 @@
#include <linux/kbuild.h>
#include <linux/arm-smccc.h>
-#if defined(CONFIG_MX25) || defined(CONFIG_MX27) || defined(CONFIG_MX35) \
+#if defined(CONFIG_MX27) \
|| defined(CONFIG_MX51) || defined(CONFIG_MX53)
#include <asm/arch/imx-regs.h>
#endif
@@ -35,42 +35,6 @@ int main(void)
* code. Is it better to define the macros directly in headers?
*/
-#if defined(CONFIG_MX25)
- /* Clock Control Module */
- DEFINE(CCM_CCTL, offsetof(struct ccm_regs, cctl));
- DEFINE(CCM_CGCR0, offsetof(struct ccm_regs, cgr0));
- DEFINE(CCM_CGCR1, offsetof(struct ccm_regs, cgr1));
- DEFINE(CCM_CGCR2, offsetof(struct ccm_regs, cgr2));
- DEFINE(CCM_PCDR2, offsetof(struct ccm_regs, pcdr[2]));
- DEFINE(CCM_MCR, offsetof(struct ccm_regs, mcr));
-
- /* Enhanced SDRAM Controller */
- DEFINE(ESDRAMC_ESDCTL0, offsetof(struct esdramc_regs, ctl0));
- DEFINE(ESDRAMC_ESDCFG0, offsetof(struct esdramc_regs, cfg0));
- DEFINE(ESDRAMC_ESDMISC, offsetof(struct esdramc_regs, misc));
-
- /* Multi-Layer AHB Crossbar Switch */
- DEFINE(MAX_MPR0, offsetof(struct max_regs, mpr0));
- DEFINE(MAX_SGPCR0, offsetof(struct max_regs, sgpcr0));
- DEFINE(MAX_MPR1, offsetof(struct max_regs, mpr1));
- DEFINE(MAX_SGPCR1, offsetof(struct max_regs, sgpcr1));
- DEFINE(MAX_MPR2, offsetof(struct max_regs, mpr2));
- DEFINE(MAX_SGPCR2, offsetof(struct max_regs, sgpcr2));
- DEFINE(MAX_MPR3, offsetof(struct max_regs, mpr3));
- DEFINE(MAX_SGPCR3, offsetof(struct max_regs, sgpcr3));
- DEFINE(MAX_MPR4, offsetof(struct max_regs, mpr4));
- DEFINE(MAX_SGPCR4, offsetof(struct max_regs, sgpcr4));
- DEFINE(MAX_MGPCR0, offsetof(struct max_regs, mgpcr0));
- DEFINE(MAX_MGPCR1, offsetof(struct max_regs, mgpcr1));
- DEFINE(MAX_MGPCR2, offsetof(struct max_regs, mgpcr2));
- DEFINE(MAX_MGPCR3, offsetof(struct max_regs, mgpcr3));
- DEFINE(MAX_MGPCR4, offsetof(struct max_regs, mgpcr4));
-
- /* AHB <-> IP-Bus Interface */
- DEFINE(AIPS_MPR_0_7, offsetof(struct aips_regs, mpr_0_7));
- DEFINE(AIPS_MPR_8_15, offsetof(struct aips_regs, mpr_8_15));
-#endif
-
#if defined(CONFIG_MX27)
DEFINE(AIPI1_PSR0, IMX_AIPI1_BASE + offsetof(struct aipi_regs, psr0));
DEFINE(AIPI1_PSR1, IMX_AIPI1_BASE + offsetof(struct aipi_regs, psr1));
@@ -97,56 +61,6 @@ int main(void)
offsetof(struct system_control_regs, fmcr));
#endif
-#if defined(CONFIG_MX35)
- /* Round up to make sure size gives nice stack alignment */
- DEFINE(CLKCTL_CCMR, offsetof(struct ccm_regs, ccmr));
- DEFINE(CLKCTL_PDR0, offsetof(struct ccm_regs, pdr0));
- DEFINE(CLKCTL_PDR1, offsetof(struct ccm_regs, pdr1));
- DEFINE(CLKCTL_PDR2, offsetof(struct ccm_regs, pdr2));
- DEFINE(CLKCTL_PDR3, offsetof(struct ccm_regs, pdr3));
- DEFINE(CLKCTL_PDR4, offsetof(struct ccm_regs, pdr4));
- DEFINE(CLKCTL_RCSR, offsetof(struct ccm_regs, rcsr));
- DEFINE(CLKCTL_MPCTL, offsetof(struct ccm_regs, mpctl));
- DEFINE(CLKCTL_PPCTL, offsetof(struct ccm_regs, ppctl));
- DEFINE(CLKCTL_ACMR, offsetof(struct ccm_regs, acmr));
- DEFINE(CLKCTL_COSR, offsetof(struct ccm_regs, cosr));
- DEFINE(CLKCTL_CGR0, offsetof(struct ccm_regs, cgr0));
- DEFINE(CLKCTL_CGR1, offsetof(struct ccm_regs, cgr1));
- DEFINE(CLKCTL_CGR2, offsetof(struct ccm_regs, cgr2));
- DEFINE(CLKCTL_CGR3, offsetof(struct ccm_regs, cgr3));
-
- /* Multi-Layer AHB Crossbar Switch */
- DEFINE(MAX_MPR0, offsetof(struct max_regs, mpr0));
- DEFINE(MAX_SGPCR0, offsetof(struct max_regs, sgpcr0));
- DEFINE(MAX_MPR1, offsetof(struct max_regs, mpr1));
- DEFINE(MAX_SGPCR1, offsetof(struct max_regs, sgpcr1));
- DEFINE(MAX_MPR2, offsetof(struct max_regs, mpr2));
- DEFINE(MAX_SGPCR2, offsetof(struct max_regs, sgpcr2));
- DEFINE(MAX_MPR3, offsetof(struct max_regs, mpr3));
- DEFINE(MAX_SGPCR3, offsetof(struct max_regs, sgpcr3));
- DEFINE(MAX_MPR4, offsetof(struct max_regs, mpr4));
- DEFINE(MAX_SGPCR4, offsetof(struct max_regs, sgpcr4));
- DEFINE(MAX_MGPCR0, offsetof(struct max_regs, mgpcr0));
- DEFINE(MAX_MGPCR1, offsetof(struct max_regs, mgpcr1));
- DEFINE(MAX_MGPCR2, offsetof(struct max_regs, mgpcr2));
- DEFINE(MAX_MGPCR3, offsetof(struct max_regs, mgpcr3));
- DEFINE(MAX_MGPCR4, offsetof(struct max_regs, mgpcr4));
- DEFINE(MAX_MGPCR5, offsetof(struct max_regs, mgpcr5));
-
- /* AHB <-> IP-Bus Interface */
- DEFINE(AIPS_MPR_0_7, offsetof(struct aips_regs, mpr_0_7));
- DEFINE(AIPS_MPR_8_15, offsetof(struct aips_regs, mpr_8_15));
- DEFINE(AIPS_PACR_0_7, offsetof(struct aips_regs, pacr_0_7));
- DEFINE(AIPS_PACR_8_15, offsetof(struct aips_regs, pacr_8_15));
- DEFINE(AIPS_PACR_16_23, offsetof(struct aips_regs, pacr_16_23));
- DEFINE(AIPS_PACR_24_31, offsetof(struct aips_regs, pacr_24_31));
- DEFINE(AIPS_OPACR_0_7, offsetof(struct aips_regs, opacr_0_7));
- DEFINE(AIPS_OPACR_8_15, offsetof(struct aips_regs, opacr_8_15));
- DEFINE(AIPS_OPACR_16_23, offsetof(struct aips_regs, opacr_16_23));
- DEFINE(AIPS_OPACR_24_31, offsetof(struct aips_regs, opacr_24_31));
- DEFINE(AIPS_OPACR_32_39, offsetof(struct aips_regs, opacr_32_39));
-#endif
-
#if defined(CONFIG_MX51) || defined(CONFIG_MX53)
/* Round up to make sure size gives nice stack alignment */
DEFINE(CLKCTL_CCMR, offsetof(struct clkctl, ccr));
diff --git a/arch/arm/lib/asmdefs.h b/arch/arm/lib/asmdefs.h
new file mode 100644
index 00000000000..d307a3a8a25
--- /dev/null
+++ b/arch/arm/lib/asmdefs.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Macros for asm code.
+ *
+ * Copyright (c) 2019, Arm Limited.
+ */
+
+#ifndef _ASMDEFS_H
+#define _ASMDEFS_H
+
+#if defined(__aarch64__)
+
+/* Branch Target Identitication support. */
+#define BTI_C hint 34
+#define BTI_J hint 36
+/* Return address signing support (pac-ret). */
+#define PACIASP hint 25; .cfi_window_save
+#define AUTIASP hint 29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h. */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note. */
+#define GNU_PROPERTY(type, value) \
+ .section .note.gnu.property, "a"; \
+ .p2align 3; \
+ .word 4; \
+ .word 16; \
+ .word 5; \
+ .asciz "GNU"; \
+ .word type; \
+ .word 4; \
+ .word value; \
+ .word 0; \
+ .text
+
+/* If set then the GNU Property Note section will be added to
+ mark objects to support BTI and PAC-RET. */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files. */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment) \
+ .global name; \
+ .type name,%function; \
+ .align alignment; \
+ name: \
+ .cfi_startproc; \
+ BTI_C;
+
+#else
+
+#define END_FILE
+
+#define ENTRY_ALIGN(name, alignment) \
+ .global name; \
+ .type name,%function; \
+ .align alignment; \
+ name: \
+ .cfi_startproc;
+
+#endif
+
+#define ENTRY(name) ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name) \
+ .global name; \
+ .type name,%function; \
+ name:
+
+#define END(name) \
+ .cfi_endproc; \
+ .size name, .-name;
+
+#define L(l) .L ## l
+
+#ifdef __ILP32__
+ /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n) mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+ /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n) mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
+#endif
diff --git a/arch/arm/lib/bootm.c b/arch/arm/lib/bootm.c
index f60ee3a7e6a..dd6a69315ac 100644
--- a/arch/arm/lib/bootm.c
+++ b/arch/arm/lib/bootm.c
@@ -16,7 +16,6 @@
#include <command.h>
#include <cpu_func.h>
#include <dm.h>
-#include <lmb.h>
#include <log.h>
#include <asm/global_data.h>
#include <dm/root.h>
@@ -43,50 +42,6 @@ DECLARE_GLOBAL_DATA_PTR;
static struct tag *params;
-static ulong get_sp(void)
-{
- ulong ret;
-
- asm("mov %0, sp" : "=r"(ret) : );
- return ret;
-}
-
-void arch_lmb_reserve(struct lmb *lmb)
-{
- ulong sp, bank_end;
- int bank;
-
- /*
- * Booting a (Linux) kernel image
- *
- * Allocate space for command line and board info - the
- * address should be as high as possible within the reach of
- * the kernel (see CONFIG_SYS_BOOTMAPSZ settings), but in unused
- * memory, which means far enough below the current stack
- * pointer.
- */
- sp = get_sp();
- debug("## Current stack ends at 0x%08lx ", sp);
-
- /* adjust sp by 4K to be safe */
- sp -= 4096;
- for (bank = 0; bank < CONFIG_NR_DRAM_BANKS; bank++) {
- if (!gd->bd->bi_dram[bank].size ||
- sp < gd->bd->bi_dram[bank].start)
- continue;
- /* Watch out for RAM at end of address space! */
- bank_end = gd->bd->bi_dram[bank].start +
- gd->bd->bi_dram[bank].size - 1;
- if (sp > bank_end)
- continue;
- if (bank_end > gd->ram_top)
- bank_end = gd->ram_top - 1;
-
- lmb_reserve(lmb, sp, bank_end - sp + 1);
- break;
- }
-}
-
__weak void board_quiesce_devices(void)
{
}
diff --git a/arch/arm/lib/ccn504.S b/arch/arm/lib/ccn504.S
index 2c584095c3c..c6ea3e3afc9 100644
--- a/arch/arm/lib/ccn504.S
+++ b/arch/arm/lib/ccn504.S
@@ -12,7 +12,7 @@
/*************************************************************************
*
* void ccn504_add_masters_to_dvm(CCI_MN_BASE, CCI_MN_RNF_NODEID_LIST,
- * CCI_MN_DVM_DOMAIN_CTL_SET);
+ * CCI_MN_DVM_DOMAIN_CTL_SET);
*
* Add fully-coherent masters to DVM domain
*
@@ -78,4 +78,3 @@ ENTRY(ccn504_set_aux)
ret
ENDPROC(ccn504_set_aux)
-
diff --git a/arch/arm/lib/crt0.S b/arch/arm/lib/crt0.S
index 46b6be21a8d..956d258c9da 100644
--- a/arch/arm/lib/crt0.S
+++ b/arch/arm/lib/crt0.S
@@ -130,6 +130,14 @@ ENTRY(_main)
ldr r9, [r9, #GD_NEW_GD] /* r9 <- gd->new_gd */
adr lr, here
+#if defined(CONFIG_POSITION_INDEPENDENT)
+ adr r0, _main
+ ldr r1, _start_ofs
+ add r0, r1
+ ldr r1, =CONFIG_SYS_TEXT_BASE
+ sub r1, r0
+ add lr, r1
+#endif
ldr r0, [r9, #GD_RELOC_OFF] /* r0 = gd->reloc_off */
add lr, lr, r0
#if defined(CONFIG_CPU_V7M)
@@ -180,3 +188,6 @@ here:
#endif
ENDPROC(_main)
+
+_start_ofs:
+ .word _start - _main
diff --git a/arch/arm/lib/div64.S b/arch/arm/lib/div64.S
index 3ef1ce1fff5..a83e3372149 100644
--- a/arch/arm/lib/div64.S
+++ b/arch/arm/lib/div64.S
@@ -34,12 +34,12 @@
* This is meant to be used by do_div() from include/asm/div64.h only.
*
* Input parameters:
- * xh-xl = dividend (clobbered)
- * r4 = divisor (preserved)
+ * xh-xl = dividend (clobbered)
+ * r4 = divisor (preserved)
*
* Output values:
- * yh-yl = result
- * xh = remainder
+ * yh-yl = result
+ * xh = remainder
*
* Clobbered regs: xl, ip
*/
@@ -85,7 +85,7 @@ UNWIND(.fnstart)
#endif
@ The division loop for needed upper bit positions.
- @ Break out early if dividend reaches 0.
+ @ Break out early if dividend reaches 0.
2: cmp xh, yl
orrcs yh, yh, ip
subscs xh, xh, yl
diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
index 0798d098afe..700eee5fbbe 100644
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -34,7 +34,7 @@
mov \divisor, \divisor, lsl \result
mov \curbit, \curbit, lsl \result
mov \result, #0
-
+
#else
@ Initially shift the divisor left 3 bits if possible,
@@ -48,7 +48,7 @@
@ Unless the divisor is very big, shift it up in multiples of
@ four bits, since this is the amount of unwinding in the main
- @ division loop. Continue shifting until the divisor is
+ @ division loop. Continue shifting until the divisor is
@ larger than the dividend.
1: cmp \divisor, #0x10000000
cmplo \divisor, \dividend
@@ -135,7 +135,7 @@
@ Unless the divisor is very big, shift it up in multiples of
@ four bits, since this is the amount of unwinding in the main
- @ division loop. Continue shifting until the divisor is
+ @ division loop. Continue shifting until the divisor is
@ larger than the dividend.
1: cmp \divisor, #0x10000000
cmplo \divisor, \dividend
diff --git a/arch/arm/lib/memcpy-arm64.S b/arch/arm/lib/memcpy-arm64.S
new file mode 100644
index 00000000000..507054d847e
--- /dev/null
+++ b/arch/arm/lib/memcpy-arm64.S
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * memcpy - copy memory area
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#include "asmdefs.h"
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_lw w10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l x14
+#define E_h x15
+#define F_l x16
+#define F_h x17
+#define G_l count
+#define G_h dst
+#define H_l src
+#define H_h srcend
+#define tmp1 x14
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+ from a single entry point. It uses unaligned accesses and branchless
+ sequences to keep the code small, simple and improve performance.
+
+ Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+ copies of up to 128 bytes, and large copies. The overhead of the overlap
+ check is negligible since it is only required for large copies.
+
+ Large copies use a software pipelined loop processing 64 bytes per iteration.
+ The destination pointer is 16-byte aligned to minimize unaligned accesses.
+ The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (memmove)
+ENTRY (memcpy)
+ PTR_ARG (0)
+ PTR_ARG (1)
+ SIZE_ARG (2)
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+ ldp A_l, A_h, [src]
+ ldp D_l, D_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
+ ldr A_lw, [src]
+ ldr B_lw, [srcend, -4]
+ str A_lw, [dstin]
+ str B_lw, [dstend, -4]
+ ret
+
+ /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
+ cbz count, L(copy0)
+ lsr tmp1, count, 1
+ ldrb A_lw, [src]
+ ldrb C_lw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb C_lw, [dstend, -1]
+L(copy0):
+ ret
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ ldp D_l, D_h, [srcend, -16]
+ cmp count, 64
+ b.hi L(copy128)
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Copy 65..128 bytes. */
+L(copy128):
+ ldp E_l, E_h, [src, 32]
+ ldp F_l, F_h, [src, 48]
+ cmp count, 96
+ b.ls L(copy96)
+ ldp G_l, G_h, [srcend, -64]
+ ldp H_l, H_h, [srcend, -48]
+ stp G_l, G_h, [dstend, -64]
+ stp H_l, H_h, [dstend, -48]
+L(copy96):
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ /* Use backwards copy if there is an overlap. */
+ sub tmp1, dstin, src
+ cbz tmp1, L(copy0)
+ cmp tmp1, count
+ b.lo L(copy_long_backwards)
+
+ /* Copy 16 bytes and then align dst to 16-byte alignment. */
+
+ ldp D_l, D_h, [src]
+ and tmp1, dstin, 15
+ bic dst, dstin, 15
+ sub src, src, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp A_l, A_h, [src, 16]
+ stp D_l, D_h, [dstin]
+ ldp B_l, B_h, [src, 32]
+ ldp C_l, C_h, [src, 48]
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 128 + 16 /* Test and readjust count. */
+ b.ls L(copy64_from_end)
+
+L(loop64):
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [src, 16]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [src, 32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [src, 48]
+ stp D_l, D_h, [dst, 64]!
+ ldp D_l, D_h, [src, 64]!
+ subs count, count, 64
+ b.hi L(loop64)
+
+ /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
+ ldp E_l, E_h, [srcend, -64]
+ stp A_l, A_h, [dst, 16]
+ ldp A_l, A_h, [srcend, -48]
+ stp B_l, B_h, [dst, 32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dst, 48]
+ ldp C_l, C_h, [srcend, -16]
+ stp D_l, D_h, [dst, 64]
+ stp E_l, E_h, [dstend, -64]
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ ret
+
+ .p2align 4
+
+ /* Large backwards copy for overlapping copies.
+ Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
+ ldp D_l, D_h, [srcend, -16]
+ and tmp1, dstend, 15
+ sub srcend, srcend, tmp1
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]
+ stp D_l, D_h, [dstend, -16]
+ ldp B_l, B_h, [srcend, -32]
+ ldp C_l, C_h, [srcend, -48]
+ ldp D_l, D_h, [srcend, -64]!
+ sub dstend, dstend, tmp1
+ subs count, count, 128
+ b.ls L(copy64_from_start)
+
+L(loop64_backwards):
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [srcend, -16]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [srcend, -32]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [srcend, -48]
+ stp D_l, D_h, [dstend, -64]!
+ ldp D_l, D_h, [srcend, -64]!
+ subs count, count, 64
+ b.hi L(loop64_backwards)
+
+ /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
+ ldp G_l, G_h, [src, 48]
+ stp A_l, A_h, [dstend, -16]
+ ldp A_l, A_h, [src, 32]
+ stp B_l, B_h, [dstend, -32]
+ ldp B_l, B_h, [src, 16]
+ stp C_l, C_h, [dstend, -48]
+ ldp C_l, C_h, [src]
+ stp D_l, D_h, [dstend, -64]
+ stp G_l, G_h, [dstin, 48]
+ stp A_l, A_h, [dstin, 32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin]
+ ret
+
+END (memcpy)
diff --git a/arch/arm/lib/memset-arm64.S b/arch/arm/lib/memset-arm64.S
new file mode 100644
index 00000000000..ee9f9a96cfe
--- /dev/null
+++ b/arch/arm/lib/memset-arm64.S
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012-2021, Arm Limited.
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#include <asm/macro.h>
+#include "asmdefs.h"
+
+#define dstin x0
+#define val x1
+#define valw w1
+#define count x2
+#define dst x3
+#define dstend x4
+#define zva_val x5
+
+ENTRY (memset)
+ PTR_ARG (0)
+ SIZE_ARG (2)
+
+ /*
+ * The optimized memset uses the dc opcode, which causes problems
+ * when the cache is disabled. Let's check if the cache is disabled
+ * and use a very simple memset implementation in this case. Otherwise
+ * jump to the optimized version.
+ */
+ switch_el x6, 3f, 2f, 1f
+3: mrs x6, sctlr_el3
+ b 0f
+2: mrs x6, sctlr_el2
+ b 0f
+1: mrs x6, sctlr_el1
+0:
+ tst x6, #CR_C
+ bne 9f
+
+ /*
+ * A very "simple" memset implementation without the use of the
+ * dc opcode. Can be run with caches disabled.
+ */
+ mov x3, #0x0
+ cmp count, x3 /* check for zero length */
+ beq 8f
+4: strb valw, [dstin, x3]
+ add x3, x3, #0x1
+ cmp count, x3
+ bne 4b
+8: ret
+9:
+
+ /* Here the optimized memset version starts */
+ dup v0.16B, valw
+ add dstend, dstin, count
+
+ cmp count, 96
+ b.hi L(set_long)
+ cmp count, 16
+ b.hs L(set_medium)
+ mov val, v0.D[0]
+
+ /* Set 0..15 bytes. */
+ tbz count, 3, 1f
+ str val, [dstin]
+ str val, [dstend, -8]
+ ret
+ .p2align 4
+1: tbz count, 2, 2f
+ str valw, [dstin]
+ str valw, [dstend, -4]
+ ret
+2: cbz count, 3f
+ strb valw, [dstin]
+ tbz count, 1, 3f
+ strh valw, [dstend, -2]
+3: ret
+
+ /* Set 17..96 bytes. */
+L(set_medium):
+ str q0, [dstin]
+ tbnz count, 6, L(set96)
+ str q0, [dstend, -16]
+ tbz count, 5, 1f
+ str q0, [dstin, 16]
+ str q0, [dstend, -32]
+1: ret
+
+ .p2align 4
+ /* Set 64..96 bytes. Write 64 bytes from the start and
+ 32 bytes from the end. */
+L(set96):
+ str q0, [dstin, 16]
+ stp q0, q0, [dstin, 32]
+ stp q0, q0, [dstend, -32]
+ ret
+
+ .p2align 4
+L(set_long):
+ and valw, valw, 255
+ bic dst, dstin, 15
+ str q0, [dstin]
+ cmp count, 160
+ ccmp valw, 0, 0, hs
+ b.ne L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(no_zva)
+#endif
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+ bic dst, dst, 63
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ .p2align 4
+L(zva_loop):
+ add dst, dst, 64
+ dc zva, dst
+ subs count, count, 64
+ b.hi L(zva_loop)
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+L(no_zva):
+ sub count, dstend, dst /* Count is 16 too large. */
+ sub dst, dst, 16 /* Dst is biased by -32. */
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+L(no_zva_loop):
+ stp q0, q0, [dst, 32]
+ stp q0, q0, [dst, 64]!
+ subs count, count, 64
+ b.hi L(no_zva_loop)
+ stp q0, q0, [dstend, -64]
+ stp q0, q0, [dstend, -32]
+ ret
+
+END (memset)
diff --git a/arch/arm/lib/relocate.S b/arch/arm/lib/relocate.S
index e5f7267be19..14b7f61c1a4 100644
--- a/arch/arm/lib/relocate.S
+++ b/arch/arm/lib/relocate.S
@@ -78,22 +78,28 @@ ENDPROC(relocate_vectors)
*/
ENTRY(relocate_code)
- ldr r1, =__image_copy_start /* r1 <- SRC &__image_copy_start */
- subs r4, r0, r1 /* r4 <- relocation offset */
- beq relocate_done /* skip relocation */
- ldr r2, =__image_copy_end /* r2 <- SRC &__image_copy_end */
-
+ adr r3, relocate_code
+ ldr r1, _image_copy_start_ofs
+ add r1, r3 /* r1 <- Run &__image_copy_start */
+ subs r4, r0, r1 /* r4 <- Run to copy offset */
+ beq relocate_done /* skip relocation */
+ ldr r1, _image_copy_start_ofs
+ add r1, r3 /* r1 <- Run &__image_copy_start */
+ ldr r2, _image_copy_end_ofs
+ add r2, r3 /* r2 <- Run &__image_copy_end */
copy_loop:
- ldmia r1!, {r10-r11} /* copy from source address [r1] */
- stmia r0!, {r10-r11} /* copy to target address [r0] */
- cmp r1, r2 /* until source end address [r2] */
+ ldmia r1!, {r10-r11} /* copy from source address [r1] */
+ stmia r0!, {r10-r11} /* copy to target address [r0] */
+ cmp r1, r2 /* until source end address [r2] */
blo copy_loop
/*
* fix .rel.dyn relocations
*/
- ldr r2, =__rel_dyn_start /* r2 <- SRC &__rel_dyn_start */
- ldr r3, =__rel_dyn_end /* r3 <- SRC &__rel_dyn_end */
+ ldr r1, _rel_dyn_start_ofs
+ add r2, r1, r3 /* r2 <- Run &__rel_dyn_start */
+ ldr r1, _rel_dyn_end_ofs
+ add r3, r1, r3 /* r3 <- Run &__rel_dyn_end */
fixloop:
ldmia r2!, {r0-r1} /* (r0,r1) <- (SRC location,fixup) */
and r1, r1, #0xff
@@ -129,3 +135,12 @@ relocate_done:
#endif
ENDPROC(relocate_code)
+
+_image_copy_start_ofs:
+ .word __image_copy_start - relocate_code
+_image_copy_end_ofs:
+ .word __image_copy_end - relocate_code
+_rel_dyn_start_ofs:
+ .word __rel_dyn_start - relocate_code
+_rel_dyn_end_ofs:
+ .word __rel_dyn_end - relocate_code
diff --git a/arch/arm/lib/stack.c b/arch/arm/lib/stack.c
index b03e1cfc80c..656084c7e51 100644
--- a/arch/arm/lib/stack.c
+++ b/arch/arm/lib/stack.c
@@ -12,6 +12,7 @@
*/
#include <common.h>
#include <init.h>
+#include <lmb.h>
#include <asm/global_data.h>
DECLARE_GLOBAL_DATA_PTR;
@@ -33,3 +34,16 @@ int arch_reserve_stacks(void)
return 0;
}
+
+static ulong get_sp(void)
+{
+ ulong ret;
+
+ asm("mov %0, sp" : "=r"(ret) : );
+ return ret;
+}
+
+void arch_lmb_reserve(struct lmb *lmb)
+{
+ arch_lmb_reserve_generic(lmb, get_sp(), gd->ram_top, 16384);
+}