Merge tag 'arm-plt-optimizations-for-v4.9' of... (32b63776) · Commits · e / devices / android_kernel_oneplus_sm8150

arch/arm/include/asm/module.h

+2 −4

Original line number	Diff line number	Diff line
		@@ -23,10 +23,8 @@ struct mod_arch_specific {
		struct unwind_table *unwind[ARM_SEC_MAX];
		#endif
		#ifdef CONFIG_ARM_MODULE_PLTS
		struct elf32_shdr *core_plt;
		struct elf32_shdr *init_plt;
		int core_plt_count;
		int init_plt_count;
		struct elf32_shdr *plt;
		int plt_count;
		#endif
		};

arch/arm/kernel/module-plts.c

+144 −99

Original line number	Diff line number	Diff line
		@@ -9,6 +9,7 @@
		#include <linux/elf.h>
		#include <linux/kernel.h>
		#include <linux/module.h>
		#include <linux/sort.h>

		#include <asm/cache.h>
		#include <asm/opcodes.h>
		@@ -30,101 +31,142 @@ struct plt_entries {
		u32 lit[PLT_ENT_COUNT];
		};

		static bool in_init(const struct module *mod, u32 addr)
		{
		return addr - (u32)mod->init_layout.base < mod->init_layout.size;
		}

		u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val)
		{
		struct plt_entries plt, plt_end;
		int c, *count;

		if (in_init(mod, loc)) {
		plt = (void *)mod->arch.init_plt->sh_addr;
		plt_end = (void *)plt + mod->arch.init_plt->sh_size;
		count = &mod->arch.init_plt_count;
		} else {
		plt = (void *)mod->arch.core_plt->sh_addr;
		plt_end = (void *)plt + mod->arch.core_plt->sh_size;
		count = &mod->arch.core_plt_count;
		struct plt_entries plt = (struct plt_entries )mod->arch.plt->sh_addr;
		int idx = 0;

		/*
		* Look for an existing entry pointing to 'val'. Given that the
		* relocations are sorted, this will be the last entry we allocated.
		* (if one exists).
		*/
		if (mod->arch.plt_count > 0) {
		plt += (mod->arch.plt_count - 1) / PLT_ENT_COUNT;
		idx = (mod->arch.plt_count - 1) % PLT_ENT_COUNT;

		if (plt->lit[idx] == val)
		return (u32)&plt->ldr[idx];

		idx = (idx + 1) % PLT_ENT_COUNT;
		if (!idx)
		plt++;
		}

		/* Look for an existing entry pointing to 'val' */
		for (c = *count; plt < plt_end; c -= PLT_ENT_COUNT, plt++) {
		int i;
		mod->arch.plt_count++;
		BUG_ON(mod->arch.plt_count * PLT_ENT_SIZE > mod->arch.plt->sh_size);

		if (!c) {
		if (!idx)
		/* Populate a new set of entries */
		*plt = (struct plt_entries){
		{ [0 ... PLT_ENT_COUNT - 1] = PLT_ENT_LDR, },
		{ val, }
		};
		++*count;
		return (u32)plt->ldr;
		}
		for (i = 0; i < PLT_ENT_COUNT; i++) {
		if (!plt->lit[i]) {
		plt->lit[i] = val;
		++*count;
		else
		plt->lit[idx] = val;

		return (u32)&plt->ldr[idx];
		}
		if (plt->lit[i] == val)
		return (u32)&plt->ldr[i];

		#define cmp_3way(a,b) ((a) < (b) ? -1 : (a) > (b))

		static int cmp_rel(const void a, const void b)
		{
		const Elf32_Rel x = a, y = b;
		int i;

		/* sort by type and symbol index */
		i = cmp_3way(ELF32_R_TYPE(x->r_info), ELF32_R_TYPE(y->r_info));
		if (i == 0)
		i = cmp_3way(ELF32_R_SYM(x->r_info), ELF32_R_SYM(y->r_info));
		return i;
		}

		static bool is_zero_addend_relocation(Elf32_Addr base, const Elf32_Rel *rel)
		{
		u32 tval = (u32 )(base + rel->r_offset);

		/*
		* Do a bitwise compare on the raw addend rather than fully decoding
		* the offset and doing an arithmetic comparison.
		* Note that a zero-addend jump/call relocation is encoded taking the
		* PC bias into account, i.e., -8 for ARM and -4 for Thumb2.
		*/
		switch (ELF32_R_TYPE(rel->r_info)) {
		u16 upper, lower;

		case R_ARM_THM_CALL:
		case R_ARM_THM_JUMP24:
		upper = __mem_to_opcode_thumb16(((u16 *)tval)[0]);
		lower = __mem_to_opcode_thumb16(((u16 *)tval)[1]);

		return (upper & 0x7ff) == 0x7ff && (lower & 0x2fff) == 0x2ffe;

		case R_ARM_CALL:
		case R_ARM_PC24:
		case R_ARM_JUMP24:
		return (__mem_to_opcode_arm(*tval) & 0xffffff) == 0xfffffe;
		}
		BUG();
		}

		static int duplicate_rel(Elf32_Addr base, const Elf32_Rel *rel, int num,
		u32 mask)
		static bool duplicate_rel(Elf32_Addr base, const Elf32_Rel *rel, int num)
		{
		u32 loc1, loc2;
		int i;

		for (i = 0; i < num; i++) {
		if (rel[i].r_info != rel[num].r_info)
		continue;
		const Elf32_Rel *prev;

		/*
		* Identical relocation types against identical symbols can
		* still result in different PLT entries if the addend in the
		* place is different. So resolve the target of the relocation
		* to compare the values.
		* Entries are sorted by type and symbol index. That means that,
		* if a duplicate entry exists, it must be in the preceding
		* slot.
		*/
		loc1 = (u32 *)(base + rel[i].r_offset);
		loc2 = (u32 *)(base + rel[num].r_offset);
		if (((loc1 ^ loc2) & mask) == 0)
		return 1;
		}
		return 0;
		if (!num)
		return false;

		prev = rel + num - 1;
		return cmp_rel(rel + num, prev) == 0 &&
		is_zero_addend_relocation(base, prev);
		}

		/* Count how many PLT entries we may need */
		static unsigned int count_plts(Elf32_Addr base, const Elf32_Rel *rel, int num)
		static unsigned int count_plts(const Elf32_Sym *syms, Elf32_Addr base,
		const Elf32_Rel *rel, int num)
		{
		unsigned int ret = 0;
		const Elf32_Sym *s;
		int i;

		/*
		* Sure, this is order(n^2), but it's usually short, and not
		* time critical
		*/
		for (i = 0; i < num; i++)
		for (i = 0; i < num; i++) {
		switch (ELF32_R_TYPE(rel[i].r_info)) {
		case R_ARM_CALL:
		case R_ARM_PC24:
		case R_ARM_JUMP24:
		if (!duplicate_rel(base, rel, i,
		__opcode_to_mem_arm(0x00ffffff)))
		ret++;
		break;
		#ifdef CONFIG_THUMB2_KERNEL
		case R_ARM_THM_CALL:
		case R_ARM_THM_JUMP24:
		if (!duplicate_rel(base, rel, i,
		__opcode_to_mem_thumb32(0x07ff2fff)))
		/*
		* We only have to consider branch targets that resolve
		* to undefined symbols. This is not simply a heuristic,
		* it is a fundamental limitation, since the PLT itself
		* is part of the module, and needs to be within range
		* as well, so modules can never grow beyond that limit.
		*/
		s = syms + ELF32_R_SYM(rel[i].r_info);
		if (s->st_shndx != SHN_UNDEF)
		break;

		/*
		* Jump relocations with non-zero addends against
		* undefined symbols are supported by the ELF spec, but
		* do not occur in practice (e.g., 'jump n bytes past
		* the entry point of undefined function symbol f').
		* So we need to support them, but there is no need to
		* take them into consideration when trying to optimize
		* this code. So let's only check for duplicates when
		* the addend is zero.
		*/
		if (!is_zero_addend_relocation(base, rel + i) \|\|
		!duplicate_rel(base, rel, i))
		ret++;
		#endif
		}
		}
		return ret;
		}
		@@ -132,52 +174,55 @@ static unsigned int count_plts(Elf32_Addr base, const Elf32_Rel *rel, int num)
		int module_frob_arch_sections(Elf_Ehdr ehdr, Elf_Shdr sechdrs,
		char secstrings, struct module mod)
		{
		unsigned long core_plts = 0, init_plts = 0;
		unsigned long plts = 0;
		Elf32_Shdr s, sechdrs_end = sechdrs + ehdr->e_shnum;
		Elf32_Sym *syms = NULL;

		/*
		* To store the PLTs, we expand the .text section for core module code
		* and the .init.text section for initialization code.
		* and for initialization code.
		*/
		for (s = sechdrs; s < sechdrs_end; ++s)
		if (strcmp(".core.plt", secstrings + s->sh_name) == 0)
		mod->arch.core_plt = s;
		else if (strcmp(".init.plt", secstrings + s->sh_name) == 0)
		mod->arch.init_plt = s;

		if (!mod->arch.core_plt \|\| !mod->arch.init_plt) {
		pr_err("%s: sections missing\n", mod->name);
		for (s = sechdrs; s < sechdrs_end; ++s) {
		if (strcmp(".plt", secstrings + s->sh_name) == 0)
		mod->arch.plt = s;
		else if (s->sh_type == SHT_SYMTAB)
		syms = (Elf32_Sym *)s->sh_addr;
		}

		if (!mod->arch.plt) {
		pr_err("%s: module PLT section missing\n", mod->name);
		return -ENOEXEC;
		}
		if (!syms) {
		pr_err("%s: module symtab section missing\n", mod->name);
		return -ENOEXEC;
		}

		for (s = sechdrs + 1; s < sechdrs_end; ++s) {
		const Elf32_Rel rels = (void )ehdr + s->sh_offset;
		Elf32_Rel rels = (void )ehdr + s->sh_offset;
		int numrels = s->sh_size / sizeof(Elf32_Rel);
		Elf32_Shdr *dstsec = sechdrs + s->sh_info;

		if (s->sh_type != SHT_REL)
		continue;

		if (strstr(secstrings + s->sh_name, ".init"))
		init_plts += count_plts(dstsec->sh_addr, rels, numrels);
		else
		core_plts += count_plts(dstsec->sh_addr, rels, numrels);
		/* ignore relocations that operate on non-exec sections */
		if (!(dstsec->sh_flags & SHF_EXECINSTR))
		continue;

		/* sort by type and symbol index */
		sort(rels, numrels, sizeof(Elf32_Rel), cmp_rel, NULL);

		plts += count_plts(syms, dstsec->sh_addr, rels, numrels);
		}

		mod->arch.core_plt->sh_type = SHT_NOBITS;
		mod->arch.core_plt->sh_flags = SHF_EXECINSTR \| SHF_ALLOC;
		mod->arch.core_plt->sh_addralign = L1_CACHE_BYTES;
		mod->arch.core_plt->sh_size = round_up(core_plts * PLT_ENT_SIZE,
		mod->arch.plt->sh_type = SHT_NOBITS;
		mod->arch.plt->sh_flags = SHF_EXECINSTR \| SHF_ALLOC;
		mod->arch.plt->sh_addralign = L1_CACHE_BYTES;
		mod->arch.plt->sh_size = round_up(plts * PLT_ENT_SIZE,
		sizeof(struct plt_entries));
		mod->arch.core_plt_count = 0;
		mod->arch.plt_count = 0;

		mod->arch.init_plt->sh_type = SHT_NOBITS;
		mod->arch.init_plt->sh_flags = SHF_EXECINSTR \| SHF_ALLOC;
		mod->arch.init_plt->sh_addralign = L1_CACHE_BYTES;
		mod->arch.init_plt->sh_size = round_up(init_plts * PLT_ENT_SIZE,
		sizeof(struct plt_entries));
		mod->arch.init_plt_count = 0;
		pr_debug("%s: core.plt=%x, init.plt=%x\n", __func__,
		mod->arch.core_plt->sh_size, mod->arch.init_plt->sh_size);
		pr_debug("%s: plt=%x\n", __func__, mod->arch.plt->sh_size);
		return 0;
		}

arch/arm/kernel/module.lds

+1 −2

Original line number	Diff line number	Diff line
		SECTIONS {
		.core.plt : { BYTE(0) }
		.init.plt : { BYTE(0) }
		.plt : { BYTE(0) }
		}