Merge branch 'sparc64-M7-memcpy' (fa5dc772) · Commits · e / devices / android_kernel_fairphone_FP5

arch/sparc/kernel/head_64.S

+14 −2

Original line number	Diff line number	Diff line
		@@ -603,10 +603,10 @@ niagara_tlb_fixup:
		be,pt %xcc, niagara4_patch
		nop
		cmp %g1, SUN4V_CHIP_SPARC_M7
		be,pt %xcc, niagara4_patch
		be,pt %xcc, sparc_m7_patch
		nop
		cmp %g1, SUN4V_CHIP_SPARC_M8
		be,pt %xcc, niagara4_patch
		be,pt %xcc, sparc_m7_patch
		nop
		cmp %g1, SUN4V_CHIP_SPARC_SN
		be,pt %xcc, niagara4_patch
		@@ -621,6 +621,18 @@ niagara_tlb_fixup:

		ba,a,pt %xcc, 80f
		nop

		sparc_m7_patch:
		call m7_patch_copyops
		nop
		call m7_patch_bzero
		nop
		call m7_patch_pageops
		nop

		ba,a,pt %xcc, 80f
		nop

		niagara4_patch:
		call niagara4_patch_copyops
		nop

arch/sparc/lib/M7copy_from_user.S

0 → 100644

+40 −0

Original line number	Diff line number	Diff line
		/*
		* M7copy_from_user.S: SPARC M7 optimized copy from userspace.
		*
		* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
		*/


		#define EX_LD(x, y) \
		98: x; \
		.section __ex_table,"a"; \
		.align 4; \
		.word 98b, y; \
		.text; \
		.align 4;

		#define EX_LD_FP(x, y) \
		98: x; \
		.section __ex_table,"a"; \
		.align 4; \
		.word 98b, y##_fp; \
		.text; \
		.align 4;

		#ifndef ASI_AIUS
		#define ASI_AIUS 0x11
		#endif

		#define FUNC_NAME M7copy_from_user
		#define LOAD(type,addr,dest) type##a [addr] %asi, dest
		#define EX_RETVAL(x) 0

		#ifdef __KERNEL__
		#define PREAMBLE \
		rd %asi, %g1; \
		cmp %g1, ASI_AIUS; \
		bne,pn %icc, raw_copy_in_user; \
		nop
		#endif

		#include "M7memcpy.S"

arch/sparc/lib/M7copy_to_user.S

0 → 100644

+51 −0

Original line number	Diff line number	Diff line
		/*
		* M7copy_to_user.S: SPARC M7 optimized copy to userspace.
		*
		* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
		*/


		#define EX_ST(x, y) \
		98: x; \
		.section __ex_table,"a"; \
		.align 4; \
		.word 98b, y; \
		.text; \
		.align 4;

		#define EX_ST_FP(x, y) \
		98: x; \
		.section __ex_table,"a"; \
		.align 4; \
		.word 98b, y##_fp; \
		.text; \
		.align 4;


		#ifndef ASI_AIUS
		#define ASI_AIUS 0x11
		#endif

		#ifndef ASI_BLK_INIT_QUAD_LDD_AIUS
		#define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23
		#endif

		#define FUNC_NAME M7copy_to_user
		#define STORE(type,src,addr) type##a src, [addr] %asi
		#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS
		#define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_S
		#define EX_RETVAL(x) 0

		#ifdef __KERNEL__
		/* Writing to %asi is _expensive_ so we hardcode it.
		* Reading %asi to check for KERNEL_DS is comparatively
		* cheap.
		*/
		#define PREAMBLE \
		rd %asi, %g1; \
		cmp %g1, ASI_AIUS; \
		bne,pn %icc, raw_copy_in_user; \
		nop
		#endif

		#include "M7memcpy.S"

arch/sparc/lib/M7memcpy.S

0 → 100644

+923 −0

File added.

Preview size limit exceeded, changes collapsed.

arch/sparc/lib/M7memset.S

0 → 100644

+352 −0

Original line number	Diff line number	Diff line
		/*
		* M7memset.S: SPARC M7 optimized memset.
		*
		* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
		*/

		/*
		* M7memset.S: M7 optimized memset.
		*
		* char *memset(sp, c, n)
		*
		* Set an array of n chars starting at sp to the character c.
		* Return sp.
		*
		* Fast assembler language version of the following C-program for memset
		* which represents the `standard' for the C-library.
		*
		* void *
		* memset(void *sp1, int c, size_t n)
		* {
		* if (n != 0) {
		* char *sp = sp1;
		* do {
		* *sp++ = (char)c;
		* } while (--n != 0);
		* }
		* return (sp1);
		* }
		*
		* The algorithm is as follows :
		*
		* For small 6 or fewer bytes stores, bytes will be stored.
		*
		* For less than 32 bytes stores, align the address on 4 byte boundary.
		* Then store as many 4-byte chunks, followed by trailing bytes.
		*
		* For sizes greater than 32 bytes, align the address on 8 byte boundary.
		* if (count >= 64) {
		* store 8-bytes chunks to align the address on 64 byte boundary
		* if (value to be set is zero && count >= MIN_ZERO) {
		* Using BIS stores, set the first long word of each
		* 64-byte cache line to zero which will also clear the
		* other seven long words of the cache line.
		* }
		* else if (count >= MIN_LOOP) {
		* Using BIS stores, set the first long word of each of
		* ST_CHUNK cache lines (64 bytes each) before the main
		* loop is entered.
		* In the main loop, continue pre-setting the first long
		* word of each cache line ST_CHUNK lines in advance while
		* setting the other seven long words (56 bytes) of each
		* cache line until fewer than ST_CHUNK*64 bytes remain.
		* Then set the remaining seven long words of each cache
		* line that has already had its first long word set.
		* }
		* store remaining data in 64-byte chunks until less than
		* 64 bytes remain.
		* }
		* Store as many 8-byte chunks, followed by trailing bytes.
		*
		* BIS = Block Init Store
		* Doing the advance store of the first element of the cache line
		* initiates the displacement of a cache line while only using a single
		* instruction in the pipeline. That avoids various pipeline delays,
		* such as filling the miss buffer. The performance effect is
		* similar to prefetching for normal stores.
		* The special case for zero fills runs faster and uses fewer instruction
		* cycles than the normal memset loop.
		*
		* We only use BIS for memset of greater than MIN_LOOP bytes because a sequence
		* BIS stores must be followed by a membar #StoreStore. The benefit of
		* the BIS store must be balanced against the cost of the membar operation.
		*/

		/*
		* ASI_STBI_P marks the cache line as "least recently used"
		* which means if many threads are active, it has a high chance
		* of being pushed out of the cache between the first initializing
		* store and the final stores.
		* Thus, we use ASI_STBIMRU_P which marks the cache line as
		* "most recently used" for all but the last store to the cache line.
		*/

		#include <asm/asi.h>
		#include <asm/page.h>

		#define ASI_STBI_P ASI_BLK_INIT_QUAD_LDD_P
		#define ASI_STBIMRU_P ASI_ST_BLKINIT_MRU_P


		#define ST_CHUNK 24 /* multiple of 4 due to loop unrolling */
		#define MIN_LOOP 16320
		#define MIN_ZERO 512

		.section ".text"
		.align 32

		/*
		* Define clear_page(dest) as memset(dest, 0, PAGE_SIZE)
		* (can create a more optimized version later.)
		*/
		.globl M7clear_page
		.globl M7clear_user_page
		M7clear_page: /* clear_page(dest) */
		M7clear_user_page:
		set PAGE_SIZE, %o1
		/* fall through into bzero code */

		.size M7clear_page,.-M7clear_page
		.size M7clear_user_page,.-M7clear_user_page

		/*
		* Define bzero(dest, n) as memset(dest, 0, n)
		* (can create a more optimized version later.)
		*/
		.globl M7bzero
		M7bzero: /* bzero(dest, size) */
		mov %o1, %o2
		mov 0, %o1
		/* fall through into memset code */

		.size M7bzero,.-M7bzero

		.global M7memset
		.type M7memset, #function
		.register %g3, #scratch
		M7memset:
		mov %o0, %o5 ! copy sp1 before using it
		cmp %o2, 7 ! if small counts, just write bytes
		bleu,pn %xcc, .wrchar
		and %o1, 0xff, %o1 ! o1 is (char)c

		sll %o1, 8, %o3
		or %o1, %o3, %o1 ! now o1 has 2 bytes of c
		sll %o1, 16, %o3
		cmp %o2, 32
		blu,pn %xcc, .wdalign
		or %o1, %o3, %o1 ! now o1 has 4 bytes of c

		sllx %o1, 32, %o3
		or %o1, %o3, %o1 ! now o1 has 8 bytes of c

		.dbalign:
		andcc %o5, 7, %o3 ! is sp1 aligned on a 8 byte bound?
		bz,pt %xcc, .blkalign ! already long word aligned
		sub %o3, 8, %o3 ! -(bytes till long word aligned)

		add %o2, %o3, %o2 ! update o2 with new count
		! Set -(%o3) bytes till sp1 long word aligned
		1: stb %o1, [%o5] ! there is at least 1 byte to set
		inccc %o3 ! byte clearing loop
		bl,pt %xcc, 1b
		inc %o5

		! Now sp1 is long word aligned (sp1 is found in %o5)
		.blkalign:
		cmp %o2, 64 ! check if there are 64 bytes to set
		blu,pn %xcc, .wrshort
		mov %o2, %o3

		andcc %o5, 63, %o3 ! is sp1 block aligned?
		bz,pt %xcc, .blkwr ! now block aligned
		sub %o3, 64, %o3 ! o3 is -(bytes till block aligned)
		add %o2, %o3, %o2 ! o2 is the remainder

		! Store -(%o3) bytes till dst is block (64 byte) aligned.
		! Use long word stores.
		! Recall that dst is already long word aligned
		1:
		addcc %o3, 8, %o3
		stx %o1, [%o5]
		bl,pt %xcc, 1b
		add %o5, 8, %o5

		! Now sp1 is block aligned
		.blkwr:
		andn %o2, 63, %o4 ! calculate size of blocks in bytes
		brz,pn %o1, .wrzero ! special case if c == 0
		and %o2, 63, %o3 ! %o3 = bytes left after blk stores.

		set MIN_LOOP, %g1
		cmp %o4, %g1 ! check there are enough bytes to set
		blu,pn %xcc, .short_set ! to justify cost of membar
		! must be > pre-cleared lines
		nop

		! initial cache-clearing stores
		! get store pipeline moving
		rd %asi, %g3 ! save %asi to be restored later
		wr %g0, ASI_STBIMRU_P, %asi

		! Primary memset loop for large memsets
		.wr_loop:
		sub %o5, 8, %o5 ! adjust %o5 for ASI store alignment
		mov ST_CHUNK, %g1
		.wr_loop_start:
		stxa %o1, [%o5+8]%asi
		subcc %g1, 4, %g1
		stxa %o1, [%o5+8+64]%asi
		add %o5, 256, %o5
		stxa %o1, [%o5+8-128]%asi
		bgu %xcc, .wr_loop_start
		stxa %o1, [%o5+8-64]%asi

		sub %o5, ST_CHUNK*64, %o5 ! reset %o5
		mov ST_CHUNK, %g1

		.wr_loop_rest:
		stxa %o1, [%o5+8+8]%asi
		sub %o4, 64, %o4
		stxa %o1, [%o5+16+8]%asi
		subcc %g1, 1, %g1
		stxa %o1, [%o5+24+8]%asi
		stxa %o1, [%o5+32+8]%asi
		stxa %o1, [%o5+40+8]%asi
		add %o5, 64, %o5
		stxa %o1, [%o5-8]%asi
		bgu %xcc, .wr_loop_rest
		stxa %o1, [%o5]ASI_STBI_P

		! If more than ST_CHUNK*64 bytes remain to set, continue
		! setting the first long word of each cache line in advance
		! to keep the store pipeline moving.

		cmp %o4, ST_CHUNK*64
		bge,pt %xcc, .wr_loop_start
		mov ST_CHUNK, %g1

		brz,a,pn %o4, .asi_done
		add %o5, 8, %o5 ! restore %o5 offset

		.wr_loop_small:
		stxa %o1, [%o5+8]%asi
		stxa %o1, [%o5+8+8]%asi
		stxa %o1, [%o5+16+8]%asi
		stxa %o1, [%o5+24+8]%asi
		stxa %o1, [%o5+32+8]%asi
		subcc %o4, 64, %o4
		stxa %o1, [%o5+40+8]%asi
		add %o5, 64, %o5
		stxa %o1, [%o5-8]%asi
		bgu,pt %xcc, .wr_loop_small
		stxa %o1, [%o5]ASI_STBI_P

		ba .asi_done
		add %o5, 8, %o5 ! restore %o5 offset

		! Special case loop for zero fill memsets
		! For each 64 byte cache line, single STBI to first element
		! clears line
		.wrzero:
		cmp %o4, MIN_ZERO ! check if enough bytes to set
		! to pay %asi + membar cost
		blu %xcc, .short_set
		nop
		sub %o4, 256, %o4

		.wrzero_loop:
		mov 64, %g3
		stxa %o1, [%o5]ASI_STBI_P
		subcc %o4, 256, %o4
		stxa %o1, [%o5+%g3]ASI_STBI_P
		add %o5, 256, %o5
		sub %g3, 192, %g3
		stxa %o1, [%o5+%g3]ASI_STBI_P
		add %g3, 64, %g3
		bge,pt %xcc, .wrzero_loop
		stxa %o1, [%o5+%g3]ASI_STBI_P
		add %o4, 256, %o4

		brz,pn %o4, .bsi_done
		nop

		.wrzero_small:
		stxa %o1, [%o5]ASI_STBI_P
		subcc %o4, 64, %o4
		bgu,pt %xcc, .wrzero_small
		add %o5, 64, %o5
		ba,a .bsi_done

		.asi_done:
		wr %g3, 0x0, %asi ! restored saved %asi
		.bsi_done:
		membar #StoreStore ! required by use of Block Store Init

		.short_set:
		cmp %o4, 64 ! check if 64 bytes to set
		blu %xcc, 5f
		nop
		4: ! set final blocks of 64 bytes
		stx %o1, [%o5]
		stx %o1, [%o5+8]
		stx %o1, [%o5+16]
		stx %o1, [%o5+24]
		subcc %o4, 64, %o4
		stx %o1, [%o5+32]
		stx %o1, [%o5+40]
		add %o5, 64, %o5
		stx %o1, [%o5-16]
		bgu,pt %xcc, 4b
		stx %o1, [%o5-8]

		5:
		! Set the remaining long words
		.wrshort:
		subcc %o3, 8, %o3 ! Can we store any long words?
		blu,pn %xcc, .wrchars
		and %o2, 7, %o2 ! calc bytes left after long words
		6:
		subcc %o3, 8, %o3
		stx %o1, [%o5] ! store the long words
		bgeu,pt %xcc, 6b
		add %o5, 8, %o5

		.wrchars: ! check for extra chars
		brnz %o2, .wrfin
		nop
		retl
		nop

		.wdalign:
		andcc %o5, 3, %o3 ! is sp1 aligned on a word boundary
		bz,pn %xcc, .wrword
		andn %o2, 3, %o3 ! create word sized count in %o3

		dec %o2 ! decrement count
		stb %o1, [%o5] ! clear a byte
		b .wdalign
		inc %o5 ! next byte

		.wrword:
		subcc %o3, 4, %o3
		st %o1, [%o5] ! 4-byte writing loop
		bnz,pt %xcc, .wrword
		add %o5, 4, %o5

		and %o2, 3, %o2 ! leftover count, if any

		.wrchar:
		! Set the remaining bytes, if any
		brz %o2, .exit
		nop
		.wrfin:
		deccc %o2
		stb %o1, [%o5]
		bgu,pt %xcc, .wrfin
		inc %o5
		.exit:
		retl ! %o0 was preserved
		nop

		.size M7memset,.-M7memset