Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit fa5dc772 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'sparc64-M7-memcpy'

Babu Moger says:

====================
sparc64: Update memcpy, memset etc. for M7/M8 architectures

This series of patches updates the memcpy, memset, copy_to_user, copy_from_user
etc for SPARC M7/M8 architecture.

New algorithm here takes advantage of the M7/M8 block init store ASIs, with much
more optimized way to improve the performance. More detail are in code comments.

Tested and compared the latency measured in ticks(NG4memcpy vs new M7memcpy).

1. Memset numbers(Aligned memset)

No.of bytes   NG4memset	   M7memset    	Delta ((B-A)/A)*100
	     (Avg.Ticks A) (Avg.Ticks B) (latency reduction)
  3		77		25		-67.53
  7		43		33		-23.25
  32		72		68		 -5.55
  128		164		44		-73.17
  256		335		68		-79.70
  512		511		220		-56.94
  1024		1552		627		-59.60
  2048		3515		1322		-62.38
  4096		6303		2472		-60.78
  8192		13118		4867		-62.89
  16384		26206		10371		-60.42
  32768		52501		18569		-64.63
  65536		100219		35899		-64.17

2. Memcpy numbers(Aligned memcpy)

No.of bytes   NG4memcpy	   M7memcpy    	Delta ((B-A)/A)*100
	     (Avg.Ticks A) (Avg.Ticks B) (latency reduction)
  3		20		19		-5
  7		29		27		-6.89
  32		30		28		-6.66
  128		89		69		-22.47
  256		142		143		 0.70
  512		341		283		-17.00
  1024		1588		655		-58.75
  2048		3553		1357		-61.80
  4096		7218		2590		-64.11
  8192		13701		5231		-61.82
  16384		28304		10716		-62.13
  32768		56516		22995		-59.31
  65536		115443		50840		-55.96

3. Memset numbers(un-aligned memset)

No.of bytes   NG4memset	   M7memset    	Delta ((B-A)/A)*100
	     (Avg.Ticks A) (Avg.Ticks B) (latency reduction)
  3		40		31		-22.5
  7		52		29		-44.2307692308
  32		89		86		-3.3707865169
  128		201		74		-63.184079602
  256		340		154		-54.7058823529
  512		961		335		-65.1404786681
  1024		1799		686		-61.8677042802
  2048		3575		1260		-64.7552447552
  4096		6560		2627		-59.9542682927
  8192		13161		6018		-54.273991338
  16384		26465		10439		-60.5554505951
  32768		52119		18649		-64.2184232238
  65536		101593		35724		-64.8361599717

4. Memcpy numbers(un-aligned memcpy)

No.of bytes   NG4memcpy	   M7memcpy    	Delta ((B-A)/A)*100
	     (Avg.Ticks A) (Avg.Ticks B) (latency reduction)
  3		26		19		-26.9230769231
  7		48		45		-6.25
  32		52		49		-5.7692307692
  128		284		334		17.6056338028
  256		430		482		12.0930232558
  512		646		690		6.8111455108
  1024		1051		1016		-3.3301617507
  2048		1787		1818		1.7347509793
  4096		3309		3376		2.0247809006
  8192		8151		7444		-8.673782358
  16384		34222		34556		0.9759803635
  32768		87851		95044		8.1877269468
  65536		158331		159572		0.7838010244

There is not much difference in numbers with Un-aligned copies
between NG4memcpy and M7memcpy because they both mostly use the
same algorithems.

v2:
 1. Fixed indentation issues found by David Miller
 2. Used ENTRY and ENDPROC for the labels in M7patch.S as suggested by David Miller
 3. Now M8 also will use M7memcpy. Also tested on M8 config.
 4. These patches are created on top of below M8 patches
    https://patchwork.ozlabs.org/patch/792661/
    https://patchwork.ozlabs.org/patch/792662/


    However, I did not see these patches in sparc-next tree. It may be in queue now.
    It is possible these patches might cause some build problems. It will resolve
    once all M8 patches are in sparc-next tree.

v0: Initial version
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 061273f9 34060b8f
Loading
Loading
Loading
Loading
+14 −2
Original line number Diff line number Diff line
@@ -603,10 +603,10 @@ niagara_tlb_fixup:
	be,pt	%xcc, niagara4_patch
	 nop
	cmp	%g1, SUN4V_CHIP_SPARC_M7
	be,pt	%xcc, niagara4_patch
	be,pt	%xcc, sparc_m7_patch
	 nop
	cmp	%g1, SUN4V_CHIP_SPARC_M8
	be,pt	%xcc, niagara4_patch
	be,pt	%xcc, sparc_m7_patch
	 nop
	cmp	%g1, SUN4V_CHIP_SPARC_SN
	be,pt	%xcc, niagara4_patch
@@ -621,6 +621,18 @@ niagara_tlb_fixup:

	ba,a,pt	%xcc, 80f
	 nop

sparc_m7_patch:
	call	m7_patch_copyops
	 nop
	call	m7_patch_bzero
	 nop
	call	m7_patch_pageops
	 nop

	ba,a,pt	%xcc, 80f
	 nop

niagara4_patch:
	call	niagara4_patch_copyops
	 nop
+40 −0
Original line number Diff line number Diff line
/*
 * M7copy_from_user.S: SPARC M7 optimized copy from userspace.
 *
 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
 */


#define EX_LD(x, y)			\
98:	x;				\
	.section __ex_table,"a";	\
	.align 4;			\
	.word 98b, y;			\
	.text;				\
	.align 4;

#define EX_LD_FP(x, y)			\
98:	x;				\
	.section __ex_table,"a";	\
	.align 4;			\
	.word 98b, y##_fp;		\
	.text;				\
	.align 4;

#ifndef ASI_AIUS
#define ASI_AIUS	0x11
#endif

#define FUNC_NAME		M7copy_from_user
#define LOAD(type,addr,dest)	type##a [addr] %asi, dest
#define EX_RETVAL(x)		0

#ifdef __KERNEL__
#define PREAMBLE					\
	rd		%asi, %g1;			\
	cmp		%g1, ASI_AIUS;			\
	bne,pn		%icc, raw_copy_in_user;		\
	 nop
#endif

#include "M7memcpy.S"
+51 −0
Original line number Diff line number Diff line
/*
 * M7copy_to_user.S: SPARC M7 optimized copy to userspace.
 *
 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
 */


#define EX_ST(x, y)			\
98:	x;				\
	.section __ex_table,"a";	\
	.align 4;			\
	.word 98b, y;			\
	.text;				\
	.align 4;

#define EX_ST_FP(x, y)			\
98:	x;				\
	.section __ex_table,"a";	\
	.align 4;			\
	.word 98b, y##_fp;		\
	.text;				\
	.align 4;


#ifndef ASI_AIUS
#define ASI_AIUS	0x11
#endif

#ifndef ASI_BLK_INIT_QUAD_LDD_AIUS
#define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23
#endif

#define FUNC_NAME		M7copy_to_user
#define STORE(type,src,addr)	type##a src, [addr] %asi
#define STORE_ASI		ASI_BLK_INIT_QUAD_LDD_AIUS
#define	STORE_MRU_ASI		ASI_ST_BLKINIT_MRU_S
#define EX_RETVAL(x)		0

#ifdef __KERNEL__
	/* Writing to %asi is _expensive_ so we hardcode it.
	 * Reading %asi to check for KERNEL_DS is comparatively
	 * cheap.
	 */
#define PREAMBLE					\
	rd		%asi, %g1;			\
	cmp		%g1, ASI_AIUS;			\
	bne,pn		%icc, raw_copy_in_user;		\
	 nop
#endif

#include "M7memcpy.S"
+923 −0

File added.

Preview size limit exceeded, changes collapsed.

+352 −0
Original line number Diff line number Diff line
/*
 * M7memset.S: SPARC M7 optimized memset.
 *
 * Copyright (c) 2016, Oracle and/or its affiliates.  All rights reserved.
 */

/*
 * M7memset.S: M7 optimized memset.
 *
 * char *memset(sp, c, n)
 *
 * Set an array of n chars starting at sp to the character c.
 * Return sp.
 *
 * Fast assembler language version of the following C-program for memset
 * which represents the `standard' for the C-library.
 *
 *	void *
 *	memset(void *sp1, int c, size_t n)
 *	{
 *	    if (n != 0) {
 *		char *sp = sp1;
 *		do {
 *		    *sp++ = (char)c;
 *		} while (--n != 0);
 *	    }
 *	    return (sp1);
 *	}
 *
 * The algorithm is as follows :
 *
 *	For small 6 or fewer bytes stores, bytes will be stored.
 *
 *	For less than 32 bytes stores, align the address on 4 byte boundary.
 *	Then store as many 4-byte chunks, followed by trailing bytes.
 *
 *	For sizes greater than 32 bytes, align the address on 8 byte boundary.
 *	if (count >= 64) {
 *      	store 8-bytes chunks to align the address on 64 byte boundary
 *		if (value to be set is zero && count >= MIN_ZERO) {
 *              	Using BIS stores, set the first long word of each
 *			64-byte cache line to zero which will also clear the
 *			other seven long words of the cache line.
 *       	}
 *       	else if (count >= MIN_LOOP) {
 *       		Using BIS stores, set the first long word of each of
 *              	ST_CHUNK cache lines (64 bytes each) before the main
 *			loop is entered.
 *              	In the main loop, continue pre-setting the first long
 *              	word of each cache line ST_CHUNK lines in advance while
 *              	setting the other seven long words (56 bytes) of each
 * 			cache line until fewer than ST_CHUNK*64 bytes remain.
 *			Then set the remaining seven long words of each cache
 * 			line that has already had its first long word set.
 *       	}
 *       	store remaining data in 64-byte chunks until less than
 *       	64 bytes remain.
 *       }
 *       Store as many 8-byte chunks, followed by trailing bytes.
 *
 * BIS = Block Init Store
 *   Doing the advance store of the first element of the cache line
 *   initiates the displacement of a cache line while only using a single
 *   instruction in the pipeline. That avoids various pipeline delays,
 *   such as filling the miss buffer. The performance effect is
 *   similar to prefetching for normal stores.
 *   The special case for zero fills runs faster and uses fewer instruction
 *   cycles than the normal memset loop.
 *
 * We only use BIS for memset of greater than MIN_LOOP bytes because a sequence
 * BIS stores must be followed by a membar #StoreStore. The benefit of
 * the BIS store must be balanced against the cost of the membar operation.
 */

/*
 * ASI_STBI_P marks the cache line as "least recently used"
 * which means if many threads are active, it has a high chance
 * of being pushed out of the cache between the first initializing
 * store and the final stores.
 * Thus, we use ASI_STBIMRU_P which marks the cache line as
 * "most recently used" for all but the last store to the cache line.
 */

#include <asm/asi.h>
#include <asm/page.h>

#define ASI_STBI_P      ASI_BLK_INIT_QUAD_LDD_P
#define ASI_STBIMRU_P   ASI_ST_BLKINIT_MRU_P


#define ST_CHUNK        24   /* multiple of 4 due to loop unrolling */
#define MIN_LOOP        16320
#define MIN_ZERO        512

	.section	".text"
	.align		32

/*
 * Define clear_page(dest) as memset(dest, 0, PAGE_SIZE)
 * (can create a more optimized version later.)
 */
	.globl		M7clear_page
	.globl		M7clear_user_page
M7clear_page:		/* clear_page(dest) */
M7clear_user_page:
	set	PAGE_SIZE, %o1
	/* fall through into bzero code */

	.size		M7clear_page,.-M7clear_page
	.size		M7clear_user_page,.-M7clear_user_page

/*
 * Define bzero(dest, n) as memset(dest, 0, n)
 * (can create a more optimized version later.)
 */
	.globl		M7bzero
M7bzero:		/* bzero(dest, size) */
	mov	%o1, %o2
	mov	0, %o1
	/* fall through into memset code */

	.size		M7bzero,.-M7bzero

	.global		M7memset
	.type		M7memset, #function
	.register	%g3, #scratch
M7memset:
	mov     %o0, %o5                ! copy sp1 before using it
	cmp     %o2, 7                  ! if small counts, just write bytes
	bleu,pn %xcc, .wrchar
	 and     %o1, 0xff, %o1          ! o1 is (char)c

	sll     %o1, 8, %o3
	or      %o1, %o3, %o1           ! now o1 has 2 bytes of c
	sll     %o1, 16, %o3
	cmp     %o2, 32
	blu,pn  %xcc, .wdalign
	 or      %o1, %o3, %o1           ! now o1 has 4 bytes of c

	sllx    %o1, 32, %o3
	or      %o1, %o3, %o1           ! now o1 has 8 bytes of c

.dbalign:
	andcc   %o5, 7, %o3             ! is sp1 aligned on a 8 byte bound?
	bz,pt   %xcc, .blkalign         ! already long word aligned
	 sub     %o3, 8, %o3             ! -(bytes till long word aligned)

	add     %o2, %o3, %o2           ! update o2 with new count
	! Set -(%o3) bytes till sp1 long word aligned
1:	stb     %o1, [%o5]              ! there is at least 1 byte to set
	inccc   %o3                     ! byte clearing loop
	bl,pt   %xcc, 1b
	 inc     %o5

	! Now sp1 is long word aligned (sp1 is found in %o5)
.blkalign:
	cmp     %o2, 64                 ! check if there are 64 bytes to set
	blu,pn  %xcc, .wrshort
	 mov     %o2, %o3

	andcc   %o5, 63, %o3            ! is sp1 block aligned?
	bz,pt   %xcc, .blkwr            ! now block aligned
	 sub     %o3, 64, %o3            ! o3 is -(bytes till block aligned)
	add     %o2, %o3, %o2           ! o2 is the remainder

	! Store -(%o3) bytes till dst is block (64 byte) aligned.
	! Use long word stores.
	! Recall that dst is already long word aligned
1:
	addcc   %o3, 8, %o3
	stx     %o1, [%o5]
	bl,pt   %xcc, 1b
	 add     %o5, 8, %o5

	! Now sp1 is block aligned
.blkwr:
	andn    %o2, 63, %o4            ! calculate size of blocks in bytes
	brz,pn  %o1, .wrzero            ! special case if c == 0
	 and     %o2, 63, %o3            ! %o3 = bytes left after blk stores.

	set     MIN_LOOP, %g1
	cmp     %o4, %g1                ! check there are enough bytes to set
	blu,pn  %xcc, .short_set        ! to justify cost of membar
	                                ! must be > pre-cleared lines
	 nop

	! initial cache-clearing stores
	! get store pipeline moving
	rd	%asi, %g3		! save %asi to be restored later
	wr     %g0, ASI_STBIMRU_P, %asi

	! Primary memset loop for large memsets
.wr_loop:
	sub     %o5, 8, %o5		! adjust %o5 for ASI store alignment
	mov     ST_CHUNK, %g1
.wr_loop_start:
	stxa    %o1, [%o5+8]%asi
	subcc   %g1, 4, %g1
	stxa    %o1, [%o5+8+64]%asi
	add     %o5, 256, %o5
	stxa    %o1, [%o5+8-128]%asi
	bgu     %xcc, .wr_loop_start
	 stxa    %o1, [%o5+8-64]%asi

	sub     %o5, ST_CHUNK*64, %o5	! reset %o5
	mov     ST_CHUNK, %g1

.wr_loop_rest:
	stxa    %o1, [%o5+8+8]%asi
	sub     %o4, 64, %o4
	stxa    %o1, [%o5+16+8]%asi
	subcc   %g1, 1, %g1
	stxa    %o1, [%o5+24+8]%asi
	stxa    %o1, [%o5+32+8]%asi
	stxa    %o1, [%o5+40+8]%asi
	add     %o5, 64, %o5
	stxa    %o1, [%o5-8]%asi
	bgu     %xcc, .wr_loop_rest
	 stxa    %o1, [%o5]ASI_STBI_P

	! If more than ST_CHUNK*64 bytes remain to set, continue
	! setting the first long word of each cache line in advance
	! to keep the store pipeline moving.

	cmp     %o4, ST_CHUNK*64
	bge,pt  %xcc, .wr_loop_start
	 mov     ST_CHUNK, %g1

	brz,a,pn %o4, .asi_done
	 add     %o5, 8, %o5             ! restore %o5 offset

.wr_loop_small:
	stxa    %o1, [%o5+8]%asi
	stxa    %o1, [%o5+8+8]%asi
	stxa    %o1, [%o5+16+8]%asi
	stxa    %o1, [%o5+24+8]%asi
	stxa    %o1, [%o5+32+8]%asi
	subcc   %o4, 64, %o4
	stxa    %o1, [%o5+40+8]%asi
	add     %o5, 64, %o5
	stxa    %o1, [%o5-8]%asi
	bgu,pt  %xcc, .wr_loop_small
	 stxa    %o1, [%o5]ASI_STBI_P

	ba      .asi_done
	 add     %o5, 8, %o5             ! restore %o5 offset

	! Special case loop for zero fill memsets
	! For each 64 byte cache line, single STBI to first element
	! clears line
.wrzero:
	cmp     %o4, MIN_ZERO           ! check if enough bytes to set
					! to pay %asi + membar cost
	blu     %xcc, .short_set
	 nop
	sub     %o4, 256, %o4

.wrzero_loop:
	mov     64, %g3
	stxa    %o1, [%o5]ASI_STBI_P
	subcc   %o4, 256, %o4
	stxa    %o1, [%o5+%g3]ASI_STBI_P
	add     %o5, 256, %o5
	sub     %g3, 192, %g3
	stxa    %o1, [%o5+%g3]ASI_STBI_P
	add %g3, 64, %g3
	bge,pt  %xcc, .wrzero_loop
	 stxa    %o1, [%o5+%g3]ASI_STBI_P
	add     %o4, 256, %o4

	brz,pn  %o4, .bsi_done
	 nop

.wrzero_small:
	stxa    %o1, [%o5]ASI_STBI_P
	subcc   %o4, 64, %o4
	bgu,pt  %xcc, .wrzero_small
	 add     %o5, 64, %o5
	ba,a	.bsi_done

.asi_done:
	wr	%g3, 0x0, %asi		! restored saved %asi
.bsi_done:
	membar  #StoreStore             ! required by use of Block Store Init

.short_set:
	cmp     %o4, 64                 ! check if 64 bytes to set
	blu     %xcc, 5f
	 nop
4:                                      ! set final blocks of 64 bytes
	stx     %o1, [%o5]
	stx     %o1, [%o5+8]
	stx     %o1, [%o5+16]
	stx     %o1, [%o5+24]
	subcc   %o4, 64, %o4
	stx     %o1, [%o5+32]
	stx     %o1, [%o5+40]
	add     %o5, 64, %o5
	stx     %o1, [%o5-16]
	bgu,pt  %xcc, 4b
	 stx     %o1, [%o5-8]

5:
	! Set the remaining long words
.wrshort:
	subcc   %o3, 8, %o3             ! Can we store any long words?
	blu,pn  %xcc, .wrchars
	 and     %o2, 7, %o2             ! calc bytes left after long words
6:
	subcc   %o3, 8, %o3
	stx     %o1, [%o5]              ! store the long words
	bgeu,pt %xcc, 6b
	 add     %o5, 8, %o5

.wrchars:                               ! check for extra chars
	brnz    %o2, .wrfin
	 nop
	retl
	 nop

.wdalign:
	andcc   %o5, 3, %o3             ! is sp1 aligned on a word boundary
	bz,pn   %xcc, .wrword
	 andn    %o2, 3, %o3             ! create word sized count in %o3

	dec     %o2                     ! decrement count
	stb     %o1, [%o5]              ! clear a byte
	b       .wdalign
	 inc     %o5                     ! next byte

.wrword:
	subcc   %o3, 4, %o3
	st      %o1, [%o5]              ! 4-byte writing loop
	bnz,pt  %xcc, .wrword
	 add     %o5, 4, %o5

	and     %o2, 3, %o2             ! leftover count, if any

.wrchar:
	! Set the remaining bytes, if any
	brz     %o2, .exit
	 nop
.wrfin:
	deccc   %o2
	stb     %o1, [%o5]
	bgu,pt  %xcc, .wrfin
	 inc     %o5
.exit:
	retl                            ! %o0 was preserved
	 nop

	.size		M7memset,.-M7memset
Loading