Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ad2fc2cd authored by Vitaly Mayatskikh's avatar Vitaly Mayatskikh Committed by Ingo Molnar
Browse files

x86: fix copy_user on x86



Switch copy_user_generic_string(), copy_user_generic_unrolled() and
__copy_user_nocache() from custom tail handlers to generic
copy_user_tail_handle().

Signed-off-by: default avatarVitaly Mayatskikh <v.mayatskih@gmail.com>
Acked-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent 1129585a
Loading
Loading
Loading
Loading
+172 −255
Original line number Original line Diff line number Diff line
/* Copyright 2002 Andi Kleen, SuSE Labs.
/*
 * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
 * Copyright 2002 Andi Kleen, SuSE Labs.
 * Subject to the GNU Public License v2.
 * Subject to the GNU Public License v2.
 *
 *
 * Functions to copy from and to user space.
 * Functions to copy from and to user space.
@@ -33,6 +35,35 @@
	.previous
	.previous
	.endm
	.endm


	.macro ALIGN_DESTINATION
#ifdef FIX_ALIGNMENT
	/* check for bad alignment of destination */
	movl %edi,%ecx
	andl $7,%ecx
	jz 102f				/* already aligned */
	subl $8,%ecx
	negl %ecx
	subl %ecx,%edx
100:	movb (%rsi),%al
101:	movb %al,(%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
	jnz 100b
102:
	.section .fixup,"ax"
103:	addl %r8d,%edx			/* ecx is zerorest also */
	jmp copy_user_handle_tail
	.previous

	.section __ex_table,"a"
	.align 8
	.quad 100b,103b
	.quad 101b,103b
	.previous
#endif
	.endm

/* Standard copy_to_user with segment limit checking */
/* Standard copy_to_user with segment limit checking */
ENTRY(copy_to_user)
ENTRY(copy_to_user)
	CFI_STARTPROC
	CFI_STARTPROC
@@ -42,19 +73,6 @@ ENTRY(copy_to_user)
	jc bad_to_user
	jc bad_to_user
	cmpq TI_addr_limit(%rax),%rcx
	cmpq TI_addr_limit(%rax),%rcx
	jae bad_to_user
	jae bad_to_user
	xorl %eax,%eax	/* clear zero flag */
	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
	CFI_ENDPROC

ENTRY(copy_user_generic)
	CFI_STARTPROC
	movl $1,%ecx	/* set zero flag */
	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
	CFI_ENDPROC

ENTRY(__copy_from_user_inatomic)
	CFI_STARTPROC
	xorl %ecx,%ecx	/* clear zero flag */
	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
	CFI_ENDPROC
	CFI_ENDPROC


@@ -67,13 +85,25 @@ ENTRY(copy_from_user)
	jc bad_from_user
	jc bad_from_user
	cmpq TI_addr_limit(%rax),%rcx
	cmpq TI_addr_limit(%rax),%rcx
	jae bad_from_user
	jae bad_from_user
	movl $1,%ecx	/* set zero flag */
	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
	CFI_ENDPROC
	CFI_ENDPROC
ENDPROC(copy_from_user)
ENDPROC(copy_from_user)


ENTRY(copy_user_generic)
	CFI_STARTPROC
	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
	CFI_ENDPROC
ENDPROC(copy_user_generic)

ENTRY(__copy_from_user_inatomic)
	CFI_STARTPROC
	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
	CFI_ENDPROC
ENDPROC(__copy_from_user_inatomic)

	.section .fixup,"ax"
	.section .fixup,"ax"
	/* must zero dest */
	/* must zero dest */
ENTRY(bad_from_user)
bad_from_user:
bad_from_user:
	CFI_STARTPROC
	CFI_STARTPROC
	movl %edx,%ecx
	movl %edx,%ecx
@@ -84,219 +114,111 @@ bad_to_user:
	movl %edx,%eax
	movl %edx,%eax
	ret
	ret
	CFI_ENDPROC
	CFI_ENDPROC
END(bad_from_user)
ENDPROC(bad_from_user)
	.previous
	.previous


		
/*
/*
 * copy_user_generic_unrolled - memory copy with exception handling.
 * copy_user_generic_unrolled - memory copy with exception handling.
 * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
 * This version is for CPUs like P4 that don't have efficient micro
 * code for rep movsq
 *
 *
 * Input:
 * Input:
 * rdi destination
 * rdi destination
 * rsi source
 * rsi source
 * rdx count
 * rdx count
 * ecx zero flag -- if true zero destination on error
 *
 *
 * Output:
 * Output:
 * eax uncopied bytes or 0 if successful.
 * eax uncopied bytes or 0 if successfull.
 */
 */
ENTRY(copy_user_generic_unrolled)
ENTRY(copy_user_generic_unrolled)
	CFI_STARTPROC
	CFI_STARTPROC
	pushq %rbx
	cmpl $8,%edx
	CFI_ADJUST_CFA_OFFSET 8
	jb 20f		/* less then 8 bytes, go to byte copy loop */
	CFI_REL_OFFSET rbx, 0
	ALIGN_DESTINATION
	pushq %rcx
	movl %edx,%ecx
	CFI_ADJUST_CFA_OFFSET 8
	andl $63,%edx
	CFI_REL_OFFSET rcx, 0
	shrl $6,%ecx
	xorl %eax,%eax		/*zero for the exception handler */
	jz 17f

1:	movq (%rsi),%r8
#ifdef FIX_ALIGNMENT
2:	movq 1*8(%rsi),%r9
	/* check for bad alignment of destination */
3:	movq 2*8(%rsi),%r10
	movl %edi,%ecx
4:	movq 3*8(%rsi),%r11
	andl $7,%ecx
5:	movq %r8,(%rdi)
	jnz  .Lbad_alignment
6:	movq %r9,1*8(%rdi)
.Lafter_bad_alignment:
7:	movq %r10,2*8(%rdi)
#endif
8:	movq %r11,3*8(%rdi)

9:	movq 4*8(%rsi),%r8
	movq %rdx,%rcx
10:	movq 5*8(%rsi),%r9

11:	movq 6*8(%rsi),%r10
	movl $64,%ebx
12:	movq 7*8(%rsi),%r11
	shrq $6,%rdx
13:	movq %r8,4*8(%rdi)
	decq %rdx
14:	movq %r9,5*8(%rdi)
	js   .Lhandle_tail
15:	movq %r10,6*8(%rdi)

16:	movq %r11,7*8(%rdi)
	.p2align 4
.Lloop:
.Ls1:	movq (%rsi),%r11
.Ls2:	movq 1*8(%rsi),%r8
.Ls3:	movq 2*8(%rsi),%r9
.Ls4:	movq 3*8(%rsi),%r10
.Ld1:	movq %r11,(%rdi)
.Ld2:	movq %r8,1*8(%rdi)
.Ld3:	movq %r9,2*8(%rdi)
.Ld4:	movq %r10,3*8(%rdi)

.Ls5:	movq 4*8(%rsi),%r11
.Ls6:	movq 5*8(%rsi),%r8
.Ls7:	movq 6*8(%rsi),%r9
.Ls8:	movq 7*8(%rsi),%r10
.Ld5:	movq %r11,4*8(%rdi)
.Ld6:	movq %r8,5*8(%rdi)
.Ld7:	movq %r9,6*8(%rdi)
.Ld8:	movq %r10,7*8(%rdi)

	decq %rdx

	leaq 64(%rsi),%rsi
	leaq 64(%rsi),%rsi
	leaq 64(%rdi),%rdi
	leaq 64(%rdi),%rdi

	jns  .Lloop

	.p2align 4
.Lhandle_tail:
	movl %ecx,%edx
	andl $63,%ecx
	shrl $3,%ecx
	jz   .Lhandle_7
	movl $8,%ebx
	.p2align 4
.Lloop_8:
.Ls9:	movq (%rsi),%r8
.Ld9:	movq %r8,(%rdi)
	decl %ecx
	decl %ecx
	leaq 8(%rdi),%rdi
	jnz 1b
17:	movl %edx,%ecx
	andl $7,%edx
	shrl $3,%ecx
	jz 20f
18:	movq (%rsi),%r8
19:	movq %r8,(%rdi)
	leaq 8(%rsi),%rsi
	leaq 8(%rsi),%rsi
	jnz .Lloop_8
	leaq 8(%rdi),%rdi

	decl %ecx
.Lhandle_7:
	jnz 18b
20:	andl %edx,%edx
	jz 23f
	movl %edx,%ecx
	movl %edx,%ecx
	andl $7,%ecx
21:	movb (%rsi),%al
	jz   .Lende
22:	movb %al,(%rdi)
	.p2align 4
.Lloop_1:
.Ls10:	movb (%rsi),%bl
.Ld10:	movb %bl,(%rdi)
	incq %rdi
	incq %rsi
	incq %rsi
	incq %rdi
	decl %ecx
	decl %ecx
	jnz .Lloop_1
	jnz 21b

23:	xor %eax,%eax
	CFI_REMEMBER_STATE
.Lende:
	popq %rcx
	CFI_ADJUST_CFA_OFFSET -8
	CFI_RESTORE rcx
	popq %rbx
	CFI_ADJUST_CFA_OFFSET -8
	CFI_RESTORE rbx
	ret
	ret
	CFI_RESTORE_STATE


#ifdef FIX_ALIGNMENT
	.section .fixup,"ax"
	/* align destination */
30:	shll $6,%ecx
	.p2align 4
	addl %ecx,%edx
.Lbad_alignment:
	jmp 60f
	movl $8,%r9d
40:	leal (%edx,%ecx,8),%edx
	subl %ecx,%r9d
	jmp 60f
	movl %r9d,%ecx
50:	movl %ecx,%edx
	cmpq %r9,%rdx
60:	jmp copy_user_handle_tail /* ecx is zerorest also */
	jz   .Lhandle_7
	.previous
	js   .Lhandle_7
.Lalign_1:
.Ls11:	movb (%rsi),%bl
.Ld11:	movb %bl,(%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
	jnz .Lalign_1
	subq %r9,%rdx
	jmp .Lafter_bad_alignment
#endif


	/* table sorted by exception address */
	.section __ex_table,"a"
	.section __ex_table,"a"
	.align 8
	.align 8
	.quad .Ls1,.Ls1e	/* Ls1-Ls4 have copied zero bytes */
	.quad 1b,30b
	.quad .Ls2,.Ls1e
	.quad 2b,30b
	.quad .Ls3,.Ls1e
	.quad 3b,30b
	.quad .Ls4,.Ls1e
	.quad 4b,30b
	.quad .Ld1,.Ls1e	/* Ld1-Ld4 have copied 0-24 bytes */
	.quad 5b,30b
	.quad .Ld2,.Ls2e
	.quad 6b,30b
	.quad .Ld3,.Ls3e
	.quad 7b,30b
	.quad .Ld4,.Ls4e
	.quad 8b,30b
	.quad .Ls5,.Ls5e	/* Ls5-Ls8 have copied 32 bytes */
	.quad 9b,30b
	.quad .Ls6,.Ls5e
	.quad 10b,30b
	.quad .Ls7,.Ls5e
	.quad 11b,30b
	.quad .Ls8,.Ls5e
	.quad 12b,30b
	.quad .Ld5,.Ls5e	/* Ld5-Ld8 have copied 32-56 bytes */
	.quad 13b,30b
	.quad .Ld6,.Ls6e
	.quad 14b,30b
	.quad .Ld7,.Ls7e
	.quad 15b,30b
	.quad .Ld8,.Ls8e
	.quad 16b,30b
	.quad .Ls9,.Le_quad
	.quad 18b,40b
	.quad .Ld9,.Le_quad
	.quad 19b,40b
	.quad .Ls10,.Le_byte
	.quad 21b,50b
	.quad .Ld10,.Le_byte
	.quad 22b,50b
#ifdef FIX_ALIGNMENT
	.quad .Ls11,.Lzero_rest
	.quad .Ld11,.Lzero_rest
#endif
	.quad .Le5,.Le_zero
	.previous
	.previous

	/* eax: zero, ebx: 64 */
.Ls1e: 	addl $8,%eax		/* eax is bytes left uncopied within the loop (Ls1e: 64 .. Ls8e: 8) */
.Ls2e: 	addl $8,%eax
.Ls3e: 	addl $8,%eax
.Ls4e: 	addl $8,%eax
.Ls5e: 	addl $8,%eax
.Ls6e: 	addl $8,%eax
.Ls7e: 	addl $8,%eax
.Ls8e: 	addl $8,%eax
	addq %rbx,%rdi	/* +64 */
	subq %rax,%rdi  /* correct destination with computed offset */

	shlq $6,%rdx	/* loop counter * 64 (stride length) */
	addq %rax,%rdx	/* add offset to loopcnt */
	andl $63,%ecx	/* remaining bytes */
	addq %rcx,%rdx	/* add them */
	jmp .Lzero_rest

	/* exception on quad word loop in tail handling */
	/* ecx:	loopcnt/8, %edx: length, rdi: correct */
.Le_quad:
	shll $3,%ecx
	andl $7,%edx
	addl %ecx,%edx
	/* edx: bytes to zero, rdi: dest, eax:zero */
.Lzero_rest:
	cmpl $0,(%rsp)
	jz   .Le_zero
	movq %rdx,%rcx
.Le_byte:
	xorl %eax,%eax
.Le5:	rep
	stosb
	/* when there is another exception while zeroing the rest just return */
.Le_zero:
	movq %rdx,%rax
	jmp .Lende
	CFI_ENDPROC
	CFI_ENDPROC
ENDPROC(copy_user_generic)
ENDPROC(copy_user_generic_unrolled)



/* Some CPUs run faster using the string copy instructions.
/* Some CPUs run faster using the string copy instructions.
	   This is also a lot simpler. Use them when possible.
 * This is also a lot simpler. Use them when possible.
	   Patch in jmps to this code instead of copying it fully
	   to avoid unwanted aliasing in the exception tables. */

 /* rdi	destination
  * rsi source
  * rdx count
  * ecx zero flag
  *
  * Output:
  * eax uncopied bytes or 0 if successfull.
 *
 *
 * Only 4GB of copy is supported. This shouldn't be a problem
 * Only 4GB of copy is supported. This shouldn't be a problem
 * because the kernel normally only writes from/to page sized chunks
 * because the kernel normally only writes from/to page sized chunks
@@ -304,48 +226,43 @@ ENDPROC(copy_user_generic)
 * And more would be dangerous because both Intel and AMD have
 * And more would be dangerous because both Intel and AMD have
 * errata with rep movsq > 4GB. If someone feels the need to fix
 * errata with rep movsq > 4GB. If someone feels the need to fix
 * this please consider this.
 * this please consider this.
 *
 * Input:
 * rdi destination
 * rsi source
 * rdx count
 *
 * Output:
 * eax uncopied bytes or 0 if successful.
 */
 */
ENTRY(copy_user_generic_string)
ENTRY(copy_user_generic_string)
	CFI_STARTPROC
	CFI_STARTPROC
	movl %ecx,%r8d		/* save zero flag */
	andl %edx,%edx
	jz 4f
	cmpl $8,%edx
	jb 2f		/* less than 8 bytes, go to byte copy loop */
	ALIGN_DESTINATION
	movl %edx,%ecx
	movl %edx,%ecx
	shrl $3,%ecx
	shrl $3,%ecx
	andl $7,%edx
	andl $7,%edx
	jz   10f
1:	rep
1:	rep
	movsq
	movsq
	movl %edx,%ecx
2:	movl %edx,%ecx
2:	rep
3:	rep
	movsb
	movsb
9:	movl %ecx,%eax
4:	xorl %eax,%eax
	ret

	/* multiple of 8 byte */
10:	rep
	movsq
	xor %eax,%eax
	ret
	ret


	/* exception handling */
	.section .fixup,"ax"
3:      lea (%rdx,%rcx,8),%rax	/* exception on quad loop */
11:	leal (%edx,%ecx,8),%ecx
	jmp 6f
12:	movl %ecx,%edx		/* ecx is zerorest also */
5:	movl %ecx,%eax		/* exception on byte loop */
	jmp copy_user_handle_tail
	/* eax: left over bytes */
	.previous
6:	testl %r8d,%r8d		/* zero flag set? */
	jz 7f
	movl %eax,%ecx		/* initialize x86 loop counter */
	push %rax
	xorl %eax,%eax
8:	rep
	stosb 			/* zero the rest */
11:	pop %rax
7:	ret
	CFI_ENDPROC
END(copy_user_generic_c)


	.section __ex_table,"a"
	.section __ex_table,"a"
	.quad 1b,3b
	.align 8
	.quad 2b,5b
	.quad 1b,11b
	.quad 8b,11b
	.quad 3b,12b
	.quad 10b,3b
	.previous
	.previous
	CFI_ENDPROC
ENDPROC(copy_user_generic_string)
+103 −180
Original line number Original line Diff line number Diff line
/* Copyright 2002 Andi Kleen, SuSE Labs.
/*
 * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
 * Copyright 2002 Andi Kleen, SuSE Labs.
 * Subject to the GNU Public License v2.
 * Subject to the GNU Public License v2.
 *
 *
 * Functions to copy from and to user space.
 * Functions to copy from and to user space.
@@ -12,204 +14,125 @@
#include <asm/current.h>
#include <asm/current.h>
#include <asm/asm-offsets.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/thread_info.h>
#include <asm/cpufeature.h>

/*
 * copy_user_nocache - Uncached memory copy with exception handling
 * This will force destination/source out of cache for more performance.
 *
 * Input:
 * rdi destination
 * rsi source
 * rdx count
 * rcx zero flag	when 1 zero on exception
 *
 * Output:
 * eax uncopied bytes or 0 if successful.
 */
ENTRY(__copy_user_nocache)
	CFI_STARTPROC
	pushq %rbx
	CFI_ADJUST_CFA_OFFSET 8
	CFI_REL_OFFSET rbx, 0
	pushq %rcx		/* save zero flag */
	CFI_ADJUST_CFA_OFFSET 8
	CFI_REL_OFFSET rcx, 0

	xorl %eax,%eax		/* zero for the exception handler */


	.macro ALIGN_DESTINATION
#ifdef FIX_ALIGNMENT
#ifdef FIX_ALIGNMENT
	/* check for bad alignment of destination */
	/* check for bad alignment of destination */
	movl %edi,%ecx
	movl %edi,%ecx
	andl $7,%ecx
	andl $7,%ecx
	jnz  .Lbad_alignment
	jz 102f				/* already aligned */
.Lafter_bad_alignment:
	subl $8,%ecx
#endif
	negl %ecx

	subl %ecx,%edx
	movq %rdx,%rcx
100:	movb (%rsi),%al

101:	movb %al,(%rdi)
	movl $64,%ebx
	incq %rsi
	shrq $6,%rdx
	incq %rdi
	decq %rdx
	decl %ecx
	js   .Lhandle_tail
	jnz 100b

102:
	.p2align 4
	.section .fixup,"ax"
.Lloop:
103:	addl %r8d,%edx			/* ecx is zerorest also */
.Ls1:	movq (%rsi),%r11
	jmp copy_user_handle_tail
.Ls2:	movq 1*8(%rsi),%r8
	.previous
.Ls3:	movq 2*8(%rsi),%r9
.Ls4:	movq 3*8(%rsi),%r10
.Ld1:	movnti %r11,(%rdi)
.Ld2:	movnti %r8,1*8(%rdi)
.Ld3:	movnti %r9,2*8(%rdi)
.Ld4:	movnti %r10,3*8(%rdi)

.Ls5:	movq 4*8(%rsi),%r11
.Ls6:	movq 5*8(%rsi),%r8
.Ls7:	movq 6*8(%rsi),%r9
.Ls8:	movq 7*8(%rsi),%r10
.Ld5:	movnti %r11,4*8(%rdi)
.Ld6:	movnti %r8,5*8(%rdi)
.Ld7:	movnti %r9,6*8(%rdi)
.Ld8:	movnti %r10,7*8(%rdi)


	dec  %rdx
	.section __ex_table,"a"
	.align 8
	.quad 100b,103b
	.quad 101b,103b
	.previous
#endif
	.endm


/*
 * copy_user_nocache - Uncached memory copy with exception handling
 * This will force destination/source out of cache for more performance.
 */
ENTRY(__copy_user_nocache)
	CFI_STARTPROC
	cmpl $8,%edx
	jb 20f		/* less then 8 bytes, go to byte copy loop */
	ALIGN_DESTINATION
	movl %edx,%ecx
	andl $63,%edx
	shrl $6,%ecx
	jz 17f
1:	movq (%rsi),%r8
2:	movq 1*8(%rsi),%r9
3:	movq 2*8(%rsi),%r10
4:	movq 3*8(%rsi),%r11
5:	movnti %r8,(%rdi)
6:	movnti %r9,1*8(%rdi)
7:	movnti %r10,2*8(%rdi)
8:	movnti %r11,3*8(%rdi)
9:	movq 4*8(%rsi),%r8
10:	movq 5*8(%rsi),%r9
11:	movq 6*8(%rsi),%r10
12:	movq 7*8(%rsi),%r11
13:	movnti %r8,4*8(%rdi)
14:	movnti %r9,5*8(%rdi)
15:	movnti %r10,6*8(%rdi)
16:	movnti %r11,7*8(%rdi)
	leaq 64(%rsi),%rsi
	leaq 64(%rsi),%rsi
	leaq 64(%rdi),%rdi
	leaq 64(%rdi),%rdi

	jns  .Lloop

	.p2align 4
.Lhandle_tail:
	movl %ecx,%edx
	andl $63,%ecx
	shrl $3,%ecx
	jz   .Lhandle_7
	movl $8,%ebx
	.p2align 4
.Lloop_8:
.Ls9:	movq (%rsi),%r8
.Ld9:	movnti %r8,(%rdi)
	decl %ecx
	decl %ecx
	leaq 8(%rdi),%rdi
	jnz 1b
17:	movl %edx,%ecx
	andl $7,%edx
	shrl $3,%ecx
	jz 20f
18:	movq (%rsi),%r8
19:	movnti %r8,(%rdi)
	leaq 8(%rsi),%rsi
	leaq 8(%rsi),%rsi
	jnz .Lloop_8
	leaq 8(%rdi),%rdi

	decl %ecx
.Lhandle_7:
	jnz 18b
20:	andl %edx,%edx
	jz 23f
	movl %edx,%ecx
	movl %edx,%ecx
	andl $7,%ecx
21:	movb (%rsi),%al
	jz   .Lende
22:	movb %al,(%rdi)
	.p2align 4
.Lloop_1:
.Ls10:	movb (%rsi),%bl
.Ld10:	movb %bl,(%rdi)
	incq %rdi
	incq %rsi
	incq %rsi
	incq %rdi
	decl %ecx
	decl %ecx
	jnz .Lloop_1
	jnz 21b

23:	xorl %eax,%eax
	CFI_REMEMBER_STATE
.Lende:
	popq %rcx
	CFI_ADJUST_CFA_OFFSET -8
	CFI_RESTORE %rcx
	popq %rbx
	CFI_ADJUST_CFA_OFFSET -8
	CFI_RESTORE rbx
	sfence
	sfence
	ret
	ret
	CFI_RESTORE_STATE


#ifdef FIX_ALIGNMENT
	.section .fixup,"ax"
	/* align destination */
30:	shll $6,%ecx
	.p2align 4
	addl %ecx,%edx
.Lbad_alignment:
	jmp 60f
	movl $8,%r9d
40:	leal (%edx,%ecx,8),%edx
	subl %ecx,%r9d
	jmp 60f
	movl %r9d,%ecx
50:	movl %ecx,%edx
	cmpq %r9,%rdx
60:	sfence
	jz   .Lhandle_7
	movl %r8d,%ecx
	js   .Lhandle_7
	jmp copy_user_handle_tail
.Lalign_1:
	.previous
.Ls11:	movb (%rsi),%bl
.Ld11:	movb %bl,(%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
	jnz .Lalign_1
	subq %r9,%rdx
	jmp .Lafter_bad_alignment
#endif


	/* table sorted by exception address */
	.section __ex_table,"a"
	.section __ex_table,"a"
	.align 8
	.quad 1b,30b
	.quad .Ls1,.Ls1e	/* .Ls[1-4] - 0 bytes copied */
	.quad 2b,30b
	.quad .Ls2,.Ls1e
	.quad 3b,30b
	.quad .Ls3,.Ls1e
	.quad 4b,30b
	.quad .Ls4,.Ls1e
	.quad 5b,30b
	.quad .Ld1,.Ls1e	/* .Ld[1-4] - 0..24 bytes coped */
	.quad 6b,30b
	.quad .Ld2,.Ls2e
	.quad 7b,30b
	.quad .Ld3,.Ls3e
	.quad 8b,30b
	.quad .Ld4,.Ls4e
	.quad 9b,30b
	.quad .Ls5,.Ls5e	/* .Ls[5-8] - 32 bytes copied */
	.quad 10b,30b
	.quad .Ls6,.Ls5e
	.quad 11b,30b
	.quad .Ls7,.Ls5e
	.quad 12b,30b
	.quad .Ls8,.Ls5e
	.quad 13b,30b
	.quad .Ld5,.Ls5e	/* .Ld[5-8] - 32..56 bytes copied */
	.quad 14b,30b
	.quad .Ld6,.Ls6e
	.quad 15b,30b
	.quad .Ld7,.Ls7e
	.quad 16b,30b
	.quad .Ld8,.Ls8e
	.quad 18b,40b
	.quad .Ls9,.Le_quad
	.quad 19b,40b
	.quad .Ld9,.Le_quad
	.quad 21b,50b
	.quad .Ls10,.Le_byte
	.quad 22b,50b
	.quad .Ld10,.Le_byte
#ifdef FIX_ALIGNMENT
	.quad .Ls11,.Lzero_rest
	.quad .Ld11,.Lzero_rest
#endif
	.quad .Le5,.Le_zero
	.previous
	.previous

	/* eax: zero, ebx: 64 */
.Ls1e: 	addl $8,%eax	/* eax: bytes left uncopied: Ls1e: 64 .. Ls8e: 8 */
.Ls2e: 	addl $8,%eax
.Ls3e: 	addl $8,%eax
.Ls4e: 	addl $8,%eax
.Ls5e: 	addl $8,%eax
.Ls6e: 	addl $8,%eax
.Ls7e: 	addl $8,%eax
.Ls8e: 	addl $8,%eax
	addq %rbx,%rdi	/* +64 */
	subq %rax,%rdi  /* correct destination with computed offset */

	shlq $6,%rdx	/* loop counter * 64 (stride length) */
	addq %rax,%rdx	/* add offset to loopcnt */
	andl $63,%ecx	/* remaining bytes */
	addq %rcx,%rdx	/* add them */
	jmp .Lzero_rest

	/* exception on quad word loop in tail handling */
	/* ecx:	loopcnt/8, %edx: length, rdi: correct */
.Le_quad:
	shll $3,%ecx
	andl $7,%edx
	addl %ecx,%edx
	/* edx: bytes to zero, rdi: dest, eax:zero */
.Lzero_rest:
	cmpl $0,(%rsp)	/* zero flag set? */
	jz   .Le_zero
	movq %rdx,%rcx
.Le_byte:
	xorl %eax,%eax
.Le5:	rep
	stosb
	/* when there is another exception while zeroing the rest just return */
.Le_zero:
	movq %rdx,%rax
	jmp .Lende
	CFI_ENDPROC
	CFI_ENDPROC
ENDPROC(__copy_user_nocache)
ENDPROC(__copy_user_nocache)