x86: fix copy_user on x86 (ad2fc2cd) · Commits · e / devices / android_kernel_xiaomi_markw

arch/x86/lib/copy_user_64.S

+172 −255

Original line number	Diff line number	Diff line
		/* Copyright 2002 Andi Kleen, SuSE Labs.
		/*
		* Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
		* Copyright 2002 Andi Kleen, SuSE Labs.
		* Subject to the GNU Public License v2.
		*
		* Functions to copy from and to user space.
		@@ -33,6 +35,35 @@
		.previous
		.endm

		.macro ALIGN_DESTINATION
		#ifdef FIX_ALIGNMENT
		/* check for bad alignment of destination */
		movl %edi,%ecx
		andl $7,%ecx
		jz 102f /* already aligned */
		subl $8,%ecx
		negl %ecx
		subl %ecx,%edx
		100: movb (%rsi),%al
		101: movb %al,(%rdi)
		incq %rsi
		incq %rdi
		decl %ecx
		jnz 100b
		102:
		.section .fixup,"ax"
		103: addl %r8d,%edx /* ecx is zerorest also */
		jmp copy_user_handle_tail
		.previous

		.section __ex_table,"a"
		.align 8
		.quad 100b,103b
		.quad 101b,103b
		.previous
		#endif
		.endm

		/* Standard copy_to_user with segment limit checking */
		ENTRY(copy_to_user)
		CFI_STARTPROC
		@@ -42,19 +73,6 @@ ENTRY(copy_to_user)
		jc bad_to_user
		cmpq TI_addr_limit(%rax),%rcx
		jae bad_to_user
		xorl %eax,%eax /* clear zero flag */
		ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
		CFI_ENDPROC

		ENTRY(copy_user_generic)
		CFI_STARTPROC
		movl $1,%ecx /* set zero flag */
		ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
		CFI_ENDPROC

		ENTRY(__copy_from_user_inatomic)
		CFI_STARTPROC
		xorl %ecx,%ecx /* clear zero flag */
		ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
		CFI_ENDPROC

		@@ -67,13 +85,25 @@ ENTRY(copy_from_user)
		jc bad_from_user
		cmpq TI_addr_limit(%rax),%rcx
		jae bad_from_user
		movl $1,%ecx /* set zero flag */
		ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
		CFI_ENDPROC
		ENDPROC(copy_from_user)

		ENTRY(copy_user_generic)
		CFI_STARTPROC
		ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
		CFI_ENDPROC
		ENDPROC(copy_user_generic)

		ENTRY(__copy_from_user_inatomic)
		CFI_STARTPROC
		ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
		CFI_ENDPROC
		ENDPROC(__copy_from_user_inatomic)

		.section .fixup,"ax"
		/* must zero dest */
		ENTRY(bad_from_user)
		bad_from_user:
		CFI_STARTPROC
		movl %edx,%ecx
		@@ -84,219 +114,111 @@ bad_to_user:
		movl %edx,%eax
		ret
		CFI_ENDPROC
		END(bad_from_user)
		ENDPROC(bad_from_user)
		.previous


		/*
		* copy_user_generic_unrolled - memory copy with exception handling.
		* This version is for CPUs like P4 that don't have efficient micro code for rep movsq
		* This version is for CPUs like P4 that don't have efficient micro
		* code for rep movsq
		*
		* Input:
		* rdi destination
		* rsi source
		* rdx count
		* ecx zero flag -- if true zero destination on error
		*
		* Output:
		* eax uncopied bytes or 0 if successful.
		* eax uncopied bytes or 0 if successfull.
		*/
		ENTRY(copy_user_generic_unrolled)
		CFI_STARTPROC
		pushq %rbx
		CFI_ADJUST_CFA_OFFSET 8
		CFI_REL_OFFSET rbx, 0
		pushq %rcx
		CFI_ADJUST_CFA_OFFSET 8
		CFI_REL_OFFSET rcx, 0
		xorl %eax,%eax /zero for the exception handler /

		#ifdef FIX_ALIGNMENT
		/* check for bad alignment of destination */
		movl %edi,%ecx
		andl $7,%ecx
		jnz .Lbad_alignment
		.Lafter_bad_alignment:
		#endif

		movq %rdx,%rcx

		movl $64,%ebx
		shrq $6,%rdx
		decq %rdx
		js .Lhandle_tail

		.p2align 4
		.Lloop:
		.Ls1: movq (%rsi),%r11
		.Ls2: movq 1*8(%rsi),%r8
		.Ls3: movq 2*8(%rsi),%r9
		.Ls4: movq 3*8(%rsi),%r10
		.Ld1: movq %r11,(%rdi)
		.Ld2: movq %r8,1*8(%rdi)
		.Ld3: movq %r9,2*8(%rdi)
		.Ld4: movq %r10,3*8(%rdi)

		.Ls5: movq 4*8(%rsi),%r11
		.Ls6: movq 5*8(%rsi),%r8
		.Ls7: movq 6*8(%rsi),%r9
		.Ls8: movq 7*8(%rsi),%r10
		.Ld5: movq %r11,4*8(%rdi)
		.Ld6: movq %r8,5*8(%rdi)
		.Ld7: movq %r9,6*8(%rdi)
		.Ld8: movq %r10,7*8(%rdi)

		decq %rdx

		cmpl $8,%edx
		jb 20f /* less then 8 bytes, go to byte copy loop */
		ALIGN_DESTINATION
		movl %edx,%ecx
		andl $63,%edx
		shrl $6,%ecx
		jz 17f
		1: movq (%rsi),%r8
		2: movq 1*8(%rsi),%r9
		3: movq 2*8(%rsi),%r10
		4: movq 3*8(%rsi),%r11
		5: movq %r8,(%rdi)
		6: movq %r9,1*8(%rdi)
		7: movq %r10,2*8(%rdi)
		8: movq %r11,3*8(%rdi)
		9: movq 4*8(%rsi),%r8
		10: movq 5*8(%rsi),%r9
		11: movq 6*8(%rsi),%r10
		12: movq 7*8(%rsi),%r11
		13: movq %r8,4*8(%rdi)
		14: movq %r9,5*8(%rdi)
		15: movq %r10,6*8(%rdi)
		16: movq %r11,7*8(%rdi)
		leaq 64(%rsi),%rsi
		leaq 64(%rdi),%rdi

		jns .Lloop

		.p2align 4
		.Lhandle_tail:
		movl %ecx,%edx
		andl $63,%ecx
		shrl $3,%ecx
		jz .Lhandle_7
		movl $8,%ebx
		.p2align 4
		.Lloop_8:
		.Ls9: movq (%rsi),%r8
		.Ld9: movq %r8,(%rdi)
		decl %ecx
		leaq 8(%rdi),%rdi
		jnz 1b
		17: movl %edx,%ecx
		andl $7,%edx
		shrl $3,%ecx
		jz 20f
		18: movq (%rsi),%r8
		19: movq %r8,(%rdi)
		leaq 8(%rsi),%rsi
		jnz .Lloop_8

		.Lhandle_7:
		leaq 8(%rdi),%rdi
		decl %ecx
		jnz 18b
		20: andl %edx,%edx
		jz 23f
		movl %edx,%ecx
		andl $7,%ecx
		jz .Lende
		.p2align 4
		.Lloop_1:
		.Ls10: movb (%rsi),%bl
		.Ld10: movb %bl,(%rdi)
		incq %rdi
		21: movb (%rsi),%al
		22: movb %al,(%rdi)
		incq %rsi
		incq %rdi
		decl %ecx
		jnz .Lloop_1

		CFI_REMEMBER_STATE
		.Lende:
		popq %rcx
		CFI_ADJUST_CFA_OFFSET -8
		CFI_RESTORE rcx
		popq %rbx
		CFI_ADJUST_CFA_OFFSET -8
		CFI_RESTORE rbx
		jnz 21b
		23: xor %eax,%eax
		ret
		CFI_RESTORE_STATE

		#ifdef FIX_ALIGNMENT
		/* align destination */
		.p2align 4
		.Lbad_alignment:
		movl $8,%r9d
		subl %ecx,%r9d
		movl %r9d,%ecx
		cmpq %r9,%rdx
		jz .Lhandle_7
		js .Lhandle_7
		.Lalign_1:
		.Ls11: movb (%rsi),%bl
		.Ld11: movb %bl,(%rdi)
		incq %rsi
		incq %rdi
		decl %ecx
		jnz .Lalign_1
		subq %r9,%rdx
		jmp .Lafter_bad_alignment
		#endif
		.section .fixup,"ax"
		30: shll $6,%ecx
		addl %ecx,%edx
		jmp 60f
		40: leal (%edx,%ecx,8),%edx
		jmp 60f
		50: movl %ecx,%edx
		60: jmp copy_user_handle_tail /* ecx is zerorest also */
		.previous

		/* table sorted by exception address */
		.section __ex_table,"a"
		.align 8
		.quad .Ls1,.Ls1e /* Ls1-Ls4 have copied zero bytes */
		.quad .Ls2,.Ls1e
		.quad .Ls3,.Ls1e
		.quad .Ls4,.Ls1e
		.quad .Ld1,.Ls1e /* Ld1-Ld4 have copied 0-24 bytes */
		.quad .Ld2,.Ls2e
		.quad .Ld3,.Ls3e
		.quad .Ld4,.Ls4e
		.quad .Ls5,.Ls5e /* Ls5-Ls8 have copied 32 bytes */
		.quad .Ls6,.Ls5e
		.quad .Ls7,.Ls5e
		.quad .Ls8,.Ls5e
		.quad .Ld5,.Ls5e /* Ld5-Ld8 have copied 32-56 bytes */
		.quad .Ld6,.Ls6e
		.quad .Ld7,.Ls7e
		.quad .Ld8,.Ls8e
		.quad .Ls9,.Le_quad
		.quad .Ld9,.Le_quad
		.quad .Ls10,.Le_byte
		.quad .Ld10,.Le_byte
		#ifdef FIX_ALIGNMENT
		.quad .Ls11,.Lzero_rest
		.quad .Ld11,.Lzero_rest
		#endif
		.quad .Le5,.Le_zero
		.quad 1b,30b
		.quad 2b,30b
		.quad 3b,30b
		.quad 4b,30b
		.quad 5b,30b
		.quad 6b,30b
		.quad 7b,30b
		.quad 8b,30b
		.quad 9b,30b
		.quad 10b,30b
		.quad 11b,30b
		.quad 12b,30b
		.quad 13b,30b
		.quad 14b,30b
		.quad 15b,30b
		.quad 16b,30b
		.quad 18b,40b
		.quad 19b,40b
		.quad 21b,50b
		.quad 22b,50b
		.previous

		/* eax: zero, ebx: 64 */
		.Ls1e: addl $8,%eax /* eax is bytes left uncopied within the loop (Ls1e: 64 .. Ls8e: 8) */
		.Ls2e: addl $8,%eax
		.Ls3e: addl $8,%eax
		.Ls4e: addl $8,%eax
		.Ls5e: addl $8,%eax
		.Ls6e: addl $8,%eax
		.Ls7e: addl $8,%eax
		.Ls8e: addl $8,%eax
		addq %rbx,%rdi /* +64 */
		subq %rax,%rdi /* correct destination with computed offset */

		shlq $6,%rdx /* loop counter * 64 (stride length) */
		addq %rax,%rdx /* add offset to loopcnt */
		andl $63,%ecx /* remaining bytes */
		addq %rcx,%rdx /* add them */
		jmp .Lzero_rest

		/* exception on quad word loop in tail handling */
		/* ecx: loopcnt/8, %edx: length, rdi: correct */
		.Le_quad:
		shll $3,%ecx
		andl $7,%edx
		addl %ecx,%edx
		/* edx: bytes to zero, rdi: dest, eax:zero */
		.Lzero_rest:
		cmpl $0,(%rsp)
		jz .Le_zero
		movq %rdx,%rcx
		.Le_byte:
		xorl %eax,%eax
		.Le5: rep
		stosb
		/* when there is another exception while zeroing the rest just return */
		.Le_zero:
		movq %rdx,%rax
		jmp .Lende
		CFI_ENDPROC
		ENDPROC(copy_user_generic)

		ENDPROC(copy_user_generic_unrolled)

		/* Some CPUs run faster using the string copy instructions.
		This is also a lot simpler. Use them when possible.
		Patch in jmps to this code instead of copying it fully
		to avoid unwanted aliasing in the exception tables. */

		/* rdi destination
		* rsi source
		* rdx count
		* ecx zero flag
		*
		* Output:
		* eax uncopied bytes or 0 if successfull.
		* This is also a lot simpler. Use them when possible.
		*
		* Only 4GB of copy is supported. This shouldn't be a problem
		* because the kernel normally only writes from/to page sized chunks
		@@ -304,48 +226,43 @@ ENDPROC(copy_user_generic)
		* And more would be dangerous because both Intel and AMD have
		* errata with rep movsq > 4GB. If someone feels the need to fix
		* this please consider this.
		*
		* Input:
		* rdi destination
		* rsi source
		* rdx count
		*
		* Output:
		* eax uncopied bytes or 0 if successful.
		*/
		ENTRY(copy_user_generic_string)
		CFI_STARTPROC
		movl %ecx,%r8d /* save zero flag */
		andl %edx,%edx
		jz 4f
		cmpl $8,%edx
		jb 2f /* less than 8 bytes, go to byte copy loop */
		ALIGN_DESTINATION
		movl %edx,%ecx
		shrl $3,%ecx
		andl $7,%edx
		jz 10f
		1: rep
		movsq
		movl %edx,%ecx
		2: rep
		2: movl %edx,%ecx
		3: rep
		movsb
		9: movl %ecx,%eax
		ret

		/* multiple of 8 byte */
		10: rep
		movsq
		xor %eax,%eax
		4: xorl %eax,%eax
		ret

		/* exception handling */
		3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */
		jmp 6f
		5: movl %ecx,%eax /* exception on byte loop */
		/* eax: left over bytes */
		6: testl %r8d,%r8d /* zero flag set? */
		jz 7f
		movl %eax,%ecx /* initialize x86 loop counter */
		push %rax
		xorl %eax,%eax
		8: rep
		stosb /* zero the rest */
		11: pop %rax
		7: ret
		CFI_ENDPROC
		END(copy_user_generic_c)
		.section .fixup,"ax"
		11: leal (%edx,%ecx,8),%ecx
		12: movl %ecx,%edx /* ecx is zerorest also */
		jmp copy_user_handle_tail
		.previous

		.section __ex_table,"a"
		.quad 1b,3b
		.quad 2b,5b
		.quad 8b,11b
		.quad 10b,3b
		.align 8
		.quad 1b,11b
		.quad 3b,12b
		.previous
		CFI_ENDPROC
		ENDPROC(copy_user_generic_string)

arch/x86/lib/copy_user_nocache_64.S

+103 −180

Original line number	Diff line number	Diff line
		/* Copyright 2002 Andi Kleen, SuSE Labs.
		/*
		* Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
		* Copyright 2002 Andi Kleen, SuSE Labs.
		* Subject to the GNU Public License v2.
		*
		* Functions to copy from and to user space.
		@@ -12,204 +14,125 @@
		#include <asm/current.h>
		#include <asm/asm-offsets.h>
		#include <asm/thread_info.h>
		#include <asm/cpufeature.h>

		/*
		* copy_user_nocache - Uncached memory copy with exception handling
		* This will force destination/source out of cache for more performance.
		*
		* Input:
		* rdi destination
		* rsi source
		* rdx count
		* rcx zero flag when 1 zero on exception
		*
		* Output:
		* eax uncopied bytes or 0 if successful.
		*/
		ENTRY(__copy_user_nocache)
		CFI_STARTPROC
		pushq %rbx
		CFI_ADJUST_CFA_OFFSET 8
		CFI_REL_OFFSET rbx, 0
		pushq %rcx /* save zero flag */
		CFI_ADJUST_CFA_OFFSET 8
		CFI_REL_OFFSET rcx, 0

		xorl %eax,%eax /* zero for the exception handler */

		.macro ALIGN_DESTINATION
		#ifdef FIX_ALIGNMENT
		/* check for bad alignment of destination */
		movl %edi,%ecx
		andl $7,%ecx
		jnz .Lbad_alignment
		.Lafter_bad_alignment:
		#endif

		movq %rdx,%rcx

		movl $64,%ebx
		shrq $6,%rdx
		decq %rdx
		js .Lhandle_tail

		.p2align 4
		.Lloop:
		.Ls1: movq (%rsi),%r11
		.Ls2: movq 1*8(%rsi),%r8
		.Ls3: movq 2*8(%rsi),%r9
		.Ls4: movq 3*8(%rsi),%r10
		.Ld1: movnti %r11,(%rdi)
		.Ld2: movnti %r8,1*8(%rdi)
		.Ld3: movnti %r9,2*8(%rdi)
		.Ld4: movnti %r10,3*8(%rdi)

		.Ls5: movq 4*8(%rsi),%r11
		.Ls6: movq 5*8(%rsi),%r8
		.Ls7: movq 6*8(%rsi),%r9
		.Ls8: movq 7*8(%rsi),%r10
		.Ld5: movnti %r11,4*8(%rdi)
		.Ld6: movnti %r8,5*8(%rdi)
		.Ld7: movnti %r9,6*8(%rdi)
		.Ld8: movnti %r10,7*8(%rdi)
		jz 102f /* already aligned */
		subl $8,%ecx
		negl %ecx
		subl %ecx,%edx
		100: movb (%rsi),%al
		101: movb %al,(%rdi)
		incq %rsi
		incq %rdi
		decl %ecx
		jnz 100b
		102:
		.section .fixup,"ax"
		103: addl %r8d,%edx /* ecx is zerorest also */
		jmp copy_user_handle_tail
		.previous

		dec %rdx
		.section __ex_table,"a"
		.align 8
		.quad 100b,103b
		.quad 101b,103b
		.previous
		#endif
		.endm

		/*
		* copy_user_nocache - Uncached memory copy with exception handling
		* This will force destination/source out of cache for more performance.
		*/
		ENTRY(__copy_user_nocache)
		CFI_STARTPROC
		cmpl $8,%edx
		jb 20f /* less then 8 bytes, go to byte copy loop */
		ALIGN_DESTINATION
		movl %edx,%ecx
		andl $63,%edx
		shrl $6,%ecx
		jz 17f
		1: movq (%rsi),%r8
		2: movq 1*8(%rsi),%r9
		3: movq 2*8(%rsi),%r10
		4: movq 3*8(%rsi),%r11
		5: movnti %r8,(%rdi)
		6: movnti %r9,1*8(%rdi)
		7: movnti %r10,2*8(%rdi)
		8: movnti %r11,3*8(%rdi)
		9: movq 4*8(%rsi),%r8
		10: movq 5*8(%rsi),%r9
		11: movq 6*8(%rsi),%r10
		12: movq 7*8(%rsi),%r11
		13: movnti %r8,4*8(%rdi)
		14: movnti %r9,5*8(%rdi)
		15: movnti %r10,6*8(%rdi)
		16: movnti %r11,7*8(%rdi)
		leaq 64(%rsi),%rsi
		leaq 64(%rdi),%rdi

		jns .Lloop

		.p2align 4
		.Lhandle_tail:
		movl %ecx,%edx
		andl $63,%ecx
		shrl $3,%ecx
		jz .Lhandle_7
		movl $8,%ebx
		.p2align 4
		.Lloop_8:
		.Ls9: movq (%rsi),%r8
		.Ld9: movnti %r8,(%rdi)
		decl %ecx
		leaq 8(%rdi),%rdi
		jnz 1b
		17: movl %edx,%ecx
		andl $7,%edx
		shrl $3,%ecx
		jz 20f
		18: movq (%rsi),%r8
		19: movnti %r8,(%rdi)
		leaq 8(%rsi),%rsi
		jnz .Lloop_8

		.Lhandle_7:
		leaq 8(%rdi),%rdi
		decl %ecx
		jnz 18b
		20: andl %edx,%edx
		jz 23f
		movl %edx,%ecx
		andl $7,%ecx
		jz .Lende
		.p2align 4
		.Lloop_1:
		.Ls10: movb (%rsi),%bl
		.Ld10: movb %bl,(%rdi)
		incq %rdi
		21: movb (%rsi),%al
		22: movb %al,(%rdi)
		incq %rsi
		incq %rdi
		decl %ecx
		jnz .Lloop_1

		CFI_REMEMBER_STATE
		.Lende:
		popq %rcx
		CFI_ADJUST_CFA_OFFSET -8
		CFI_RESTORE %rcx
		popq %rbx
		CFI_ADJUST_CFA_OFFSET -8
		CFI_RESTORE rbx
		jnz 21b
		23: xorl %eax,%eax
		sfence
		ret
		CFI_RESTORE_STATE

		#ifdef FIX_ALIGNMENT
		/* align destination */
		.p2align 4
		.Lbad_alignment:
		movl $8,%r9d
		subl %ecx,%r9d
		movl %r9d,%ecx
		cmpq %r9,%rdx
		jz .Lhandle_7
		js .Lhandle_7
		.Lalign_1:
		.Ls11: movb (%rsi),%bl
		.Ld11: movb %bl,(%rdi)
		incq %rsi
		incq %rdi
		decl %ecx
		jnz .Lalign_1
		subq %r9,%rdx
		jmp .Lafter_bad_alignment
		#endif
		.section .fixup,"ax"
		30: shll $6,%ecx
		addl %ecx,%edx
		jmp 60f
		40: leal (%edx,%ecx,8),%edx
		jmp 60f
		50: movl %ecx,%edx
		60: sfence
		movl %r8d,%ecx
		jmp copy_user_handle_tail
		.previous

		/* table sorted by exception address */
		.section __ex_table,"a"
		.align 8
		.quad .Ls1,.Ls1e /* .Ls[1-4] - 0 bytes copied */
		.quad .Ls2,.Ls1e
		.quad .Ls3,.Ls1e
		.quad .Ls4,.Ls1e
		.quad .Ld1,.Ls1e /* .Ld[1-4] - 0..24 bytes coped */
		.quad .Ld2,.Ls2e
		.quad .Ld3,.Ls3e
		.quad .Ld4,.Ls4e
		.quad .Ls5,.Ls5e /* .Ls[5-8] - 32 bytes copied */
		.quad .Ls6,.Ls5e
		.quad .Ls7,.Ls5e
		.quad .Ls8,.Ls5e
		.quad .Ld5,.Ls5e /* .Ld[5-8] - 32..56 bytes copied */
		.quad .Ld6,.Ls6e
		.quad .Ld7,.Ls7e
		.quad .Ld8,.Ls8e
		.quad .Ls9,.Le_quad
		.quad .Ld9,.Le_quad
		.quad .Ls10,.Le_byte
		.quad .Ld10,.Le_byte
		#ifdef FIX_ALIGNMENT
		.quad .Ls11,.Lzero_rest
		.quad .Ld11,.Lzero_rest
		#endif
		.quad .Le5,.Le_zero
		.quad 1b,30b
		.quad 2b,30b
		.quad 3b,30b
		.quad 4b,30b
		.quad 5b,30b
		.quad 6b,30b
		.quad 7b,30b
		.quad 8b,30b
		.quad 9b,30b
		.quad 10b,30b
		.quad 11b,30b
		.quad 12b,30b
		.quad 13b,30b
		.quad 14b,30b
		.quad 15b,30b
		.quad 16b,30b
		.quad 18b,40b
		.quad 19b,40b
		.quad 21b,50b
		.quad 22b,50b
		.previous

		/* eax: zero, ebx: 64 */
		.Ls1e: addl $8,%eax /* eax: bytes left uncopied: Ls1e: 64 .. Ls8e: 8 */
		.Ls2e: addl $8,%eax
		.Ls3e: addl $8,%eax
		.Ls4e: addl $8,%eax
		.Ls5e: addl $8,%eax
		.Ls6e: addl $8,%eax
		.Ls7e: addl $8,%eax
		.Ls8e: addl $8,%eax
		addq %rbx,%rdi /* +64 */
		subq %rax,%rdi /* correct destination with computed offset */

		shlq $6,%rdx /* loop counter * 64 (stride length) */
		addq %rax,%rdx /* add offset to loopcnt */
		andl $63,%ecx /* remaining bytes */
		addq %rcx,%rdx /* add them */
		jmp .Lzero_rest

		/* exception on quad word loop in tail handling */
		/* ecx: loopcnt/8, %edx: length, rdi: correct */
		.Le_quad:
		shll $3,%ecx
		andl $7,%edx
		addl %ecx,%edx
		/* edx: bytes to zero, rdi: dest, eax:zero */
		.Lzero_rest:
		cmpl $0,(%rsp) /* zero flag set? */
		jz .Le_zero
		movq %rdx,%rcx
		.Le_byte:
		xorl %eax,%eax
		.Le5: rep
		stosb
		/* when there is another exception while zeroing the rest just return */
		.Le_zero:
		movq %rdx,%rax
		jmp .Lende
		CFI_ENDPROC
		ENDPROC(__copy_user_nocache)