Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 0bd40535 authored by Richard Woodruff's avatar Richard Woodruff Committed by Kevin Hilman
Browse files

OMAP3: PM: Update clean_l2 to use v7_flush_dcache_all



Analysis in TI kernel with ETM showed that using cache mapped flush
in kernel instead of SO mapped flush cost drops by 65% (3.39mS down
to 1.17mS) for clean_l2 which is used during sleep sequences.
Overall:
	- speed up
	- unfortunately there isn't a good alternative flush method today
	- code reduction and less maintenance and potential bug in
	  unmaintained code

This also fixes the bug with the clean_l2 function usage.

Reported-by: default avatarTony Lindgren <tony@atomide.com>

Cc: Kevin Hilman <khilman@deeprootsystems.com>
Cc: Tony Lindgren <tony@atomide.com>

Acked-by: default avatarSantosh Shilimkar <santosh.shilimkar@ti.com>
Acked-by: default avatarJean Pihet <j-pihet@ti.com>

[nm@ti.com: ported rkw's proposal to 2.6.37-rc2]
Signed-off-by: default avatarNishanth Menon <nm@ti.com>
Signed-off-by: default avatarRichard Woodruff <r-woodruff2@ti.com>
Signed-off-by: default avatarKevin Hilman <khilman@deeprootsystems.com>
parent 1cbbe37a
Loading
Loading
Loading
Loading
+14 −66
Original line number Diff line number Diff line
@@ -520,72 +520,18 @@ clean_caches:
	cmp	r9, #1 /* Check whether L2 inval is required or not*/
	bne	skip_l2_inval
clean_l2:
	/* read clidr */
	mrc     p15, 1, r0, c0, c0, 1
	/* extract loc from clidr */
	ands    r3, r0, #0x7000000
	/* left align loc bit field */
	mov     r3, r3, lsr #23
	/* if loc is 0, then no need to clean */
	beq     finished
	/* start clean at cache level 0 */
	mov     r10, #0
loop1:
	/* work out 3x current cache level */
	add     r2, r10, r10, lsr #1
	/* extract cache type bits from clidr*/
	mov     r1, r0, lsr r2
	/* mask of the bits for current cache only */
	and     r1, r1, #7
	/* see what cache we have at this level */
	cmp     r1, #2
	/* skip if no cache, or just i-cache */
	blt     skip
	/* select current cache level in cssr */
	mcr     p15, 2, r10, c0, c0, 0
	/* isb to sych the new cssr&csidr */
	isb
	/* read the new csidr */
	mrc     p15, 1, r1, c0, c0, 0
	/* extract the length of the cache lines */
	and     r2, r1, #7
	/* add 4 (line length offset) */
	add     r2, r2, #4
	ldr     r4, assoc_mask
	/* find maximum number on the way size */
	ands    r4, r4, r1, lsr #3
	/* find bit position of way size increment */
	clz     r5, r4
	ldr     r7, numset_mask
	/* extract max number of the index size*/
	ands    r7, r7, r1, lsr #13
loop2:
	mov     r9, r4
	/* create working copy of max way size*/
loop3:
	/* factor way and cache number into r11 */
	orr     r11, r10, r9, lsl r5
	/* factor index number into r11 */
	orr     r11, r11, r7, lsl r2
	/*clean & invalidate by set/way */
	mcr     p15, 0, r11, c7, c10, 2
	/* decrement the way*/
	subs    r9, r9, #1
	bge     loop3
	/*decrement the index */
	subs    r7, r7, #1
	bge     loop2
skip:
	add     r10, r10, #2
	/* increment cache number */
	cmp     r3, r10
	bgt     loop1
finished:
	/*swith back to cache level 0 */
	mov     r10, #0
	/* select current cache level in cssr */
	mcr     p15, 2, r10, c0, c0, 0
	isb
	/*
	 * Jump out to kernel flush routine
	 *  - reuse that code is better
	 *  - it executes in a cached space so is faster than refetch per-block
	 *  - should be faster and will change with kernel
	 *  - 'might' have to copy address, load and jump to it
	 *  - lr is used since we are running in SRAM currently.
	 */
	ldr r1, kernel_flush
	mov lr, pc
	bx  r1

skip_l2_inval:
	/* Data memory barrier and Data sync barrier */
	mov     r1, #0
@@ -668,5 +614,7 @@ cache_pred_disable_mask:
	.word	0xFFFFE7FB
control_stat:
	.word	CONTROL_STAT
kernel_flush:
	.word v7_flush_dcache_all
ENTRY(omap34xx_cpu_suspend_sz)
	.word	. - omap34xx_cpu_suspend