Fast 32-bit to 16-bit framebuffer blit

From wiki.gp2x.org

Sometimes you have an application you are porting that was designed entirely for 32bpp framebuffers. Ideally, you'd covert it to 16bpp for the GP2X, but sometimes that would be very difficult. If you are looking for a modest speedup on full-screen 32-bit to 16-bit blits, I have taken some code from forum member A_SN and re-written it for this purpose (unrolling the loop for 8-byte iterations, use of all registers together with STMs, and having it assume the proper byte order for most apps). It was written for use in UQM2X by Senor Quack (Dan Silsby).

Here are some benchmarks to prove mine is faster than straight Paeryn SDL: (SDL_BlitSurface between two 320x240 SWSURFACEs, 32bpp source and 16bpp destination)

PAERYN 16MB STOCK SDL 1.2.9:
----------------------------
 
[root@gp2x benchmark]$./benchmark.gpe
Waiting 5 seconds to begin test...
SDL test: 76694697 usec (10,000 calls)
        AVG ms/CALL:  7.669

SENOR QUACK'S FB_WRITE:
-----------------------
 
Beginning Senor Quack test:
(waiting 2 seconds..)
Senor Quack's fb_write: 73051858 usec (10,000 calls)
        AVG ms/CALL:  7.305

That is a 4.75% improvement in speed. Modest, indeed, but it helped make the difference for me. If your program did nothing but blit a 32bpp surface to the GP2X framebuffer over and over, you'd get 130.4 fps with Paeryn 1.2.9 SDL. With my fb_write function you would get 136.9 fps. This helped sections of my game that were stuttery at 200mhz.

THE CODE:

@ C prototype : extern void fb_write(uint64_t *screen, uint32_t *fb);
@ Credits to A_SN from gp32x.com for the original code.. Heavily modified by me to be faster and in the right order for my app. 
@ DKS - I have added this to UQM for a blitting speed-up.
@		(all uqm code uses 32-bit surfaces, but GP2X is 16-bit.  this is faster than SDL's 32-to-16bit generic blitter)
@
@	This code now reads in 8 32-bit pixels each loop and shoots out 4 words representing 8 16-bit pixels
@	to the pixel array represented by the second parameter

@	It assumes byte order of XRGB0888 for 32-bit array, RGB565 for 16-bit array and also assumes entire 320x240 screen is copied.
@	It copies from end to beginning.
	
.text
.align
.global fb_write
fb_write:
	@ args = 0, pretend = 0, frame = 0
	@ frame_needed = 0, uses_anonymous_args = 0
	stmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr}				@store registers

	ldr	lr, .L245		@lr = 307199		original array start adress (local)
	ldr	ip, .L245+4		@ip = 9599			init of the counter

	add	lr, r0, lr		@lr += r0			calculation of the full adress
	ldr r0, .L245+8		@calculate base address of dest. pixels (start from end)
	add r0, r0, r1		@calculate base address of dest. pixels (start from end), r0 is now address to write to

	mov	r1, #248 @F8 (for masking off 32-bit red and blue)   		
	mov	r2, #252 @FC (for masking off 32-bit green)
.L240:
	ldmda lr!, {r5-r8}			@load 4 32-bit pixels		
	
	and	r3, r8, r1, asl #16		@r3 = r8 & (0xF8<<16)		r3 = MSW & 0x00F80000 = 0x00100000	//MSW red
	and	r4, r8, r2, asl #8		@r4 = r8 & (0xFC<<8)		r4 = MSW & 0x0000FC00 = 0x00003000	//MSW green
	mov	r11, r3, lsl #8			@r11 = r3 << 8  			//red
	orr	r11, r11, r4, lsl #11 	@r11 = r11 | (r4 << 11) 		//green
	and	r3, r8, r1				@r3 = r8 & 0xF8			r3 = MSW & 0x000000F8 = 0x000000F8	//MSW blue
	orr	r11, r11, r3, lsl #13	@r11 = r11 | (r3 << 13)	//blue	
	
@ OK, r11 contains converted r8 color in 0xFFFF0000 now
	
	and	r3, r7, r1, asl #16		@r3 = r7 & (0xF8<<16)		r3 = LSW & 0x00F80000 = 0x00880000	//LSW red
	orr	r11, r11, r3, lsr #8	@r11 = r11 | (r3 >> 8)
	and	r3, r7, r2, asl #8		@r3 = r7 & (0xFC<<8)		r3 = LSW & 0x0000FC00 = 0x0000F800	//LSW green
	orr	r11, r11, r3, lsr #5	@r11 = r11 | (r3>>5)		r11 = r11 | r3 = 0xF9820011 | 0x000007C0 = 0xF98207D1
	and	r7, r7, r1				@r7 = r7 & 0x000000F8		r7 = LSW & 0x000000F8 = 0x00000088 //LSW blue
	orr	r11, r11, r7, lsr #3	@r11 = r11 | (r7 >> 3)
	
@ OK, r11 contains both converted R8 and R7 colors: R8 in 0xFFFF0000 and R7 in 0x0000FFFF

	and	r3, r6, r1, asl #16		
	and	r4, r6, r2, asl #8		
	mov	r10, r3, lsl #8			
	orr	r10, r10, r4, lsl #11 	
	and	r3, r6, r1				
	orr	r10, r10, r3, lsl #13		
	
@ OK, r10 contains converted R6 color in 0xFFFF0000 now
	
	and	r3, r5, r1, asl #16		
	orr	r10, r10, r3, lsr #8	
	and	r3, r5, r2, asl #8		
	orr	r10, r10, r3, lsr #5	
	and	r5, r5, r1				
	orr	r10, r10, r5, lsr #3	
	
@ OK, r10 contains both converted R6 and R5 colors: R6 in 0xFFFF0000 and R5 in 0x0000FFFF

	ldmda lr!, {r5-r8}			@load 4 more 32-bit pixels		

	and	r3, r8, r1, asl #16		
	and	r4, r8, r2, asl #8		
	mov	r9, r3, lsl #8			
	orr	r9, r9, r4, lsl #11 	
	and	r3, r8, r1				
	orr	r9, r9, r3, lsl #13			
	
@ OK, r9 contains converted r8 color in 0xFFFF0000 now
	
	and	r3, r7, r1, asl #16		
	orr	r9, r9, r3, lsr #8		
	and	r3, r7, r2, asl #8		
	orr	r9, r9, r3, lsr #5		
	and	r7, r7, r1				
	orr	r9, r9, r7, lsr #3		
	
@ OK, r9 contains both converted R8 and R7 colors: R8 in 0xFFFF0000 and R7 in 0x0000FFFF

	and	r3, r6, r1, asl #16		
	and	r4, r6, r2, asl #8		
	mov	r8, r3, lsl #8			
	orr	r8, r8, r4, lsl #11 	
	and	r3, r6, r1				
	orr	r8, r8, r3, lsl #13			
	
@ OK, r9 contains converted R6 color in 0xFFFF0000 now
	
	and	r3, r5, r1, asl #16		
	orr	r8, r8, r3, lsr #8		
	and	r3, r5, r2, asl #8		
	orr	r8, r8, r3, lsr #5		
	and	r5, r5, r1				
	orr	r8, r8, r5, lsr #3		
	
@ OK, r8 contains both converted R6 and R5 colors: R6 in 0xFFFF0000 and R5 in 0x0000FFFF

@ Now, r11, r10, r9, and r8 contain our 8 converted pixels (in 16-bit format)

	stmda r0!, {r8, r9, r10, r11}  
	
	subs	ip, ip, #1		@ip--				decrementation of ip counter

	bge	.L240			@if (ip!=0) go back to .L240	loop condition
	ldmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}				@restore registers
.L246:
	.align	2
.L245:
	.word	307199
	.word	9599 	@ total loops
	.word	153599
	
	.size	fb_write, .-fb_write
	.section	.rodata.str1.4
	.align	2
.end

Personal tools