mirror of https://github.com/MLXXXp/Arduboy2.git
Optimised Sprites::drawBitmap() for size
This commit is contained in:
parent
f8f46de06d
commit
57392fcd0a
261
src/Sprites.cpp
261
src/Sprites.cpp
|
@ -124,238 +124,53 @@ void Sprites::drawBitmap(int16_t x, int16_t y,
|
||||||
|
|
||||||
sRow += start_h;
|
sRow += start_h;
|
||||||
ofs = (sRow * WIDTH) + x + xOffset;
|
ofs = (sRow * WIDTH) + x + xOffset;
|
||||||
uint8_t *bofs = (uint8_t *)bitmap + (start_h * w) + xOffset;
|
|
||||||
uint8_t data;
|
|
||||||
|
|
||||||
uint8_t mul_amt = 1 << yOffset;
|
uint8_t mul_amt = 1 << yOffset;
|
||||||
uint16_t mask_data;
|
uint16_t mask_data;
|
||||||
uint16_t bitmap_data;
|
uint16_t bitmap_data;
|
||||||
|
|
||||||
switch (draw_mode) {
|
const uint8_t ofs_step = draw_mode == SPRITE_PLUS_MASK ? 2 : 1;
|
||||||
case SPRITE_UNMASKED:
|
const uint8_t ofs_stride = (w - rendered_width)*ofs_step;
|
||||||
// we only want to mask the 8 bits of our own sprite, so we can
|
const uint8_t initial_bofs = ((start_h * w) + xOffset)*ofs_step;
|
||||||
// calculate the mask before the start of the loop
|
|
||||||
mask_data = ~(0xFF * mul_amt);
|
|
||||||
// really if yOffset = 0 you have a faster case here that could be
|
|
||||||
// optimized
|
|
||||||
for (uint8_t a = 0; a < loop_h; a++) {
|
|
||||||
for (uint8_t iCol = 0; iCol < rendered_width; iCol++) {
|
|
||||||
bitmap_data = pgm_read_byte(bofs) * mul_amt;
|
|
||||||
|
|
||||||
if (sRow >= 0) {
|
uint8_t *bofs = (uint8_t *)bitmap + initial_bofs;
|
||||||
data = Arduboy2Base::sBuffer[ofs];
|
uint8_t *mask_ofs = !mask ? (uint8_t *)bitmap : (uint8_t *)mask;
|
||||||
data &= (uint8_t)(mask_data);
|
mask_ofs += initial_bofs + ofs_step - 1;
|
||||||
data |= (uint8_t)(bitmap_data);
|
|
||||||
Arduboy2Base::sBuffer[ofs] = data;
|
for (uint8_t a = 0; a < loop_h; a++) {
|
||||||
}
|
for (uint8_t iCol = 0; iCol < rendered_width; iCol++) {
|
||||||
if (yOffset != 0 && sRow < 7) {
|
uint8_t data;
|
||||||
data = Arduboy2Base::sBuffer[ofs + WIDTH];
|
|
||||||
data &= (*((unsigned char *) (&mask_data) + 1));
|
bitmap_data = pgm_read_byte(bofs) * mul_amt;
|
||||||
data |= (*((unsigned char *) (&bitmap_data) + 1));
|
mask_data = ~bitmap_data;
|
||||||
Arduboy2Base::sBuffer[ofs + WIDTH] = data;
|
|
||||||
}
|
if (draw_mode == SPRITE_UNMASKED) {
|
||||||
ofs++;
|
mask_data = ~(0xFF * mul_amt);
|
||||||
bofs++;
|
} else if (draw_mode == SPRITE_IS_MASK_ERASE) {
|
||||||
}
|
bitmap_data = 0;
|
||||||
sRow++;
|
} else {
|
||||||
bofs += w - rendered_width;
|
mask_data = ~(pgm_read_byte(mask_ofs) * mul_amt);
|
||||||
ofs += WIDTH - rendered_width;
|
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
|
|
||||||
case SPRITE_IS_MASK:
|
if (sRow >= 0) {
|
||||||
for (uint8_t a = 0; a < loop_h; a++) {
|
data = Arduboy2Base::sBuffer[ofs];
|
||||||
for (uint8_t iCol = 0; iCol < rendered_width; iCol++) {
|
data &= (uint8_t)(mask_data);
|
||||||
bitmap_data = pgm_read_byte(bofs) * mul_amt;
|
data |= (uint8_t)(bitmap_data);
|
||||||
if (sRow >= 0) {
|
Arduboy2Base::sBuffer[ofs] = data;
|
||||||
Arduboy2Base::sBuffer[ofs] |= (uint8_t)(bitmap_data);
|
|
||||||
}
|
|
||||||
if (yOffset != 0 && sRow < 7) {
|
|
||||||
Arduboy2Base::sBuffer[ofs + WIDTH] |= (*((unsigned char *) (&bitmap_data) + 1));
|
|
||||||
}
|
|
||||||
ofs++;
|
|
||||||
bofs++;
|
|
||||||
}
|
|
||||||
sRow++;
|
|
||||||
bofs += w - rendered_width;
|
|
||||||
ofs += WIDTH - rendered_width;
|
|
||||||
}
|
}
|
||||||
break;
|
if (yOffset != 0 && sRow < 7) {
|
||||||
|
data = Arduboy2Base::sBuffer[ofs + WIDTH];
|
||||||
case SPRITE_IS_MASK_ERASE:
|
data &= (*((unsigned char *) (&mask_data) + 1));
|
||||||
for (uint8_t a = 0; a < loop_h; a++) {
|
data |= (*((unsigned char *) (&bitmap_data) + 1));
|
||||||
for (uint8_t iCol = 0; iCol < rendered_width; iCol++) {
|
Arduboy2Base::sBuffer[ofs + WIDTH] = data;
|
||||||
bitmap_data = pgm_read_byte(bofs) * mul_amt;
|
|
||||||
if (sRow >= 0) {
|
|
||||||
Arduboy2Base::sBuffer[ofs] &= ~(uint8_t)(bitmap_data);
|
|
||||||
}
|
|
||||||
if (yOffset != 0 && sRow < 7) {
|
|
||||||
Arduboy2Base::sBuffer[ofs + WIDTH] &= ~(*((unsigned char *) (&bitmap_data) + 1));
|
|
||||||
}
|
|
||||||
ofs++;
|
|
||||||
bofs++;
|
|
||||||
}
|
|
||||||
sRow++;
|
|
||||||
bofs += w - rendered_width;
|
|
||||||
ofs += WIDTH - rendered_width;
|
|
||||||
}
|
}
|
||||||
break;
|
ofs++;
|
||||||
|
mask_ofs += ofs_step;
|
||||||
case SPRITE_MASKED:
|
bofs += ofs_step;
|
||||||
uint8_t *mask_ofs;
|
}
|
||||||
mask_ofs = (uint8_t *)mask + (start_h * w) + xOffset;
|
sRow++;
|
||||||
for (uint8_t a = 0; a < loop_h; a++) {
|
bofs += ofs_stride;
|
||||||
for (uint8_t iCol = 0; iCol < rendered_width; iCol++) {
|
mask_ofs += ofs_stride;
|
||||||
// NOTE: you might think in the yOffset==0 case that this results
|
ofs += WIDTH - rendered_width;
|
||||||
// in more effort, but in all my testing the compiler was forcing
|
|
||||||
// 16-bit math to happen here anyways, so this isn't actually
|
|
||||||
// compiling to more code than it otherwise would. If the offset
|
|
||||||
// is 0 the high part of the word will just never be used.
|
|
||||||
|
|
||||||
// load data and bit shift
|
|
||||||
// mask needs to be bit flipped
|
|
||||||
mask_data = ~(pgm_read_byte(mask_ofs) * mul_amt);
|
|
||||||
bitmap_data = pgm_read_byte(bofs) * mul_amt;
|
|
||||||
|
|
||||||
if (sRow >= 0) {
|
|
||||||
data = Arduboy2Base::sBuffer[ofs];
|
|
||||||
data &= (uint8_t)(mask_data);
|
|
||||||
data |= (uint8_t)(bitmap_data);
|
|
||||||
Arduboy2Base::sBuffer[ofs] = data;
|
|
||||||
}
|
|
||||||
if (yOffset != 0 && sRow < 7) {
|
|
||||||
data = Arduboy2Base::sBuffer[ofs + WIDTH];
|
|
||||||
data &= (*((unsigned char *) (&mask_data) + 1));
|
|
||||||
data |= (*((unsigned char *) (&bitmap_data) + 1));
|
|
||||||
Arduboy2Base::sBuffer[ofs + WIDTH] = data;
|
|
||||||
}
|
|
||||||
ofs++;
|
|
||||||
mask_ofs++;
|
|
||||||
bofs++;
|
|
||||||
}
|
|
||||||
sRow++;
|
|
||||||
bofs += w - rendered_width;
|
|
||||||
mask_ofs += w - rendered_width;
|
|
||||||
ofs += WIDTH - rendered_width;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
|
|
||||||
case SPRITE_PLUS_MASK:
|
|
||||||
// *2 because we use double the bits (mask + bitmap)
|
|
||||||
bofs = (uint8_t *)(bitmap + ((start_h * w) + xOffset) * 2);
|
|
||||||
|
|
||||||
uint8_t xi = rendered_width; // counter for x loop below
|
|
||||||
|
|
||||||
asm volatile(
|
|
||||||
"push r28\n" // save Y
|
|
||||||
"push r29\n"
|
|
||||||
"movw r28, %[buffer_ofs]\n" // Y = buffer_ofs_2
|
|
||||||
"adiw r28, 63\n" // buffer_ofs_2 = buffer_ofs + 128
|
|
||||||
"adiw r28, 63\n"
|
|
||||||
"adiw r28, 2\n"
|
|
||||||
"loop_y:\n"
|
|
||||||
"loop_x:\n"
|
|
||||||
// load bitmap and mask data
|
|
||||||
"lpm %A[bitmap_data], Z+\n"
|
|
||||||
"lpm %A[mask_data], Z+\n"
|
|
||||||
|
|
||||||
// shift mask and buffer data
|
|
||||||
"tst %[yOffset]\n"
|
|
||||||
"breq skip_shifting\n"
|
|
||||||
"mul %A[bitmap_data], %[mul_amt]\n"
|
|
||||||
"movw %[bitmap_data], r0\n"
|
|
||||||
"mul %A[mask_data], %[mul_amt]\n"
|
|
||||||
"movw %[mask_data], r0\n"
|
|
||||||
|
|
||||||
// SECOND PAGE
|
|
||||||
// if yOffset != 0 && sRow < 7
|
|
||||||
"cpi %[sRow], 7\n"
|
|
||||||
"brge end_second_page\n"
|
|
||||||
// then
|
|
||||||
"ld %[data], Y\n"
|
|
||||||
"com %B[mask_data]\n" // invert high byte of mask
|
|
||||||
"and %[data], %B[mask_data]\n"
|
|
||||||
"or %[data], %B[bitmap_data]\n"
|
|
||||||
// update buffer, increment
|
|
||||||
"st Y+, %[data]\n"
|
|
||||||
|
|
||||||
"end_second_page:\n"
|
|
||||||
"skip_shifting:\n"
|
|
||||||
|
|
||||||
// FIRST PAGE
|
|
||||||
// if sRow >= 0
|
|
||||||
"tst %[sRow]\n"
|
|
||||||
"brmi skip_first_page\n"
|
|
||||||
"ld %[data], %a[buffer_ofs]\n"
|
|
||||||
// then
|
|
||||||
"com %A[mask_data]\n"
|
|
||||||
"and %[data], %A[mask_data]\n"
|
|
||||||
"or %[data], %A[bitmap_data]\n"
|
|
||||||
// update buffer, increment
|
|
||||||
"st %a[buffer_ofs]+, %[data]\n"
|
|
||||||
"jmp end_first_page\n"
|
|
||||||
|
|
||||||
"skip_first_page:\n"
|
|
||||||
// since no ST Z+ when skipped we need to do this manually
|
|
||||||
"adiw %[buffer_ofs], 1\n"
|
|
||||||
|
|
||||||
"end_first_page:\n"
|
|
||||||
|
|
||||||
// "x_loop_next:\n"
|
|
||||||
"dec %[xi]\n"
|
|
||||||
"brne loop_x\n"
|
|
||||||
|
|
||||||
// increment y
|
|
||||||
"next_loop_y:\n"
|
|
||||||
"dec %[yi]\n"
|
|
||||||
"breq finished\n"
|
|
||||||
"mov %[xi], %[x_count]\n" // reset x counter
|
|
||||||
// sRow++;
|
|
||||||
"inc %[sRow]\n"
|
|
||||||
"clr __zero_reg__\n"
|
|
||||||
// sprite_ofs += (w - rendered_width) * 2;
|
|
||||||
"add %A[sprite_ofs], %A[sprite_ofs_jump]\n"
|
|
||||||
"adc %B[sprite_ofs], __zero_reg__\n"
|
|
||||||
// buffer_ofs += WIDTH - rendered_width;
|
|
||||||
"add %A[buffer_ofs], %A[buffer_ofs_jump]\n"
|
|
||||||
"adc %B[buffer_ofs], __zero_reg__\n"
|
|
||||||
// buffer_ofs_page_2 += WIDTH - rendered_width;
|
|
||||||
"add r28, %A[buffer_ofs_jump]\n"
|
|
||||||
"adc r29, __zero_reg__\n"
|
|
||||||
|
|
||||||
"rjmp loop_y\n"
|
|
||||||
"finished:\n"
|
|
||||||
// put the Y register back in place
|
|
||||||
"pop r29\n"
|
|
||||||
"pop r28\n"
|
|
||||||
"clr __zero_reg__\n" // just in case
|
|
||||||
: [xi] "+&a" (xi),
|
|
||||||
[yi] "+&a" (loop_h),
|
|
||||||
[sRow] "+&a" (sRow), // CPI requires an upper register (r16-r23)
|
|
||||||
[data] "=&l" (data),
|
|
||||||
[mask_data] "=&l" (mask_data),
|
|
||||||
[bitmap_data] "=&l" (bitmap_data)
|
|
||||||
:
|
|
||||||
[screen_width] "M" (WIDTH),
|
|
||||||
[x_count] "l" (rendered_width), // lower register
|
|
||||||
[sprite_ofs] "z" (bofs),
|
|
||||||
[buffer_ofs] "x" (Arduboy2Base::sBuffer+ofs),
|
|
||||||
[buffer_ofs_jump] "a" (WIDTH-rendered_width), // upper reg (r16-r23)
|
|
||||||
[sprite_ofs_jump] "a" ((w-rendered_width)*2), // upper reg (r16-r23)
|
|
||||||
|
|
||||||
// [sprite_ofs_jump] "r" (0),
|
|
||||||
[yOffset] "l" (yOffset), // lower register
|
|
||||||
[mul_amt] "l" (mul_amt) // lower register
|
|
||||||
// NOTE: We also clobber r28 and r29 (y) but sometimes the compiler
|
|
||||||
// won't allow us, so in order to make this work we don't tell it
|
|
||||||
// that we clobber them. Instead, we push/pop to preserve them.
|
|
||||||
// Then we need to guarantee that the the compiler doesn't put one of
|
|
||||||
// our own variables into r28/r29.
|
|
||||||
// We do that by specifying all the inputs and outputs use either
|
|
||||||
// lower registers (l) or simple (r16-r23) upper registers (a).
|
|
||||||
: // pushes/clobbers/pops r28 and r29 (y)
|
|
||||||
);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue