From 57392fcd0a15f138ffc8769271410c07164fe9f1 Mon Sep 17 00:00:00 2001 From: Delio Brignoli Date: Sun, 24 Dec 2017 17:23:04 +0100 Subject: [PATCH] Optimised Sprites::drawBitmap() for size --- src/Sprites.cpp | 261 +++++++----------------------------------------- 1 file changed, 38 insertions(+), 223 deletions(-) diff --git a/src/Sprites.cpp b/src/Sprites.cpp index 459e7f9..03e94f9 100644 --- a/src/Sprites.cpp +++ b/src/Sprites.cpp @@ -124,238 +124,53 @@ void Sprites::drawBitmap(int16_t x, int16_t y, sRow += start_h; ofs = (sRow * WIDTH) + x + xOffset; - uint8_t *bofs = (uint8_t *)bitmap + (start_h * w) + xOffset; - uint8_t data; uint8_t mul_amt = 1 << yOffset; uint16_t mask_data; uint16_t bitmap_data; - switch (draw_mode) { - case SPRITE_UNMASKED: - // we only want to mask the 8 bits of our own sprite, so we can - // calculate the mask before the start of the loop - mask_data = ~(0xFF * mul_amt); - // really if yOffset = 0 you have a faster case here that could be - // optimized - for (uint8_t a = 0; a < loop_h; a++) { - for (uint8_t iCol = 0; iCol < rendered_width; iCol++) { - bitmap_data = pgm_read_byte(bofs) * mul_amt; + const uint8_t ofs_step = draw_mode == SPRITE_PLUS_MASK ? 2 : 1; + const uint8_t ofs_stride = (w - rendered_width)*ofs_step; + const uint8_t initial_bofs = ((start_h * w) + xOffset)*ofs_step; - if (sRow >= 0) { - data = Arduboy2Base::sBuffer[ofs]; - data &= (uint8_t)(mask_data); - data |= (uint8_t)(bitmap_data); - Arduboy2Base::sBuffer[ofs] = data; - } - if (yOffset != 0 && sRow < 7) { - data = Arduboy2Base::sBuffer[ofs + WIDTH]; - data &= (*((unsigned char *) (&mask_data) + 1)); - data |= (*((unsigned char *) (&bitmap_data) + 1)); - Arduboy2Base::sBuffer[ofs + WIDTH] = data; - } - ofs++; - bofs++; - } - sRow++; - bofs += w - rendered_width; - ofs += WIDTH - rendered_width; + uint8_t *bofs = (uint8_t *)bitmap + initial_bofs; + uint8_t *mask_ofs = !mask ? (uint8_t *)bitmap : (uint8_t *)mask; + mask_ofs += initial_bofs + ofs_step - 1; + + for (uint8_t a = 0; a < loop_h; a++) { + for (uint8_t iCol = 0; iCol < rendered_width; iCol++) { + uint8_t data; + + bitmap_data = pgm_read_byte(bofs) * mul_amt; + mask_data = ~bitmap_data; + + if (draw_mode == SPRITE_UNMASKED) { + mask_data = ~(0xFF * mul_amt); + } else if (draw_mode == SPRITE_IS_MASK_ERASE) { + bitmap_data = 0; + } else { + mask_data = ~(pgm_read_byte(mask_ofs) * mul_amt); } - break; - case SPRITE_IS_MASK: - for (uint8_t a = 0; a < loop_h; a++) { - for (uint8_t iCol = 0; iCol < rendered_width; iCol++) { - bitmap_data = pgm_read_byte(bofs) * mul_amt; - if (sRow >= 0) { - Arduboy2Base::sBuffer[ofs] |= (uint8_t)(bitmap_data); - } - if (yOffset != 0 && sRow < 7) { - Arduboy2Base::sBuffer[ofs + WIDTH] |= (*((unsigned char *) (&bitmap_data) + 1)); - } - ofs++; - bofs++; - } - sRow++; - bofs += w - rendered_width; - ofs += WIDTH - rendered_width; + if (sRow >= 0) { + data = Arduboy2Base::sBuffer[ofs]; + data &= (uint8_t)(mask_data); + data |= (uint8_t)(bitmap_data); + Arduboy2Base::sBuffer[ofs] = data; } - break; - - case SPRITE_IS_MASK_ERASE: - for (uint8_t a = 0; a < loop_h; a++) { - for (uint8_t iCol = 0; iCol < rendered_width; iCol++) { - bitmap_data = pgm_read_byte(bofs) * mul_amt; - if (sRow >= 0) { - Arduboy2Base::sBuffer[ofs] &= ~(uint8_t)(bitmap_data); - } - if (yOffset != 0 && sRow < 7) { - Arduboy2Base::sBuffer[ofs + WIDTH] &= ~(*((unsigned char *) (&bitmap_data) + 1)); - } - ofs++; - bofs++; - } - sRow++; - bofs += w - rendered_width; - ofs += WIDTH - rendered_width; + if (yOffset != 0 && sRow < 7) { + data = Arduboy2Base::sBuffer[ofs + WIDTH]; + data &= (*((unsigned char *) (&mask_data) + 1)); + data |= (*((unsigned char *) (&bitmap_data) + 1)); + Arduboy2Base::sBuffer[ofs + WIDTH] = data; } - break; - - case SPRITE_MASKED: - uint8_t *mask_ofs; - mask_ofs = (uint8_t *)mask + (start_h * w) + xOffset; - for (uint8_t a = 0; a < loop_h; a++) { - for (uint8_t iCol = 0; iCol < rendered_width; iCol++) { - // NOTE: you might think in the yOffset==0 case that this results - // in more effort, but in all my testing the compiler was forcing - // 16-bit math to happen here anyways, so this isn't actually - // compiling to more code than it otherwise would. If the offset - // is 0 the high part of the word will just never be used. - - // load data and bit shift - // mask needs to be bit flipped - mask_data = ~(pgm_read_byte(mask_ofs) * mul_amt); - bitmap_data = pgm_read_byte(bofs) * mul_amt; - - if (sRow >= 0) { - data = Arduboy2Base::sBuffer[ofs]; - data &= (uint8_t)(mask_data); - data |= (uint8_t)(bitmap_data); - Arduboy2Base::sBuffer[ofs] = data; - } - if (yOffset != 0 && sRow < 7) { - data = Arduboy2Base::sBuffer[ofs + WIDTH]; - data &= (*((unsigned char *) (&mask_data) + 1)); - data |= (*((unsigned char *) (&bitmap_data) + 1)); - Arduboy2Base::sBuffer[ofs + WIDTH] = data; - } - ofs++; - mask_ofs++; - bofs++; - } - sRow++; - bofs += w - rendered_width; - mask_ofs += w - rendered_width; - ofs += WIDTH - rendered_width; - } - break; - - - case SPRITE_PLUS_MASK: - // *2 because we use double the bits (mask + bitmap) - bofs = (uint8_t *)(bitmap + ((start_h * w) + xOffset) * 2); - - uint8_t xi = rendered_width; // counter for x loop below - - asm volatile( - "push r28\n" // save Y - "push r29\n" - "movw r28, %[buffer_ofs]\n" // Y = buffer_ofs_2 - "adiw r28, 63\n" // buffer_ofs_2 = buffer_ofs + 128 - "adiw r28, 63\n" - "adiw r28, 2\n" - "loop_y:\n" - "loop_x:\n" - // load bitmap and mask data - "lpm %A[bitmap_data], Z+\n" - "lpm %A[mask_data], Z+\n" - - // shift mask and buffer data - "tst %[yOffset]\n" - "breq skip_shifting\n" - "mul %A[bitmap_data], %[mul_amt]\n" - "movw %[bitmap_data], r0\n" - "mul %A[mask_data], %[mul_amt]\n" - "movw %[mask_data], r0\n" - - // SECOND PAGE - // if yOffset != 0 && sRow < 7 - "cpi %[sRow], 7\n" - "brge end_second_page\n" - // then - "ld %[data], Y\n" - "com %B[mask_data]\n" // invert high byte of mask - "and %[data], %B[mask_data]\n" - "or %[data], %B[bitmap_data]\n" - // update buffer, increment - "st Y+, %[data]\n" - - "end_second_page:\n" - "skip_shifting:\n" - - // FIRST PAGE - // if sRow >= 0 - "tst %[sRow]\n" - "brmi skip_first_page\n" - "ld %[data], %a[buffer_ofs]\n" - // then - "com %A[mask_data]\n" - "and %[data], %A[mask_data]\n" - "or %[data], %A[bitmap_data]\n" - // update buffer, increment - "st %a[buffer_ofs]+, %[data]\n" - "jmp end_first_page\n" - - "skip_first_page:\n" - // since no ST Z+ when skipped we need to do this manually - "adiw %[buffer_ofs], 1\n" - - "end_first_page:\n" - - // "x_loop_next:\n" - "dec %[xi]\n" - "brne loop_x\n" - - // increment y - "next_loop_y:\n" - "dec %[yi]\n" - "breq finished\n" - "mov %[xi], %[x_count]\n" // reset x counter - // sRow++; - "inc %[sRow]\n" - "clr __zero_reg__\n" - // sprite_ofs += (w - rendered_width) * 2; - "add %A[sprite_ofs], %A[sprite_ofs_jump]\n" - "adc %B[sprite_ofs], __zero_reg__\n" - // buffer_ofs += WIDTH - rendered_width; - "add %A[buffer_ofs], %A[buffer_ofs_jump]\n" - "adc %B[buffer_ofs], __zero_reg__\n" - // buffer_ofs_page_2 += WIDTH - rendered_width; - "add r28, %A[buffer_ofs_jump]\n" - "adc r29, __zero_reg__\n" - - "rjmp loop_y\n" - "finished:\n" - // put the Y register back in place - "pop r29\n" - "pop r28\n" - "clr __zero_reg__\n" // just in case - : [xi] "+&a" (xi), - [yi] "+&a" (loop_h), - [sRow] "+&a" (sRow), // CPI requires an upper register (r16-r23) - [data] "=&l" (data), - [mask_data] "=&l" (mask_data), - [bitmap_data] "=&l" (bitmap_data) - : - [screen_width] "M" (WIDTH), - [x_count] "l" (rendered_width), // lower register - [sprite_ofs] "z" (bofs), - [buffer_ofs] "x" (Arduboy2Base::sBuffer+ofs), - [buffer_ofs_jump] "a" (WIDTH-rendered_width), // upper reg (r16-r23) - [sprite_ofs_jump] "a" ((w-rendered_width)*2), // upper reg (r16-r23) - - // [sprite_ofs_jump] "r" (0), - [yOffset] "l" (yOffset), // lower register - [mul_amt] "l" (mul_amt) // lower register - // NOTE: We also clobber r28 and r29 (y) but sometimes the compiler - // won't allow us, so in order to make this work we don't tell it - // that we clobber them. Instead, we push/pop to preserve them. - // Then we need to guarantee that the the compiler doesn't put one of - // our own variables into r28/r29. - // We do that by specifying all the inputs and outputs use either - // lower registers (l) or simple (r16-r23) upper registers (a). - : // pushes/clobbers/pops r28 and r29 (y) - ); - break; + ofs++; + mask_ofs += ofs_step; + bofs += ofs_step; + } + sRow++; + bofs += ofs_stride; + mask_ofs += ofs_stride; + ofs += WIDTH - rendered_width; } }