diff options
author | Raghuram Subramani <raghus2247@gmail.com> | 2022-06-19 19:47:51 +0530 |
---|---|---|
committer | Raghuram Subramani <raghus2247@gmail.com> | 2022-06-19 19:47:51 +0530 |
commit | 4fd287655a72b9aea14cdac715ad5b90ed082ed2 (patch) | |
tree | 65d393bc0e699dd12d05b29ba568e04cea666207 /circuitpython/lib/protomatter/src/core.c | |
parent | 0150f70ce9c39e9e6dd878766c0620c85e47bed0 (diff) |
add circuitpython code
Diffstat (limited to 'circuitpython/lib/protomatter/src/core.c')
-rw-r--r-- | circuitpython/lib/protomatter/src/core.c | 1302 |
1 files changed, 1302 insertions, 0 deletions
diff --git a/circuitpython/lib/protomatter/src/core.c b/circuitpython/lib/protomatter/src/core.c new file mode 100644 index 0000000..8428797 --- /dev/null +++ b/circuitpython/lib/protomatter/src/core.c @@ -0,0 +1,1302 @@ +/*! + * @file core.c + * + * Part of Adafruit's Protomatter library for HUB75-style RGB LED matrices. + * + * Adafruit invests time and resources providing this open source code, + * please support Adafruit and open-source hardware by purchasing + * products from Adafruit! + * + * Written by Phil "Paint Your Dragon" Burgess and Jeff Epler for + * Adafruit Industries, with contributions from the open source community. + * + * BSD license, all text here must be included in any redistribution. + * + */ + +// Device- and environment-neutral core matrix-driving functionality. +// See notes near top of arch/arch.h regarding assumptions of hardware +// "common ground." If you find yourself doing an "#ifdef ARDUINO" or +// "#ifdef _SAMD21_" in this file, STOP. Idea is that the code in this +// file is neutral and portable (within aforementioned assumptions). +// Nonportable elements should appear in arch.h. If arch.h functionality +// is lacking, extend it there, do not go making device- or environment- +// specific cases within this file. + +// Function names are intentionally a little obtuse, idea is that one writes +// a more sensible wrapper around this for specific environments (e.g. the +// Arduino stuff in Adafruit_Protomatter.cpp). The "_PM_" prefix on most +// things hopefully makes function and variable name collisions much less +// likely with one's own code. + +#include "core.h" // enums and structs +#include "arch/arch.h" // Do NOT include this in any other source files +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +// Overall matrix refresh rate (frames/second) is a function of matrix width +// and chain length, number of address lines, number of bit planes, CPU speed +// and whether or not a GPIO toggle register is available. There is no "this +// will run at X-frames-per-second" constant figure. You typically just have +// to try it out and perhaps trade off some bit planes for refresh rate until +// the image looks good and stable. Anything over 100 Hz is usually passable, +// around 250 Hz is where things firm up. And while this could proceed higher +// in some situations, the tradeoff is that faster rates use progressively +// more CPU time (because it's timer interrupt based and not using DMA or +// special peripherals). So a throttle is set here, an approximate maximum +// frame rate which the software will attempt to avoid exceeding (but may +// refresh slower than this, and in many cases will...just need to set an +// upper limit to avoid excessive CPU load). An incredibly long comment block +// for a single constant, thank you for coming to my TED talk! +#define _PM_MAX_REFRESH_HZ 250 ///< Max matrix refresh rate + +// Time (in microseconds) to pause following any change in address lines +// (individually or collectively). Some matrices respond slowly there... +// must pause on change for matrix to catch up. Defined here (rather than +// arch.h) because it's not architecture-specific. +#define _PM_ROW_DELAY 8 ///< Delay time between row address line changes (ms) + +// These are the lowest-level functions for issing data to matrices. +// There are three versions because it depends on how the six RGB data bits +// (and clock bit) are arranged within a 32-bit PORT register. If all six +// (seven) fit within one byte or word of the PORT, the library's memory +// use (and corresponding data-issuing function) change. This will also have +// an impact on parallel chains in the future, where the number of concurrent +// RGB data bits isn't always six, but some multiple thereof (i.e. up to five +// parallel outputs -- 30 RGB bits + clock -- on a 32-bit PORT, though that's +// largely hypothetical as the chance of finding a PORT with that many bits +// exposed and NOT interfering with other peripherals on a board is highly +// improbable. But I could see four happening, maybe on a Grand Central or +// other kitchen-sink board. +static void blast_byte(Protomatter_core *core, uint8_t *data); +static void blast_word(Protomatter_core *core, uint16_t *data); +static void blast_long(Protomatter_core *core, uint32_t *data); + +#define _PM_clearReg(x) \ + (*(volatile _PM_PORT_TYPE *)((x).clearReg) = \ + ((x).bit)) ///< Clear non-RGB-data-or-clock control line (_PM_pin type) +#define _PM_setReg(x) \ + (*(volatile _PM_PORT_TYPE *)((x).setReg) = \ + ((x).bit)) ///< Set non-RGB-data-or-clock control line (_PM_pin type) + +// Validate and populate vital elements of core structure. +// Does NOT allocate core struct -- calling function must provide that. +// (In the Arduino C++ library, it’s part of the Protomatter class.) +ProtomatterStatus _PM_init(Protomatter_core *core, uint16_t bitWidth, + uint8_t bitDepth, uint8_t rgbCount, uint8_t *rgbList, + uint8_t addrCount, uint8_t *addrList, + uint8_t clockPin, uint8_t latchPin, uint8_t oePin, + bool doubleBuffer, int8_t tile, void *timer) { + if (!core) + return PROTOMATTER_ERR_ARG; + + // bitDepth is NOT constrained here, handle in calling function + // (varies with implementation, e.g. GFX lib is max 6 bitplanes, + // but might be more or less elsewhere) + if (rgbCount > 5) + rgbCount = 5; // Max 5 in parallel (32-bit PORT) + if (addrCount > 5) + addrCount = 5; // Max 5 address lines (A-E) + if (!tile) + tile = 1; // Can't have zero vertical tiling. Single matrix is 1. + +#if defined(_PM_TIMER_DEFAULT) + // If NULL timer was passed in (the default case for the constructor), + // use default value from arch.h. For example, in the Arduino case it's + // tied to TC4 specifically. + if (timer == NULL) + timer = _PM_TIMER_DEFAULT; +#else + if (timer == NULL) + return PROTOMATTER_ERR_ARG; +#endif + + core->timer = timer; + core->width = bitWidth; // Matrix chain width in bits (NOT including V tile) + core->tile = tile; // Matrix chain vertical tiling + core->chainBits = bitWidth * abs(tile); // Total matrix chain bits + core->numPlanes = bitDepth; + core->parallel = rgbCount; + core->numAddressLines = addrCount; + core->clockPin = clockPin; + core->latch.pin = latchPin; + core->oe.pin = oePin; + core->doubleBuffer = doubleBuffer; + core->addr = NULL; + core->screenData = NULL; + + // Make a copy of the rgbList and addrList tables in case they're + // passed from local vars on the stack or some other non-persistent + // source. screenData is NOT allocated here because data size (byte, + // word, long) is not known until the begin function evaluates all + // the pin bitmasks. + + rgbCount *= 6; // Convert parallel count to pin count + if ((core->rgbPins = (uint8_t *)_PM_allocate(rgbCount * sizeof(uint8_t)))) { + if ((core->addr = (_PM_pin *)_PM_allocate(addrCount * sizeof(_PM_pin)))) { + memcpy(core->rgbPins, rgbList, rgbCount * sizeof(uint8_t)); + for (uint8_t i = 0; i < addrCount; i++) { + core->addr[i].pin = addrList[i]; + } + return PROTOMATTER_OK; + } + _PM_free(core->rgbPins); + core->rgbPins = NULL; + } + return PROTOMATTER_ERR_MALLOC; +} + +// Allocate display buffers and populate additional elements. +ProtomatterStatus _PM_begin(Protomatter_core *core) { + if (!core) + return PROTOMATTER_ERR_ARG; + + if (!core->rgbPins) { // NULL if copy failed to allocate + return PROTOMATTER_ERR_MALLOC; + } + + // Verify that rgbPins and clockPin are all on the same PORT. If not, + // return an error. Pin list is not freed; please call dealloc function. + // Also get bitmask of which bits within 32-bit PORT register are + // referenced. + uint8_t *port = (uint8_t *)_PM_portOutRegister(core->clockPin); +#if defined(_PM_portToggleRegister) + // If a bit-toggle register is present, the clock pin is included + // in determining which bytes of the PORT register are used (and thus + // the data storage efficiency). + uint32_t bitMask = _PM_portBitMask(core->clockPin); +#else + // If no bit-toggle register, clock pin can be on any bit, doesn't + // affect storage efficiency. + uint32_t bitMask = 0; +#endif + + for (uint8_t i = 0; i < core->parallel * 6; i++) { + uint8_t *p2 = (uint8_t *)_PM_portOutRegister(core->rgbPins[i]); + if (p2 != port) { + return PROTOMATTER_ERR_PINS; + } + bitMask |= _PM_portBitMask(core->rgbPins[i]); + } + + // RGB + clock are on same port, we can proceed... + + // Determine data type for internal representation. If all the data + // bitmasks (and possibly clock bitmask, depending whether toggle-bits + // register is present) are in the same byte, this can be stored more + // compact than if they're spread across a word or long. + uint8_t byteMask = 0; + if (bitMask & 0xFF000000) + byteMask |= 0b1000; + if (bitMask & 0x00FF0000) + byteMask |= 0b0100; + if (bitMask & 0x0000FF00) + byteMask |= 0b0010; + if (bitMask & 0x000000FF) + byteMask |= 0b0001; + switch (byteMask) { + case 0b0001: // If all PORT bits are in the same byte... + case 0b0010: + case 0b0100: + case 0b1000: + core->bytesPerElement = 1; // Use 8-bit PORT accesses. + break; + case 0b0011: // If all PORT bits in upper/lower word... + case 0b1100: + core->bytesPerElement = 2; // Use 16-bit PORT accesses. + // Although some devices might tolerate unaligned 16-bit accesses + // ('middle' word of 32-bit PORT), that is NOT handled here. + // It's a portability liability. + break; + default: // Any other situation... + core->bytesPerElement = 4; // Use 32-bit PORT accesses. + break; + } + + // Planning for screen data allocation... + core->numRowPairs = 1 << core->numAddressLines; + uint8_t chunks = (core->chainBits + (_PM_chunkSize - 1)) / _PM_chunkSize; + uint16_t columns = chunks * _PM_chunkSize; // Padded matrix width + uint32_t screenBytes = + columns * core->numRowPairs * core->numPlanes * core->bytesPerElement; + + core->bufferSize = screenBytes; // Bytes per matrix buffer (1 or 2) + if (core->doubleBuffer) + screenBytes *= 2; // Total for matrix buffer(s) + uint32_t rgbMaskBytes = core->parallel * 6 * core->bytesPerElement; + + // Allocate matrix buffer(s). Don't worry about the return type... + // though we might be using words or longs for certain pin configs, + // _PM_allocate() by definition always aligns to the longest type. + if (!(core->screenData = + (uint8_t *)_PM_allocate(screenBytes + rgbMaskBytes))) { + return PROTOMATTER_ERR_MALLOC; + } + + // rgbMask data follows the matrix buffer(s) + core->rgbMask = core->screenData + screenBytes; + +#if !defined(_PM_portToggleRegister) + // Clear entire screenData buffer so there's no cruft in any pad bytes + // (if using toggle register, each is set to clockMask below instead). + memset(core->screenData, 0, screenBytes); +#endif + + // Figure out clockMask and rgbAndClockMask, clear matrix buffers + if (core->bytesPerElement == 1) { + core->portOffset = _PM_byteOffset(core->rgbPins[0]); +#if defined(_PM_portToggleRegister) && !defined(_PM_STRICT_32BIT_IO) + // Clock and rgbAndClockMask are 8-bit values + core->clockMask = _PM_portBitMask(core->clockPin) >> (core->portOffset * 8); + core->rgbAndClockMask = + (bitMask >> (core->portOffset * 8)) | core->clockMask; + memset(core->screenData, core->clockMask, screenBytes); +#else + // Clock and rgbAndClockMask are 32-bit values + core->clockMask = _PM_portBitMask(core->clockPin); + core->rgbAndClockMask = bitMask | core->clockMask; +#endif + for (uint8_t i = 0; i < core->parallel * 6; i++) { + ((uint8_t *)core->rgbMask)[i] = // Pin bitmasks are 8-bit + _PM_portBitMask(core->rgbPins[i]) >> (core->portOffset * 8); + } + } else if (core->bytesPerElement == 2) { + core->portOffset = _PM_wordOffset(core->rgbPins[0]); +#if defined(_PM_portToggleRegister) && !defined(_PM_STRICT_32BIT_IO) + // Clock and rgbAndClockMask are 16-bit values + core->clockMask = + _PM_portBitMask(core->clockPin) >> (core->portOffset * 16); + core->rgbAndClockMask = + (bitMask >> (core->portOffset * 16)) | core->clockMask; + uint32_t elements = screenBytes / 2; + for (uint32_t i = 0; i < elements; i++) { + ((uint16_t *)core->screenData)[i] = core->clockMask; + } +#else + // Clock and rgbAndClockMask are 32-bit values + core->clockMask = _PM_portBitMask(core->clockPin); + core->rgbAndClockMask = bitMask | core->clockMask; +#if defined(_PM_portToggleRegister) + // TO DO: this ifdef and the one above can probably be wrapped up + // in a more cohesive case. Think something similar will be needed + // for the byte case. Will need Teensy 4.1 to test. + uint32_t elements = screenBytes / 2; + uint16_t mask = core->clockMask >> (core->portOffset * 16); + for (uint32_t i = 0; i < elements; i++) { + ((uint16_t *)core->screenData)[i] = mask; + } +#endif +#endif + for (uint8_t i = 0; i < core->parallel * 6; i++) { + ((uint16_t *)core->rgbMask)[i] = // Pin bitmasks are 16-bit + _PM_portBitMask(core->rgbPins[i]) >> (core->portOffset * 16); + } + } else { + core->portOffset = 0; + core->clockMask = _PM_portBitMask(core->clockPin); + core->rgbAndClockMask = bitMask | core->clockMask; +#if defined(_PM_portToggleRegister) + uint32_t elements = screenBytes / 4; + for (uint32_t i = 0; i < elements; i++) { + ((uint32_t *)core->screenData)[i] = core->clockMask; + } +#endif + for (uint8_t i = 0; i < core->parallel * 6; i++) { + ((uint32_t *)core->rgbMask)[i] = // Pin bitmasks are 32-bit + _PM_portBitMask(core->rgbPins[i]); + } + } + + // Estimate minimum bitplane #0 period for _PM_MAX_REFRESH_HZ rate. + uint32_t minPeriodPerFrame = _PM_timerFreq / _PM_MAX_REFRESH_HZ; + uint32_t minPeriodPerLine = minPeriodPerFrame / core->numRowPairs; + core->minPeriod = minPeriodPerLine / ((1 << core->numPlanes) - 1); + if (core->minPeriod < _PM_minMinPeriod) { + core->minPeriod = _PM_minMinPeriod; + } + core->bitZeroPeriod = core->minPeriod; + // Actual frame rate may be lower than this...it's only an estimate + // and does not factor in things like address line selection delays + // or interrupt overhead. That's OK, just don't want to exceed this + // rate, as it'll eat all the CPU cycles. + + core->activeBuffer = 0; + + // Configure pins as outputs and initialize their states. + + core->latch.setReg = _PM_portSetRegister(core->latch.pin); + core->latch.clearReg = _PM_portClearRegister(core->latch.pin); + core->latch.bit = _PM_portBitMask(core->latch.pin); + core->oe.setReg = _PM_portSetRegister(core->oe.pin); + core->oe.clearReg = _PM_portClearRegister(core->oe.pin); + core->oe.bit = _PM_portBitMask(core->oe.pin); + + _PM_pinOutput(core->clockPin); + _PM_pinLow(core->clockPin); // Init clock LOW + _PM_pinOutput(core->latch.pin); + _PM_pinLow(core->latch.pin); // Init latch LOW + _PM_pinOutput(core->oe.pin); + _PM_pinHigh(core->oe.pin); // Init OE HIGH (disable output) + + for (uint8_t i = 0; i < core->parallel * 6; i++) { + _PM_pinOutput(core->rgbPins[i]); + _PM_pinLow(core->rgbPins[i]); + } +#if defined(_PM_portToggleRegister) + core->addrPortToggle = _PM_portToggleRegister(core->addr[0].pin); + core->singleAddrPort = 1; +#endif + core->prevRow = (1 << core->numAddressLines) - 2; + for (uint8_t line = 0, bit = 1; line < core->numAddressLines; + line++, bit <<= 1) { + core->addr[line].setReg = _PM_portSetRegister(core->addr[line].pin); + core->addr[line].clearReg = _PM_portClearRegister(core->addr[line].pin); + core->addr[line].bit = _PM_portBitMask(core->addr[line].pin); + _PM_pinOutput(core->addr[line].pin); + if (core->prevRow & bit) { + _PM_pinHigh(core->addr[line].pin); + } else { + _PM_pinLow(core->addr[line].pin); + } +#if defined(_PM_portToggleRegister) + // If address pin on different port than addr 0, no singleAddrPort. + if (_PM_portToggleRegister(core->addr[line].pin) != core->addrPortToggle) { + core->singleAddrPort = 0; + } +#endif + } + + // Get pointers to bit set and clear registers (and toggle, if present) + core->setReg = (uint8_t *)_PM_portSetRegister(core->clockPin); + core->clearReg = (uint8_t *)_PM_portClearRegister(core->clockPin); +#if defined(_PM_portToggleRegister) + core->toggleReg = (uint8_t *)_PM_portToggleRegister(core->clockPin); +#endif + + // Reset plane/row counters, config and start timer + _PM_resume(core); + + return PROTOMATTER_OK; +} + +// Disable (but do not deallocate) a Protomatter matrix. Disables matrix by +// setting OE pin HIGH and writing all-zero data to matrix shift registers, +// so it won't halt with lit LEDs. +void _PM_stop(Protomatter_core *core) { + if ((core)) { + // If _PM_begin failed, this will be a NULL pointer. Stop early, + // none of the other "stop" operations make sense + if (!core->screenData) { + return; + } + while (core->swapBuffers) + ; // Wait for any pending buffer swap + _PM_timerStop(core->timer); // Halt timer + _PM_setReg(core->oe); // Set OE HIGH (disable output) + // So, in PRINCIPLE, setting OE high would be sufficient... + // but in case that pin is shared with another function such + // as the onloard LED (which pulses during bootloading) let's + // also clear out the matrix shift registers for good measure. + // Set all RGB pins LOW... + for (uint8_t i = 0; i < core->parallel * 6; i++) { + _PM_pinLow(core->rgbPins[i]); + } + // Clock out bits (just need to toggle clock with RGBs held low) + for (uint32_t i = 0; i < core->chainBits; i++) { + _PM_pinHigh(core->clockPin); + _PM_clockHoldHigh; + _PM_pinLow(core->clockPin); + _PM_clockHoldLow; + } + // Latch data + _PM_setReg(core->latch); + _PM_clearReg(core->latch); + } +} + +void _PM_resume(Protomatter_core *core) { + if ((core)) { + // Init plane & row to max values so they roll over on 1st interrupt + core->plane = core->numPlanes - 1; + core->row = core->numRowPairs - 1; + core->prevRow = (core->numRowPairs > 1) ? (core->row - 1) : 1; + core->swapBuffers = 0; + core->frameCount = 0; + + for (uint8_t line = 0, bit = 1; line < core->numAddressLines; + line++, bit <<= 1) { + _PM_pinOutput(core->addr[line].pin); + if (core->prevRow & bit) { + _PM_pinHigh(core->addr[line].pin); + } else { + _PM_pinLow(core->addr[line].pin); + } + } + + _PM_timerInit(core->timer); // Configure timer + _PM_timerStart(core->timer, 1000); // Start timer + } +} + +// Free memory associated with core structure. Does NOT dealloc struct. +void _PM_deallocate(Protomatter_core *core) { + if ((core)) { + _PM_stop(core); + // TO DO: Set all pins back to inputs here? + if (core->screenData) + _PM_free(core->screenData); + if (core->addr) + _PM_free(core->addr); + if (core->rgbPins) { + _PM_free(core->rgbPins); + core->rgbPins = NULL; + } + } +} + +// ISR function (in arch.h) calls this function which it extern'd. +// Profuse apologies for the ESP32-specific IRAM_ATTR here -- the goal was +// for all architecture-specific detauls to be in arch.h -- but the need +// for one here caught me off guard. So, in arch.h, for all non-ESP32 +// devices, IRAM_ATTR is defined to nothing and is ignored here. If any +// future architectures have their own attribute for making a function +// RAM-resident, #define IRAM_ATTR to that in the corresponding device- +// specific section of arch.h. Sorry. :/ +// Any functions called by this function should also be IRAM_ATTR'd. +IRAM_ATTR void _PM_row_handler(Protomatter_core *core) { + + _PM_setReg(core->oe); // Disable LED output + + // ESP32 requires this next line, but not wanting to put arch-specific + // ifdefs in this code...it's a trivial operation so just do it. + // Latch is already clear at this point, but we go through the motions + // to clear it again in order to sync up the setReg(OE) above with the + // setReg(latch) that follows. Reason being, bit set/clear operations + // on ESP32 aren't truly atomic, and if those two pins are on the same + // port (quite common) the second setReg will be ignored. The nonsense + // clearReg is used to sync up the two setReg operations. See also the + // ESP32-specific PEW define in arch.h, same deal. + _PM_clearReg(core->latch); + + _PM_setReg(core->latch); + (void)_PM_timerStop(core->timer); + uint8_t prevPlane = core->plane; // Save that plane # for later timing + _PM_clearReg(core->latch); // (split to add a few cycles) + + if (prevPlane == 0) { // Plane 0 just finished loading +#if defined(_PM_portToggleRegister) + // If all address lines are on a single PORT (and bit toggle is + // available), do address line change all at once. Even doing all + // this math takes MUCH less time than the delays required when + // doing line-by-line changes. + if (core->singleAddrPort) { + // Make bitmasks of prior and new row bits + uint32_t priorBits = 0, newBits = 0; + for (uint8_t line = 0, bit = 1; line < core->numAddressLines; + line++, bit <<= 1) { + if (core->row & bit) { + newBits |= core->addr[line].bit; + } + if (core->prevRow & bit) { + priorBits |= core->addr[line].bit; + } + } + *(volatile _PM_PORT_TYPE *)core->addrPortToggle = newBits ^ priorBits; + _PM_delayMicroseconds(_PM_ROW_DELAY); + } else { +#endif + // Configure row address lines individually, making changes + // (with delays) only where necessary. + for (uint8_t line = 0, bit = 1; line < core->numAddressLines; + line++, bit <<= 1) { + if ((core->row & bit) != (core->prevRow & bit)) { + if (core->row & bit) { // Set addr line high + _PM_setReg(core->addr[line]); + } else { // Set addr line low + _PM_clearReg(core->addr[line]); + } + _PM_delayMicroseconds(_PM_ROW_DELAY); + } + } +#if defined(_PM_portToggleRegister) + } +#endif + core->prevRow = core->row; + } + + // Advance bitplane index and/or row as necessary + if (++core->plane >= core->numPlanes) { // Next data bitplane, or + core->plane = 0; // roll over bitplane to start + if (++core->row >= core->numRowPairs) { // Next row, or + core->row = 0; // roll over row to start + // Switch matrix buffers if due (only if double-buffered) + if (core->swapBuffers) { + core->activeBuffer = 1 - core->activeBuffer; + core->swapBuffers = 0; // Swapped! + } + core->frameCount++; + } + } + + // core->plane now is index of data to issue, NOT data to display. + // 'prevPlane' is the previously-loaded data, which gets displayed + // now while the next plane data is loaded. + + // Set timer and enable LED output for data loaded on PRIOR pass: + _PM_timerStart(core->timer, core->bitZeroPeriod << prevPlane); + _PM_delayMicroseconds(1); // Appease Teensy4 + _PM_clearReg(core->oe); // Enable LED output + + uint32_t elementsPerLine = + _PM_chunkSize * ((core->chainBits + (_PM_chunkSize - 1)) / _PM_chunkSize); + uint32_t srcOffset = elementsPerLine * + (core->numPlanes * core->row + core->plane) * + core->bytesPerElement; + if (core->doubleBuffer) { + srcOffset += core->bufferSize * core->activeBuffer; + } + + if (core->bytesPerElement == 1) { + blast_byte(core, (uint8_t *)(core->screenData + srcOffset)); + } else if (core->bytesPerElement == 2) { + blast_word(core, (uint16_t *)(core->screenData + srcOffset)); + } else { + blast_long(core, (uint32_t *)(core->screenData + srcOffset)); + } + + // core->plane data is now loaded, will be shown on NEXT pass + + // On the last (longest) bitplane (note that 'plane' has already wrapped + // around earlier, so a value of 0 here indicates longest plane), take + // note of the elapsed timer value at this point...that's the number of + // cycles required to issue (not necessarily display) data for one plane, + // and the bare minimum display duration allowed for plane 0. + if ((core->numPlanes > 1) && (core->plane == 0)) { + // Determine number of timer cycles taken to issue the data. + // It can vary slightly if heavy interrupts happen, things like that. + // Timer is still running and counting up at this point. + uint32_t elapsed = _PM_timerGetCount(core->timer); + // Nudge the plane-zero time up or down (filtering to avoid jitter) + core->bitZeroPeriod = ((core->bitZeroPeriod * 7) + elapsed + 4) / 8; + // But don't allow it to drop below the minimum period calculated during + // begin(), that's a hard limit and would just waste cycles. + if (core->bitZeroPeriod < core->minPeriod) { + core->bitZeroPeriod = core->minPeriod; + } + } +} + +// Innermost data-stuffing loop functions + +// The presence of a bit-toggle register can make the data-stuffing loop a +// fair bit faster (2 PORT accesses per column vs 3). But ironically, some +// devices (e.g. SAMD51) can outpace the matrix max CLK speed, so we slow +// them down with a few NOPs. These are defined in arch.h as needed. +// _PM_clockHoldLow is whatever code necessary to delay the clock rise +// after data is placed on the PORT. _PM_clockHoldHigh is code for delay +// before setting the clock back low. If undefined, nothing goes there. + +#if !defined(PEW) // arch.h can define a custom PEW if needed (e.g. ESP32) + +#if !defined(_PM_STRICT_32BIT_IO) // Partial access to 32-bit GPIO OK + +#if defined(_PM_portToggleRegister) +#define PEW \ + *toggle = *data++; /* Toggle in new data + toggle clock low */ \ + _PM_clockHoldLow; \ + *toggle = clock; /* Toggle clock high */ \ + _PM_clockHoldHigh; +#else +#define PEW \ + *set = *data++; /* Set RGB data high */ \ + _PM_clockHoldLow; \ + *set_full = clock; /* Set clock high */ \ + _PM_clockHoldHigh; \ + *clear_full = rgbclock; /* Clear RGB data + clock */ \ + ///< Bitbang one set of RGB data bits to matrix +#endif + +#else // ONLY 32-bit GPIO + +#if defined(_PM_portToggleRegister) +#define PEW \ + *toggle = *data++ << shift; /* Toggle in new data + toggle clock low */ \ + _PM_clockHoldLow; \ + *toggle = clock; /* Toggle clock high */ \ + _PM_clockHoldHigh; +#else +#define PEW \ + *set = *data++ << shift; /* Set RGB data high */ \ + _PM_clockHoldLow; \ + *set = clock; /* Set clock high */ \ + _PM_clockHoldHigh; \ + *clear_full = rgbclock; /* Clear RGB data + clock */ \ + ///< Bitbang one set of RGB data bits to matrix +#endif + +#endif // end 32-bit GPIO + +#endif // end PEW + +#if _PM_chunkSize == 1 +#define PEW_UNROLL PEW +#elif _PM_chunkSize == 2 +#define PEW_UNROLL PEW PEW ///< 2-way PEW unroll +#elif _PM_chunkSize == 4 +#define PEW_UNROLL PEW PEW PEW PEW ///< 4-way PEW unroll +#elif _PM_chunkSize == 8 +#define PEW_UNROLL PEW PEW PEW PEW PEW PEW PEW PEW ///< 8-way PEW unroll +#elif _PM_chunkSize == 16 +#define PEW_UNROLL \ + PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW +#elif _PM_chunkSize == 32 +#define PEW_UNROLL \ + PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \ + PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW +#elif _PM_chunkSize == 64 +#define PEW_UNROLL \ + PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \ + PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \ + PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \ + PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW +#else +#error "Unimplemented _PM_chunkSize value" +#endif + +// There are THREE COPIES of the following function -- one each for byte, +// word and long. If changes are made in any one of them, the others MUST +// be updated to match! (Decided against using macro tricks for the +// function, too often ends in disaster...but must be vigilant in the +// three-function maintenance then.) + +IRAM_ATTR static void blast_byte(Protomatter_core *core, uint8_t *data) { +#if !defined(_PM_STRICT_32BIT_IO) // Partial access to 32-bit GPIO OK + +#if defined(_PM_portToggleRegister) + // If here, it was established in begin() that the RGB data bits and + // clock are all within the same byte of a PORT register, else we'd be + // in the word- or long-blasting functions now. So we just need an + // 8-bit pointer to the PORT. + volatile uint8_t *toggle = + (volatile uint8_t *)core->toggleReg + core->portOffset; +#else + // No-toggle version is a little different. If here, RGB data is all + // in one byte of PORT register, clock can be any bit in 32-bit PORT. + volatile uint8_t *set; // For RGB data set + volatile _PM_PORT_TYPE *set_full; // For clock set + volatile _PM_PORT_TYPE *clear_full; // For RGB data + clock clear + set = (volatile uint8_t *)core->setReg + core->portOffset; + set_full = (volatile _PM_PORT_TYPE *)core->setReg; + clear_full = (volatile _PM_PORT_TYPE *)core->clearReg; + _PM_PORT_TYPE rgbclock = core->rgbAndClockMask; // RGB + clock bit +#endif + _PM_PORT_TYPE clock = core->clockMask; // Clock bit + uint8_t chunks = (core->chainBits + (_PM_chunkSize - 1)) / _PM_chunkSize; + + // PORT has already been initialized with RGB data + clock bits + // all LOW, so we don't need to initialize that state here. + + while (chunks--) { + PEW_UNROLL // _PM_chunkSize RGB+clock writes + } + +#if defined(_PM_portToggleRegister) + // Want the PORT left with RGB data and clock LOW on function exit + // (so it's easier to see on 'scope, and to prime it for the next call). + // This is implicit in the no-toggle case (due to how the PEW macro + // works), but toggle case requires explicitly clearing those bits. + // rgbAndClockMask is an 8-bit value when toggling, hence offset here. + *((volatile uint8_t *)core->clearReg + core->portOffset) = + core->rgbAndClockMask; +#endif + +#else // ONLY 32-bit GPIO + +#if defined(_PM_portToggleRegister) + volatile _PM_PORT_TYPE *toggle = (volatile _PM_PORT_TYPE *)core->toggleReg; +#else + volatile _PM_PORT_TYPE *set = (volatile _PM_PORT_TYPE *)core->setReg; + volatile _PM_PORT_TYPE *clear_full = (volatile _PM_PORT_TYPE *)core->clearReg; + _PM_PORT_TYPE rgbclock = core->rgbAndClockMask; // RGB + clock bit +#endif + _PM_PORT_TYPE clock = core->clockMask; // Clock bit + uint8_t shift = core->portOffset * 8; + uint8_t chunks = (core->chainBits + (_PM_chunkSize - 1)) / _PM_chunkSize; + + // PORT has already been initialized with RGB data + clock bits + // all LOW, so we don't need to initialize that state here. + + while (chunks--) { + PEW_UNROLL // _PM_chunkSize RGB+clock writes + } + +#if defined(_PM_portToggleRegister) + *((volatile uint32_t *)core->clearReg) = core->rgbAndClockMask; +#endif + +#endif // 32-bit GPIO +} + +IRAM_ATTR static void blast_word(Protomatter_core *core, uint16_t *data) { +#if !defined(_PM_STRICT_32BIT_IO) // Partial access to 32-bit GPIO OK + +#if defined(_PM_portToggleRegister) + // See notes above -- except now 16-bit word in PORT. + volatile uint16_t *toggle = + (volatile uint16_t *)core->toggleReg + core->portOffset; +#else + volatile uint16_t *set; // For RGB data set + volatile _PM_PORT_TYPE *set_full; // For clock set + volatile _PM_PORT_TYPE *clear_full; // For RGB data + clock clear + set = (volatile uint16_t *)core->setReg + core->portOffset; + set_full = (volatile _PM_PORT_TYPE *)core->setReg; + clear_full = (volatile _PM_PORT_TYPE *)core->clearReg; + _PM_PORT_TYPE rgbclock = core->rgbAndClockMask; // RGB + clock bit +#endif + _PM_PORT_TYPE clock = core->clockMask; // Clock bit + uint8_t chunks = (core->chainBits + (_PM_chunkSize - 1)) / _PM_chunkSize; + while (chunks--) { + PEW_UNROLL // _PM_chunkSize RGB+clock writes + } +#if defined(_PM_portToggleRegister) + // rgbAndClockMask is a 16-bit value when toggling, hence offset here. + *((volatile uint16_t *)core->clearReg + core->portOffset) = + core->rgbAndClockMask; +#endif + +#else // ONLY 32-bit GPIO + +#if defined(_PM_portToggleRegister) + volatile _PM_PORT_TYPE *toggle = (volatile _PM_PORT_TYPE *)core->toggleReg; +#else + volatile _PM_PORT_TYPE *set = (volatile _PM_PORT_TYPE *)core->setReg; + volatile _PM_PORT_TYPE *clear_full = (volatile _PM_PORT_TYPE *)core->clearReg; + _PM_PORT_TYPE rgbclock = core->rgbAndClockMask; // RGB + clock bit +#endif + _PM_PORT_TYPE clock = core->clockMask; // Clock bit + uint8_t shift = core->portOffset * 16; + uint8_t chunks = (core->chainBits + (_PM_chunkSize - 1)) / _PM_chunkSize; + while (chunks--) { + PEW_UNROLL // _PM_chunkSize RGB+clock writes + } +#if defined(_PM_portToggleRegister) + *((volatile _PM_PORT_TYPE *)core->clearReg) = core->rgbAndClockMask; +#endif + +#endif // 32-bit GPIO +} + +IRAM_ATTR static void blast_long(Protomatter_core *core, uint32_t *data) { +#if defined(_PM_portToggleRegister) + // See notes above -- except now full 32-bit PORT. + volatile uint32_t *toggle = (volatile uint32_t *)core->toggleReg; +#else + // Note in this case two copies exist of the PORT set register. + // The optimizer will most likely simplify this; leaving as-is, not + // wanting a special case of the PEW macro due to divergence risk. + volatile uint32_t *set; // For RGB data set +#if !defined(_PM_STRICT_32BIT_IO) + volatile _PM_PORT_TYPE *set_full; // For clock set + set_full = (volatile _PM_PORT_TYPE *)core->setReg; +#endif + volatile _PM_PORT_TYPE *clear_full; // For RGB data + clock clear + set = (volatile uint32_t *)core->setReg; + clear_full = (volatile _PM_PORT_TYPE *)core->clearReg; + _PM_PORT_TYPE rgbclock = core->rgbAndClockMask; // RGB + clock bit +#endif + _PM_PORT_TYPE clock = core->clockMask; // Clock bit +#if defined(_PM_STRICT_32BIT_IO) + uint8_t shift = 0; +#endif + uint8_t chunks = (core->chainBits + (_PM_chunkSize - 1)) / _PM_chunkSize; + while (chunks--) { + PEW_UNROLL // _PM_chunkSize RGB+clock writes + } +#if defined(_PM_portToggleRegister) + *(volatile uint32_t *)core->clearReg = core->rgbAndClockMask; +#endif +} + +// Returns current value of frame counter and resets its value to zero. +// Two calls to this, timed one second apart (or use math with other +// intervals), can be used to get a rough frames-per-second value for +// the matrix (since this is difficult to estimate beforehand). +uint32_t _PM_getFrameCount(Protomatter_core *core) { + uint32_t count = 0; + if ((core)) { + count = core->frameCount; + core->frameCount = 0; + } + return count; +} + +void _PM_swapbuffer_maybe(Protomatter_core *core) { + if (core->doubleBuffer) { + core->swapBuffers = 1; + // To avoid overwriting data on the matrix, don't return + // until the timer ISR has performed the swap at the right time. + while (core->swapBuffers) + ; + } +} + +#if defined(ARDUINO) || defined(CIRCUITPY) + +// Arduino and CircuitPython happen to use the same internal canvas +// representation. + +// 16-bit (565) color conversion functions go here (rather than in the +// Arduino lib .cpp) because knowledge is required of chunksize and the +// toggle register (or lack thereof), which are only known to this file, +// not the .cpp or anywhere else. However...this file knows nothing of +// the GFXcanvas16 type (from Adafruit_GFX...another C++ lib), so the +// .cpp just passes down some pointers and minimal info about the canvas +// buffer. It's probably not ideal but this is my life now, oh well. + +// Different runtime environments (which might not use the 565 canvas +// format) will need their own conversion functions. + +// There are THREE COPIES of the following function -- one each for byte, +// word and long. If changes are made in any one of them, the others MUST +// be updated to match! Note that they are not simple duplicates of each +// other. The byte case, for example, doesn't need to handle parallel +// matrix chains (matrix data can only be byte-sized if one chain). + +// width argument comes from GFX canvas width, which may be less than +// core's bitWidth (due to padding). height isn't needed, it can be +// inferred from core->numRowPairs and core->tile. +__attribute__((noinline)) void _PM_convert_565_byte(Protomatter_core *core, + const uint16_t *source, + uint16_t width) { + uint8_t *pinMask = (uint8_t *)core->rgbMask; // Pin bitmasks + uint8_t *dest = (uint8_t *)core->screenData; + if (core->doubleBuffer) { + dest += core->bufferSize * (1 - core->activeBuffer); + } + +#if defined(_PM_portToggleRegister) +#if !defined(_PM_STRICT_32BIT_IO) + // core->clockMask mask is already an 8-bit value + uint8_t clockMask = core->clockMask; +#else + // core->clockMask mask is 32-bit, shift down to 8-bit for this func. + uint8_t clockMask = core->clockMask >> (core->portOffset * 8); +#endif +#endif + + // No need to clear matrix buffer, loops below do a full overwrite + // (except for any scanline pad, which was already initialized in the + // begin() function and won't be touched here). + + // Determine matrix bytes per bitplane & row (row pair really): + + // Size of 1 plane of row pair (across full chain / tile set) + uint32_t bitplaneSize = + _PM_chunkSize * ((core->chainBits + (_PM_chunkSize - 1)) / _PM_chunkSize); + uint8_t pad = bitplaneSize - core->chainBits; // Plane-start pad + + // Skip initial scanline padding if present (HUB75 matrices shift data + // in from right-to-left, so if we need scanline padding it occurs at + // the start of a line, rather than the usual end). Destination pointer + // passed in already handles double-buffer math, so we don't need to + // handle that here, just the pad... + dest += pad; + + uint32_t initialRedBit, initialGreenBit, initialBlueBit; + if (core->numPlanes == 6) { + // If numPlanes is 6, red and blue are expanded from 5 to 6 bits. + // This involves duplicating the MSB of the 5-bit value to the LSB + // of its corresponding 6-bit value...or in this case, bitmasks for + // red and blue are initially assigned to canvas MSBs, while green + // starts at LSB (because it's already 6-bit). Inner loop below then + // wraps red & blue after the first bitplane. + initialRedBit = 0b1000000000000000; // MSB red + initialGreenBit = 0b0000000000100000; // LSB green + initialBlueBit = 0b0000000000010000; // MSB blue + } else { + // If numPlanes is 1 to 5, no expansion is needed, and one or all + // three color components might be decimated by some number of bits. + // The initial bitmasks are set to the components' numPlanesth bit + // (e.g. for 5 planes, start at red & blue bit #0, green bit #1, + // for 4 planes, everything starts at the next bit up, etc.). + uint8_t shiftLeft = 5 - core->numPlanes; + initialRedBit = 0b0000100000000000 << shiftLeft; + initialGreenBit = 0b0000000001000000 << shiftLeft; + initialBlueBit = 0b0000000000000001 << shiftLeft; + } + + // This works sequentially-ish through the destination buffer, + // reading from the canvas source pixels in repeated passes, + // beginning from the least bit. + for (uint8_t row = 0; row < core->numRowPairs; row++) { + uint32_t redBit = initialRedBit; + uint32_t greenBit = initialGreenBit; + uint32_t blueBit = initialBlueBit; + for (uint8_t plane = 0; plane < core->numPlanes; plane++) { +#if defined(_PM_portToggleRegister) + uint8_t prior = clockMask; // Set clock bit on 1st out +#endif + uint8_t *d2 = dest; // Incremented per-pixel across all tiles + + // Work from bottom tile to top, because data is issued in that order + for (int8_t tile = abs(core->tile) - 1; tile >= 0; tile--) { + const uint16_t *upperSrc, *lowerSrc; // Canvas scanline pointers + int16_t srcIdx; + int8_t srcInc; + + // Source pointer to tile's upper-left pixel + const uint16_t *srcTileUL = + source + tile * width * core->numRowPairs * 2; + if ((tile & 1) && (core->tile < 0)) { + // Special handling for serpentine tiles + lowerSrc = srcTileUL + width * (core->numRowPairs - 1 - row); + upperSrc = lowerSrc + width * core->numRowPairs; + srcIdx = width - 1; // Work right to left + srcInc = -1; + } else { + // Progressive tile + upperSrc = srcTileUL + width * row; // Top row + lowerSrc = upperSrc + width * core->numRowPairs; // Bottom row + srcIdx = 0; // Left to right + srcInc = 1; + } + + for (uint16_t x = 0; x < width; x++, srcIdx += srcInc) { + uint16_t upperRGB = upperSrc[srcIdx]; // Pixel in upper half + uint16_t lowerRGB = lowerSrc[srcIdx]; // Pixel in lower half + uint8_t result = 0; + if (upperRGB & redBit) + result |= pinMask[0]; + if (upperRGB & greenBit) + result |= pinMask[1]; + if (upperRGB & blueBit) + result |= pinMask[2]; + if (lowerRGB & redBit) + result |= pinMask[3]; + if (lowerRGB & greenBit) + result |= pinMask[4]; + if (lowerRGB & blueBit) + result |= pinMask[5]; +#if defined(_PM_portToggleRegister) + *d2++ = result ^ prior; + prior = result | clockMask; // Set clock bit on next out +#else + *d2++ = result; +#endif + } // end x + } // end tile + + greenBit <<= 1; + if (plane || (core->numPlanes < 6)) { + // In most cases red & blue bit scoot 1 left... + redBit <<= 1; + blueBit <<= 1; + } else { + // Exception being after bit 0 with 6-plane display, + // in which case they're reset to red & blue LSBs + // (so 5-bit colors are expanded to 6 bits). + redBit = 0b0000100000000000; + blueBit = 0b0000000000000001; + } +#if defined(_PM_portToggleRegister) + // If using bit-toggle register, erase the toggle bit on the + // first element of each bitplane & row pair. The matrix-driving + // interrupt functions correspondingly set the clock low before + // finishing. This is all done for legibility on oscilloscope -- + // so idle clock appears LOW -- but really the matrix samples on + // a rising edge and we could leave it high, but at this stage + // in development just want the scope "readable." + dest[-pad] &= ~clockMask; // Negative index is legal & intentional +#endif + dest += bitplaneSize; // Advance one scanline in dest buffer + } // end plane + } // end row +} + +// Corresponding function for word output -- either 12 RGB bits (2 parallel +// matrix chains), or 1 chain with RGB bits not in the same byte (but in the +// same 16-bit word). Some of the comments have been stripped out since it's +// largely the same operation, but changes are noted. +// WORD OUTPUT IS UNTESTED AND ROW TILING MAY ESPECIALLY PRESENT ISSUES. +void _PM_convert_565_word(Protomatter_core *core, uint16_t *source, + uint16_t width) { + uint16_t *pinMask = (uint16_t *)core->rgbMask; // Pin bitmasks + uint16_t *dest = (uint16_t *)core->screenData; + if (core->doubleBuffer) { + dest += core->bufferSize / core->bytesPerElement * (1 - core->activeBuffer); + } + + // Size of 1 plane of row pair (across full chain / tile set) + uint32_t bitplaneSize = + _PM_chunkSize * ((core->chainBits + (_PM_chunkSize - 1)) / _PM_chunkSize); + uint8_t pad = bitplaneSize - core->chainBits; // Plane-start pad + + uint32_t initialRedBit, initialGreenBit, initialBlueBit; + if (core->numPlanes == 6) { + initialRedBit = 0b1000000000000000; // MSB red + initialGreenBit = 0b0000000000100000; // LSB green + initialBlueBit = 0b0000000000010000; // MSB blue + } else { + uint8_t shiftLeft = 5 - core->numPlanes; + initialRedBit = 0b0000100000000000 << shiftLeft; + initialGreenBit = 0b0000000001000000 << shiftLeft; + initialBlueBit = 0b0000000000000001 << shiftLeft; + } + + // Unlike the 565 byte converter, the word converter DOES clear out the + // matrix buffer (because each chain is OR'd into place). If a toggle + // register exists, "clear" really means the clock mask is set in all + // but the first element on a scanline (per bitplane). If no toggle + // register, can just zero everything out. +#if defined(_PM_portToggleRegister) + // No per-chain loop is required; one clock bit handles all chains + uint32_t offset = 0; // Current position in the 'dest' buffer + uint16_t mask = core->clockMask >> (core->portOffset * 16); + for (uint8_t row = 0; row < core->numRowPairs; row++) { + for (uint8_t plane = 0; plane < core->numPlanes; plane++) { + dest[offset++] = 0; // First element of each plane + for (uint16_t x = 1; x < bitplaneSize; x++) { // All subsequent items + dest[offset++] = mask; + } + } + } +#else + memset(dest, 0, core->bufferSize); +#endif + + dest += pad; // Pad value is in 'elements,' not bytes, so this is OK + + for (uint8_t chain = 0; chain < core->parallel; chain++) { + for (uint8_t row = 0; row < core->numRowPairs; row++) { + uint32_t redBit = initialRedBit; + uint32_t greenBit = initialGreenBit; + uint32_t blueBit = initialBlueBit; + for (uint8_t plane = 0; plane < core->numPlanes; plane++) { +#if defined(_PM_portToggleRegister) + // Since we're ORing in bits over an existing clock bit, + // prior is 0 rather than clockMask as in the byte case. + uint16_t prior = 0; +#endif + uint16_t *d2 = dest; // Incremented per-pixel across all tiles + + // Work from bottom tile to top, because data is issued in that order + for (int8_t tile = abs(core->tile) - 1; tile >= 0; tile--) { + uint16_t *upperSrc, *lowerSrc; // Canvas scanline pointers + int16_t srcIdx; + int8_t srcInc; + + // Source pointer to tile's upper-left pixel + uint16_t *srcTileUL = source + (chain * abs(core->tile) + tile) * + width * core->numRowPairs * 2; + if ((tile & 1) && (core->tile < 0)) { + // Special handling for serpentine tiles + lowerSrc = srcTileUL + width * (core->numRowPairs - 1 - row); + upperSrc = lowerSrc + width * core->numRowPairs; + srcIdx = width - 1; // Work right to left + srcInc = -1; + } else { + // Progressive tile + upperSrc = srcTileUL + width * row; // Top row + lowerSrc = upperSrc + width * core->numRowPairs; // Bottom row + srcIdx = 0; // Left to right + srcInc = 1; + } + + for (uint16_t x = 0; x < width; x++, srcIdx += srcInc) { + uint16_t upperRGB = upperSrc[srcIdx]; // Pixel in upper half + uint16_t lowerRGB = lowerSrc[srcIdx]; // Pixel in lower half + uint16_t result = 0; + if (upperRGB & redBit) + result |= pinMask[0]; + if (upperRGB & greenBit) + result |= pinMask[1]; + if (upperRGB & blueBit) + result |= pinMask[2]; + if (lowerRGB & redBit) + result |= pinMask[3]; + if (lowerRGB & greenBit) + result |= pinMask[4]; + if (lowerRGB & blueBit) + result |= pinMask[5]; + // Main difference here vs byte converter is each chain + // ORs new bits into place (vs single-pass overwrite). +#if defined(_PM_portToggleRegister) + *d2++ |= result ^ prior; // Bitwise OR + prior = result; +#else + *d2++ |= result; // Bitwise OR +#endif + } // end x + } // end tile + greenBit <<= 1; + if (plane || (core->numPlanes < 6)) { + redBit <<= 1; + blueBit <<= 1; + } else { + redBit = 0b0000100000000000; + blueBit = 0b0000000000000001; + } + dest += bitplaneSize; // Advance one scanline in dest buffer + } // end plane + } // end row + pinMask += 6; // Next chain's RGB pin masks + } +} + +// Corresponding function for long output -- either several parallel chains +// (up to 5), or 1 chain with RGB bits scattered widely about the PORT. +// Same deal, comments are pared back, see above functions for explanations. +// LONG OUTPUT IS UNTESTED AND ROW TILING MAY ESPECIALLY PRESENT ISSUES. +void _PM_convert_565_long(Protomatter_core *core, uint16_t *source, + uint16_t width) { + uint32_t *pinMask = (uint32_t *)core->rgbMask; // Pin bitmasks + uint32_t *dest = (uint32_t *)core->screenData; + if (core->doubleBuffer) { + dest += core->bufferSize / core->bytesPerElement * (1 - core->activeBuffer); + } + + // Size of 1 plane of row pair (across full chain / tile set) + uint32_t bitplaneSize = + _PM_chunkSize * ((core->chainBits + (_PM_chunkSize - 1)) / _PM_chunkSize); + uint8_t pad = bitplaneSize - core->chainBits; // Plane-start pad + + uint32_t initialRedBit, initialGreenBit, initialBlueBit; + if (core->numPlanes == 6) { + initialRedBit = 0b1000000000000000; // MSB red + initialGreenBit = 0b0000000000100000; // LSB green + initialBlueBit = 0b0000000000010000; // MSB blue + } else { + uint8_t shiftLeft = 5 - core->numPlanes; + initialRedBit = 0b0000100000000000 << shiftLeft; + initialGreenBit = 0b0000000001000000 << shiftLeft; + initialBlueBit = 0b0000000000000001 << shiftLeft; + } + +#if defined(_PM_portToggleRegister) + // No per-chain loop is required; one clock bit handles all chains + uint32_t offset = 0; // Current position in the 'dest' buffer + for (uint8_t row = 0; row < core->numRowPairs; row++) { + for (uint8_t plane = 0; plane < core->numPlanes; plane++) { + dest[offset++] = 0; // First element of each plane + for (uint16_t x = 1; x < bitplaneSize; x++) { // All subsequent items + dest[offset++] = core->clockMask; + } + } + } +#else + memset(dest, 0, core->bufferSize); +#endif + + dest += pad; // Pad value is in 'elements,' not bytes, so this is OK + + for (uint8_t chain = 0; chain < core->parallel; chain++) { + for (uint8_t row = 0; row < core->numRowPairs; row++) { + uint32_t redBit = initialRedBit; + uint32_t greenBit = initialGreenBit; + uint32_t blueBit = initialBlueBit; + for (uint8_t plane = 0; plane < core->numPlanes; plane++) { +#if defined(_PM_portToggleRegister) + uint32_t prior = 0; +#endif + uint32_t *d2 = dest; // Incremented per-pixel across all tiles + + // Work from bottom tile to top, because data is issued in that order + for (int8_t tile = abs(core->tile) - 1; tile >= 0; tile--) { + uint16_t *upperSrc, *lowerSrc; // Canvas scanline pointers + int16_t srcIdx; + int8_t srcInc; + + // Source pointer to tile's upper-left pixel + uint16_t *srcTileUL = source + (chain * abs(core->tile) + tile) * + width * core->numRowPairs * 2; + if ((tile & 1) && (core->tile < 0)) { + // Special handling for serpentine tiles + lowerSrc = srcTileUL + width * (core->numRowPairs - 1 - row); + upperSrc = lowerSrc + width * core->numRowPairs; + srcIdx = width - 1; // Work right to left + srcInc = -1; + } else { + // Progressive tile + upperSrc = srcTileUL + width * row; // Top row + lowerSrc = upperSrc + width * core->numRowPairs; // Bottom row + srcIdx = 0; // Left to right + srcInc = 1; + } + + for (uint16_t x = 0; x < width; x++, srcIdx += srcInc) { + uint16_t upperRGB = upperSrc[srcIdx]; // Pixel in upper half + uint16_t lowerRGB = lowerSrc[srcIdx]; // Pixel in lower half + uint32_t result = 0; + if (upperRGB & redBit) + result |= pinMask[0]; + if (upperRGB & greenBit) + result |= pinMask[1]; + if (upperRGB & blueBit) + result |= pinMask[2]; + if (lowerRGB & redBit) + result |= pinMask[3]; + if (lowerRGB & greenBit) + result |= pinMask[4]; + if (lowerRGB & blueBit) + result |= pinMask[5]; + // Main difference here vs byte converter is each chain + // ORs new bits into place (vs single-pass overwrite). +#if defined(_PM_portToggleRegister) + *d2++ |= result ^ prior; // Bitwise OR + prior = result; +#else + *d2++ |= result; // Bitwise OR +#endif + } // end x + } // end tile + greenBit <<= 1; + if (plane || (core->numPlanes < 6)) { + redBit <<= 1; + blueBit <<= 1; + } else { + redBit = 0b0000100000000000; + blueBit = 0b0000000000000001; + } + dest += bitplaneSize; // Advance one scanline in dest buffer + } // end plane + } // end row + pinMask += 6; // Next chain's RGB pin masks + } +} + +void _PM_convert_565(Protomatter_core *core, uint16_t *source, uint16_t width) { + // Destination address is computed in convert function + // (based on active buffer value, if double-buffering), + // just need to pass in the canvas buffer address and + // width in pixels. + if (core->bytesPerElement == 1) { + _PM_convert_565_byte(core, source, width); + } else if (core->bytesPerElement == 2) { + _PM_convert_565_word(core, source, width); + } else { + _PM_convert_565_long(core, source, width); + } +} + +#endif // END ARDUINO || CIRCUITPY + +/* NOTES TO FUTURE SELF ---------------------------------------------------- + +ON BYTES, WORDS and LONGS: +I've gone back and forth between implementing all this either as it +currently is (with byte, word and long cases for various steps), or using +a uint32_t[64] table for expanding RGB bit combos to PORT bit combos. +The latter would certainly simplify the code a ton, and the additional +table lookup step wouldn't significantly impact performance, especially +going forward with faster processors (several devices already require a +few NOPs in the innermost loop to avoid outpacing the matrix). +BUT, the reason this is NOT currently done is that it only allows for a +single matrix chain (doing parallel chains would require either an +impractically large lookup table, or adding together multiple tables' +worth of bitmasks, which would slow things down in the vital inner loop). +Although parallel matrix chains aren't yet 100% implemented in this code +right now, I wanted to leave that possibility for the future, as a way to +handle larger matrix combos, because long chains will slow down the +refresh rate. +*/ |