Initial commit

2018-11-26 00:49:19 +01:00
commit 368127e848
355 changed files with 67999 additions and 0 deletions
@@ -0,0 +1,313 @@
+#ifndef __INC_M0_CLOCKLESS_H
+#define __INC_M0_CLOCKLESS_H
+
+struct M0ClocklessData {
+  uint8_t d[3];
+  uint8_t s[3];
+  uint8_t e[3];
+  uint8_t adj;
+};
+
+
+template<int HI_OFFSET, int LO_OFFSET, int T1, int T2, int T3, EOrder RGB_ORDER, int WAIT_TIME>int
+showLedData(volatile uint32_t *_port, uint32_t _bitmask, const uint8_t *_leds, uint32_t num_leds, struct M0ClocklessData *pData) {
+  // Lo register variables
+  register uint32_t scratch=0;
+  register struct M0ClocklessData *base = pData;
+  register volatile uint32_t *port = _port;
+  register uint32_t d=0;
+  register uint32_t counter=num_leds;
+  register uint32_t bn=0;
+  register uint32_t b=0;
+  register uint32_t bitmask = _bitmask;
+
+  // high register variable
+  register const uint8_t *leds = _leds;
+
+  asm __volatile__ (
+    ///////////////////////////////////////////////////////////////////////////
+    //
+    // asm macro definitions - used to assemble the clockless output
+    //
+    ".ifnotdef fl_delay_def;"
+#ifdef FASTLED_ARM_M0_PLUS
+    "  .set fl_is_m0p, 1;"
+    "  .macro m0pad;"
+    "    nop;"
+    "  .endm;"
+#else
+    "  .set fl_is_m0p, 0;"
+    "  .macro m0pad;"
+    "  .endm;"
+#endif
+    "  .set fl_delay_def, 1;"
+    "  .set fl_delay_mod, 4;"
+    "  .if fl_is_m0p == 1;"
+    "    .set fl_delay_mod, 3;"
+    "  .endif;"
+    "  .macro fl_delay dtime, reg=r0;"
+    "    .if (\\dtime > 0);"
+    "      .set dcycle, (\\dtime / fl_delay_mod);"
+    "      .set dwork, (dcycle * fl_delay_mod);"
+    "      .set drem, (\\dtime - dwork);"
+    "      .rept (drem);"
+    "        nop;"
+    "      .endr;"
+    "      .if dcycle > 0;"
+    "        mov \\reg, #dcycle;"
+    "        delayloop_\\@:;"
+    "        sub \\reg, #1;"
+    "        bne delayloop_\\@;"
+    "	     .if fl_is_m0p == 0;"
+    "          nop;"
+    "        .endif;"
+    "      .endif;"
+    "    .endif;"
+    "  .endm;"
+
+    "  .macro mod_delay dtime,b1,b2,reg;"
+    "    .set adj, (\\b1 + \\b2);"
+    "    .if adj < \\dtime;"
+    "      .set dtime2, (\\dtime - adj);"
+    "      fl_delay dtime2, \\reg;"
+    "    .endif;"
+    "  .endm;"
+
+    // check the bit and drop the line low if it isn't set
+    "  .macro qlo4 b,bitmask,port,loff	;"
+    "    lsl \\b, #1			;"
+    "    bcs skip_\\@			;"
+    "    str \\bitmask, [\\port, \\loff]	;"
+    "    skip_\\@:			;"
+    "    m0pad;"
+    "  .endm				;"
+
+    // set the pin hi or low (determined by the offset passed in )
+    "  .macro qset2 bitmask,port,loff;"
+    "    str \\bitmask, [\\port, \\loff];"
+    "    m0pad;"
+    "  .endm;"
+
+    // Load up the next led byte to work with, put it in bn
+    "  .macro loadleds3 leds, bn, rled, scratch;"
+    "    mov \\scratch, \\leds;"
+    "    ldrb \\bn, [\\scratch, \\rled];"
+    "  .endm;"
+
+    // check whether or not we should dither
+    "  .macro loaddither7 bn,d,base,rdither;"
+    "    ldrb \\d, [\\base, \\rdither];"
+    "    lsl \\d, #24;"  //; shift high for the qadd w/bn
+    "    lsl \\bn, #24;" //; shift high for the qadd w/d
+    "    bne chkskip_\\@;" //; if bn==0, clear d;"
+    "    eor \\d, \\d;" //; clear d;"
+    "    m0pad;"
+    "    chkskip_\\@:;"
+    "  .endm;"
+
+    // Do the qadd8 for dithering -- there's two versions of this.  The m0 version
+    // takes advantage of the 3 cycle branch to do two things after the branch,
+    // while keeping timing constant.  The m0+, however, branches in 2 cycles, so
+    // we have to work around that a bit more.  This is one of the few times
+    // where the m0 will actually be _more_ efficient than the m0+
+    "  .macro dither5 bn,d;"
+    "  .syntax unified;"
+    "    .if fl_is_m0p == 0;"
+    "      adds \\bn, \\d;"         // do the add
+    "      bcc dither5_1_\\@;"
+    "      mvns \\bn, \\bn;"        // set the low 24bits ot 1's
+    "      lsls \\bn, \\bn, #24;"   // move low 8 bits to the high bits
+    "      dither5_1_\\@:;"
+    "      nop;"                    // nop to keep timing in line
+    "    .else;"
+    "      adds \\bn, \\d;"         // do the add"
+    "      bcc dither5_2_\\@;"
+    "      mvns \\bn, \\bn;"        // set the low 24bits ot 1's
+    "      dither5_2_\\@:;"
+    "      bcc dither5_3_\\@;"
+    "      lsls \\bn, \\bn, #24;"   // move low 8 bits to the high bits
+    "      dither5_3_\\@:;"
+    "    .endif;"
+    "  .syntax divided;"
+    "  .endm;"
+
+    // Do our scaling
+    "  .macro scale4 bn, base, scale, scratch;"
+    "    ldrb \\scratch, [\\base, \\scale];"
+    "    lsr \\bn, \\bn, #24;"                  // bring bn back down to its low 8 bits
+    "    mul \\bn, \\scratch;"                  // do the multiply
+    "  .endm;"
+
+    // swap bn into b
+    "  .macro swapbbn1 b,bn;"
+    "    lsl \\b, \\bn, #16;"  // put the 8 bits we want for output high
+    "  .endm;"
+
+    // adjust the dithering value for the next time around (load e from memory
+    // to do the math)
+    "  .macro adjdither7 base,d,rled,eoffset,scratch;"
+    "    ldrb \\d, [\\base, \\rled];"
+    "    ldrb \\scratch,[\\base,\\eoffset];"          // load e
+    "    .syntax unified;"
+    "    subs \\d, \\scratch, \\d;"                   // d=e-d
+    "    .syntax divided;"
+    "    strb \\d, [\\base, \\rled];"                 // save d
+    "  .endm;"
+
+    // increment the led pointer (base+9 has what we're incrementing by)
+    "  .macro incleds3   leds, base, scratch;"
+    "    ldrb \\scratch, [\\base, #9];"               // load incremen
+    "    add \\leds, \\leds, \\scratch;"              // update leds pointer
+    "  .endm;"
+
+    // compare and loop
+    "  .macro cmploop5 counter,label;"
+    "    .syntax unified;"
+    "    subs \\counter, #1;"
+    "    .syntax divided;"
+    "    beq done_\\@;"
+    "    m0pad;"
+    "    b \\label;"
+    "    done_\\@:;"
+    "  .endm;"
+
+    " .endif;"
+  );
+
+#define M0_ASM_ARGS     :             \
+      [leds] "+h" (leds),             \
+      [counter] "+l" (counter),       \
+      [scratch] "+l" (scratch),       \
+      [d] "+l" (d),                   \
+      [bn] "+l" (bn),                 \
+      [b] "+l" (b)                    \
+    :                                 \
+      [port] "l" (port),              \
+      [base] "l" (base),              \
+      [bitmask] "l" (bitmask),        \
+      [hi_off] "I" (HI_OFFSET),       \
+      [lo_off] "I" (LO_OFFSET),       \
+      [led0] "I" (RO(0)),             \
+      [led1] "I" (RO(1)),             \
+      [led2] "I" (RO(2)),             \
+      [scale0] "I" (3+RO(0)),         \
+      [scale1] "I" (3+RO(1)),         \
+      [scale2] "I" (3+RO(2)),         \
+      [e0] "I" (6+RO(0)),             \
+      [e1] "I" (6+RO(1)),             \
+      [e2] "I" (6+RO(2)),             \
+      [T1] "I" (T1),                  \
+      [T2] "I" (T2),                  \
+      [T3] "I" (T3)                   \
+    :
+
+    /////////////////////////////////////////////////////////////////////////
+    // now for some convinience macros to make building our lines a bit cleaner
+#define LOOP            "  loop_%=:"
+#define HI2             "  qset2 %[bitmask], %[port], %[hi_off];"
+#define D1              "  mod_delay %c[T1],2,0,%[scratch];"
+#define QLO4            "  qlo4 %[b],%[bitmask],%[port], %[lo_off];"
+#define LOADLEDS3(X)    "  loadleds3 %[leds], %[bn], %[led" #X "] ,%[scratch];"
+#define D2(ADJ)         "  mod_delay %c[T2],4," #ADJ ",%[scratch];"
+#define LO2             "  qset2 %[bitmask], %[port], %[lo_off];"
+#define D3(ADJ)         "  mod_delay %c[T3],2," #ADJ ",%[scratch];"
+#define LOADDITHER7(X)  "  loaddither7 %[bn], %[d], %[base], %[led" #X "];"
+#define DITHER5         "  dither5 %[bn], %[d];"
+#define SCALE4(X)       "  scale4 %[bn], %[base], %[scale" #X "], %[scratch];"
+#define SWAPBBN1        "  swapbbn1 %[b], %[bn];"
+#define ADJDITHER7(X)   "  adjdither7 %[base],%[d],%[led" #X "],%[e" #X "],%[scratch];"
+#define INCLEDS3        "  incleds3 %[leds],%[base],%[scratch];"
+#define CMPLOOP5        "  cmploop5 %[counter], loop_%=;"
+#define NOTHING         ""
+
+#if !(defined(SEI_CHK) && (FASTLED_ALLOW_INTERRUPTS == 1))
+    // We're not allowing interrupts - run the entire loop in asm to keep things
+    // as tight as possible.  In an ideal world, we should be pushing out ws281x
+    // leds (or other 3-wire leds) with zero gaps between pixels.
+    asm __volatile__ (
+      // pre-load byte 0
+    LOADLEDS3(0) LOADDITHER7(0) DITHER5 SCALE4(0) ADJDITHER7(0) SWAPBBN1
+
+    // loop over writing out the data
+    LOOP
+      // Write out byte 0, prepping byte 1
+      HI2 D1 QLO4 NOTHING         D2(0) LO2 D3(0)
+      HI2 D1 QLO4 LOADLEDS3(1)    D2(3) LO2 D3(0)
+      HI2 D1 QLO4 LOADDITHER7(1)  D2(7) LO2 D3(0)
+      HI2 D1 QLO4 DITHER5         D2(5) LO2 D3(0)
+      HI2 D1 QLO4 SCALE4(1)       D2(4) LO2 D3(0)
+      HI2 D1 QLO4 ADJDITHER7(1)   D2(7) LO2 D3(0)
+      HI2 D1 QLO4 NOTHING         D2(0) LO2 D3(0)
+      HI2 D1 QLO4 SWAPBBN1        D2(1) LO2 D3(0)
+
+      // Write out byte 1, prepping byte 2
+      HI2 D1 QLO4 NOTHING         D2(0) LO2 D3(0)
+      HI2 D1 QLO4 LOADLEDS3(2)    D2(3) LO2 D3(0)
+      HI2 D1 QLO4 LOADDITHER7(2)  D2(7) LO2 D3(0)
+      HI2 D1 QLO4 DITHER5         D2(5) LO2 D3(0)
+      HI2 D1 QLO4 SCALE4(2)       D2(4) LO2 D3(0)
+      HI2 D1 QLO4 ADJDITHER7(2)   D2(7) LO2 D3(0)
+      HI2 D1 QLO4 INCLEDS3        D2(3) LO2 D3(0)
+      HI2 D1 QLO4 SWAPBBN1        D2(1) LO2 D3(0)
+
+      // Write out byte 2, prepping byte 0
+      HI2 D1 QLO4 NOTHING         D2(0) LO2 D3(0)
+      HI2 D1 QLO4 LOADLEDS3(0)    D2(3) LO2 D3(0)
+      HI2 D1 QLO4 LOADDITHER7(0)  D2(7) LO2 D3(0)
+      HI2 D1 QLO4 DITHER5         D2(5) LO2 D3(0)
+      HI2 D1 QLO4 SCALE4(0)       D2(4) LO2 D3(0)
+      HI2 D1 QLO4 ADJDITHER7(0)   D2(7) LO2 D3(0)
+      HI2 D1 QLO4 NOTHING         D2(0) LO2 D3(0)
+      HI2 D1 QLO4 SWAPBBN1        D2(1) LO2 D3(5) CMPLOOP5
+
+      M0_ASM_ARGS
+    );
+#else
+    // We're allowing interrupts - track the loop outside the asm code, to allow
+    // inserting the interrupt overrun checks.
+    asm __volatile__ (
+      // pre-load byte 0
+      LOADLEDS3(0) LOADDITHER7(0) DITHER5 SCALE4(0) ADJDITHER7(0) SWAPBBN1
+      M0_ASM_ARGS);
+
+    do {
+      asm __volatile__ (
+      // Write out byte 0, prepping byte 1
+      HI2 D1 QLO4 NOTHING         D2(0) LO2 D3(0)
+      HI2 D1 QLO4 LOADLEDS3(1)    D2(3) LO2 D3(0)
+      HI2 D1 QLO4 LOADDITHER7(1)  D2(7) LO2 D3(0)
+      HI2 D1 QLO4 DITHER5         D2(5) LO2 D3(0)
+      HI2 D1 QLO4 SCALE4(1)       D2(4) LO2 D3(0)
+      HI2 D1 QLO4 ADJDITHER7(1)   D2(7) LO2 D3(0)
+      HI2 D1 QLO4 NOTHING         D2(0) LO2 D3(0)
+      HI2 D1 QLO4 SWAPBBN1        D2(1) LO2 D3(0)
+
+      // Write out byte 1, prepping byte 2
+      HI2 D1 QLO4 NOTHING         D2(0) LO2 D3(0)
+      HI2 D1 QLO4 LOADLEDS3(2)    D2(3) LO2 D3(0)
+      HI2 D1 QLO4 LOADDITHER7(2)  D2(7) LO2 D3(0)
+      HI2 D1 QLO4 DITHER5         D2(5) LO2 D3(0)
+      HI2 D1 QLO4 SCALE4(2)       D2(4) LO2 D3(0)
+      HI2 D1 QLO4 ADJDITHER7(2)   D2(7) LO2 D3(0)
+      HI2 D1 QLO4 INCLEDS3        D2(3) LO2 D3(0)
+      HI2 D1 QLO4 SWAPBBN1        D2(1) LO2 D3(0)
+
+      // Write out byte 2, prepping byte 0
+      HI2 D1 QLO4 NOTHING         D2(0) LO2 D3(0)
+      HI2 D1 QLO4 LOADLEDS3(0)    D2(3) LO2 D3(0)
+      HI2 D1 QLO4 LOADDITHER7(0)  D2(7) LO2 D3(0)
+      HI2 D1 QLO4 DITHER5         D2(5) LO2 D3(0)
+      HI2 D1 QLO4 SCALE4(0)       D2(4) LO2 D3(0)
+      HI2 D1 QLO4 ADJDITHER7(0)   D2(7) LO2 D3(0)
+      HI2 D1 QLO4 NOTHING         D2(0) LO2 D3(0)
+      HI2 D1 QLO4 SWAPBBN1        D2(1) LO2 D3(5)
+
+      M0_ASM_ARGS
+      );
+      SEI_CHK; INNER_SEI; --counter; CLI_CHK;
+    } while(counter);
+#endif
+    return num_leds;
+}
+
+#endif
@@ -0,0 +1,89 @@
+#ifndef __INC_CLOCKLESS_ARM_D21
+#define __INC_CLOCKLESS_ARM_D21
+
+#include "platforms/arm/common/m0clockless.h"
+FASTLED_NAMESPACE_BEGIN
+#define FASTLED_HAS_CLOCKLESS 1
+
+template <uint8_t DATA_PIN, int T1, int T2, int T3, EOrder RGB_ORDER = RGB, int XTRA0 = 0, bool FLIP = false, int WAIT_TIME = 50>
+class ClocklessController : public CLEDController {
+  typedef typename FastPinBB<DATA_PIN>::port_ptr_t data_ptr_t;
+  typedef typename FastPinBB<DATA_PIN>::port_t data_t;
+
+  data_t mPinMask;
+  data_ptr_t mPort;
+  CMinWait<WAIT_TIME> mWait;
+public:
+  virtual void init() {
+    FastPinBB<DATA_PIN>::setOutput();
+    mPinMask = FastPinBB<DATA_PIN>::mask();
+    mPort = FastPinBB<DATA_PIN>::port();
+  }
+
+	virtual uint16_t getMaxRefreshRate() const { return 400; }
+
+  virtual void clearLeds(int nLeds) {
+    showColor(CRGB(0, 0, 0), nLeds, 0);
+  }
+
+  // set all the leds on the controller to a given color
+  virtual void showColor(const struct CRGB & rgbdata, int nLeds, CRGB scale) {
+    PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+    mWait.wait();
+    cli();
+
+    showRGBInternal(pixels);
+
+    sei();
+    mWait.mark();
+  }
+
+  virtual void show(const struct CRGB *rgbdata, int nLeds, CRGB scale) {
+    PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+    mWait.wait();
+    cli();
+
+    showRGBInternal(pixels);
+
+    sei();
+    mWait.mark();
+  }
+
+#ifdef SUPPORT_ARGB
+  virtual void show(const struct CARGB *rgbdata, int nLeds, CRGB scale) {
+    PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+    mWait.wait();
+    cli();
+    showRGBInternal(pixels);
+    sei();
+    mWait.mark();
+  }
+#endif
+
+  // This method is made static to force making register Y available to use for data on AVR - if the method is non-static, then
+  // gcc will use register Y for the this pointer.
+  static uint32_t showRGBInternal(PixelController<RGB_ORDER> & pixels) {
+    struct M0ClocklessData data;
+    data.d[0] = pixels.d[0];
+    data.d[1] = pixels.d[1];
+    data.d[2] = pixels.d[2];
+    data.s[0] = pixels.mScale[0];
+    data.s[1] = pixels.mScale[1];
+    data.s[2] = pixels.mScale[2];
+    data.e[0] = pixels.e[0];
+    data.e[1] = pixels.e[1];
+    data.e[2] = pixels.e[2];
+    data.adj = pixels.mAdvance;
+
+    typename FastPin<DATA_PIN>::port_ptr_t portBase = FastPin<DATA_PIN>::port();
+    showLedData<8,4,T1,T2,T3,RGB_ORDER, WAIT_TIME>(portBase, FastPin<DATA_PIN>::mask(), pixels.mData, pixels.mLen, &data);
+    return 0; // 0x00FFFFFF - _VAL;
+  }
+
+
+};
+
+FASTLED_NAMESPACE_END
+
+
+#endif // __INC_CLOCKLESS_ARM_D21
@@ -0,0 +1,8 @@
+#ifndef __INC_FASTLED_ARM_D21_H
+#define __INC_FASTLED_ARM_D21_H
+
+#include "fastled_delay.h"
+#include "fastpin_arm_d21.h"
+#include "clockless_arm_d21.h"
+
+#endif
@@ -0,0 +1,95 @@
+#ifndef __INC_FASTPIN_ARM_SAM_H
+#define __INC_FASTPIN_ARM_SAM_H
+
+FASTLED_NAMESPACE_BEGIN
+
+#if defined(FASTLED_FORCE_SOFTWARE_PINS)
+#warning "Software pin support forced, pin access will be sloightly slower."
+#define NO_HARDWARE_PIN_SUPPORT
+#undef HAS_HARDWARE_PIN_SUPPORT
+
+#else
+
+/// Template definition for STM32 style ARM pins, providing direct access to the various GPIO registers.  Note that this
+/// uses the full port GPIO registers.  In theory, in some way, bit-band register access -should- be faster, however I have found
+/// that something about the way gcc does register allocation results in the bit-band code being slower.  It will need more fine tuning.
+/// The registers are data output, set output, clear output, toggle output, input, and direction
+
+template<uint8_t PIN, uint8_t _BIT, uint32_t _MASK, int _GRP> class _ARMPIN {
+public:
+  typedef volatile uint32_t * port_ptr_t;
+  typedef uint32_t port_t;
+
+  #if 0
+  inline static void setOutput() {
+    if(_BIT<8) {
+      _CRL::r() = (_CRL::r() & (0xF << (_BIT*4)) | (0x1 << (_BIT*4));
+    } else {
+      _CRH::r() = (_CRH::r() & (0xF << ((_BIT-8)*4))) | (0x1 << ((_BIT-8)*4));
+    }
+  }
+  inline static void setInput() { /* TODO */ } // TODO: preform MUX config { _PDDR::r() &= ~_MASK; }
+  #endif
+
+  inline static void setOutput() { pinMode(PIN, OUTPUT); } // TODO: perform MUX config { _PDDR::r() |= _MASK; }
+  inline static void setInput() { pinMode(PIN, INPUT); } // TODO: preform MUX config { _PDDR::r() &= ~_MASK; }
+
+  inline static void hi() __attribute__ ((always_inline)) { PORT->Group[_GRP].OUTSET.reg = _MASK; }
+  inline static void lo() __attribute__ ((always_inline)) { PORT->Group[_GRP].OUTCLR.reg = _MASK; }
+  // inline static void lo() __attribute__ ((always_inline)) { PORT->Group[_GRP].BSRR = (_MASK<<16); }
+  inline static void set(register port_t val) __attribute__ ((always_inline)) { PORT->Group[_GRP].OUT.reg = val; }
+
+  inline static void strobe() __attribute__ ((always_inline)) { toggle(); toggle(); }
+
+  inline static void toggle() __attribute__ ((always_inline)) { PORT->Group[_GRP].OUTTGL.reg = _MASK; }
+
+  inline static void hi(register port_ptr_t port) __attribute__ ((always_inline)) { hi(); }
+  inline static void lo(register port_ptr_t port) __attribute__ ((always_inline)) { lo(); }
+  inline static void fastset(register port_ptr_t port, register port_t val) __attribute__ ((always_inline)) { *port = val; }
+
+  inline static port_t hival() __attribute__ ((always_inline)) { return PORT->Group[_GRP].OUT.reg | _MASK; }
+  inline static port_t loval() __attribute__ ((always_inline)) { return PORT->Group[_GRP].OUT.reg & ~_MASK; }
+  inline static port_ptr_t port() __attribute__ ((always_inline)) { return &PORT->Group[_GRP].OUT.reg; }
+  inline static port_ptr_t sport() __attribute__ ((always_inline)) { return &PORT->Group[_GRP].OUTSET.reg; }
+  inline static port_ptr_t cport() __attribute__ ((always_inline)) { return &PORT->Group[_GRP].OUTCLR.reg; }
+  inline static port_t mask() __attribute__ ((always_inline)) { return _MASK; }
+};
+
+#define _R(T) struct __gen_struct_ ## T
+#define _RD32(T) struct __gen_struct_ ## T { static __attribute__((always_inline)) inline volatile PortGroup * r() { return T; } };
+
+#define _IO32(L) _RD32(GPIO ## L)
+
+#define _DEFPIN_ARM(PIN, L, BIT) template<> class FastPin<PIN> : public _ARMPIN<PIN, BIT, 1 << BIT, L> {};
+
+// Actual pin definitions
+#if defined(ARDUINO_SAMD_ZERO)
+
+#define MAX_PIN 42
+_DEFPIN_ARM( 0,0,10); _DEFPIN_ARM( 1,0,11); _DEFPIN_ARM( 2,0, 8); _DEFPIN_ARM( 3,0, 9);
+_DEFPIN_ARM( 4,0,14); _DEFPIN_ARM( 5,0,15); _DEFPIN_ARM( 6,0,20); _DEFPIN_ARM( 7,0,21);
+_DEFPIN_ARM( 8,0, 6); _DEFPIN_ARM( 9,0, 7); _DEFPIN_ARM(10,0,18); _DEFPIN_ARM(11,0,16);
+_DEFPIN_ARM(12,0,19); _DEFPIN_ARM(13,0,17); _DEFPIN_ARM(14,0, 2); _DEFPIN_ARM(15,1, 8);
+_DEFPIN_ARM(16,1, 9); _DEFPIN_ARM(17,0, 4); _DEFPIN_ARM(18,0, 5); _DEFPIN_ARM(19,1, 2);
+_DEFPIN_ARM(20,0,22); _DEFPIN_ARM(21,0,23); _DEFPIN_ARM(22,0,12); _DEFPIN_ARM(23,1,11);
+_DEFPIN_ARM(24,1,10); _DEFPIN_ARM(25,1, 3); _DEFPIN_ARM(26,0,27); _DEFPIN_ARM(27,0,28);
+_DEFPIN_ARM(28,0,24); _DEFPIN_ARM(29,0,25); _DEFPIN_ARM(30,1,22); _DEFPIN_ARM(31,1,23);
+_DEFPIN_ARM(32,0,22); _DEFPIN_ARM(33,0,23); _DEFPIN_ARM(34,0,19); _DEFPIN_ARM(35,0,16);
+_DEFPIN_ARM(36,0,18); _DEFPIN_ARM(37,0,17); _DEFPIN_ARM(38,0,13); _DEFPIN_ARM(39,0,21);
+_DEFPIN_ARM(40,0, 6); _DEFPIN_ARM(41,0, 7); _DEFPIN_ARM(42,0, 3);
+
+#define SPI_DATA 24
+#define SPI_CLOCK 23
+
+#define HAS_HARDWARE_PIN_SUPPORT
+
+#endif
+
+
+
+#endif // FASTLED_FORCE_SOFTWARE_PINS
+
+FASTLED_NAMESPACE_END
+
+
+#endif // __INC_FASTPIN_ARM_SAM_H
@@ -0,0 +1,26 @@
+#ifndef __INC_LED_SYSDEFS_ARM_D21_H
+#define __INC_LED_SYSDEFS_ARM_D21_H
+
+
+#define FASTLED_ARM
+#define FASTLED_ARM_M0_PLUS
+
+#ifndef INTERRUPT_THRESHOLD
+#define INTERRUPT_THRESHOLD 1
+#endif
+
+// Default to allowing interrupts
+#ifndef FASTLED_ALLOW_INTERRUPTS
+#define FASTLED_ALLOW_INTERRUPTS 0
+#endif
+
+#if FASTLED_ALLOW_INTERRUPTS == 1
+#define FASTLED_ACCURATE_CLOCK
+#endif
+
+// reuseing/abusing cli/sei defs for due
+#define cli()  __disable_irq();
+#define sei() __enable_irq();
+
+
+#endif
@@ -0,0 +1,145 @@
+#ifndef __INC_CLOCKLESS_ARM_K20_H
+#define __INC_CLOCKLESS_ARM_K20_H
+
+FASTLED_NAMESPACE_BEGIN
+
+// Definition for a single channel clockless controller for the k20 family of chips, like that used in the teensy 3.0/3.1
+// See clockless.h for detailed info on how the template parameters are used.
+#if defined(FASTLED_TEENSY3)
+
+#define FASTLED_HAS_CLOCKLESS 1
+
+template <int DATA_PIN, int T1, int T2, int T3, EOrder RGB_ORDER = RGB, int XTRA0 = 0, bool FLIP = false, int WAIT_TIME = 50>
+class ClocklessController : public CLEDController {
+	typedef typename FastPin<DATA_PIN>::port_ptr_t data_ptr_t;
+	typedef typename FastPin<DATA_PIN>::port_t data_t;
+
+	data_t mPinMask;
+	data_ptr_t mPort;
+	CMinWait<WAIT_TIME> mWait;
+public:
+	virtual void init() {
+		FastPin<DATA_PIN>::setOutput();
+		mPinMask = FastPin<DATA_PIN>::mask();
+		mPort = FastPin<DATA_PIN>::port();
+	}
+
+	virtual uint16_t getMaxRefreshRate() const { return 400; }
+
+	virtual void clearLeds(int nLeds) {
+		showColor(CRGB(0, 0, 0), nLeds, 0);
+	}
+
+protected:
+
+	// set all the leds on the controller to a given color
+	virtual void showColor(const struct CRGB & rgbdata, int nLeds, CRGB scale) {
+		PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+
+		mWait.wait();
+		showRGBInternal(pixels);
+		mWait.mark();
+	}
+
+	virtual void show(const struct CRGB *rgbdata, int nLeds, CRGB scale) {
+		PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+
+		mWait.wait();
+		showRGBInternal(pixels);
+		mWait.mark();
+	}
+
+#ifdef SUPPORT_ARGB
+	virtual void show(const struct CARGB *rgbdata, int nLeds, CRGB scale) {
+		PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+		mWait.wait();
+		showRGBInternal(pixels);
+		mWait.mark();
+	}
+#endif
+
+	template<int BITS> __attribute__ ((always_inline)) inline static void writeBits(register uint32_t & next_mark, register data_ptr_t port, register data_t hi, register data_t lo, register uint8_t & b)  {
+		for(register uint32_t i = BITS-1; i > 0; i--) {
+			while(ARM_DWT_CYCCNT < next_mark);
+			next_mark = ARM_DWT_CYCCNT + (T1+T2+T3);
+			FastPin<DATA_PIN>::fastset(port, hi);
+			if(b&0x80) {
+				while((next_mark - ARM_DWT_CYCCNT) > (T3+(2*(F_CPU/24000000))));
+				FastPin<DATA_PIN>::fastset(port, lo);
+			} else {
+				while((next_mark - ARM_DWT_CYCCNT) > (T2+T3+(2*(F_CPU/24000000))));
+				FastPin<DATA_PIN>::fastset(port, lo);
+			}
+			b <<= 1;
+		}
+
+		while(ARM_DWT_CYCCNT < next_mark);
+		next_mark = ARM_DWT_CYCCNT + (T1+T2+T3);
+		FastPin<DATA_PIN>::fastset(port, hi);
+
+		if(b&0x80) {
+			while((next_mark - ARM_DWT_CYCCNT) > (T3+(2*(F_CPU/24000000))));
+			FastPin<DATA_PIN>::fastset(port, lo);
+		} else {
+			while((next_mark - ARM_DWT_CYCCNT) > (T2+T3+(2*(F_CPU/24000000))));
+			FastPin<DATA_PIN>::fastset(port, lo);
+		}
+	}
+
+	// This method is made static to force making register Y available to use for data on AVR - if the method is non-static, then
+	// gcc will use register Y for the this pointer.
+	static uint32_t showRGBInternal(PixelController<RGB_ORDER> & pixels) {
+	    // Get access to the clock
+		ARM_DEMCR    |= ARM_DEMCR_TRCENA;
+		ARM_DWT_CTRL |= ARM_DWT_CTRL_CYCCNTENA;
+		ARM_DWT_CYCCNT = 0;
+
+		register data_ptr_t port = FastPin<DATA_PIN>::port();
+		register data_t hi = *port | FastPin<DATA_PIN>::mask();;
+		register data_t lo = *port & ~FastPin<DATA_PIN>::mask();;
+		*port = lo;
+
+		// Setup the pixel controller and load/scale the first byte
+		pixels.preStepFirstByteDithering();
+		register uint8_t b = pixels.loadAndScale0();
+
+		cli();
+		uint32_t next_mark = ARM_DWT_CYCCNT + (T1+T2+T3);
+
+		while(pixels.has(1)) {
+			pixels.stepDithering();
+			#if (FASTLED_ALLOW_INTERRUPTS == 1)
+			cli();
+			// if interrupts took longer than 45µs, punt on the current frame
+			if(ARM_DWT_CYCCNT > next_mark) {
+				if((ARM_DWT_CYCCNT-next_mark) > ((WAIT_TIME-INTERRUPT_THRESHOLD)*CLKS_PER_US)) { sei(); return ARM_DWT_CYCCNT; }
+			}
+
+			hi = *port | FastPin<DATA_PIN>::mask();
+			lo = *port & ~FastPin<DATA_PIN>::mask();
+			#endif
+			// Write first byte, read next byte
+			writeBits<8+XTRA0>(next_mark, port, hi, lo, b);
+			b = pixels.loadAndScale1();
+
+			// Write second byte, read 3rd byte
+			writeBits<8+XTRA0>(next_mark, port, hi, lo, b);
+			b = pixels.loadAndScale2();
+
+			// Write third byte, read 1st byte of next pixel
+			writeBits<8+XTRA0>(next_mark, port, hi, lo, b);
+			b = pixels.advanceAndLoadAndScale0();
+			#if (FASTLED_ALLOW_INTERRUPTS == 1)
+			sei();
+			#endif
+		};
+
+		sei();
+		return ARM_DWT_CYCCNT;
+	}
+};
+#endif
+
+FASTLED_NAMESPACE_END
+
+#endif
@@ -0,0 +1,396 @@
+#ifndef __INC_BLOCK_CLOCKLESS_ARM_K20_H
+#define __INC_BLOCK_CLOCKLESS_ARM_K20_H
+
+// Definition for a single channel clockless controller for the k20 family of chips, like that used in the teensy 3.0/3.1
+// See clockless.h for detailed info on how the template parameters are used.
+#if defined(FASTLED_TEENSY3)
+#define FASTLED_HAS_BLOCKLESS 1
+
+#define PORTC_FIRST_PIN 15
+#define PORTD_FIRST_PIN 2
+#define HAS_PORTDC 1
+
+#define PORT_MASK (((1<<LANES)-1) & ((FIRST_PIN==2) ? 0xFF : 0xFFF))
+
+#define MIN(X,Y) (((X)<(Y)) ? (X):(Y))
+#define LANES ((FIRST_PIN==2) ? MIN(__LANES,8) : MIN(__LANES,12))
+
+#include "kinetis.h"
+
+FASTLED_NAMESPACE_BEGIN
+
+template <uint8_t __LANES, int FIRST_PIN, int T1, int T2, int T3, EOrder RGB_ORDER = GRB, int XTRA0 = 0, bool FLIP = false, int WAIT_TIME = 40>
+class InlineBlockClocklessController : public CLEDController {
+	typedef typename FastPin<FIRST_PIN>::port_ptr_t data_ptr_t;
+	typedef typename FastPin<FIRST_PIN>::port_t data_t;
+
+	data_t mPinMask;
+	data_ptr_t mPort;
+	CMinWait<WAIT_TIME> mWait;
+public:
+	virtual void init() {
+		if(FIRST_PIN == PORTC_FIRST_PIN) { // PORTC
+			switch(LANES) {
+				case 12: FastPin<30>::setOutput();
+				case 11: FastPin<29>::setOutput();
+				case 10: FastPin<27>::setOutput();
+				case 9: FastPin<28>::setOutput();
+				case 8: FastPin<12>::setOutput();
+				case 7: FastPin<11>::setOutput();
+				case 6: FastPin<13>::setOutput();
+				case 5: FastPin<10>::setOutput();
+				case 4: FastPin<9>::setOutput();
+				case 3: FastPin<23>::setOutput();
+				case 2: FastPin<22>::setOutput();
+				case 1: FastPin<15>::setOutput();
+			}
+		} else if(FIRST_PIN == PORTD_FIRST_PIN) { // PORTD
+			switch(LANES) {
+				case 8: FastPin<5>::setOutput();
+				case 7: FastPin<21>::setOutput();
+				case 6: FastPin<20>::setOutput();
+				case 5: FastPin<6>::setOutput();
+				case 4: FastPin<8>::setOutput();
+				case 3: FastPin<7>::setOutput();
+				case 2: FastPin<14>::setOutput();
+				case 1: FastPin<2>::setOutput();
+			}
+		}
+		mPinMask = FastPin<FIRST_PIN>::mask();
+		mPort = FastPin<FIRST_PIN>::port();
+	}
+
+	virtual uint16_t getMaxRefreshRate() const { return 400; }
+
+	virtual void clearLeds(int nLeds) {
+		showColor(CRGB(0, 0, 0), nLeds, 0);
+	}
+
+	// set all the leds on the controller to a given color
+	virtual void showColor(const struct CRGB & rgbdata, int nLeds, CRGB scale) {
+		MultiPixelController<LANES,PORT_MASK,RGB_ORDER> pixels(rgbdata,nLeds, scale, getDither() );
+		mWait.wait();
+		uint32_t clocks = showRGBInternal(pixels,nLeds);
+		#if FASTLED_ALLOW_INTTERUPTS == 0
+		// Adjust the timer
+		long microsTaken = CLKS_TO_MICROS(clocks);
+		MS_COUNTER += (1 + (microsTaken / 1000));
+		#endif
+
+		mWait.mark();
+	}
+
+	virtual void show(const struct CRGB *rgbdata, int nLeds, CRGB scale) {
+		MultiPixelController<LANES,PORT_MASK,RGB_ORDER> pixels(rgbdata,nLeds, scale, getDither() );
+		mWait.wait();
+		uint32_t clocks = showRGBInternal(pixels,nLeds);
+		#if FASTLED_ALLOW_INTTERUPTS == 0
+		// Adjust the timer
+		long microsTaken = CLKS_TO_MICROS(clocks);
+		MS_COUNTER += (1 + (microsTaken / 1000));
+		#endif
+
+		mWait.mark();
+	}
+
+#ifdef SUPPORT_ARGB
+	virtual void show(const struct CARGB *rgbdata, int nLeds, CRGB scale) {
+		MultiPixelController<LANES,PORT_MASK,RGB_ORDER> pixels(rgbdata,nLeds, scale, getDither() );
+		mWait.wait();
+		uint32_t clocks = showRGBInternal(pixels,nLeds);
+
+		#if FASTLED_ALLOW_INTTERUPTS == 0
+		// Adjust the timer
+		long microsTaken = CLKS_TO_MICROS(clocks);
+		MS_COUNTER += (1 + (microsTaken / 1000));
+		#endif
+
+		mWait.mark();
+	}
+#endif
+
+
+	typedef union {
+		uint8_t bytes[12];
+		uint16_t shorts[6];
+		uint32_t raw[3];
+	} Lines;
+
+	template<int BITS,int PX> __attribute__ ((always_inline)) inline static void writeBits(register uint32_t & next_mark, register Lines & b, MultiPixelController<LANES, PORT_MASK, RGB_ORDER> &pixels) { // , register uint32_t & b2)  {
+		register Lines b2;
+		if(LANES>8) {
+			transpose8<1,2>(b.bytes,b2.bytes);
+			transpose8<1,2>(b.bytes+8,b2.bytes+1);
+		} else {
+			transpose8x1(b.bytes,b2.bytes);
+		}
+		register uint8_t d = pixels.template getd<PX>(pixels);
+		register uint8_t scale = pixels.template getscale<PX>(pixels);
+
+		for(register uint32_t i = 0; i < (LANES/2); i++) {
+			while(ARM_DWT_CYCCNT < next_mark);
+			next_mark = ARM_DWT_CYCCNT + (T1+T2+T3)-3;
+			*FastPin<FIRST_PIN>::sport() = PORT_MASK;
+
+			while((next_mark - ARM_DWT_CYCCNT) > (T2+T3+(2*(F_CPU/24000000))));
+			if(LANES>8) {
+				*FastPin<FIRST_PIN>::cport() = ((~b2.shorts[i]) & PORT_MASK);
+			} else {
+				*FastPin<FIRST_PIN>::cport() = ((~b2.bytes[7-i]) & PORT_MASK);
+			}
+
+			while((next_mark - ARM_DWT_CYCCNT) > (T3));
+			*FastPin<FIRST_PIN>::cport() = PORT_MASK;
+
+			b.bytes[i] = pixels.template loadAndScale<PX>(pixels,i,d,scale);
+			b.bytes[i+(LANES/2)] = pixels.template loadAndScale<PX>(pixels,i+(LANES/2),d,scale);
+		}
+
+		// if folks use an odd numnber of lanes, get the last byte's value here
+		if(LANES & 0x01) {
+			b.bytes[LANES-1] = pixels.template loadAndScale<PX>(pixels,LANES-1,d,scale);
+		}
+
+		for(register uint32_t i = LANES/2; i < 8; i++) {
+			while(ARM_DWT_CYCCNT < next_mark);
+			next_mark = ARM_DWT_CYCCNT + (T1+T2+T3)-3;
+			*FastPin<FIRST_PIN>::sport() = PORT_MASK;
+			while((next_mark - ARM_DWT_CYCCNT) > (T2+T3+(2*(F_CPU/24000000))));
+			if(LANES>8) {
+				*FastPin<FIRST_PIN>::cport() = ((~b2.shorts[i]) & PORT_MASK);
+			} else {
+				// b2.bytes[0] = 0;
+				*FastPin<FIRST_PIN>::cport() = ((~b2.bytes[7-i]) & PORT_MASK);
+			}
+
+			while((next_mark - ARM_DWT_CYCCNT) > (T3));
+			*FastPin<FIRST_PIN>::cport() = PORT_MASK;
+
+		}
+
+
+		// while(ARM_DWT_CYCCNT < next_mark);
+		// next_mark = ARM_DWT_CYCCNT + (T1+T2+T3)-3;
+		// *FastPin<FIRST_PIN>::sport() = PORT_MASK;
+		//
+		// while((next_mark - ARM_DWT_CYCCNT) > (T2+T3+6));
+		// if(LANES>8) {
+		// 	*FastPin<FIRST_PIN>::cport() = ((~b2.shorts[7]) & PORT_MASK);
+		// } else {
+		// 	*FastPin<FIRST_PIN>::cport() = PORT_MASK; // ((~b2.bytes[7-i]) & PORT_MASK);
+		// }
+		//
+		// while((next_mark - ARM_DWT_CYCCNT) > (T3));
+		// *FastPin<FIRST_PIN>::cport() = PORT_MASK;
+	}
+
+
+
+	// This method is made static to force making register Y available to use for data on AVR - if the method is non-static, then
+	// gcc will use register Y for the this pointer.
+		static uint32_t showRGBInternal(MultiPixelController<LANES, PORT_MASK, RGB_ORDER> &allpixels, int nLeds) {
+		// Get access to the clock
+		ARM_DEMCR    |= ARM_DEMCR_TRCENA;
+		ARM_DWT_CTRL |= ARM_DWT_CTRL_CYCCNTENA;
+		ARM_DWT_CYCCNT = 0;
+
+		// Setup the pixel controller and load/scale the first byte
+		allpixels.preStepFirstByteDithering();
+		register Lines b0;
+
+		allpixels.preStepFirstByteDithering();
+		for(int i = 0; i < LANES; i++) {
+			b0.bytes[i] = allpixels.loadAndScale0(i);
+		}
+
+		cli();
+		uint32_t next_mark = ARM_DWT_CYCCNT + (T1+T2+T3);
+
+		while(nLeds--) {
+			#if (FASTLED_ALLOW_INTERRUPTS == 1)
+			cli();
+			// if interrupts took longer than 45µs, punt on the current frame
+			if(ARM_DWT_CYCCNT > next_mark) {
+				if((ARM_DWT_CYCCNT-next_mark) > ((WAIT_TIME-5)*CLKS_PER_US)) { sei(); return ARM_DWT_CYCCNT; }
+			}
+			#endif
+			allpixels.stepDithering();
+
+			// Write first byte, read next byte
+			writeBits<8+XTRA0,1>(next_mark, b0, allpixels);
+
+			// Write second byte, read 3rd byte
+			writeBits<8+XTRA0,2>(next_mark, b0, allpixels);
+			allpixels.advanceData();
+
+			// Write third byte
+			writeBits<8+XTRA0,0>(next_mark, b0, allpixels);
+			#if (FASTLED_ALLOW_INTERRUPTS == 1)
+			sei();
+			#endif
+		};
+
+		return ARM_DWT_CYCCNT;
+	}
+};
+
+#define DLANES (MIN(__LANES,16))
+#define PMASK ((1<<(DLANES))-1)
+#define PMASK_HI (PMASK>>8 & 0xFF)
+#define PMASK_LO (PMASK & 0xFF)
+
+template <uint8_t __LANES, int T1, int T2, int T3, EOrder RGB_ORDER = GRB, int XTRA0 = 0, bool FLIP = false, int WAIT_TIME = 50>
+class SixteenWayInlineBlockClocklessController : public CLEDController {
+	typedef typename FastPin<PORTC_FIRST_PIN>::port_ptr_t data_ptr_t;
+	typedef typename FastPin<PORTC_FIRST_PIN>::port_t data_t;
+
+	data_t mPinMask;
+	data_ptr_t mPort;
+	CMinWait<WAIT_TIME> mWait;
+public:
+	virtual void init() {
+				// FastPin<30>::setOutput();
+				// FastPin<29>::setOutput();
+				// FastPin<27>::setOutput();
+				// FastPin<28>::setOutput();
+				switch(DLANES) {
+					case 16: FastPin<12>::setOutput();
+					case 15: FastPin<11>::setOutput();
+					case 14: FastPin<13>::setOutput();
+					case 13: FastPin<10>::setOutput();
+					case 12: FastPin<9>::setOutput();
+					case 11: FastPin<23>::setOutput();
+					case 10: FastPin<22>::setOutput();
+					case 9:  FastPin<15>::setOutput();
+
+					case 8:  FastPin<5>::setOutput();
+					case 7:  FastPin<21>::setOutput();
+					case 6:  FastPin<20>::setOutput();
+					case 5:  FastPin<6>::setOutput();
+					case 4:  FastPin<8>::setOutput();
+					case 3:  FastPin<7>::setOutput();
+					case 2:  FastPin<14>::setOutput();
+					case 1:  FastPin<2>::setOutput();
+				}
+	}
+
+	virtual void clearLeds(int nLeds) {
+		showColor(CRGB(0, 0, 0), nLeds, 0);
+	}
+
+	// set all the leds on the controller to a given color
+	virtual void showColor(const struct CRGB & rgbdata, int nLeds, CRGB scale) {
+		MultiPixelController<DLANES,PMASK,RGB_ORDER> pixels(rgbdata,nLeds, scale, getDither() );
+		mWait.wait();
+		showRGBInternal(pixels,nLeds);
+		mWait.mark();
+	}
+
+	virtual void show(const struct CRGB *rgbdata, int nLeds, CRGB scale) {
+		MultiPixelController<DLANES,PMASK,RGB_ORDER> pixels(rgbdata,nLeds, scale, getDither() );
+		mWait.wait();
+		showRGBInternal(pixels,nLeds);
+		mWait.mark();
+	}
+
+#ifdef SUPPORT_ARGB
+	virtual void show(const struct CARGB *rgbdata, int nLeds, CRGB scale) {
+		MultiPixelController<DLANES,PMASK,RGB_ORDER> pixels(rgbdata,nLeds, scale, getDither() );
+		mWait.wait();
+		showRGBInternal(pixels,nLeds);
+		mWait.mark();
+	}
+#endif
+
+
+	typedef union {
+		uint8_t bytes[16];
+		uint16_t shorts[8];
+		uint32_t raw[4];
+	} Lines;
+
+	template<int BITS,int PX> __attribute__ ((always_inline)) inline static void writeBits(register uint32_t & next_mark, register Lines & b, MultiPixelController<DLANES, PMASK, RGB_ORDER> &pixels) { // , register uint32_t & b2)  {
+		register Lines b2;
+		transpose8x1(b.bytes,b2.bytes);
+		transpose8x1(b.bytes+8,b2.bytes+8);
+		register uint8_t d = pixels.template getd<PX>(pixels);
+		register uint8_t scale = pixels.template getscale<PX>(pixels);
+
+		for(register uint32_t i = 0; (i < DLANES) && (i < 8); i++) {
+			while(ARM_DWT_CYCCNT < next_mark);
+			next_mark = ARM_DWT_CYCCNT + (T1+T2+T3)-3;
+			*FastPin<PORTD_FIRST_PIN>::sport() = PMASK_LO;
+			*FastPin<PORTC_FIRST_PIN>::sport() = PMASK_HI;
+
+			while((next_mark - ARM_DWT_CYCCNT) > (T2+T3+6));
+			*FastPin<PORTD_FIRST_PIN>::cport() = ((~b2.bytes[7-i]) & PMASK_LO);
+			*FastPin<PORTC_FIRST_PIN>::cport() = ((~b2.bytes[15-i]) & PMASK_HI);
+
+			while((next_mark - ARM_DWT_CYCCNT) > (T3));
+			*FastPin<PORTD_FIRST_PIN>::cport() = PMASK_LO;
+			*FastPin<PORTC_FIRST_PIN>::cport() = PMASK_HI;
+
+			b.bytes[i] = pixels.template loadAndScale<PX>(pixels,i,d,scale);
+			if(DLANES==16 || (DLANES>8 && ((i+8) < DLANES))) {
+				b.bytes[i+8] = pixels.template loadAndScale<PX>(pixels,i+8,d,scale);
+			}
+		}
+	}
+
+
+
+	// This method is made static to force making register Y available to use for data on AVR - if the method is non-static, then
+	// gcc will use register Y for the this pointer.
+		static uint32_t showRGBInternal(MultiPixelController<DLANES, PMASK, RGB_ORDER> &allpixels, int nLeds) {
+		// Get access to the clock
+		ARM_DEMCR    |= ARM_DEMCR_TRCENA;
+		ARM_DWT_CTRL |= ARM_DWT_CTRL_CYCCNTENA;
+		ARM_DWT_CYCCNT = 0;
+
+		// Setup the pixel controller and load/scale the first byte
+		allpixels.preStepFirstByteDithering();
+		register Lines b0;
+
+		allpixels.preStepFirstByteDithering();
+		for(int i = 0; i < DLANES; i++) {
+			b0.bytes[i] = allpixels.loadAndScale0(i);
+		}
+
+		cli();
+		uint32_t next_mark = ARM_DWT_CYCCNT + (T1+T2+T3);
+
+		while(nLeds--) {
+			allpixels.stepDithering();
+			#if 0 && (FASTLED_ALLOW_INTERRUPTS == 1)
+			cli();
+			// if interrupts took longer than 45µs, punt on the current frame
+			if(ARM_DWT_CYCCNT > next_mark) {
+				if((ARM_DWT_CYCCNT-next_mark) > ((WAIT_TIME-INTERRUPT_THRESHOLD)*CLKS_PER_US)) { sei(); return ARM_DWT_CYCCNT; }
+			}
+			#endif
+
+			// Write first byte, read next byte
+			writeBits<8+XTRA0,1>(next_mark, b0, allpixels);
+
+			// Write second byte, read 3rd byte
+			writeBits<8+XTRA0,2>(next_mark, b0, allpixels);
+			allpixels.advanceData();
+
+			// Write third byte
+			writeBits<8+XTRA0,0>(next_mark, b0, allpixels);
+
+			#if 0 && (FASTLED_ALLOW_INTERRUPTS == 1)
+			sei();
+			#endif
+		};
+		sei();
+
+		return ARM_DWT_CYCCNT;
+	}
+};
+
+FASTLED_NAMESPACE_END
+
+#endif
+
+#endif
@@ -0,0 +1,14 @@
+#ifndef __INC_FASTLED_ARM_K20_H
+#define __INC_FASTLED_ARM_K20_H
+
+// Include the k20 headers
+#include "bitswap.h"
+#include "fastled_delay.h"
+#include "fastpin_arm_k20.h"
+#include "fastspi_arm_k20.h"
+#include "octows2811_controller.h"
+#include "smartmatrix_t3.h"
+#include "clockless_arm_k20.h"
+#include "clockless_block_arm_k20.h"
+
+#endif
@@ -0,0 +1,120 @@
+#ifndef __FASTPIN_ARM_K20_H
+#define __FASTPIN_ARM_K20_H
+
+FASTLED_NAMESPACE_BEGIN
+
+#if defined(FASTLED_FORCE_SOFTWARE_PINS)
+#warning "Software pin support forced, pin access will be sloightly slower."
+#define NO_HARDWARE_PIN_SUPPORT
+#undef HAS_HARDWARE_PIN_SUPPORT
+
+#else
+
+
+/// Template definition for teensy 3.0 style ARM pins, providing direct access to the various GPIO registers.  Note that this
+/// uses the full port GPIO registers.  In theory, in some way, bit-band register access -should- be faster, however I have found
+/// that something about the way gcc does register allocation results in the bit-band code being slower.  It will need more fine tuning.
+/// The registers are data output, set output, clear output, toggle output, input, and direction
+template<uint8_t PIN, uint32_t _MASK, typename _PDOR, typename _PSOR, typename _PCOR, typename _PTOR, typename _PDIR, typename _PDDR> class _ARMPIN {
+public:
+	typedef volatile uint32_t * port_ptr_t;
+	typedef uint32_t port_t;
+
+	inline static void setOutput() { pinMode(PIN, OUTPUT); } // TODO: perform MUX config { _PDDR::r() |= _MASK; }
+	inline static void setInput() { pinMode(PIN, INPUT); } // TODO: preform MUX config { _PDDR::r() &= ~_MASK; }
+
+	inline static void hi() __attribute__ ((always_inline)) { _PSOR::r() = _MASK; }
+	inline static void lo() __attribute__ ((always_inline)) { _PCOR::r() = _MASK; }
+	inline static void set(register port_t val) __attribute__ ((always_inline)) { _PDOR::r() = val; }
+
+	inline static void strobe() __attribute__ ((always_inline)) { toggle(); toggle(); }
+
+	inline static void toggle() __attribute__ ((always_inline)) { _PTOR::r() = _MASK; }
+
+	inline static void hi(register port_ptr_t port) __attribute__ ((always_inline)) { hi(); }
+	inline static void lo(register port_ptr_t port) __attribute__ ((always_inline)) { lo(); }
+	inline static void fastset(register port_ptr_t port, register port_t val) __attribute__ ((always_inline)) { *port = val; }
+
+	inline static port_t hival() __attribute__ ((always_inline)) { return _PDOR::r() | _MASK; }
+	inline static port_t loval() __attribute__ ((always_inline)) { return _PDOR::r() & ~_MASK; }
+	inline static port_ptr_t port() __attribute__ ((always_inline)) { return &_PDOR::r(); }
+	inline static port_ptr_t sport() __attribute__ ((always_inline)) { return &_PSOR::r(); }
+	inline static port_ptr_t cport() __attribute__ ((always_inline)) { return &_PCOR::r(); }
+	inline static port_t mask() __attribute__ ((always_inline)) { return _MASK; }
+};
+
+/// Template definition for teensy 3.0 style ARM pins using bit banding, providing direct access to the various GPIO registers.  GCC
+/// does a poor job of optimizing around these accesses so they are not being used just yet.
+template<uint8_t PIN, int _BIT, typename _PDOR, typename _PSOR, typename _PCOR, typename _PTOR, typename _PDIR, typename _PDDR> class _ARMPIN_BITBAND {
+public:
+	typedef volatile uint32_t * port_ptr_t;
+	typedef uint32_t port_t;
+
+	inline static void setOutput() { pinMode(PIN, OUTPUT); } // TODO: perform MUX config { _PDDR::r() |= _MASK; }
+	inline static void setInput() { pinMode(PIN, INPUT); } // TODO: preform MUX config { _PDDR::r() &= ~_MASK; }
+
+	inline static void hi() __attribute__ ((always_inline)) { *_PDOR::template rx<_BIT>() = 1; }
+	inline static void lo() __attribute__ ((always_inline)) { *_PDOR::template rx<_BIT>() = 0; }
+	inline static void set(register port_t val) __attribute__ ((always_inline)) { *_PDOR::template rx<_BIT>() = val; }
+
+	inline static void strobe() __attribute__ ((always_inline)) { toggle(); toggle(); }
+
+	inline static void toggle() __attribute__ ((always_inline)) { *_PTOR::template rx<_BIT>() = 1; }
+
+	inline static void hi(register port_ptr_t port) __attribute__ ((always_inline)) { hi();  }
+	inline static void lo(register port_ptr_t port) __attribute__ ((always_inline)) { lo(); }
+	inline static void fastset(register port_ptr_t port, register port_t val) __attribute__ ((always_inline)) { *_PDOR::template rx<_BIT>() = val; }
+
+	inline static port_t hival() __attribute__ ((always_inline)) { return 1; }
+	inline static port_t loval() __attribute__ ((always_inline)) { return 0; }
+	inline static port_ptr_t port() __attribute__ ((always_inline)) { return _PDOR::template rx<_BIT>(); }
+	inline static port_t mask() __attribute__ ((always_inline)) { return 1; }
+};
+
+// Macros for k20 pin access/definition
+#define GPIO_BITBAND_ADDR(reg, bit) (((uint32_t)&(reg) - 0x40000000) * 32 + (bit) * 4 + 0x42000000)
+#define GPIO_BITBAND_PTR(reg, bit) ((uint32_t *)GPIO_BITBAND_ADDR((reg), (bit)))
+
+#define _R(T) struct __gen_struct_ ## T
+#define _RD32(T) struct __gen_struct_ ## T { static __attribute__((always_inline)) inline reg32_t r() { return T; } \
+	template<int BIT> static __attribute__((always_inline)) inline ptr_reg32_t rx() { return GPIO_BITBAND_PTR(T, BIT); } };
+#define _IO32(L) _RD32(GPIO ## L ## _PDOR); _RD32(GPIO ## L ## _PSOR); _RD32(GPIO ## L ## _PCOR); _RD32(GPIO ## L ## _PTOR); _RD32(GPIO ## L ## _PDIR); _RD32(GPIO ## L ## _PDDR);
+
+#define _DEFPIN_ARM(PIN, BIT, L) template<> class FastPin<PIN> : public _ARMPIN<PIN, 1 << BIT, _R(GPIO ## L ## _PDOR), _R(GPIO ## L ## _PSOR), _R(GPIO ## L ## _PCOR), \
+																			_R(GPIO ## L ## _PTOR), _R(GPIO ## L ## _PDIR), _R(GPIO ## L ## _PDDR)> {}; \
+									template<> class FastPinBB<PIN> : public _ARMPIN_BITBAND<PIN, BIT, _R(GPIO ## L ## _PDOR), _R(GPIO ## L ## _PSOR), _R(GPIO ## L ## _PCOR), \
+ 																			_R(GPIO ## L ## _PTOR), _R(GPIO ## L ## _PDIR), _R(GPIO ## L ## _PDDR)> {};
+
+// Actual pin definitions
+#if defined(FASTLED_TEENSY3) && defined(CORE_TEENSY)
+
+_IO32(A); _IO32(B); _IO32(C); _IO32(D); _IO32(E);
+
+#define MAX_PIN 33
+_DEFPIN_ARM(0, 16, B); _DEFPIN_ARM(1, 17, B); _DEFPIN_ARM(2, 0, D); _DEFPIN_ARM(3, 12, A);
+_DEFPIN_ARM(4, 13, A); _DEFPIN_ARM(5, 7, D); _DEFPIN_ARM(6, 4, D); _DEFPIN_ARM(7, 2, D);
+_DEFPIN_ARM(8, 3, D); _DEFPIN_ARM(9, 3, C); _DEFPIN_ARM(10, 4, C); _DEFPIN_ARM(11, 6, C);
+_DEFPIN_ARM(12, 7, C); _DEFPIN_ARM(13, 5, C); _DEFPIN_ARM(14, 1, D); _DEFPIN_ARM(15, 0, C);
+_DEFPIN_ARM(16, 0, B); _DEFPIN_ARM(17, 1, B); _DEFPIN_ARM(18, 3, B); _DEFPIN_ARM(19, 2, B);
+_DEFPIN_ARM(20, 5, D); _DEFPIN_ARM(21, 6, D); _DEFPIN_ARM(22, 1, C); _DEFPIN_ARM(23, 2, C);
+_DEFPIN_ARM(24, 5, A); _DEFPIN_ARM(25, 19, B); _DEFPIN_ARM(26, 1, E); _DEFPIN_ARM(27, 9, C);
+_DEFPIN_ARM(28, 8, C); _DEFPIN_ARM(29, 10, C); _DEFPIN_ARM(30, 11, C); _DEFPIN_ARM(31, 0, E);
+_DEFPIN_ARM(32, 18, B); _DEFPIN_ARM(33, 4, A);
+
+#define SPI_DATA 11
+#define SPI_CLOCK 13
+#define SPI1            (*(SPI_t *)0x4002D000)
+
+#define SPI2_DATA 7
+#define SPI2_CLOCK 14
+
+#define FASTLED_TEENSY3
+#define ARM_HARDWARE_SPI
+#define HAS_HARDWARE_PIN_SUPPORT
+#endif
+
+#endif // FASTLED_FORCE_SOFTWARE_PINS
+
+FASTLED_NAMESPACE_END
+
+#endif // __INC_FASTPIN_ARM_K20
@@ -0,0 +1,446 @@
+#ifndef __INC_FASTSPI_ARM_H
+#define __INC_FASTSPI_ARM_H
+
+FASTLED_NAMESPACE_BEGIN
+
+#if defined(FASTLED_TEENSY3) && defined(CORE_TEENSY)
+
+// Version 1.20 renamed SPI_t to KINETISK_SPI_t
+#if TEENSYDUINO >= 120
+#define SPI_t KINETISK_SPI_t
+#endif
+
+#ifndef KINETISK_SPI0
+#define KINETISK_SPI0 SPI0
+#endif
+
+#ifndef SPI_PUSHR_CONT
+#define SPI_PUSHR_CONT SPIX.PUSHR_CONT
+#define SPI_PUSHR_CTAS(X) SPIX.PUSHR_CTAS(X)
+#define SPI_PUSHR_EOQ SPIX.PUSHR_EOQ
+#define SPI_PUSHR_CTCNT SPIX.PUSHR_CTCNT
+#define SPI_PUSHR_PCS(X) SPIX.PUSHR_PCS(X)
+#endif
+
+// Template function that, on compilation, expands to a constant representing the highest bit set in a byte.  Right now,
+// if no bits are set (value is 0), it returns 0, which is also the value returned if the lowest bit is the only bit
+// set (the zero-th bit).  Unclear if I  will want this to change at some point.
+template<int VAL, int BIT> class BitWork {
+public:
+	static int highestBit() __attribute__((always_inline)) { return (VAL & 1 << BIT) ? BIT : BitWork<VAL, BIT-1>::highestBit(); }
+};
+template<int VAL> class BitWork<VAL, 0> {
+public:
+	static int highestBit() __attribute__((always_inline)) { return 0; }
+};
+
+#define MAX(A, B) (( (A) > (B) ) ? (A) : (B))
+
+#define USE_CONT 0
+// intra-frame backup data
+struct SPIState {
+	uint32_t _ctar0,_ctar1;
+	uint32_t pins[4];
+};
+
+// extern SPIState gState;
+
+
+// Templated function to translate a clock divider value into the prescalar, scalar, and clock doubling setting for the world.
+template <int VAL> void getScalars(uint32_t & preScalar, uint32_t & scalar, uint32_t & dbl) {
+	switch(VAL) {
+		// Handle the dbl clock cases
+		case 0: case 1:
+		case 2: preScalar = 0; scalar = 0; dbl = 1; break;
+		case 3: preScalar = 1; scalar = 0; dbl = 1; break;
+		case 5: preScalar = 2; scalar = 0; dbl = 1; break;
+		case 7: preScalar = 3; scalar = 0; dbl = 1; break;
+
+		// Handle the scalar value 6 cases (since it's not a power of two, it won't get caught
+		// below)
+		case 9: preScalar = 1; scalar = 2; dbl = 1; break;
+		case 18: case 19: preScalar = 1; scalar = 2; dbl = 0; break;
+
+		case 15: preScalar = 2; scalar = 2; dbl = 1; break;
+		case 30: case 31: preScalar = 2; scalar = 2; dbl = 0; break;
+
+		case 21: case 22: case 23: preScalar = 3; scalar = 2; dbl = 1; break;
+		case 42: case 43: case 44: case 45: case 46: case 47: preScalar = 3; scalar = 2; dbl = 0; break;
+		default: {
+			int p2 = BitWork<VAL/2, 15>::highestBit();
+			int p3 = BitWork<VAL/3, 15>::highestBit();
+			int p5 = BitWork<VAL/5, 15>::highestBit();
+			int p7 = BitWork<VAL/7, 15>::highestBit();
+
+			int w2 = 2 * (1 << p2);
+			int w3 = (VAL/3) > 0 ? 3 * (1 << p3) : 0;
+			int w5 = (VAL/5) > 0 ? 5 * (1 << p5) : 0;
+			int w7 = (VAL/7) > 0 ? 7 * (1 << p7) : 0;
+
+			int maxval = MAX(MAX(w2, w3), MAX(w5, w7));
+
+			if(w2 == maxval) { preScalar = 0; scalar = p2; }
+			else if(w3 == maxval) { preScalar = 1; scalar = p3; }
+			else if(w5 == maxval) { preScalar = 2; scalar = p5; }
+			else if(w7 == maxval) { preScalar = 3; scalar = p7; }
+
+			dbl = 0;
+			if(scalar == 0) { dbl = 1; }
+			else if(scalar < 3) { scalar--; }
+		}
+	}
+	return;
+}
+
+#define SPIX (*(SPI_t*)pSPIX)
+
+template <uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint8_t _SPI_CLOCK_DIVIDER, uint32_t pSPIX>
+class ARMHardwareSPIOutput {
+	Selectable *m_pSelect;
+	SPIState gState;
+
+	// Borrowed from the teensy3 SPSR emulation code -- note, enabling pin 7 disables pin 11 (and vice versa),
+	// and likewise enabling pin 14 disables pin 13 (and vice versa)
+	inline void enable_pins(void) __attribute__((always_inline)) {
+		//serial_print("enable_pins\n");
+		switch(_DATA_PIN) {
+			case 7:
+				CORE_PIN7_CONFIG = PORT_PCR_DSE | PORT_PCR_MUX(2);
+				CORE_PIN11_CONFIG = PORT_PCR_SRE | PORT_PCR_DSE | PORT_PCR_MUX(1);
+				break;
+			case 11:
+				CORE_PIN11_CONFIG = PORT_PCR_DSE | PORT_PCR_MUX(2);
+				CORE_PIN7_CONFIG = PORT_PCR_SRE | PORT_PCR_DSE | PORT_PCR_MUX(1);
+				break;
+		}
+
+		switch(_CLOCK_PIN) {
+			case 13:
+				CORE_PIN13_CONFIG = PORT_PCR_DSE | PORT_PCR_MUX(2);
+				CORE_PIN14_CONFIG = PORT_PCR_SRE | PORT_PCR_DSE | PORT_PCR_MUX(1);
+				break;
+			case 14:
+				CORE_PIN14_CONFIG = PORT_PCR_DSE | PORT_PCR_MUX(2);
+				CORE_PIN13_CONFIG = PORT_PCR_SRE | PORT_PCR_DSE | PORT_PCR_MUX(1);
+				break;
+		}
+	}
+
+	// Borrowed from the teensy3 SPSR emulation code.  We disable the pins that we're using, and restore the state on the pins that we aren't using
+	inline void disable_pins(void) __attribute__((always_inline)) {
+		switch(_DATA_PIN) {
+			case 7: CORE_PIN7_CONFIG = PORT_PCR_SRE | PORT_PCR_DSE | PORT_PCR_MUX(1); CORE_PIN11_CONFIG = gState.pins[1]; break;
+			case 11: CORE_PIN11_CONFIG = PORT_PCR_SRE | PORT_PCR_DSE | PORT_PCR_MUX(1); CORE_PIN7_CONFIG = gState.pins[0]; break;
+		}
+
+		switch(_CLOCK_PIN) {
+			case 13: CORE_PIN13_CONFIG = PORT_PCR_SRE | PORT_PCR_DSE | PORT_PCR_MUX(1); CORE_PIN14_CONFIG = gState.pins[3]; break;
+			case 14: CORE_PIN14_CONFIG = PORT_PCR_SRE | PORT_PCR_DSE | PORT_PCR_MUX(1); CORE_PIN13_CONFIG = gState.pins[2]; break;
+		}
+	}
+
+	static inline void update_ctars(uint32_t ctar0, uint32_t ctar1) __attribute__((always_inline)) {
+		if(SPIX.CTAR0 == ctar0 && SPIX.CTAR1 == ctar1) return;
+		uint32_t mcr = SPIX.MCR;
+		if(mcr & SPI_MCR_MDIS) {
+			SPIX.CTAR0 = ctar0;
+			SPIX.CTAR1 = ctar1;
+		} else {
+			SPIX.MCR = mcr | SPI_MCR_MDIS | SPI_MCR_HALT;
+			SPIX.CTAR0 = ctar0;
+			SPIX.CTAR1 = ctar1;
+			SPIX.MCR = mcr;
+		}
+	}
+
+	static inline void update_ctar0(uint32_t ctar) __attribute__((always_inline)) {
+		if (SPIX.CTAR0 == ctar) return;
+		uint32_t mcr = SPIX.MCR;
+		if (mcr & SPI_MCR_MDIS) {
+			SPIX.CTAR0 = ctar;
+		} else {
+			SPIX.MCR = mcr | SPI_MCR_MDIS | SPI_MCR_HALT;
+			SPIX.CTAR0 = ctar;
+
+			SPIX.MCR = mcr;
+		}
+	}
+
+	static inline void update_ctar1(uint32_t ctar) __attribute__((always_inline)) {
+		if (SPIX.CTAR1 == ctar) return;
+		uint32_t mcr = SPIX.MCR;
+		if (mcr & SPI_MCR_MDIS) {
+			SPIX.CTAR1 = ctar;
+		} else {
+			SPIX.MCR = mcr | SPI_MCR_MDIS | SPI_MCR_HALT;
+			SPIX.CTAR1 = ctar;
+			SPIX.MCR = mcr;
+
+		}
+	}
+
+	void setSPIRate() {
+		// Configure CTAR0, defaulting to 8 bits and CTAR1, defaulting to 16 bits
+		uint32_t _PBR = 0;
+		uint32_t _BR = 0;
+		uint32_t _CSSCK = 0;
+		uint32_t _DBR = 0;
+
+		// if(_SPI_CLOCK_DIVIDER >= 256) 		{ _PBR = 0; _BR = _CSSCK = 7; _DBR = 0; } // osc/256
+		// else if(_SPI_CLOCK_DIVIDER >= 128) 	{ _PBR = 0; _BR = _CSSCK = 6; _DBR = 0; } // osc/128
+		// else if(_SPI_CLOCK_DIVIDER >= 64) 	{ _PBR = 0; _BR = _CSSCK = 5; _DBR = 0; } // osc/64
+		// else if(_SPI_CLOCK_DIVIDER >= 32) 	{ _PBR = 0; _BR = _CSSCK = 4; _DBR = 0; } // osc/32
+		// else if(_SPI_CLOCK_DIVIDER >= 16) 	{ _PBR = 0; _BR = _CSSCK = 3; _DBR = 0; } // osc/16
+		// else if(_SPI_CLOCK_DIVIDER >= 8) 	{ _PBR = 0; _BR = _CSSCK = 1; _DBR = 0; } // osc/8
+		// else if(_SPI_CLOCK_DIVIDER >= 7) 	{ _PBR = 3; _BR = _CSSCK = 0; _DBR = 1; } // osc/7
+		// else if(_SPI_CLOCK_DIVIDER >= 5) 	{ _PBR = 2; _BR = _CSSCK = 0; _DBR = 1; } // osc/5
+		// else if(_SPI_CLOCK_DIVIDER >= 4) 	{ _PBR = 0; _BR = _CSSCK = 0; _DBR = 0; } // osc/4
+		// else if(_SPI_CLOCK_DIVIDER >= 3) 	{ _PBR = 1; _BR = _CSSCK = 0; _DBR = 1; } // osc/3
+		// else                                { _PBR = 0; _BR = _CSSCK = 0; _DBR = 1; } // osc/2
+
+		getScalars<_SPI_CLOCK_DIVIDER>(_PBR, _BR, _DBR);
+		_CSSCK = _BR;
+
+		uint32_t ctar0 = SPI_CTAR_FMSZ(7) | SPI_CTAR_PBR(_PBR) | SPI_CTAR_BR(_BR) | SPI_CTAR_CSSCK(_CSSCK);
+		uint32_t ctar1 = SPI_CTAR_FMSZ(15) | SPI_CTAR_PBR(_PBR) | SPI_CTAR_BR(_BR) | SPI_CTAR_CSSCK(_CSSCK);
+
+		#if USE_CONT == 1
+		ctar0 |= SPI_CTAR_CPHA | SPI_CTAR_CPOL;
+		ctar1 |= SPI_CTAR_CPHA | SPI_CTAR_CPOL;
+		#endif
+
+		if(_DBR) {
+			ctar0 |= SPI_CTAR_DBR;
+			ctar1 |= SPI_CTAR_DBR;
+		}
+
+		update_ctars(ctar0,ctar1);
+	}
+
+	void inline save_spi_state() __attribute__ ((always_inline)) {
+		// save ctar data
+		gState._ctar0 = SPIX.CTAR0;
+		gState._ctar1 = SPIX.CTAR1;
+
+		// save data for the not-us pins
+		gState.pins[0] = CORE_PIN7_CONFIG;
+		gState.pins[1] = CORE_PIN11_CONFIG;
+		gState.pins[2] = CORE_PIN13_CONFIG;
+		gState.pins[3] = CORE_PIN14_CONFIG;
+	}
+
+	void inline restore_spi_state() __attribute__ ((always_inline)) {
+		// restore ctar data
+		update_ctars(gState._ctar0,gState._ctar1);
+
+		// restore data for the not-us pins (not necessary because disable_pins will do this)
+		// CORE_PIN7_CONFIG = gState.pins[0];
+		// CORE_PIN11_CONFIG = gState.pins[1];
+		// CORE_PIN13_CONFIG = gState.pins[2];
+		// CORE_PIN14_CONFIG = gState.pins[3];
+	}
+
+
+public:
+	ARMHardwareSPIOutput() { m_pSelect = NULL; }
+	ARMHardwareSPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
+	void setSelect(Selectable *pSelect) { m_pSelect = pSelect; }
+
+
+	void init() {
+		// set the pins to output
+		FastPin<_DATA_PIN>::setOutput();
+		FastPin<_CLOCK_PIN>::setOutput();
+
+		// Enable SPI0 clock
+		uint32_t sim6 = SIM_SCGC6;
+		if((SPI_t*)pSPIX == &KINETISK_SPI0) {
+			if (!(sim6 & SIM_SCGC6_SPI0)) {
+				//serial_print("init1\n");
+				SIM_SCGC6 = sim6 | SIM_SCGC6_SPI0;
+				SPIX.CTAR0 = SPI_CTAR_FMSZ(7) | SPI_CTAR_PBR(1) | SPI_CTAR_BR(1);
+			}
+		} else if((SPI_t*)pSPIX == &SPI1) {
+			if (!(sim6 & SIM_SCGC6_SPI1)) {
+				//serial_print("init1\n");
+				SIM_SCGC6 = sim6 | SIM_SCGC6_SPI1;
+				SPIX.CTAR0 = SPI_CTAR_FMSZ(7) | SPI_CTAR_PBR(1) | SPI_CTAR_BR(1);
+			}
+		}
+
+		// Configure SPI as the master and enable
+		SPIX.MCR |= SPI_MCR_MSTR; // | SPI_MCR_CONT_SCKE);
+		SPIX.MCR &= ~(SPI_MCR_MDIS | SPI_MCR_HALT);
+
+		// pin/spi configuration happens on select
+	}
+
+	static void waitFully() __attribute__((always_inline)) {
+		while( (SPIX.SR & 0xF000) > 0);
+		while (!(SPIX.SR & SPI_SR_TCF));
+		SPIX.SR |= (SPI_SR_TCF | SPI_SR_EOQF);
+	}
+
+	static bool needwait() __attribute__((always_inline)) { return (SPIX.SR & 0x4000); }
+	static void wait() __attribute__((always_inline)) { while( (SPIX.SR & 0x4000) );  }
+	static void wait1() __attribute__((always_inline)) { while( (SPIX.SR & 0xF000) >= 0x2000);  }
+
+	enum ECont { CONT, NOCONT };
+	enum EWait { PRE, POST, NONE };
+	enum ELast { NOTLAST, LAST };
+
+	#if USE_CONT == 1
+	#define CM CONT
+	#else
+	#define CM NOCONT
+	#endif
+	#define WM PRE
+
+	template<ECont CONT_STATE, EWait WAIT_STATE, ELast LAST_STATE> class Write {
+	public:
+		static void writeWord(uint16_t w) __attribute__((always_inline)) {
+			if(WAIT_STATE == PRE) { wait(); }
+			SPIX.PUSHR = ((LAST_STATE == LAST) ? SPI_PUSHR_EOQ : 0) |
+			((CONT_STATE == CONT) ? SPI_PUSHR_CONT : 0) |
+			SPI_PUSHR_CTAS(1) | (w & 0xFFFF);
+			if(WAIT_STATE == POST) { wait(); }
+		}
+
+		static void writeByte(uint8_t b) __attribute__((always_inline)) {
+			if(WAIT_STATE == PRE) { wait(); }
+			SPIX.PUSHR = ((LAST_STATE == LAST) ? SPI_PUSHR_EOQ : 0) |
+			((CONT_STATE == CONT) ? SPI_PUSHR_CONT : 0) |
+			SPI_PUSHR_CTAS(0) | (b & 0xFF);
+			if(WAIT_STATE == POST) { wait(); }
+		}
+	};
+
+	static void writeWord(uint16_t w) __attribute__((always_inline)) { wait(); SPIX.PUSHR = SPI_PUSHR_CTAS(1) | (w & 0xFFFF); }
+	static void writeWordNoWait(uint16_t w) __attribute__((always_inline)) { SPIX.PUSHR = SPI_PUSHR_CTAS(1) | (w & 0xFFFF); }
+
+	static void writeByte(uint8_t b) __attribute__((always_inline)) { wait(); SPIX.PUSHR = SPI_PUSHR_CTAS(0) | (b & 0xFF); }
+	static void writeBytePostWait(uint8_t b) __attribute__((always_inline)) { SPIX.PUSHR = SPI_PUSHR_CTAS(0) | (b & 0xFF); wait(); }
+	static void writeByteNoWait(uint8_t b) __attribute__((always_inline)) { SPIX.PUSHR = SPI_PUSHR_CTAS(0) | (b & 0xFF); }
+
+	static void writeWordCont(uint16_t w) __attribute__((always_inline)) { wait(); SPIX.PUSHR = SPI_PUSHR_CONT | SPI_PUSHR_CTAS(1) | (w & 0xFFFF); }
+	static void writeWordContNoWait(uint16_t w) __attribute__((always_inline)) { SPIX.PUSHR = SPI_PUSHR_CONT | SPI_PUSHR_CTAS(1) | (w & 0xFFFF); }
+
+	static void writeByteCont(uint8_t b) __attribute__((always_inline)) { wait(); SPIX.PUSHR = SPI_PUSHR_CONT | SPI_PUSHR_CTAS(0) | (b & 0xFF); }
+	static void writeByteContPostWait(uint8_t b) __attribute__((always_inline)) { SPIX.PUSHR = SPI_PUSHR_CONT | SPI_PUSHR_CTAS(0) | (b & 0xFF); wait(); }
+	static void writeByteContNoWait(uint8_t b) __attribute__((always_inline)) { SPIX.PUSHR = SPI_PUSHR_CONT | SPI_PUSHR_CTAS(0) | (b & 0xFF); }
+
+	// not the most efficient mechanism in the world - but should be enough for sm16716 and friends
+	template <uint8_t BIT> inline static void writeBit(uint8_t b) {
+		uint32_t ctar1_save = SPIX.CTAR1;
+
+		// Clear out the FMSZ bits, reset them for 1 bit transferd for the start bit
+		uint32_t ctar1 = (ctar1_save & (~SPI_CTAR_FMSZ(15))) | SPI_CTAR_FMSZ(0);
+		update_ctar1(ctar1);
+
+		writeWord( (b & (1 << BIT)) != 0);
+
+		update_ctar1(ctar1_save);
+	}
+
+	void inline select() __attribute__((always_inline)) {
+		save_spi_state();
+		if(m_pSelect != NULL) { m_pSelect->select(); }
+		setSPIRate();
+		enable_pins();
+	}
+
+	void inline release() __attribute__((always_inline)) {
+		disable_pins();
+		if(m_pSelect != NULL) { m_pSelect->release(); }
+		restore_spi_state();
+	}
+
+	static void writeBytesValueRaw(uint8_t value, int len) {
+		while(len--) { Write<CM, WM, NOTLAST>::writeByte(value); }
+	}
+
+	void writeBytesValue(uint8_t value, int len) {
+		select();
+		while(len--) {
+			writeByte(value);
+		}
+		waitFully();
+		release();
+	}
+
+	// Write a block of n uint8_ts out
+	template <class D> void writeBytes(register uint8_t *data, int len) {
+		uint8_t *end = data + len;
+		select();
+		// could be optimized to write 16bit words out instead of 8bit bytes
+		while(data != end) {
+			writeByte(D::adjust(*data++));
+		}
+		D::postBlock(len);
+		waitFully();
+		release();
+	}
+
+	void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }
+
+	// write a block of uint8_ts out in groups of three.  len is the total number of uint8_ts to write out.  The template
+	// parameters indicate how many uint8_ts to skip at the beginning and/or end of each grouping
+	template <uint8_t FLAGS, class D, EOrder RGB_ORDER> void writePixels(PixelController<RGB_ORDER> pixels) {
+		select();
+		int len = pixels.mLen;
+
+		// Setup the pixel controller
+		if((FLAGS & FLAG_START_BIT) == 0) {
+			//If no start bit stupiditiy, write out as many 16-bit blocks as we can
+			while(pixels.has(2)) {
+				// Load and write out the first two bytes
+				if(WM == NONE) { wait1(); }
+				Write<CM, WM, NOTLAST>::writeWord(D::adjust(pixels.loadAndScale0()) << 8 | D::adjust(pixels.loadAndScale1()));
+
+				// Load and write out the next two bytes (step dithering, advance data in between since we
+				// cross pixels here)
+				Write<CM, WM, NOTLAST>::writeWord(D::adjust(pixels.loadAndScale2()) << 8 | D::adjust(pixels.stepAdvanceAndLoadAndScale0()));
+
+				// Load and write out the next two bytes
+				Write<CM, WM, NOTLAST>::writeWord(D::adjust(pixels.loadAndScale1()) << 8 | D::adjust(pixels.loadAndScale2()));
+				pixels.stepDithering();
+				pixels.advanceData();
+			}
+
+			if(pixels.has(1)) {
+				if(WM == NONE) { wait1(); }
+				// write out the rest as alternating 16/8-bit blocks (likely to be just one)
+				Write<CM, WM, NOTLAST>::writeWord(D::adjust(pixels.loadAndScale0()) << 8 | D::adjust(pixels.loadAndScale1()));
+				Write<CM, WM, NOTLAST>::writeByte(D::adjust(pixels.loadAndScale2()));
+			}
+
+			D::postBlock(len);
+			waitFully();
+		} else if(FLAGS & FLAG_START_BIT) {
+			uint32_t ctar1_save = SPIX.CTAR1;
+
+			// Clear out the FMSZ bits, reset them for 9 bits transferd for the start bit
+			uint32_t ctar1 = (ctar1_save & (~SPI_CTAR_FMSZ(15))) | SPI_CTAR_FMSZ(8);
+			update_ctar1(ctar1);
+
+			while(pixels.has(1)) {
+				writeWord( 0x100 | D::adjust(pixels.loadAndScale0()));
+				writeByte(D::adjust(pixels.loadAndScale1()));
+				writeByte(D::adjust(pixels.loadAndScale2()));
+				pixels.advanceData();
+				pixels.stepDithering();
+			}
+			D::postBlock(len);
+			waitFully();
+
+			// restore ctar1
+			update_ctar1(ctar1_save);
+		}
+		release();
+	}
+};
+#endif
+
+FASTLED_NAMESPACE_END
+
+#endif
@@ -0,0 +1,46 @@
+#ifndef __INC_LED_SYSDEFS_ARM_K20_H
+#define __INC_LED_SYSDEFS_ARM_K20_H
+
+#define FASTLED_TEENSY3
+#define FASTLED_ARM
+
+#ifndef INTERRUPT_THRESHOLD
+#define INTERRUPT_THRESHOLD 1
+#endif
+
+// Default to allowing interrupts
+#ifndef FASTLED_ALLOW_INTERRUPTS
+#define FASTLED_ALLOW_INTERRUPTS 1
+#endif
+
+#if FASTLED_ALLOW_INTERRUPTS == 1
+#define FASTLED_ACCURATE_CLOCK
+#endif
+
+#if (F_CPU == 96000000)
+#define CLK_DBL 1
+#endif
+
+// Get some system include files
+#include <avr/io.h>
+#include <avr/interrupt.h> // for cli/se definitions
+
+// Define the register types
+#if defined(ARDUINO) // && ARDUINO < 150
+typedef volatile       uint8_t RoReg; /**< Read only 8-bit register (volatile const unsigned int) */
+typedef volatile       uint8_t RwReg; /**< Read-Write 8-bit register (volatile unsigned int) */
+#endif
+
+extern volatile uint32_t systick_millis_count;
+#  define MS_COUNTER systick_millis_count
+
+
+// Default to using PROGMEM, since TEENSY3 provides it
+// even though all it does is ignore it.  Just being
+// conservative here in case TEENSY3 changes.
+#ifndef FASTLED_USE_PROGMEM
+#define FASTLED_USE_PROGMEM 1
+#endif
+
+
+#endif
@@ -0,0 +1,96 @@
+#ifndef __INC_OCTOWS2811_CONTROLLER_H
+#define __INC_OCTOWS2811_CONTROLLER_H
+
+#ifdef USE_OCTOWS2811
+
+// #include "OctoWS2811.h"
+
+FASTLED_NAMESPACE_BEGIN
+
+template<EOrder RGB_ORDER = GRB, boolean SLOW=false>
+class COctoWS2811Controller : public CLEDController {
+  OctoWS2811  *pocto;
+  uint8_t *drawbuffer,*framebuffer;
+
+  void _init(int nLeds) {
+    if(pocto == NULL) {
+      drawbuffer = (uint8_t*)malloc(nLeds * 8 * 3);
+      framebuffer = (uint8_t*)malloc(nLeds * 8 * 3);
+
+      // byte ordering is handled in show by the pixel controller
+      int config = WS2811_RGB;
+      if(SLOW) {
+        config |= WS2811_400kHz;
+      }
+
+      pocto = new OctoWS2811(nLeds, framebuffer, drawbuffer, config);
+
+      pocto->begin();
+    }
+  }
+public:
+  COctoWS2811Controller() { pocto = NULL; }
+
+
+  virtual void init() { /* do nothing yet */ }
+
+  virtual void clearLeds(int nLeds) {
+    _init(nLeds);
+    showColor(CRGB(0,0,0),nLeds,CRGB(0,0,0));
+  }
+
+  virtual void showColor(const struct CRGB & data, int nLeds, CRGB scale) {
+    _init(nLeds);
+    // Get our pixel values
+    PixelController<RGB_ORDER> pixels(data, nLeds, scale, getDither());
+    uint8_t ball[3][8];
+    memset(ball[0],pixels.loadAndScale0(),8);
+    memset(ball[1],pixels.loadAndScale1(),8);
+    memset(ball[2],pixels.loadAndScale2(),8);
+
+    uint8_t bout[24];
+    transpose8x1_MSB(ball[0],bout);
+    transpose8x1_MSB(ball[1],bout+8);
+    transpose8x1_MSB(ball[2],bout+16);
+
+    uint8_t *pdata = drawbuffer;
+    while(nLeds--) {
+      memcpy(pdata,bout,24);
+      pdata += 24;
+    }
+
+    pocto->show();
+  }
+
+  typedef union {
+    uint8_t bytes[8];
+    uint32_t raw[2];
+  } Lines;
+
+  virtual void show(const struct CRGB *rgbdata, int nLeds, CRGB scale) {
+    _init(nLeds);
+    MultiPixelController<8,0xFF,RGB_ORDER> pixels(rgbdata,nLeds, scale, getDither() );
+
+    uint8_t *pData = drawbuffer;
+    while(nLeds--) {
+      Lines b;
+
+      for(int i = 0; i < 8; i++) { b.bytes[i] = pixels.loadAndScale0(i); }
+      transpose8x1_MSB(b.bytes,pData); pData += 8;
+      for(int i = 0; i < 8; i++) { b.bytes[i] = pixels.loadAndScale1(i); }
+      transpose8x1_MSB(b.bytes,pData); pData += 8;
+      for(int i = 0; i < 8; i++) { b.bytes[i] = pixels.loadAndScale2(i); }
+      transpose8x1_MSB(b.bytes,pData); pData += 8;
+      pixels.stepDithering();
+      pixels.advanceData();
+    }
+
+    pocto->show();
+  }
+};
+
+FASTLED_NAMESPACE_END
+
+#endif
+
+#endif
@@ -0,0 +1,83 @@
+#ifndef __INC_SMARTMATRIX_T3_H
+#define __INC_SMARTMATRIX_T3_H
+
+#ifdef SmartMatrix_h
+#include<SmartMatrix.h>
+
+FASTLED_NAMESPACE_BEGIN
+
+extern SmartMatrix *pSmartMatrix;
+
+// note - dmx simple must be included before FastSPI for this code to be enabled
+class CSmartMatrixController : public CLEDController {
+  SmartMatrix matrix;
+
+public:
+  // initialize the LED controller
+  virtual void init() {
+      // Initialize 32x32 LED Matrix
+    matrix.begin();
+    matrix.setBrightness(255);
+    matrix.setColorCorrection(ccNone);
+
+    // Clear screen
+    clearLeds(0);
+    matrix.swapBuffers();
+    pSmartMatrix = &matrix;
+  }
+
+  // clear out/zero out the given number of leds.
+  virtual void clearLeds(int nLeds) {
+    const rgb24 black = {0,0,0};
+    matrix.fillScreen(black);
+    matrix.swapBuffers();
+  }
+
+  // set all the leds on the controller to a given color
+  virtual void showColor(const struct CRGB & data, int nLeds,CRGB scale) {
+    PixelController<RGB> pixels(data, nLeds, scale, getDither());
+    rgb24 *md = matrix.backBuffer();
+    while(nLeds--) {
+      md->red = pixels.loadAndScale0();
+      md->green = pixels.loadAndScale1();
+      md->blue = pixels.loadAndScale2();
+      md++;
+      pixels.stepDithering();
+    }
+    matrix.swapBuffers();
+  }
+
+  // note that the uint8_ts will be in the order that you want them sent out to the device.
+  // nLeds is the number of RGB leds being written to
+  virtual void show(const struct CRGB *data, int nLeds, CRGB scale) {
+    PixelController<RGB> pixels(data, nLeds, scale, getDither());
+#ifdef SMART_MATRIX_CAN_TRIPLE_BUFFER
+    rgb24 *md = matrix.getRealBackBuffer();
+#else
+    rgb24 *md = matrix.backBuffer();
+#endif
+    while(nLeds--) {
+      md->red = pixels.loadAndScale0();
+      md->green = pixels.loadAndScale1();
+      md->blue = pixels.loadAndScale2();
+      md++;
+      pixels.advanceData();
+      pixels.stepDithering();
+    }
+    matrix.swapBuffers();
+#ifdef SMART_MATRIX_CAN_TRIPLE_BUFFER
+    matrix.setBackBuffer((rgb24*)data);
+#endif
+  }
+
+#ifdef SUPPORT_ARGB
+  // as above, but every 4th uint8_t is assumed to be alpha channel data, and will be skipped
+  virtual void show(const struct CARGB *data, int nLeds, CRGB scale) = 0;
+#endif
+};
+
+FASTLED_NAMESPACE_END
+
+#endif
+
+#endif
@@ -0,0 +1,89 @@
+#ifndef __INC_CLOCKLESS_ARM_KL26
+#define __INC_CLOCKLESS_ARM_KL26
+
+#include "platforms/arm/common/m0clockless.h"
+FASTLED_NAMESPACE_BEGIN
+#define FASTLED_HAS_CLOCKLESS 1
+
+template <uint8_t DATA_PIN, int T1, int T2, int T3, EOrder RGB_ORDER = RGB, int XTRA0 = 0, bool FLIP = false, int WAIT_TIME = 50>
+class ClocklessController : public CLEDController {
+  typedef typename FastPinBB<DATA_PIN>::port_ptr_t data_ptr_t;
+  typedef typename FastPinBB<DATA_PIN>::port_t data_t;
+
+  data_t mPinMask;
+  data_ptr_t mPort;
+  CMinWait<WAIT_TIME> mWait;
+public:
+  virtual void init() {
+    FastPinBB<DATA_PIN>::setOutput();
+    mPinMask = FastPinBB<DATA_PIN>::mask();
+    mPort = FastPinBB<DATA_PIN>::port();
+  }
+
+	virtual uint16_t getMaxRefreshRate() const { return 400; }
+
+  virtual void clearLeds(int nLeds) {
+    showColor(CRGB(0, 0, 0), nLeds, 0);
+  }
+
+  // set all the leds on the controller to a given color
+  virtual void showColor(const struct CRGB & rgbdata, int nLeds, CRGB scale) {
+    PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+    mWait.wait();
+    cli();
+
+    showRGBInternal(pixels);
+
+    sei();
+    mWait.mark();
+  }
+
+  virtual void show(const struct CRGB *rgbdata, int nLeds, CRGB scale) {
+    PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+    mWait.wait();
+    cli();
+
+    showRGBInternal(pixels);
+
+    sei();
+    mWait.mark();
+  }
+
+#ifdef SUPPORT_ARGB
+  virtual void show(const struct CARGB *rgbdata, int nLeds, CRGB scale) {
+    PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+    mWait.wait();
+    cli();
+    showRGBInternal(pixels);
+    sei();
+    mWait.mark();
+  }
+#endif
+
+  // This method is made static to force making register Y available to use for data on AVR - if the method is non-static, then
+  // gcc will use register Y for the this pointer.
+  static uint32_t showRGBInternal(PixelController<RGB_ORDER> & pixels) {
+    struct M0ClocklessData data;
+    data.d[0] = pixels.d[0];
+    data.d[1] = pixels.d[1];
+    data.d[2] = pixels.d[2];
+    data.s[0] = pixels.mScale[0];
+    data.s[1] = pixels.mScale[1];
+    data.s[2] = pixels.mScale[2];
+    data.e[0] = pixels.e[0];
+    data.e[1] = pixels.e[1];
+    data.e[2] = pixels.e[2];
+    data.adj = pixels.mAdvance;
+
+    typename FastPin<DATA_PIN>::port_ptr_t portBase = FastPin<DATA_PIN>::port();
+    showLedData<4,8,T1,T2,T3,RGB_ORDER, WAIT_TIME>(portBase, FastPin<DATA_PIN>::mask(), pixels.mData, pixels.mLen, &data);
+    return 0; // 0x00FFFFFF - _VAL;
+  }
+
+
+};
+
+FASTLED_NAMESPACE_END
+
+
+#endif // __INC_CLOCKLESS_ARM_D21
@@ -0,0 +1,10 @@
+#ifndef __INC_FASTLED_ARM_KL26_H
+#define __INC_FASTLED_ARM_KL26_H
+
+// Include the k20 headers
+#include "fastled_delay.h"
+#include "fastpin_arm_kl26.h"
+#include "fastspi_arm_kl26.h"
+#include "clockless_arm_kl26.h"
+
+#endif
@@ -0,0 +1,88 @@
+#ifndef __FASTPIN_ARM_KL26_H
+#define __FASTPIN_ARM_KL26_H
+
+FASTLED_NAMESPACE_BEGIN
+
+#if defined(FASTLED_FORCE_SOFTWARE_PINS)
+#warning "Software pin support forced, pin access will be sloightly slower."
+#define NO_HARDWARE_PIN_SUPPORT
+#undef HAS_HARDWARE_PIN_SUPPORT
+
+#else
+
+
+/// Template definition for teensy LC style ARM pins, providing direct access to the various GPIO registers.  Note that this
+/// uses the full port GPIO registers.  In theory, in some way, bit-band register access -should- be faster, however I have found
+/// that something about the way gcc does register allocation results in the bit-band code being slower.  It will need more fine tuning.
+/// The registers are data output, set output, clear output, toggle output, input, and direction
+template<uint8_t PIN, uint32_t _MASK, typename _PDOR, typename _PSOR, typename _PCOR, typename _PTOR, typename _PDIR, typename _PDDR> class _ARMPIN {
+public:
+  typedef volatile uint32_t * port_ptr_t;
+  typedef uint32_t port_t;
+
+  inline static void setOutput() { pinMode(PIN, OUTPUT); } // TODO: perform MUX config { _PDDR::r() |= _MASK; }
+  inline static void setInput() { pinMode(PIN, INPUT); } // TODO: preform MUX config { _PDDR::r() &= ~_MASK; }
+
+  inline static void hi() __attribute__ ((always_inline)) { _PSOR::r() = _MASK; }
+  inline static void lo() __attribute__ ((always_inline)) { _PCOR::r() = _MASK; }
+  inline static void set(register port_t val) __attribute__ ((always_inline)) { _PDOR::r() = val; }
+
+  inline static void strobe() __attribute__ ((always_inline)) { toggle(); toggle(); }
+
+  inline static void toggle() __attribute__ ((always_inline)) { _PTOR::r() = _MASK; }
+
+  inline static void hi(register port_ptr_t port) __attribute__ ((always_inline)) { hi(); }
+  inline static void lo(register port_ptr_t port) __attribute__ ((always_inline)) { lo(); }
+  inline static void fastset(register port_ptr_t port, register port_t val) __attribute__ ((always_inline)) { *port = val; }
+
+  inline static port_t hival() __attribute__ ((always_inline)) { return _PDOR::r() | _MASK; }
+  inline static port_t loval() __attribute__ ((always_inline)) { return _PDOR::r() & ~_MASK; }
+  inline static port_ptr_t port() __attribute__ ((always_inline)) { return &_PDOR::r(); }
+  inline static port_ptr_t sport() __attribute__ ((always_inline)) { return &_PSOR::r(); }
+  inline static port_ptr_t cport() __attribute__ ((always_inline)) { return &_PCOR::r(); }
+  inline static port_t mask() __attribute__ ((always_inline)) { return _MASK; }
+};
+
+// Macros for kl26 pin access/definition
+#define GPIO_BITBAND_ADDR(reg, bit) (((uint32_t)&(reg) - 0x40000000) * 32 + (bit) * 4 + 0x42000000)
+#define GPIO_BITBAND_PTR(reg, bit) ((uint32_t *)GPIO_BITBAND_ADDR((reg), (bit)))
+
+#define _R(T) struct __gen_struct_ ## T
+#define _RD32(T) struct __gen_struct_ ## T { static __attribute__((always_inline)) inline reg32_t r() { return T; } \
+template<int BIT> static __attribute__((always_inline)) inline ptr_reg32_t rx() { return GPIO_BITBAND_PTR(T, BIT); } };
+#define _IO32(L) _RD32(FGPIO ## L ## _PDOR); _RD32(FGPIO ## L ## _PSOR); _RD32(FGPIO ## L ## _PCOR); _RD32(GPIO ## L ## _PTOR); _RD32(FGPIO ## L ## _PDIR); _RD32(FGPIO ## L ## _PDDR);
+
+#define _DEFPIN_ARM(PIN, BIT, L) template<> class FastPin<PIN> : public _ARMPIN<PIN, 1 << BIT, _R(FGPIO ## L ## _PDOR), _R(FGPIO ## L ## _PSOR), _R(FGPIO ## L ## _PCOR), \
+_R(GPIO ## L ## _PTOR), _R(FGPIO ## L ## _PDIR), _R(FGPIO ## L ## _PDDR)> {}; \
+/* template<> class FastPinBB<PIN> : public _ARMPIN_BITBAND<PIN, BIT, _R(GPIO ## L ## _PDOR), _R(GPIO ## L ## _PSOR), _R(GPIO ## L ## _PCOR), \
+_R(GPIO ## L ## _PTOR), _R(GPIO ## L ## _PDIR), _R(GPIO ## L ## _PDDR)> {}; */
+
+// Actual pin definitions
+#if defined(FASTLED_TEENSYLC) && defined(CORE_TEENSY)
+
+_IO32(A); _IO32(B); _IO32(C); _IO32(D); _IO32(E);
+
+#define MAX_PIN 26
+_DEFPIN_ARM(0, 16, B); _DEFPIN_ARM(1, 17, B); _DEFPIN_ARM(2, 0, D); _DEFPIN_ARM(3, 1, A);
+_DEFPIN_ARM(4, 2, A); _DEFPIN_ARM(5, 7, D); _DEFPIN_ARM(6, 4, D); _DEFPIN_ARM(7, 2, D);
+_DEFPIN_ARM(8, 3, D); _DEFPIN_ARM(9, 3, C); _DEFPIN_ARM(10, 4, C); _DEFPIN_ARM(11, 6, C);
+_DEFPIN_ARM(12, 7, C); _DEFPIN_ARM(13, 5, C); _DEFPIN_ARM(14, 1, D); _DEFPIN_ARM(15, 0, C);
+_DEFPIN_ARM(16, 0, B); _DEFPIN_ARM(17, 1, B); _DEFPIN_ARM(18, 3, B); _DEFPIN_ARM(19, 2, B);
+_DEFPIN_ARM(20, 5, D); _DEFPIN_ARM(21, 6, D); _DEFPIN_ARM(22, 1, C); _DEFPIN_ARM(23, 2, C);
+_DEFPIN_ARM(24, 20, E); _DEFPIN_ARM(25, 21, E); _DEFPIN_ARM(26, 30, E);
+
+#define SPI_DATA 11
+#define SPI_CLOCK 13
+// #define SPI1            (*(SPI_t *)0x4002D000)
+
+#define SPI2_DATA 0
+#define SPI2_CLOCK 20
+
+#define HAS_HARDWARE_PIN_SUPPORT
+#endif
+
+#endif // FASTLED_FORCE_SOFTWARE_PINS
+
+FASTLED_NAMESPACE_END
+
+#endif // __INC_FASTPIN_ARM_K20
@@ -0,0 +1,252 @@
+#ifndef __INC_FASTSPI_ARM_KL26_H
+#define __INC_FASTSPI_ARM_KL26_h
+
+FASTLED_NAMESPACE_BEGIN
+
+template <int VAL> void getScalars(uint8_t & sppr, uint8_t & spr) {
+  if(VAL > 4096) { sppr=7; spr=8; }
+  else if(VAL > 3584) { sppr=6; spr=8; }
+  else if(VAL > 3072) { sppr=5; spr=8; }
+  else if(VAL > 2560) { sppr=4; spr=8; }
+  else if(VAL > 2048) { sppr=7; spr=7; }
+  else if(VAL > 2048) { sppr=3; spr=8; }
+  else if(VAL > 1792) { sppr=6; spr=7; }
+  else if(VAL > 1536) { sppr=5; spr=7; }
+  else if(VAL > 1536) { sppr=2; spr=8; }
+  else if(VAL > 1280) { sppr=4; spr=7; }
+  else if(VAL > 1024) { sppr=7; spr=6; }
+  else if(VAL > 1024) { sppr=3; spr=7; }
+  else if(VAL > 1024) { sppr=1; spr=8; }
+  else if(VAL > 896) { sppr=6; spr=6; }
+  else if(VAL > 768) { sppr=5; spr=6; }
+  else if(VAL > 768) { sppr=2; spr=7; }
+  else if(VAL > 640) { sppr=4; spr=6; }
+  else if(VAL > 512) { sppr=7; spr=5; }
+  else if(VAL > 512) { sppr=3; spr=6; }
+  else if(VAL > 512) { sppr=1; spr=7; }
+  else if(VAL > 512) { sppr=0; spr=8; }
+  else if(VAL > 448) { sppr=6; spr=5; }
+  else if(VAL > 384) { sppr=5; spr=5; }
+  else if(VAL > 384) { sppr=2; spr=6; }
+  else if(VAL > 320) { sppr=4; spr=5; }
+  else if(VAL > 256) { sppr=7; spr=4; }
+  else if(VAL > 256) { sppr=3; spr=5; }
+  else if(VAL > 256) { sppr=1; spr=6; }
+  else if(VAL > 256) { sppr=0; spr=7; }
+  else if(VAL > 224) { sppr=6; spr=4; }
+  else if(VAL > 192) { sppr=5; spr=4; }
+  else if(VAL > 192) { sppr=2; spr=5; }
+  else if(VAL > 160) { sppr=4; spr=4; }
+  else if(VAL > 128) { sppr=7; spr=3; }
+  else if(VAL > 128) { sppr=3; spr=4; }
+  else if(VAL > 128) { sppr=1; spr=5; }
+  else if(VAL > 128) { sppr=0; spr=6; }
+  else if(VAL > 112) { sppr=6; spr=3; }
+  else if(VAL > 96) { sppr=5; spr=3; }
+  else if(VAL > 96) { sppr=2; spr=4; }
+  else if(VAL > 80) { sppr=4; spr=3; }
+  else if(VAL > 64) { sppr=7; spr=2; }
+  else if(VAL > 64) { sppr=3; spr=3; }
+  else if(VAL > 64) { sppr=1; spr=4; }
+  else if(VAL > 64) { sppr=0; spr=5; }
+  else if(VAL > 56) { sppr=6; spr=2; }
+  else if(VAL > 48) { sppr=5; spr=2; }
+  else if(VAL > 48) { sppr=2; spr=3; }
+  else if(VAL > 40) { sppr=4; spr=2; }
+  else if(VAL > 32) { sppr=7; spr=1; }
+  else if(VAL > 32) { sppr=3; spr=2; }
+  else if(VAL > 32) { sppr=1; spr=3; }
+  else if(VAL > 32) { sppr=0; spr=4; }
+  else if(VAL > 28) { sppr=6; spr=1; }
+  else if(VAL > 24) { sppr=5; spr=1; }
+  else if(VAL > 24) { sppr=2; spr=2; }
+  else if(VAL > 20) { sppr=4; spr=1; }
+  else if(VAL > 16) { sppr=7; spr=0; }
+  else if(VAL > 16) { sppr=3; spr=1; }
+  else if(VAL > 16) { sppr=1; spr=2; }
+  else if(VAL > 16) { sppr=0; spr=3; }
+  else if(VAL > 14) { sppr=6; spr=0; }
+  else if(VAL > 12) { sppr=5; spr=0; }
+  else if(VAL > 12) { sppr=2; spr=1; }
+  else if(VAL > 10) { sppr=4; spr=0; }
+  else if(VAL > 8) { sppr=3; spr=0; }
+  else if(VAL > 8) { sppr=1; spr=1; }
+  else if(VAL > 8) { sppr=0; spr=2; }
+  else if(VAL > 6) { sppr=2; spr=0; }
+  else if(VAL > 4) { sppr=1; spr=0; }
+  else if(VAL > 4) { sppr=0; spr=1; }
+  else /* if(VAL > 2) */ { sppr=0; spr=0; }
+}
+
+
+#define SPIX (*(KINETISL_SPI_t*)pSPIX)
+#define ARM_HARDWARE_SPI
+
+template <uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint8_t _SPI_CLOCK_DIVIDER, uint32_t pSPIX>
+class ARMHardwareSPIOutput {
+  Selectable *m_pSelect;
+
+  static inline void enable_pins(void) __attribute__((always_inline)) {
+    switch(_DATA_PIN) {
+      case 0: CORE_PIN0_CONFIG =  PORT_PCR_MUX(2); break;
+      case 1: CORE_PIN1_CONFIG =  PORT_PCR_MUX(5); break;
+      case 7: CORE_PIN7_CONFIG =  PORT_PCR_MUX(2); break;
+      case 8: CORE_PIN8_CONFIG =  PORT_PCR_MUX(5); break;
+      case 11: CORE_PIN11_CONFIG =  PORT_PCR_MUX(2); break;
+      case 12: CORE_PIN12_CONFIG =  PORT_PCR_MUX(5); break;
+      case 21: CORE_PIN21_CONFIG =  PORT_PCR_MUX(2); break;
+    }
+
+    switch(_CLOCK_PIN) {
+      case 13: CORE_PIN13_CONFIG =  PORT_PCR_MUX(2); break;
+      case 14: CORE_PIN14_CONFIG =  PORT_PCR_MUX(2); break;
+      case 20: CORE_PIN20_CONFIG =  PORT_PCR_MUX(2); break;
+    }
+  }
+
+  static inline void disable_pins(void) __attribute((always_inline)) {
+    switch(_DATA_PIN) {
+      case 0: CORE_PIN0_CONFIG = PORT_PCR_SRE | PORT_PCR_MUX(1); break;
+      case 1: CORE_PIN1_CONFIG = PORT_PCR_SRE | PORT_PCR_MUX(1); break;
+      case 7: CORE_PIN7_CONFIG = PORT_PCR_SRE | PORT_PCR_MUX(1); break;
+      case 8: CORE_PIN8_CONFIG = PORT_PCR_SRE | PORT_PCR_MUX(1); break;
+      case 11: CORE_PIN11_CONFIG = PORT_PCR_SRE | PORT_PCR_MUX(1); break;
+      case 12: CORE_PIN12_CONFIG = PORT_PCR_SRE | PORT_PCR_MUX(1); break;
+      case 21: CORE_PIN21_CONFIG = PORT_PCR_SRE | PORT_PCR_MUX(1); break;
+    }
+
+    switch(_CLOCK_PIN) {
+      case 13: CORE_PIN13_CONFIG = PORT_PCR_SRE | PORT_PCR_MUX(1); break;
+      case 14: CORE_PIN14_CONFIG = PORT_PCR_SRE | PORT_PCR_MUX(1); break;
+      case 20: CORE_PIN20_CONFIG = PORT_PCR_SRE | PORT_PCR_MUX(1); break;
+    }
+  }
+
+  void setSPIRate() {
+    uint8_t sppr, spr;
+    getScalars<_SPI_CLOCK_DIVIDER>(sppr, spr);
+
+    // Set the speed
+    SPIX.BR = SPI_BR_SPPR(sppr) | SPI_BR_SPR(spr);
+
+    // Also, force 8 bit transfers (don't want to juggle 8/16 since that flushes the world)
+    SPIX.C2 = 0;
+    SPIX.C1 |= SPI_C1_SPE;
+  }
+
+public:
+  ARMHardwareSPIOutput() { m_pSelect = NULL; }
+  ARMHardwareSPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
+
+  // set the object representing the selectable
+  void setSelect(Selectable *pSelect) { m_pSelect = pSelect; }
+
+  // initialize the SPI subssytem
+  void init() {
+    FastPin<_DATA_PIN>::setOutput();
+    FastPin<_CLOCK_PIN>::setOutput();
+
+    // Enable the SPI clocks
+    uint32_t sim4 = SIM_SCGC4;
+    if ((pSPIX == 0x40076000) && !(sim4 & SIM_SCGC4_SPI0)) {
+      SIM_SCGC4 = sim4 | SIM_SCGC4_SPI0;
+    }
+
+    if ( (pSPIX == 0x40077000) && !(sim4 & SIM_SCGC4_SPI1)) {
+      SIM_SCGC4 = sim4 | SIM_SCGC4_SPI1;
+    }
+
+    SPIX.C1 = SPI_C1_MSTR | SPI_C1_SPE;
+    SPIX.C2 = 0;
+    SPIX.BR = SPI_BR_SPPR(1) | SPI_BR_SPR(0);
+  }
+
+  // latch the CS select
+  void inline select() __attribute__((always_inline)) {
+    if(m_pSelect != NULL) { m_pSelect->select(); }
+    setSPIRate();
+    enable_pins();
+  }
+
+
+  // release the CS select
+  void inline release() __attribute__((always_inline)) {
+    disable_pins();
+    if(m_pSelect != NULL) { m_pSelect->release(); }
+  }
+
+  // Wait for the world to be clear
+  static void wait() __attribute__((always_inline)) { while(!(SPIX.S & SPI_S_SPTEF));  }
+
+  // wait until all queued up data has been written
+  void waitFully() { wait(); }
+
+  // not the most efficient mechanism in the world - but should be enough for sm16716 and friends
+  template <uint8_t BIT> inline static void writeBit(uint8_t b) { /* TODO */ }
+
+  // write a byte out via SPI (returns immediately on writing register)
+  static void writeByte(uint8_t b) __attribute__((always_inline)) { wait(); SPIX.DL = b; }
+  // write a word out via SPI (returns immediately on writing register)
+  static void writeWord(uint16_t w) __attribute__((always_inline)) { writeByte(w>>8); writeByte(w & 0xFF); }
+
+  // A raw set of writing byte values, assumes setup/init/waiting done elsewhere (static for use by adjustment classes)
+  static void writeBytesValueRaw(uint8_t value, int len) {
+    while(len--) { writeByte(value); }
+  }
+
+  // A full cycle of writing a value for len bytes, including select, release, and waiting
+  void writeBytesValue(uint8_t value, int len) {
+    setSPIRate();
+    select();
+    while(len--) {
+      writeByte(value);
+    }
+    waitFully();
+    release();
+  }
+
+  // A full cycle of writing a raw block of data out, including select, release, and waiting
+  template <class D> void writeBytes(register uint8_t *data, int len) {
+    setSPIRate();
+    uint8_t *end = data + len;
+    select();
+    // could be optimized to write 16bit words out instead of 8bit bytes
+    while(data != end) {
+      writeByte(D::adjust(*data++));
+    }
+    D::postBlock(len);
+    waitFully();
+    release();
+  }
+
+  void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }
+
+
+  template <uint8_t FLAGS, class D, EOrder RGB_ORDER> void writePixels(PixelController<RGB_ORDER> pixels) {
+    int len = pixels.mLen;
+
+    select();
+    while(pixels.has(1)) {
+      if(FLAGS & FLAG_START_BIT) {
+        writeBit<0>(1);
+        writeByte(D::adjust(pixels.loadAndScale0()));
+        writeByte(D::adjust(pixels.loadAndScale1()));
+        writeByte(D::adjust(pixels.loadAndScale2()));
+      } else {
+        writeByte(D::adjust(pixels.loadAndScale0()));
+        writeByte(D::adjust(pixels.loadAndScale1()));
+        writeByte(D::adjust(pixels.loadAndScale2()));
+      }
+
+      pixels.advanceData();
+      pixels.stepDithering();
+    }
+    D::postBlock(len);
+    release();
+  }
+
+};
+
+FASTLED_NAMESPACE_END
+
+#endif
@@ -0,0 +1,45 @@
+#ifndef __INC_LED_SYSDEFS_ARM_KL26_H
+#define __INC_LED_SYSDEFS_ARM_KL26_H
+
+#define FASTLED_TEENSYLC
+#define FASTLED_ARM
+#define FASTLED_ARM_M0_PLUS
+
+#ifndef INTERRUPT_THRESHOLD
+#define INTERRUPT_THRESHOLD 1
+#endif
+
+// Default to allowing interrupts
+#ifndef FASTLED_ALLOW_INTERRUPTS
+#define FASTLED_ALLOW_INTERRUPTS 1
+#endif
+
+#if FASTLED_ALLOW_INTERRUPTS == 1
+#define FASTLED_ACCURATE_CLOCK
+#endif
+
+#if (F_CPU == 96000000)
+#define CLK_DBL 1
+#endif
+
+// Get some system include files
+#include <avr/io.h>
+#include <avr/interrupt.h> // for cli/se definitions
+
+// Define the register types
+#if defined(ARDUINO) // && ARDUINO < 150
+typedef volatile       uint8_t RoReg; /**< Read only 8-bit register (volatile const unsigned int) */
+typedef volatile       uint8_t RwReg; /**< Read-Write 8-bit register (volatile unsigned int) */
+#endif
+
+extern volatile uint32_t systick_millis_count;
+#  define MS_COUNTER systick_millis_count
+
+// Default to using PROGMEM since TEENSYLC provides it
+// even though all it does is ignore it.  Just being
+// conservative here in case TEENSYLC changes.
+#ifndef FASTLED_USE_PROGMEM
+#define FASTLED_USE_PROGMEM 1
+#endif
+
+#endif
@@ -0,0 +1,117 @@
+#ifndef __INC_CLOCKLESS_ARM_NRF51
+#define __INC_CLOCKLESS_ARM_NRF51
+
+#if defined(NRF51)
+
+#include "nrf51_bitfields.h"
+#define FASTLED_HAS_CLOCKLESS 1
+
+#if (FASTLED_ALLOW_INTERRUPTS==1)
+#define SEI_CHK LED_TIMER->CC[0] = (WAIT_TIME * (F_CPU/1000000)); LED_TIMER->TASKS_CLEAR; LED_TIMER->EVENTS_COMPARE[0] = 0;
+#define CLI_CHK cli(); if(LED_TIMER->EVENTS_COMPARE[0]) { LED_TIMER->TASKS_STOP = 1; return 0; }
+#define INNER_SEI sei();
+#else
+#define SEI_CHK
+#define CLI_CHK
+#define INNER_SEI delaycycles<1>();
+#endif
+
+
+#include "platforms/arm/common/m0clockless.h"
+template <uint8_t DATA_PIN, int T1, int T2, int T3, EOrder RGB_ORDER = RGB, int XTRA0 = 0, bool FLIP = false, int WAIT_TIME = 75>
+class ClocklessController : public CLEDController {
+  typedef typename FastPinBB<DATA_PIN>::port_ptr_t data_ptr_t;
+  typedef typename FastPinBB<DATA_PIN>::port_t data_t;
+
+  data_t mPinMask;
+  data_ptr_t mPort;
+  CMinWait<WAIT_TIME> mWait;
+public:
+  virtual void init() {
+    FastPinBB<DATA_PIN>::setOutput();
+    mPinMask = FastPinBB<DATA_PIN>::mask();
+    mPort = FastPinBB<DATA_PIN>::port();
+  }
+
+	virtual uint16_t getMaxRefreshRate() const { return 400; }
+
+  virtual void clearLeds(int nLeds) {
+    showColor(CRGB(0, 0, 0), nLeds, 0);
+  }
+
+  // set all the leds on the controller to a given color
+  virtual void showColor(const struct CRGB & rgbdata, int nLeds, CRGB scale) {
+    PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+    mWait.wait();
+    cli();
+
+    // attempt to re-show a frame if we exit early because of interrupts.
+    if(!showRGBInternal(pixels)) {
+      sei(); delayMicroseconds(WAIT_TIME); cli();
+      showRGBInternal(pixels);
+    }
+
+    sei();
+    mWait.mark();
+  }
+
+  virtual void show(const struct CRGB *rgbdata, int nLeds, CRGB scale) {
+    PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+    mWait.wait();
+    cli();
+
+    if(!showRGBInternal(pixels)) {
+      sei(); delayMicroseconds(WAIT_TIME); cli();
+      showRGBInternal(pixels);
+    }
+
+    sei();
+    mWait.mark();
+  }
+
+#ifdef SUPPORT_ARGB
+  virtual void show(const struct CARGB *rgbdata, int nLeds, CRGB scale) {
+    PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+    mWait.wait();
+    cli();
+    showRGBInternal(pixels);
+    sei();
+    mWait.mark();
+  }
+#endif
+
+  // This method is made static to force making register Y available to use for data on AVR - if the method is non-static, then
+  // gcc will use register Y for the this pointer.
+  static uint32_t showRGBInternal(PixelController<RGB_ORDER> & pixels) {
+    struct M0ClocklessData data;
+    data.d[0] = pixels.d[0];
+    data.d[1] = pixels.d[1];
+    data.d[2] = pixels.d[2];
+    data.s[0] = pixels.mScale[0];
+    data.s[1] = pixels.mScale[1];
+    data.s[2] = pixels.mScale[2];
+    data.e[0] = pixels.e[0];
+    data.e[1] = pixels.e[1];
+    data.e[2] = pixels.e[2];
+    data.adj = pixels.mAdvance;
+
+    typename FastPin<DATA_PIN>::port_ptr_t portBase = FastPin<DATA_PIN>::port();
+
+    // timer mode w/prescaler of 0
+    LED_TIMER->MODE = TIMER_MODE_MODE_Timer;
+    LED_TIMER->PRESCALER = 0;
+    LED_TIMER->EVENTS_COMPARE[0] = 0;
+    LED_TIMER->BITMODE = TIMER_BITMODE_BITMODE_16Bit;
+    LED_TIMER->SHORTS = TIMER_SHORTS_COMPARE0_CLEAR_Msk;
+    LED_TIMER->TASKS_START = 1;
+
+    int ret = showLedData<4,8,T1,T2,T3,RGB_ORDER,WAIT_TIME>(portBase, FastPin<DATA_PIN>::mask(), pixels.mData, pixels.mLen, &data);
+
+    LED_TIMER->TASKS_STOP = 1;
+    return ret; // 0x00FFFFFF - _VAL;
+  }
+};
+
+
+#endif // NRF51
+#endif // __INC_CLOCKLESS_ARM_NRF51
@@ -0,0 +1,11 @@
+#ifndef __INC_FASTLED_ARM_NRF51_H
+#define __INC_FASTLED_ARM_NRF51_H
+
+// Include the k20 headers
+#include "bitswap.h"
+#include "fastled_delay.h"
+#include "fastpin_arm_nrf51.h"
+#include "fastspi_arm_nrf51.h"
+#include "clockless_arm_nrf51.h"
+
+#endif
@@ -0,0 +1,119 @@
+#ifndef __FASTPIN_ARM_NRF51_H
+#define __FASTPIN_ARM_NRF51_H
+
+#if defined(NRF51)
+/// Template definition for teensy 3.0 style ARM pins, providing direct access to the various GPIO registers.  Note that this
+/// uses the full port GPIO registers.  In theory, in some way, bit-band register access -should- be faster, however I have found
+/// that something about the way gcc does register allocation results in the bit-band code being slower.  It will need more fine tuning.
+/// The registers are data output, set output, clear output, toggle output, input, and direction
+#if 0
+template<uint8_t PIN, uint32_t _MASK, typename _DIRSET, typename _DIRCLR, typename _OUTSET, typename _OUTCLR, typename _OUT> class _ARMPIN {
+public:
+  typedef volatile uint32_t * port_ptr_t;
+  typedef uint32_t port_t;
+
+  inline static void setOutput() { _DIRSET::r() = _MASK; }
+  inline static void setInput() { _DIRCLR::r() = _MASK; }
+
+  inline static void hi() __attribute__ ((always_inline)) { _OUTSET::r() = _MASK; }
+  inline static void lo() __attribute__ ((always_inline)) { _OUTCLR::r() = _MASK; }
+  inline static void set(register port_t val) __attribute__ ((always_inline)) { _OUT::r() = val; }
+
+  inline static void strobe() __attribute__ ((always_inline)) { toggle(); toggle(); }
+
+  inline static void toggle() __attribute__ ((always_inline)) { _OUT::r() ^= _MASK; }
+
+  inline static void hi(register port_ptr_t port) __attribute__ ((always_inline)) { hi(); }
+  inline static void lo(register port_ptr_t port) __attribute__ ((always_inline)) { lo(); }
+  inline static void fastset(register port_ptr_t port, register port_t val) __attribute__ ((always_inline)) { *port = val; }
+
+  inline static port_t hival() __attribute__ ((always_inline)) { return _OUT::r() | _MASK; }
+  inline static port_t loval() __attribute__ ((always_inline)) { return _OUT::r() & ~_MASK; }
+  inline static port_ptr_t port() __attribute__ ((always_inline)) { return &_OUT::r(); }
+  inline static port_t mask() __attribute__ ((always_inline)) { return _MASK; }
+};
+
+#define ADDR(X) *(volatile uint32_t*)X
+#define NR_GPIO_ADDR(base,offset) (*(volatile uint32_t *))((uint32_t)(base + offset))
+#define NR_DIRSET ADDR(0x50000518UL) // NR_GPIO_ADDR(NRF_GPIO_BASE, 0x518)
+#define NR_DIRCLR ADDR(0x5000051CUL) // NR_GPIO_ADDR(NRF_GPIO_BASE, 0x51C)
+#define NR_OUTSET ADDR(0x50000508UL) // NR_GPIO_ADDR(NRF_GPIO_BASE, 0x508)
+#define NR_OUTCLR ADDR(0x5000050CUL) // NR_GPIO_ADDR(NRF_GPIO_BASE, 0x50C)
+#define NR_OUT ADDR(0x50000504UL) // NR_GPIO_ADDR(NRF_GPIO_BASE, 0x504)
+
+#define _RD32_NRF(T) struct __gen_struct_ ## T { static __attribute__((always_inline)) inline reg32_t r() { return T; }};
+
+_RD32_NRF(NR_DIRSET);
+_RD32_NRF(NR_DIRCLR);
+_RD32_NRF(NR_OUTSET);
+_RD32_NRF(NR_OUTCLR);
+_RD32_NRF(NR_OUT);
+
+#define _DEFPIN_ARM(PIN) template<> class FastPin<PIN> : public _ARMPIN<PIN, 1 << PIN, \
+  _R(NR_DIRSET), _R(NR_DIRCLR), _R(NR_OUTSET), _R(NR_OUTCLR), _R(NR_OUT)> {};
+#else
+
+typedef struct {                                    /*!< GPIO Structure                                                        */
+  // __I  uint32_t  RESERVED0[321];
+  __IO uint32_t  OUT;                               /*!< Write GPIO port.                                                      */
+  __IO uint32_t  OUTSET;                            /*!< Set individual bits in GPIO port.                                     */
+  __IO uint32_t  OUTCLR;                            /*!< Clear individual bits in GPIO port.                                   */
+  __I  uint32_t  IN;                                /*!< Read GPIO port.                                                       */
+  __IO uint32_t  DIR;                               /*!< Direction of GPIO pins.                                               */
+  __IO uint32_t  DIRSET;                            /*!< DIR set register.                                                     */
+  __IO uint32_t  DIRCLR;                            /*!< DIR clear register.                                                   */
+  __I  uint32_t  RESERVED1[120];
+  __IO uint32_t  PIN_CNF[32];                       /*!< Configuration of GPIO pins.                                           */
+} FL_NRF_GPIO_Type;
+
+#define FL_NRF_GPIO_BASE                   0x50000504UL
+#define FL_NRF_GPIO                        ((FL_NRF_GPIO_Type           *) FL_NRF_GPIO_BASE)
+
+template<uint8_t PIN, uint32_t _MASK> class _ARMPIN {
+public:
+  typedef volatile uint32_t * port_ptr_t;
+  typedef uint32_t port_t;
+
+  inline static void setOutput() { FL_NRF_GPIO->DIRSET = _MASK; }
+  inline static void setInput() { FL_NRF_GPIO->DIRCLR = _MASK; }
+
+  inline static void hi() __attribute__ ((always_inline)) { FL_NRF_GPIO->OUTSET = _MASK; }
+  inline static void lo() __attribute__ ((always_inline)) { FL_NRF_GPIO->OUTCLR= _MASK; }
+  inline static void set(register port_t val) __attribute__ ((always_inline)) { FL_NRF_GPIO->OUT = val; }
+
+  inline static void strobe() __attribute__ ((always_inline)) { toggle(); toggle(); }
+
+  inline static void toggle() __attribute__ ((always_inline)) { FL_NRF_GPIO->OUT ^= _MASK; }
+
+  inline static void hi(register port_ptr_t port) __attribute__ ((always_inline)) { hi(); }
+  inline static void lo(register port_ptr_t port) __attribute__ ((always_inline)) { lo(); }
+  inline static void fastset(register port_ptr_t port, register port_t val) __attribute__ ((always_inline)) { *port = val; }
+
+  inline static port_t hival() __attribute__ ((always_inline)) { return FL_NRF_GPIO->OUT | _MASK; }
+  inline static port_t loval() __attribute__ ((always_inline)) { return FL_NRF_GPIO->OUT & ~_MASK; }
+  inline static port_ptr_t port() __attribute__ ((always_inline)) { return &FL_NRF_GPIO->OUT; }
+  inline static port_t mask() __attribute__ ((always_inline)) { return _MASK; }
+
+  inline static bool isset() __attribute__ ((always_inline)) { return (FL_NRF_GPIO->IN & _MASK) != 0; }
+};
+
+
+#define _DEFPIN_ARM(PIN) template<> class FastPin<PIN> : public _ARMPIN<PIN, 1 << PIN> {};
+#endif
+
+// Actual pin definitions
+#define MAX_PIN 31
+_DEFPIN_ARM(0); _DEFPIN_ARM(1); _DEFPIN_ARM(2); _DEFPIN_ARM(3);
+_DEFPIN_ARM(4); _DEFPIN_ARM(5); _DEFPIN_ARM(6); _DEFPIN_ARM(7);
+_DEFPIN_ARM(8); _DEFPIN_ARM(9); _DEFPIN_ARM(10); _DEFPIN_ARM(11);
+_DEFPIN_ARM(12); _DEFPIN_ARM(13); _DEFPIN_ARM(14); _DEFPIN_ARM(15);
+_DEFPIN_ARM(16); _DEFPIN_ARM(17); _DEFPIN_ARM(18); _DEFPIN_ARM(19);
+_DEFPIN_ARM(20); _DEFPIN_ARM(21); _DEFPIN_ARM(22); _DEFPIN_ARM(23);
+_DEFPIN_ARM(24); _DEFPIN_ARM(25); _DEFPIN_ARM(26); _DEFPIN_ARM(27);
+_DEFPIN_ARM(28); _DEFPIN_ARM(29); _DEFPIN_ARM(30); _DEFPIN_ARM(31);
+
+#define HAS_HARDWARE_PIN_SUPPORT
+
+#endif
+
+#endif
@@ -0,0 +1,146 @@
+#ifndef __INC_FASTSPI_NRF_H
+#define __INC_FASTSPI_NRF_H
+
+#ifdef NRF51
+
+// A nop/stub class, mostly to show the SPI methods that are needed/used by the various SPI chipset implementations.  Should
+// be used as a definition for the set of methods that the spi implementation classes should use (since C++ doesn't support the
+// idea of interfaces - it's possible this could be done with virtual classes, need to decide if i want that overhead)
+template <uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint8_t _SPI_CLOCK_DIVIDER>
+class NRF51SPIOutput {
+
+  struct saveData {
+    uint32_t sck;
+    uint32_t mosi;
+    uint32_t miso;
+    uint32_t freq;
+    uint32_t enable;
+  } mSavedData;
+
+  void saveSPIData() {
+    mSavedData.sck = NRF_SPI0->PSELSCK;
+    mSavedData.mosi = NRF_SPI0->PSELMOSI;
+    mSavedData.miso = NRF_SPI0->PSELMISO;
+    mSavedData.freq = NRF_SPI0->FREQUENCY;
+    mSavedData.enable = NRF_SPI0->ENABLE;
+  }
+
+  void restoreSPIData() {
+    NRF_SPI0->PSELSCK = mSavedData.sck;
+    NRF_SPI0->PSELMOSI = mSavedData.mosi;
+    NRF_SPI0->PSELMISO = mSavedData.miso;
+    NRF_SPI0->FREQUENCY = mSavedData.freq;
+    mSavedData.enable = NRF_SPI0->ENABLE;
+  }
+
+public:
+  NRF51SPIOutput() { FastPin<_DATA_PIN>::setOutput(); FastPin<_CLOCK_PIN>::setOutput(); }
+  NRF51SPIOutput(Selectable *pSelect) {  FastPin<_DATA_PIN>::setOutput(); FastPin<_CLOCK_PIN>::setOutput();  }
+
+  // set the object representing the selectable
+  void setSelect(Selectable *pSelect) { /* TODO */ }
+
+  // initialize the SPI subssytem
+  void init() {
+    FastPin<_DATA_PIN>::setOutput();
+    FastPin<_CLOCK_PIN>::setOutput();
+    NRF_SPI0->PSELSCK = _CLOCK_PIN;
+    NRF_SPI0->PSELMOSI = _DATA_PIN;
+    NRF_SPI0->PSELMISO = 0xFFFFFFFF;
+    NRF_SPI0->FREQUENCY = 0x80000000;
+    NRF_SPI0->ENABLE = 1;
+    NRF_SPI0->EVENTS_READY = 0;
+  }
+
+  // latch the CS select
+  void select() { saveSPIData(); init(); }
+
+  // release the CS select
+  void release() { restoreSPIData(); }
+
+  static bool shouldWait(bool wait = false) __attribute__((always_inline)) __attribute__((always_inline)) {
+    static bool sWait=false;
+    return false; // if(sWait) { sWait = wait; return true; } else { sWait = wait; return false; }
+  }
+
+  // wait until all queued up data has been written
+  void waitFully() __attribute__((always_inline)){ if(shouldWait()) { while(NRF_SPI0->EVENTS_READY==0); } NRF_SPI0->EVENTS_READY=0; uint8_t b = NRF_SPI0->RXD; }
+  void wait() __attribute__((always_inline)){ if(shouldWait()) { while(NRF_SPI0->EVENTS_READY==0); } NRF_SPI0->EVENTS_READY=0; uint8_t b = NRF_SPI0->RXD; }
+  // void waitFully() { while(NRF_SPI0->EVENTS_READY==0); NRF_SPI0->EVENTS_READY=0; uint8_t b = NRF_SPI0->RXD; }
+  // void wait() { while(NRF_SPI0->EVENTS_READY==0); NRF_SPI0->EVENTS_READY=0; uint8_t b = NRF_SPI0->RXD; }
+
+  // write a byte out via SPI (returns immediately on writing register)
+  // void writeByte(uint8_t b) { wait(); NRF_SPI0->TXD = b;  shouldWait(true); }
+  // void writeByte(uint8_t b) __attribute__((always_inline)){ wait(); NRF_SPI0->TXD = b;  shouldWait(true);  }
+  void writeByte(uint8_t b) __attribute__((always_inline)) {  NRF_SPI0->EVENTS_READY=0; /*uint8_t x = NRF_SPI0->RXD;*/ NRF_SPI0->TXD = b; }
+
+  // write a word out via SPI (returns immediately on writing register)
+  void writeWord(uint16_t w) __attribute__((always_inline)){ writeByte(w>>8); writeByte(w & 0xFF);  }
+
+  // A raw set of writing byte values, assumes setup/init/waiting done elsewhere (static for use by adjustment classes)
+  static void writeBytesValueRaw(uint8_t value, int len) { while(len--) { writeByte(value);  } }
+
+  // A full cycle of writing a value for len bytes, including select, release, and waiting
+  void writeBytesValue(uint8_t value, int len) {
+    select();
+    while(len--) {
+      writeByte(value);
+    }
+    waitFully();
+    release();
+  }
+
+  // A full cycle of writing a raw block of data out, including select, release, and waiting
+  template<class D> void writeBytes(uint8_t *data, int len) {
+    uint8_t *end = data + len;
+    select();
+    while(data != end) {
+      writeByte(D::adjust(*data++));
+    }
+    D::postBlock(len);
+    waitFully();
+    release();
+  }
+
+  void writeBytes(uint8_t *data, int len) {
+    writeBytes<DATA_NOP>(data, len);
+  }
+
+  // write a single bit out, which bit from the passed in byte is determined by template parameter
+  template <uint8_t BIT> inline static void writeBit(uint8_t b) {
+    waitFully();
+    NRF_SPI0->ENABLE = 0;
+    if(b & 1<<BIT) {
+      FastPin<_DATA_PIN>::hi();
+    } else {
+      FastPin<_DATA_PIN>::lo();
+    }
+    FastPin<_CLOCK_PIN>::toggle();
+    FastPin<_CLOCK_PIN>::toggle();
+    NRF_SPI0->ENABLE = 1;
+  }
+
+  template <uint8_t FLAGS, class D, EOrder RGB_ORDER> void writePixels(PixelController<RGB_ORDER> pixels) {
+    select();
+    int len = pixels.mLen;
+    while(pixels.has(1)) {
+      if(FLAGS & FLAG_START_BIT) {
+				writeBit<0>(1);
+      }
+			writeByte(D::adjust(pixels.loadAndScale0()));
+			writeByte(D::adjust(pixels.loadAndScale1()));
+			writeByte(D::adjust(pixels.loadAndScale2()));
+
+			pixels.advanceData();
+			pixels.stepDithering();
+		}
+		D::postBlock(len);
+		waitFully();
+		release();
+  }
+
+};
+
+#endif
+
+#endif
@@ -0,0 +1,44 @@
+#ifndef __LED_SYSDEFS_ARM_NRF51
+#define __LED_SYSDEFS_ARM_NRF51
+
+#ifndef NRF51
+#define NRF51
+#endif
+
+#define LED_TIMER NRF_TIMER1
+#define FASTLED_NO_PINMAP
+#define FASTLED_HAS_CLOCKLESS
+
+#define FASTLED_ARM
+#define FASTLED_ARM_M0
+
+#ifndef F_CPU
+#define F_CPU 16000000
+#endif
+
+#include <stdint.h>
+#include "nrf51.h"
+#include "core_cm0.h"
+
+typedef volatile uint32_t RoReg;
+typedef volatile uint32_t RwReg;
+typedef uint32_t prog_uint32_t;
+typedef uint8_t boolean;
+
+#define PROGMEM
+#define NO_PROGMEM
+#define NEED_CXX_BITS
+
+// Default to NOT using PROGMEM here
+#ifndef FASTLED_USE_PROGMEM
+#define FASTLED_USE_PROGMEM 0
+#endif
+
+#ifndef FASTLED_ALLOW_INTERRUPTS
+#define FASTLED_ALLOW_INTERRUPTS 1
+#endif
+
+#define cli()  __disable_irq();
+#define sei() __enable_irq();
+
+#endif
@@ -0,0 +1,145 @@
+#ifndef __INC_CLOCKLESS_ARM_SAM_H
+#define __INC_CLOCKLESS_ARM_SAM_H
+
+FASTLED_NAMESPACE_BEGIN
+
+// Definition for a single channel clockless controller for the sam family of arm chips, like that used in the due and rfduino
+// See clockless.h for detailed info on how the template parameters are used.
+
+#if defined(__SAM3X8E__)
+
+
+#define TADJUST 0
+#define TOTAL ( (T1+TADJUST) + (T2+TADJUST) + (T3+TADJUST) )
+#define T1_MARK (TOTAL - (T1+TADJUST))
+#define T2_MARK (T1_MARK - (T2+TADJUST))
+
+#define SCALE(S,V) scale8_video(S,V)
+// #define SCALE(S,V) scale8(S,V)
+#define FASTLED_HAS_CLOCKLESS 1
+
+template <uint8_t DATA_PIN, int T1, int T2, int T3, EOrder RGB_ORDER = RGB, int XTRA0 = 0, bool FLIP = false, int WAIT_TIME = 50>
+class ClocklessController : public CLEDController {
+	typedef typename FastPinBB<DATA_PIN>::port_ptr_t data_ptr_t;
+	typedef typename FastPinBB<DATA_PIN>::port_t data_t;
+
+	data_t mPinMask;
+	data_ptr_t mPort;
+	CMinWait<WAIT_TIME> mWait;
+public:
+	virtual void init() {
+		FastPinBB<DATA_PIN>::setOutput();
+		mPinMask = FastPinBB<DATA_PIN>::mask();
+		mPort = FastPinBB<DATA_PIN>::port();
+	}
+
+	virtual uint16_t getMaxRefreshRate() const { return 400; }
+
+	virtual void clearLeds(int nLeds) {
+		showColor(CRGB(0, 0, 0), nLeds, 0);
+	}
+
+protected:
+
+	// set all the leds on the controller to a given color
+	virtual void showColor(const struct CRGB & rgbdata, int nLeds, CRGB scale) {
+		PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+		mWait.wait();
+		showRGBInternal(pixels);
+		mWait.mark();
+	}
+
+	virtual void show(const struct CRGB *rgbdata, int nLeds, CRGB scale) {
+		PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+		mWait.wait();
+		showRGBInternal(pixels);
+		mWait.mark();
+	}
+
+#ifdef SUPPORT_ARGB
+	virtual void show(const struct CARGB *rgbdata, int nLeds, CRGB scale) {
+		PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+		mWait.wait();
+		showRGBInternal(pixels);
+		sei();
+		mWait.mark();
+	}
+#endif
+
+
+	template<int BITS>  __attribute__ ((always_inline)) inline static void writeBits(register uint32_t & next_mark, register data_ptr_t port, register uint8_t & b) {
+		// Make sure we don't slot into a wrapping spot, this will delay up to 12.5µs for WS2812
+		// bool bShift=0;
+		// while(VAL < (TOTAL*10)) { bShift=true; }
+		// if(bShift) { next_mark = (VAL-TOTAL); };
+
+		for(register uint32_t i = BITS; i > 0; i--) {
+			// wait to start the bit, then set the pin high
+			while(DUE_TIMER_VAL < next_mark);
+			next_mark = (DUE_TIMER_VAL+TOTAL);
+			*port = 1;
+
+			// how long we want to wait next depends on whether or not our bit is set to 1 or 0
+			if(b&0x80) {
+				// we're a 1, wait until there's less than T3 clocks left
+				while((next_mark - DUE_TIMER_VAL) > (T3));
+			} else {
+				// we're a 0, wait until there's less than (T2+T3+slop) clocks left in this bit
+				while((next_mark - DUE_TIMER_VAL) > (T2+T3+6+TADJUST+TADJUST));
+			}
+			*port=0;
+			b <<= 1;
+		}
+	}
+
+#define FORCE_REFERENCE(var)  asm volatile( "" : : "r" (var) )
+	// This method is made static to force making register Y available to use for data on AVR - if the method is non-static, then
+	// gcc will use register Y for the this pointer.
+	static uint32_t showRGBInternal(PixelController<RGB_ORDER> & pixels) {
+		// Setup and start the clock
+		TC_Configure(DUE_TIMER,DUE_TIMER_CHANNEL,TC_CMR_TCCLKS_TIMER_CLOCK1);
+		pmc_enable_periph_clk(DUE_TIMER_ID);
+		TC_Start(DUE_TIMER,DUE_TIMER_CHANNEL);
+
+		register data_ptr_t port asm("r7") = FastPinBB<DATA_PIN>::port(); FORCE_REFERENCE(port);
+		*port = 0;
+
+		// Setup the pixel controller and load/scale the first byte
+		pixels.preStepFirstByteDithering();
+		register uint8_t b = pixels.loadAndScale0();
+
+		uint32_t next_mark = (DUE_TIMER_VAL + (TOTAL));
+		while(pixels.has(1)) {
+			pixels.stepDithering();
+
+			#if (FASTLED_ALLOW_INTERRUPTS == 1)
+			cli();
+			if(DUE_TIMER_VAL > next_mark) {
+				if((DUE_TIMER_VAL - next_mark) > ((WAIT_TIME-INTERRUPT_THRESHOLD)*CLKS_PER_US)) { sei(); TC_Stop(DUE_TIMER,DUE_TIMER_CHANNEL); return DUE_TIMER_VAL; }
+			}
+			#endif
+
+			writeBits<8+XTRA0>(next_mark, port, b);
+
+			b = pixels.loadAndScale1();
+			writeBits<8+XTRA0>(next_mark, port,b);
+
+			b = pixels.loadAndScale2();
+			writeBits<8+XTRA0>(next_mark, port,b);
+
+			b = pixels.advanceAndLoadAndScale0();
+			#if (FASTLED_ALLOW_INTERRUPTS == 1)
+			sei();
+			#endif
+		};
+
+		TC_Stop(DUE_TIMER,DUE_TIMER_CHANNEL);
+		return DUE_TIMER_VAL;
+	}
+};
+
+#endif
+
+FASTLED_NAMESPACE_END
+
+#endif
@@ -0,0 +1,206 @@
+ #ifndef __INC_BLOCK_CLOCKLESS_H
+#define __INC_BLOCK_CLOCKLESS_H
+
+FASTLED_NAMESPACE_BEGIN
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Base template for clockless controllers.  These controllers have 3 control points in their cycle for each bit.  The first point
+// is where the line is raised hi.  The second pointsnt is where the line is dropped low for a zero.  The third point is where the
+// line is dropped low for a one.  T1, T2, and T3 correspond to the timings for those three in clock cycles.
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__SAM3X8E__)
+#define PORT_MASK (((1<<LANES)-1) & ((FIRST_PIN==2) ? 0xFF : 0xFF))
+
+#define FASTLED_HAS_BLOCKLESS 1
+
+#define PORTD_FIRST_PIN 25
+#define PORTA_FIRST_PIN 69
+#define PORTB_FIRST_PIN 90
+
+typedef union {
+  uint8_t bytes[8];
+  uint32_t raw[2];
+} Lines;
+
+#define TADJUST 0
+#define TOTAL ( (T1+TADJUST) + (T2+TADJUST) + (T3+TADJUST) )
+#define T1_MARK (TOTAL - (T1+TADJUST))
+#define T2_MARK (T1_MARK - (T2+TADJUST))
+template <uint8_t LANES, int FIRST_PIN, int T1, int T2, int T3, EOrder RGB_ORDER = RGB, int XTRA0 = 0, bool FLIP = false, int WAIT_TIME = 50>
+class InlineBlockClocklessController : public CLEDController {
+	typedef typename FastPin<FIRST_PIN>::port_ptr_t data_ptr_t;
+	typedef typename FastPin<FIRST_PIN>::port_t data_t;
+
+	data_t mPinMask;
+	data_ptr_t mPort;
+	CMinWait<WAIT_TIME> mWait;
+public:
+	virtual void init() {
+    if(FIRST_PIN == PORTA_FIRST_PIN) {
+      switch(LANES) {
+        case 8: FastPin<31>::setOutput();
+        case 7: FastPin<58>::setOutput();
+        case 6: FastPin<100>::setOutput();
+        case 5: FastPin<59>::setOutput();
+        case 4: FastPin<60>::setOutput();
+        case 3: FastPin<61>::setOutput();
+        case 2: FastPin<68>::setOutput();
+        case 1: FastPin<69>::setOutput();
+      }
+    } else if(FIRST_PIN == PORTD_FIRST_PIN) {
+      switch(LANES) {
+        case 8: FastPin<11>::setOutput();
+        case 7: FastPin<29>::setOutput();
+        case 6: FastPin<15>::setOutput();
+        case 5: FastPin<14>::setOutput();
+        case 4: FastPin<28>::setOutput();
+        case 3: FastPin<27>::setOutput();
+        case 2: FastPin<26>::setOutput();
+        case 1: FastPin<25>::setOutput();
+      }
+    } else if(FIRST_PIN == PORTB_FIRST_PIN) {
+      switch(LANES) {
+        case 8: FastPin<97>::setOutput();
+        case 7: FastPin<96>::setOutput();
+        case 6: FastPin<95>::setOutput();
+        case 5: FastPin<94>::setOutput();
+        case 4: FastPin<93>::setOutput();
+        case 3: FastPin<92>::setOutput();
+        case 2: FastPin<91>::setOutput();
+        case 1: FastPin<90>::setOutput();
+      }
+    }
+    mPinMask = FastPin<FIRST_PIN>::mask();
+    mPort = FastPin<FIRST_PIN>::port();
+	}
+
+	virtual uint16_t getMaxRefreshRate() const { return 400; }
+
+	virtual void clearLeds(int nLeds) {
+		showColor(CRGB(0, 0, 0), nLeds, 0);
+	}
+
+	// set all the leds on the controller to a given color
+	virtual void showColor(const struct CRGB & rgbdata, int nLeds, CRGB scale) {
+    MultiPixelController<LANES,PORT_MASK,RGB_ORDER> pixels(rgbdata,nLeds, scale, getDither() );
+		mWait.wait();
+    showRGBInternal(pixels, nLeds);
+		sei();
+		mWait.mark();
+	}
+
+// #define ADV_RGB
+#define ADV_RGB if(maskbit & PORT_MASK) { rgbdata += nLeds; } maskbit <<= 1;
+
+	virtual void show(const struct CRGB *rgbdata, int nLeds, CRGB scale) {
+    MultiPixelController<LANES,PORT_MASK,RGB_ORDER> pixels(rgbdata,nLeds, scale, getDither() );
+		mWait.wait();
+		showRGBInternal(pixels, nLeds);
+		mWait.mark();
+	}
+
+#ifdef SUPPORT_ARGB
+	virtual void show(const struct CARGB *rgbdata, int nLeds, CRGB scale) {
+		mWait.wait();
+		showRGBInternal(PixelController<RGB_ORDER>(rgbdata, nLeds, scale, getDither()));
+		mWait.mark();
+	}
+#endif
+
+	template<int BITS,int PX> __attribute__ ((always_inline)) inline static void writeBits(register uint32_t & next_mark, register Lines & b, Lines & b3, MultiPixelController<LANES, PORT_MASK, RGB_ORDER> &pixels) { // , register uint32_t & b2)  {
+		register Lines b2;
+    transpose8x1(b.bytes,b2.bytes);
+
+    register uint8_t d = pixels.template getd<PX>(pixels);
+    register uint8_t scale = pixels.template getscale<PX>(pixels);
+
+		for(uint32_t i = 0; (i < LANES) && (i<8); i++) {
+      while(DUE_TIMER_VAL < next_mark);
+      next_mark = (DUE_TIMER_VAL+TOTAL);
+
+			*FastPin<FIRST_PIN>::sport() = PORT_MASK;
+
+			while((next_mark - DUE_TIMER_VAL) > (T2+T3+6));
+			*FastPin<FIRST_PIN>::cport() = (~b2.bytes[7-i]) & PORT_MASK;
+
+			while((next_mark - (DUE_TIMER_VAL)) > T3);
+			*FastPin<FIRST_PIN>::cport() = PORT_MASK;
+
+      b3.bytes[i] = pixels.template loadAndScale<PX>(pixels,i,d,scale);
+		}
+
+    for(uint32_t i = LANES; i < 8; i++) {
+      while(DUE_TIMER_VAL > next_mark);
+
+      next_mark = DUE_TIMER_VAL - (TOTAL-3);
+      *FastPin<FIRST_PIN>::sport() = PORT_MASK;
+
+      while((next_mark - DUE_TIMER_VAL) > (T2+T3+6));
+      *FastPin<FIRST_PIN>::cport() = (~b2.bytes[7-i]) & PORT_MASK;
+
+      while((next_mark - DUE_TIMER_VAL) > T3);
+      *FastPin<FIRST_PIN>::cport() = PORT_MASK;
+    }
+	}
+
+	// This method is made static to force making register Y available to use for data on AVR - if the method is non-static, then
+	// gcc will use register Y for the this pointer.
+	static uint32_t showRGBInternal(MultiPixelController<LANES, PORT_MASK, RGB_ORDER> &allpixels, int nLeds) {
+		// Serial.println("Entering show");
+		// Setup the pixel controller and load/scale the first byte
+		Lines b0,b1,b2;
+
+    allpixels.preStepFirstByteDithering();
+		for(int i = 0; i < LANES; i++) {
+			b0.bytes[i] = allpixels.loadAndScale0(i);
+		}
+
+		// Setup and start the clock
+    TC_Configure(DUE_TIMER,DUE_TIMER_CHANNEL,TC_CMR_TCCLKS_TIMER_CLOCK1);
+    pmc_enable_periph_clk(DUE_TIMER_ID);
+    TC_Start(DUE_TIMER,DUE_TIMER_CHANNEL);
+
+    #if (FASTLED_ALLOW_INTERRUPTS == 1)
+    cli();
+    #endif
+		uint32_t next_mark = (DUE_TIMER_VAL + (TOTAL));
+		while(nLeds--) {
+      allpixels.stepDithering();
+      #if (FASTLED_ALLOW_INTERRUPTS == 1)
+      cli();
+      if(DUE_TIMER_VAL > next_mark) {
+        if((DUE_TIMER_VAL - next_mark) > ((WAIT_TIME-INTERRUPT_THRESHOLD)*CLKS_PER_US)) {
+          sei(); TC_Stop(DUE_TIMER,DUE_TIMER_CHANNEL); return DUE_TIMER_VAL;
+        }
+      }
+      #endif
+
+			// Write first byte, read next byte
+			writeBits<8+XTRA0,1>(next_mark, b0, b1, allpixels);
+
+			// Write second byte, read 3rd byte
+			writeBits<8+XTRA0,2>(next_mark, b1, b2, allpixels);
+
+      allpixels.advanceData();
+			// Write third byte
+			writeBits<8+XTRA0,0>(next_mark, b2, b0, allpixels);
+
+      #if (FASTLED_ALLOW_INTERRUPTS == 1)
+      sei();
+      #endif
+		}
+
+		return DUE_TIMER_VAL;
+	}
+
+
+};
+
+#endif
+
+FASTLED_NAMESPACE_END
+
+#endif
@@ -0,0 +1,11 @@
+#ifndef __INC_FASTLED_ARM_SAM_H
+#define __INC_FASTLED_ARM_SAM_H
+
+// Include the sam headers
+#include "fastled_delay.h"
+#include "fastpin_arm_sam.h"
+#include "fastspi_arm_sam.h"
+#include "clockless_arm_sam.h"
+#include "clockless_block_arm_sam.h"
+
+#endif
@@ -0,0 +1,137 @@
+#ifndef __INC_FASTPIN_ARM_SAM_H
+#define __INC_FASTPIN_ARM_SAM_H
+
+FASTLED_NAMESPACE_BEGIN
+
+#if defined(FASTLED_FORCE_SOFTWARE_PINS)
+#warning "Software pin support forced, pin access will be sloightly slower."
+#define NO_HARDWARE_PIN_SUPPORT
+#undef HAS_HARDWARE_PIN_SUPPORT
+
+#else
+
+
+/// Template definition for arduino due style ARM pins, providing direct access to the various GPIO registers.  Note that this
+/// uses the full port GPIO registers.  In theory, in some way, bit-band register access -should- be faster, however I have found
+/// that something about the way gcc does register allocation results in the bit-band code being slower.  It will need more fine tuning.
+/// The registers are data register, set output register, clear output register, set data direction register
+template<uint8_t PIN, uint32_t _MASK, typename _PDOR, typename _PSOR, typename _PCOR, typename _PDDR> class _DUEPIN {
+public:
+	typedef volatile uint32_t * port_ptr_t;
+	typedef uint32_t port_t;
+
+	inline static void setOutput() { pinMode(PIN, OUTPUT); } // TODO: perform MUX config { _PDDR::r() |= _MASK; }
+	inline static void setInput() { pinMode(PIN, INPUT); } // TODO: preform MUX config { _PDDR::r() &= ~_MASK; }
+
+	inline static void hi() __attribute__ ((always_inline)) { _PSOR::r() = _MASK; }
+	inline static void lo() __attribute__ ((always_inline)) { _PCOR::r() = _MASK; }
+	inline static void set(register port_t val) __attribute__ ((always_inline)) { _PDOR::r() = val; }
+
+	inline static void strobe() __attribute__ ((always_inline)) { toggle(); toggle();  }
+
+	inline static void toggle() __attribute__ ((always_inline)) { _PDOR::r() ^= _MASK; }
+
+	inline static void hi(register port_ptr_t port) __attribute__ ((always_inline)) { hi(); }
+	inline static void lo(register port_ptr_t port) __attribute__ ((always_inline)) { lo(); }
+	inline static void fastset(register port_ptr_t port, register port_t val) __attribute__ ((always_inline)) { *port = val; }
+
+	inline static port_t hival() __attribute__ ((always_inline)) { return _PDOR::r() | _MASK; }
+	inline static port_t loval() __attribute__ ((always_inline)) { return _PDOR::r() & ~_MASK; }
+	inline static port_ptr_t port() __attribute__ ((always_inline)) { return &_PDOR::r(); }
+	inline static port_ptr_t sport() __attribute__ ((always_inline)) { return &_PSOR::r(); }
+	inline static port_ptr_t cport() __attribute__ ((always_inline)) { return &_PCOR::r(); }
+	inline static port_t mask() __attribute__ ((always_inline)) { return _MASK; }
+};
+
+
+/// Template definition for DUE  style ARM pins using bit banding, providing direct access to the various GPIO registers.  GCC
+/// does a poor job of optimizing around these accesses so they are not being used just yet.
+template<uint8_t PIN, uint32_t _BIT, typename _PDOR, typename _PSOR, typename _PCOR, typename _PDDR> class _DUEPIN_BITBAND {
+public:
+	typedef volatile uint32_t * port_ptr_t;
+	typedef uint32_t port_t;
+
+	inline static void setOutput() { pinMode(PIN, OUTPUT); } // TODO: perform MUX config { _PDDR::r() |= _MASK; }
+	inline static void setInput() { pinMode(PIN, INPUT); } // TODO: preform MUX config { _PDDR::r() &= ~_MASK; }
+
+	inline static void hi() __attribute__ ((always_inline)) { *_PDOR::template rx<_BIT>() = 1; }
+	inline static void lo() __attribute__ ((always_inline)) { *_PDOR::template rx<_BIT>() = 0; }
+	inline static void set(register port_t val) __attribute__ ((always_inline)) { *_PDOR::template rx<_BIT>() = val; }
+
+	inline static void strobe() __attribute__ ((always_inline)) { toggle(); toggle(); }
+
+	inline static void toggle() __attribute__ ((always_inline)) { *_PDOR::template rx<_BIT>() ^= 1; }
+
+	inline static void hi(register port_ptr_t port) __attribute__ ((always_inline)) { hi();  }
+	inline static void lo(register port_ptr_t port) __attribute__ ((always_inline)) { lo(); }
+	inline static void fastset(register port_ptr_t port, register port_t val) __attribute__ ((always_inline)) { *port = val; }
+
+	inline static port_t hival() __attribute__ ((always_inline)) { return 1; }
+	inline static port_t loval() __attribute__ ((always_inline)) { return 0; }
+	inline static port_ptr_t port() __attribute__ ((always_inline)) { return _PDOR::template rx<_BIT>(); }
+	inline static port_t mask() __attribute__ ((always_inline)) { return 1; }
+};
+
+#define GPIO_BITBAND_ADDR(reg, bit) (((uint32_t)&(reg) - 0x40000000) * 32 + (bit) * 4 + 0x42000000)
+#define GPIO_BITBAND_PTR(reg, bit) ((uint32_t *)GPIO_BITBAND_ADDR((reg), (bit)))
+
+#define _R(T) struct __gen_struct_ ## T
+#define _RD32(T) struct __gen_struct_ ## T { static __attribute__((always_inline)) inline reg32_t r() { return T; } \
+	template<int BIT> static __attribute__((always_inline)) inline ptr_reg32_t rx() { return GPIO_BITBAND_PTR(T, BIT); } };
+#define DUE_IO32(L) _RD32(REG_PIO ## L ## _ODSR); _RD32(REG_PIO ## L ## _SODR); _RD32(REG_PIO ## L ## _CODR); _RD32(REG_PIO ## L ## _OER);
+
+#define _DEFPIN_DUE(PIN, BIT, L) template<> class FastPin<PIN> : public _DUEPIN<PIN, 1 << BIT, _R(REG_PIO ## L ## _ODSR), _R(REG_PIO ## L ## _SODR), _R(REG_PIO ## L ## _CODR), \
+  																			_R(GPIO ## L ## _OER)> {}; \
+  								   template<> class FastPinBB<PIN> : public _DUEPIN_BITBAND<PIN, BIT, _R(REG_PIO ## L ## _ODSR), _R(REG_PIO ## L ## _SODR), _R(REG_PIO ## L ## _CODR), \
+  																			_R(GPIO ## L ## _OER)> {};
+
+#if defined(__SAM3X8E__)
+
+DUE_IO32(A);
+DUE_IO32(B);
+DUE_IO32(C);
+DUE_IO32(D);
+
+#define MAX_PIN 78
+_DEFPIN_DUE(0, 8, A); _DEFPIN_DUE(1, 9, A); _DEFPIN_DUE(2, 25, B); _DEFPIN_DUE(3, 28, C);
+_DEFPIN_DUE(4, 26, C); _DEFPIN_DUE(5, 25, C); _DEFPIN_DUE(6, 24, C); _DEFPIN_DUE(7, 23, C);
+_DEFPIN_DUE(8, 22, C); _DEFPIN_DUE(9, 21, C); _DEFPIN_DUE(10, 29, C); _DEFPIN_DUE(11, 7, D);
+_DEFPIN_DUE(12, 8, D); _DEFPIN_DUE(13, 27, B); _DEFPIN_DUE(14, 4, D); _DEFPIN_DUE(15, 5, D);
+_DEFPIN_DUE(16, 13, A); _DEFPIN_DUE(17, 12, A); _DEFPIN_DUE(18, 11, A); _DEFPIN_DUE(19, 10, A);
+_DEFPIN_DUE(20, 12, B); _DEFPIN_DUE(21, 13, B); _DEFPIN_DUE(22, 26, B); _DEFPIN_DUE(23, 14, A);
+_DEFPIN_DUE(24, 15, A); _DEFPIN_DUE(25, 0, D); _DEFPIN_DUE(26, 1, D); _DEFPIN_DUE(27, 2, D);
+_DEFPIN_DUE(28, 3, D); _DEFPIN_DUE(29, 6, D); _DEFPIN_DUE(30, 9, D); _DEFPIN_DUE(31, 7, A);
+_DEFPIN_DUE(32, 10, D); _DEFPIN_DUE(33, 1, C); _DEFPIN_DUE(34, 2, C); _DEFPIN_DUE(35, 3, C);
+_DEFPIN_DUE(36, 4, C); _DEFPIN_DUE(37, 5, C); _DEFPIN_DUE(38, 6, C); _DEFPIN_DUE(39, 7, C);
+_DEFPIN_DUE(40, 8, C); _DEFPIN_DUE(41, 9, C); _DEFPIN_DUE(42, 19, A); _DEFPIN_DUE(43, 20, A);
+_DEFPIN_DUE(44, 19, C); _DEFPIN_DUE(45, 18, C); _DEFPIN_DUE(46, 17, C); _DEFPIN_DUE(47, 16, C);
+_DEFPIN_DUE(48, 15, C); _DEFPIN_DUE(49, 14, C); _DEFPIN_DUE(50, 13, C); _DEFPIN_DUE(51, 12, C);
+_DEFPIN_DUE(52, 21, B); _DEFPIN_DUE(53, 14, B); _DEFPIN_DUE(54, 16, A); _DEFPIN_DUE(55, 24, A);
+_DEFPIN_DUE(56, 23, A); _DEFPIN_DUE(57, 22, A); _DEFPIN_DUE(58, 6, A); _DEFPIN_DUE(59, 4, A);
+_DEFPIN_DUE(60, 3, A); _DEFPIN_DUE(61, 2, A); _DEFPIN_DUE(62, 17, B); _DEFPIN_DUE(63, 18, B);
+_DEFPIN_DUE(64, 19, B); _DEFPIN_DUE(65, 20, B); _DEFPIN_DUE(66, 15, B); _DEFPIN_DUE(67, 16, B);
+_DEFPIN_DUE(68, 1, A); _DEFPIN_DUE(69, 0, A); _DEFPIN_DUE(70, 17, A); _DEFPIN_DUE(71, 18, A);
+_DEFPIN_DUE(72, 30, C); _DEFPIN_DUE(73, 21, A); _DEFPIN_DUE(74, 25, A); _DEFPIN_DUE(75, 26, A);
+_DEFPIN_DUE(76, 27, A); _DEFPIN_DUE(77, 28, A); _DEFPIN_DUE(78, 23, B);
+
+// digix pins
+_DEFPIN_DUE(90, 0, B); _DEFPIN_DUE(91, 1, B); _DEFPIN_DUE(92, 2, B); _DEFPIN_DUE(93, 3, B);
+_DEFPIN_DUE(94, 4, B); _DEFPIN_DUE(95, 5, B); _DEFPIN_DUE(96, 6, B); _DEFPIN_DUE(97, 7, B);
+_DEFPIN_DUE(98, 8, B); _DEFPIN_DUE(99, 9, B); _DEFPIN_DUE(100, 5, A); _DEFPIN_DUE(101, 22, B);
+_DEFPIN_DUE(102, 23, B); _DEFPIN_DUE(103, 24, B); _DEFPIN_DUE(104, 27, C); _DEFPIN_DUE(105, 20, C);
+_DEFPIN_DUE(106, 11, C); _DEFPIN_DUE(107, 10, C); _DEFPIN_DUE(108, 21, A); _DEFPIN_DUE(109, 30, C);
+_DEFPIN_DUE(110, 29, B); _DEFPIN_DUE(111, 30, B); _DEFPIN_DUE(112, 31, B); _DEFPIN_DUE(113, 28, B);
+
+#define SPI_DATA 75
+#define SPI_CLOCK 76
+#define ARM_HARDWARE_SPI
+#define HAS_HARDWARE_PIN_SUPPORT
+
+#endif
+
+#endif // FASTLED_FORCE_SOFTWARE_PINS
+
+FASTLED_NAMESPACE_END
+
+
+#endif // __INC_FASTPIN_ARM_SAM_H
@@ -0,0 +1,163 @@
+#ifndef __INC_FASTSPI_ARM_SAM_H
+#define __INC_FASTSPI_ARM_SAM_H
+
+FASTLED_NAMESPACE_BEGIN
+
+#if defined(__SAM3X8E__)
+#define m_SPI ((Spi*)SPI0)
+
+template <uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint8_t _SPI_CLOCK_DIVIDER>
+class SAMHardwareSPIOutput {
+	Selectable *m_pSelect;
+
+	static inline void waitForEmpty() { while ((m_SPI->SPI_SR & SPI_SR_TDRE) == 0); }
+
+	void enableConfig() { m_SPI->SPI_WPMR &= ~SPI_WPMR_WPEN; }
+	void disableConfig() { m_SPI->SPI_WPMR |= SPI_WPMR_WPEN; }
+
+	void enableSPI() { m_SPI->SPI_CR = SPI_CR_SPIEN; }
+	void disableSPI() { m_SPI->SPI_CR = SPI_CR_SPIDIS; }
+	void resetSPI() { m_SPI->SPI_CR = SPI_CR_SWRST; }
+
+	static inline void readyTransferBits(register uint32_t bits) {
+		bits -= 8;
+		// don't change the number of transfer bits while data is still being transferred from TDR to the shift register
+		waitForEmpty();
+		m_SPI->SPI_CSR[0] = SPI_CSR_NCPHA | SPI_CSR_CSAAT | (bits << SPI_CSR_BITS_Pos) | SPI_CSR_DLYBCT(1) | SPI_CSR_SCBR(_SPI_CLOCK_DIVIDER);
+	}
+
+	template<int BITS> static inline void writeBits(uint16_t w) {
+		waitForEmpty();
+		m_SPI->SPI_TDR = (uint32_t)w | SPI_PCS(0);
+	}
+
+public:
+	SAMHardwareSPIOutput() { m_pSelect = NULL; }
+	SAMHardwareSPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
+
+	// set the object representing the selectable
+	void setSelect(Selectable *pSelect) { /* TODO */ }
+
+	// initialize the SPI subssytem
+	void init() {
+		// m_SPI = SPI0;
+
+		// set the output pins master out, master in, clock.  Note doing this here because I still don't
+		// know how I want to expose this type of functionality in FastPin.
+		PIO_Configure(PIOA, PIO_PERIPH_A, FastPin<_DATA_PIN>::mask(), PIO_DEFAULT);
+		PIO_Configure(PIOA, PIO_PERIPH_A, FastPin<_DATA_PIN-1>::mask(), PIO_DEFAULT);
+		PIO_Configure(PIOA, PIO_PERIPH_A, FastPin<_CLOCK_PIN>::mask(), PIO_DEFAULT);
+
+		release();
+
+		// Configure the SPI clock, divider between 1-255
+		// SCBR = _SPI_CLOCK_DIVIDER
+		pmc_enable_periph_clk(ID_SPI0);
+		disableSPI();
+
+		// reset twice (what the sam code does, not sure why?)
+		resetSPI();
+		resetSPI();
+
+		// Configure SPI as master, enable
+		// Bits we want in MR: master, disable mode fault detection, variable peripheral select
+		m_SPI->SPI_MR = SPI_MR_MSTR | SPI_MR_MODFDIS | SPI_MR_PS;
+
+		enableSPI();
+
+		// Send everything out in 8 bit chunks, other sizes appear to work, poorly...
+		readyTransferBits(8);
+	}
+
+	// latch the CS select
+	void inline select() __attribute__((always_inline)) { if(m_pSelect != NULL) { m_pSelect->select(); } }
+
+	// release the CS select
+	void inline release() __attribute__((always_inline)) { if(m_pSelect != NULL) { m_pSelect->release(); } }
+
+	// wait until all queued up data has been written
+	void waitFully() { while((m_SPI->SPI_SR & SPI_SR_TXEMPTY) == 0); }
+
+	// write a byte out via SPI (returns immediately on writing register)
+	static void writeByte(uint8_t b) {
+		writeBits<8>(b);
+	}
+
+	// write a word out via SPI (returns immediately on writing register)
+	static void writeWord(uint16_t w) {
+		writeBits<16>(w);
+	}
+
+	// A raw set of writing byte values, assumes setup/init/waiting done elsewhere
+	static void writeBytesValueRaw(uint8_t value, int len) {
+		while(len--) { writeByte(value); }
+	}
+
+	// A full cycle of writing a value for len bytes, including select, release, and waiting
+	void writeBytesValue(uint8_t value, int len) {
+		select(); writeBytesValueRaw(value, len); release();
+	}
+
+	template <class D> void writeBytes(register uint8_t *data, int len) {
+		uint8_t *end = data + len;
+		select();
+		// could be optimized to write 16bit words out instead of 8bit bytes
+		while(data != end) {
+			writeByte(D::adjust(*data++));
+		}
+		D::postBlock(len);
+		waitFully();
+		release();
+	}
+
+	void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }
+
+	// write a single bit out, which bit from the passed in byte is determined by template parameter
+	// not the most efficient mechanism in the world - but should be enough for sm16716 and friends
+	template <uint8_t BIT> inline void writeBit(uint8_t b) {
+		// need to wait for all exisiting data to go out the door, first
+		waitFully();
+		disableSPI();
+		if(b & (1 << BIT)) {
+			FastPin<_DATA_PIN>::hi();
+		} else {
+			FastPin<_DATA_PIN>::lo();
+		}
+
+		FastPin<_CLOCK_PIN>::hi();
+		FastPin<_CLOCK_PIN>::lo();
+		enableSPI();
+	}
+
+	// write a block of uint8_ts out in groups of three.  len is the total number of uint8_ts to write out.  The template
+	// parameters indicate how many uint8_ts to skip at the beginning and/or end of each grouping
+	template <uint8_t FLAGS, class D, EOrder RGB_ORDER> void writePixels(PixelController<RGB_ORDER> pixels) {
+		select();
+		int len = pixels.mLen;
+
+		if(FLAGS & FLAG_START_BIT) {
+			while(pixels.has(1)) {
+				writeBits<9>((1<<8) | D::adjust(pixels.loadAndScale0()));
+				writeByte(D::adjust(pixels.loadAndScale1()));
+				writeByte(D::adjust(pixels.loadAndScale2()));
+				pixels.advanceData();
+				pixels.stepDithering();
+			}
+		} else {
+			while(pixels.has(1)) {
+				writeByte(D::adjust(pixels.loadAndScale0()));
+				writeByte(D::adjust(pixels.loadAndScale1()));
+				writeByte(D::adjust(pixels.loadAndScale2()));
+				pixels.advanceData();
+				pixels.stepDithering();
+			}
+		}
+		D::postBlock(len);
+		release();
+	}
+};
+
+#endif
+
+FASTLED_NAMESPACE_END
+#endif
@@ -0,0 +1,39 @@
+#ifndef __INC_LED_SYSDEFS_ARM_SAM_H
+#define __INC_LED_SYSDEFS_ARM_SAM_H
+
+
+#define FASTLED_ARM
+
+// Setup DUE timer defines/channels/etc...
+#ifndef DUE_TIMER_CHANNEL
+#define DUE_TIMER_GROUP 0
+#endif
+
+#ifndef DUE_TIMER_CHANNEL
+#define DUE_TIMER_CHANNEL 0
+#endif
+
+#define DUE_TIMER ((DUE_TIMER_GROUP==0) ? TC0 : ((DUE_TIMER_GROUP==1) ? TC1 : TC2))
+#define DUE_TIMER_ID (ID_TC0 + (DUE_TIMER_GROUP*3) + DUE_TIMER_CHANNEL)
+#define DUE_TIMER_VAL (DUE_TIMER->TC_CHANNEL[DUE_TIMER_CHANNEL].TC_CV << 1)
+#define DUE_TIMER_RUNNING ((DUE_TIMER->TC_CHANNEL[DUE_TIMER_CHANNEL].TC_SR & TC_SR_CLKSTA) != 0)
+
+#ifndef INTERRUPT_THRESHOLD
+#define INTERRUPT_THRESHOLD 1
+#endif
+
+// Default to allowing interrupts
+#ifndef FASTLED_ALLOW_INTERRUPTS
+#define FASTLED_ALLOW_INTERRUPTS 1
+#endif
+
+#if FASTLED_ALLOW_INTERRUPTS == 1
+#define FASTLED_ACCURATE_CLOCK
+#endif
+
+// reusing/abusing cli/sei defs for due
+#define cli()  __disable_irq(); __disable_fault_irq();
+#define sei() __enable_irq(); __enable_fault_irq();
+
+
+#endif
@@ -0,0 +1,147 @@
+#ifndef __INC_CLOCKLESS_ARM_STM32_H
+#define __INC_CLOCKLESS_ARM_STM32_H
+
+FASTLED_NAMESPACE_BEGIN
+// Definition for a single channel clockless controller for the stm32 family of chips, like that used in the spark core
+// See clockless.h for detailed info on how the template parameters are used.
+
+#define FASTLED_HAS_CLOCKLESS 1
+
+template <int DATA_PIN, int T1, int T2, int T3, EOrder RGB_ORDER = RGB, int XTRA0 = 0, bool FLIP = false, int WAIT_TIME = 50>
+class ClocklessController : public CLEDController {
+  typedef typename FastPin<DATA_PIN>::port_ptr_t data_ptr_t;
+  typedef typename FastPin<DATA_PIN>::port_t data_t;
+
+  data_t mPinMask;
+  data_ptr_t mPort;
+  CMinWait<WAIT_TIME> mWait;
+public:
+  virtual void init() {
+    FastPin<DATA_PIN>::setOutput();
+    mPinMask = FastPin<DATA_PIN>::mask();
+    mPort = FastPin<DATA_PIN>::port();
+  }
+
+	virtual uint16_t getMaxRefreshRate() const { return 400; }
+
+  virtual void clearLeds(int nLeds) {
+    showColor(CRGB(0, 0, 0), nLeds, 0);
+  }
+
+protected:
+
+  // set all the leds on the controller to a given color
+  virtual void showColor(const struct CRGB & rgbdata, int nLeds, CRGB scale) {
+    PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+
+    mWait.wait();
+    showRGBInternal(pixels);
+    mWait.mark();
+  }
+
+  virtual void show(const struct CRGB *rgbdata, int nLeds, CRGB scale) {
+    PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+
+    mWait.wait();
+    showRGBInternal(pixels);
+    mWait.mark();
+  }
+
+  #ifdef SUPPORT_ARGB
+  virtual void show(const struct CARGB *rgbdata, int nLeds, CRGB scale) {
+    PixelController<RGB_ORDER> pixels(rgbdata, nLeds, scale, getDither());
+    mWait.wait();
+    showRGBInternal(pixels);
+    mWait.mark();
+  }
+  #endif
+
+#define _CYCCNT (*(volatile uint32_t*)(0xE0001004UL))
+
+  template<int BITS> __attribute__ ((always_inline)) inline static void writeBits(register uint32_t & next_mark, register data_ptr_t port, register data_t hi, register data_t lo, register uint8_t & b)  {
+    for(register uint32_t i = BITS-1; i > 0; i--) {
+      while(_CYCCNT < (T1+T2+T3-20));
+      FastPin<DATA_PIN>::fastset(port, hi);
+      _CYCCNT = 4;
+      if(b&0x80) {
+        while(_CYCCNT < (T1+T2-20));
+        FastPin<DATA_PIN>::fastset(port, lo);
+      } else {
+        while(_CYCCNT < (T1-10));
+        FastPin<DATA_PIN>::fastset(port, lo);
+      }
+      b <<= 1;
+    }
+
+    while(_CYCCNT < (T1+T2+T3-20));
+    FastPin<DATA_PIN>::fastset(port, hi);
+    _CYCCNT = 4;
+
+    if(b&0x80) {
+      while(_CYCCNT < (T1+T2-20));
+      FastPin<DATA_PIN>::fastset(port, lo);
+    } else {
+      while(_CYCCNT < (T1-10));
+      FastPin<DATA_PIN>::fastset(port, lo);
+    }
+  }
+
+  // This method is made static to force making register Y available to use for data on AVR - if the method is non-static, then
+  // gcc will use register Y for the this pointer.
+  static uint32_t showRGBInternal(PixelController<RGB_ORDER> & pixels) {
+    // Get access to the clock
+    CoreDebug->DEMCR  |= CoreDebug_DEMCR_TRCENA_Msk;
+    DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk;
+    DWT->CYCCNT = 0;
+
+    register data_ptr_t port = FastPin<DATA_PIN>::port();
+    register data_t hi = *port | FastPin<DATA_PIN>::mask();;
+    register data_t lo = *port & ~FastPin<DATA_PIN>::mask();;
+    *port = lo;
+
+    // Setup the pixel controller and load/scale the first byte
+    pixels.preStepFirstByteDithering();
+    register uint8_t b = pixels.loadAndScale0();
+
+    cli();
+
+    uint32_t next_mark = (T1+T2+T3);
+
+    DWT->CYCCNT = 0;
+    while(pixels.has(1)) {
+      pixels.stepDithering();
+      #if (FASTLED_ALLOW_INTERRUPTS == 1)
+      cli();
+      // if interrupts took longer than 45µs, punt on the current frame
+      if(DWT->CYCCNT > next_mark) {
+        if((DWT->CYCCNT-next_mark) > ((WAIT_TIME-INTERRUPT_THRESHOLD)*CLKS_PER_US)) { sei(); return DWT->CYCCNT; }
+      }
+
+      hi = *port | FastPin<DATA_PIN>::mask();
+      lo = *port & ~FastPin<DATA_PIN>::mask();
+      #endif
+
+      // Write first byte, read next byte
+      writeBits<8+XTRA0>(next_mark, port, hi, lo, b);
+      b = pixels.loadAndScale1();
+
+      // Write second byte, read 3rd byte
+      writeBits<8+XTRA0>(next_mark, port, hi, lo, b);
+      b = pixels.loadAndScale2();
+
+      // Write third byte, read 1st byte of next pixel
+      writeBits<8+XTRA0>(next_mark, port, hi, lo, b);
+      b = pixels.advanceAndLoadAndScale0();
+      #if (FASTLED_ALLOW_INTERRUPTS == 1)
+      sei();
+      #endif
+    };
+
+    sei();
+    return DWT->CYCCNT;
+  }
+};
+
+FASTLED_NAMESPACE_END
+
+  #endif
@@ -0,0 +1,10 @@
+#ifndef __INC_FASTLED_ARM_SAM_H
+#define __INC_FASTLED_ARM_SAM_H
+
+// Include the sam headers
+#include "fastled_delay.h"
+#include "fastpin_arm_stm32.h"
+// #include "fastspi_arm_stm32.h"
+#include "clockless_arm_stm32.h"
+
+#endif
@@ -0,0 +1,105 @@
+#ifndef __FASTPIN_ARM_STM32_H
+#define __FASTPIN_ARM_STM32_H
+
+FASTLED_NAMESPACE_BEGIN
+
+#if defined(FASTLED_FORCE_SOFTWARE_PINS)
+#warning "Software pin support forced, pin access will be sloightly slower."
+#define NO_HARDWARE_PIN_SUPPORT
+#undef HAS_HARDWARE_PIN_SUPPORT
+
+#else
+
+/// Template definition for STM32 style ARM pins, providing direct access to the various GPIO registers.  Note that this
+/// uses the full port GPIO registers.  In theory, in some way, bit-band register access -should- be faster, however I have found
+/// that something about the way gcc does register allocation results in the bit-band code being slower.  It will need more fine tuning.
+/// The registers are data output, set output, clear output, toggle output, input, and direction
+
+template<uint8_t PIN, uint8_t _BIT, uint32_t _MASK, typename _GPIO> class _ARMPIN {
+public:
+  typedef volatile uint32_t * port_ptr_t;
+  typedef uint32_t port_t;
+
+  #if 0
+  inline static void setOutput() {
+    if(_BIT<8) {
+      _CRL::r() = (_CRL::r() & (0xF << (_BIT*4)) | (0x1 << (_BIT*4));
+    } else {
+      _CRH::r() = (_CRH::r() & (0xF << ((_BIT-8)*4))) | (0x1 << ((_BIT-8)*4));
+    }
+  }
+  inline static void setInput() { /* TODO */ } // TODO: preform MUX config { _PDDR::r() &= ~_MASK; }
+  #endif
+
+  inline static void setOutput() { pinMode(PIN, OUTPUT); } // TODO: perform MUX config { _PDDR::r() |= _MASK; }
+  inline static void setInput() { pinMode(PIN, INPUT); } // TODO: preform MUX config { _PDDR::r() &= ~_MASK; }
+
+  inline static void hi() __attribute__ ((always_inline)) { _GPIO::r()->BSRR = _MASK; }
+  inline static void lo() __attribute__ ((always_inline)) { _GPIO::r()->BRR = _MASK; }
+  // inline static void lo() __attribute__ ((always_inline)) { _GPIO::r()->BSRR = (_MASK<<16); }
+  inline static void set(register port_t val) __attribute__ ((always_inline)) { _GPIO::r()->ODR = val; }
+
+  inline static void strobe() __attribute__ ((always_inline)) { toggle(); toggle(); }
+
+  inline static void toggle() __attribute__ ((always_inline)) { if(_GPIO::r()->ODR & _MASK) { lo(); } else { hi(); } }
+
+  inline static void hi(register port_ptr_t port) __attribute__ ((always_inline)) { hi(); }
+  inline static void lo(register port_ptr_t port) __attribute__ ((always_inline)) { lo(); }
+  inline static void fastset(register port_ptr_t port, register port_t val) __attribute__ ((always_inline)) { *port = val; }
+
+  inline static port_t hival() __attribute__ ((always_inline)) { return _GPIO::r()->ODR | _MASK; }
+  inline static port_t loval() __attribute__ ((always_inline)) { return _GPIO::r()->ODR & ~_MASK; }
+  inline static port_ptr_t port() __attribute__ ((always_inline)) { return &_GPIO::r()->ODR; }
+  inline static port_ptr_t sport() __attribute__ ((always_inline)) { return &_GPIO::r()->BSRR; }
+  inline static port_ptr_t cport() __attribute__ ((always_inline)) { return &_GPIO::r()->BRR; }
+  inline static port_t mask() __attribute__ ((always_inline)) { return _MASK; }
+};
+
+#define _R(T) struct __gen_struct_ ## T
+#define _RD32(T) struct __gen_struct_ ## T { static __attribute__((always_inline)) inline volatile GPIO_TypeDef * r() { return T; } };
+
+#define _IO32(L) _RD32(GPIO ## L)
+
+#define _DEFPIN_ARM(PIN, BIT, L) template<> class FastPin<PIN> : public _ARMPIN<PIN, BIT, 1 << BIT, _R(GPIO ## L)> {};
+
+// Actual pin definitions
+#if defined(SPARK)
+
+_IO32(A); _IO32(B); _IO32(C); _IO32(D); _IO32(E); _IO32(F); _IO32(G);
+
+
+#define MAX_PIN 19
+_DEFPIN_ARM(0, 7, B);
+_DEFPIN_ARM(1, 6, B);
+_DEFPIN_ARM(2, 5, B);
+_DEFPIN_ARM(3, 4, B);
+_DEFPIN_ARM(4, 3, B);
+_DEFPIN_ARM(5, 15, A);
+_DEFPIN_ARM(6, 14, A);
+_DEFPIN_ARM(7, 13, A);
+_DEFPIN_ARM(8, 8, A);
+_DEFPIN_ARM(9, 9, A);
+_DEFPIN_ARM(10, 0, A);
+_DEFPIN_ARM(11, 1, A);
+_DEFPIN_ARM(12, 4, A);
+_DEFPIN_ARM(13, 5, A);
+_DEFPIN_ARM(14, 6, A);
+_DEFPIN_ARM(15, 7, A);
+_DEFPIN_ARM(16, 0, B);
+_DEFPIN_ARM(17, 1, B);
+_DEFPIN_ARM(18, 3, A);
+_DEFPIN_ARM(19, 2, A);
+
+
+#define SPI_DATA 15
+#define SPI_CLOCK 13
+
+#define HAS_HARDWARE_PIN_SUPPORT
+
+#endif
+
+#endif // FASTLED_FORCE_SOFTWARE_PINS
+
+FASTLED_NAMESPACE_END
+
+#endif // __INC_FASTPIN_ARM_STM32
@@ -0,0 +1,47 @@
+#ifndef __INC_LED_SYSDEFS_ARM_SAM_H
+#define __INC_LED_SYSDEFS_ARM_SAM_H
+
+#include "application.h"
+
+#define FASTLED_NAMESPACE_BEGIN namespace NSFastLED {
+#define FASTLED_NAMESPACE_END }
+#define FASTLED_USING_NAMESPACE using namespace NSFastLED;
+
+#define FASTLED_ARM
+
+#ifndef INTERRUPT_THRESHOLD
+#define INTERRUPT_THRESHOLD 1
+#endif
+
+// Default to allowing interrupts
+#ifndef FASTLED_ALLOW_INTERRUPTS
+#define FASTLED_ALLOW_INTERRUPTS 0
+#endif
+
+#if FASTLED_ALLOW_INTERRUPTS == 1
+#define FASTLED_ACCURATE_CLOCK
+#endif
+
+// reusing/abusing cli/sei defs for due
+#define cli()  __disable_irq(); __disable_fault_irq();
+#define sei() __enable_irq(); __enable_fault_irq();
+
+// pgmspace definitions
+#define PROGMEM
+#define pgm_read_dword(addr) (*(const unsigned long *)(addr))
+#define pgm_read_dword_near(addr) pgm_read_dword(addr)
+
+// Default to NOT using PROGMEM here
+#ifndef FASTLED_USE_PROGMEM
+#define FASTLED_USE_PROGMEM 0
+#endif
+
+// data type defs
+typedef volatile       uint8_t RoReg; /**< Read only 8-bit register (volatile const unsigned int) */
+typedef volatile       uint8_t RwReg; /**< Read-Write 8-bit register (volatile unsigned int) */
+
+#define FASTLED_NO_PINMAP
+
+#define F_CPU 72000000
+
+#endif
@@ -0,0 +1,514 @@
+#ifndef __INC_CLOCKLESS_TRINKET_H
+#define __INC_CLOCKLESS_TRINKET_H
+
+#include "controller.h"
+#include "lib8tion.h"
+#include <avr/interrupt.h> // for cli/se definitions
+
+FASTLED_NAMESPACE_BEGIN
+
+#if defined(FASTLED_AVR)
+
+// Scaling macro choice
+#ifndef TRINKET_SCALE
+#define TRINKET_SCALE 1
+// whether or not to use dithering
+#define DITHER 1
+#endif
+
+#if (F_CPU==8000000)
+#define FASTLED_SLOW_CLOCK_ADJUST asm __volatile__ ("mov r0,r0\n\t");
+#else
+#define FASTLED_SLOW_CLOCK_ADJUST
+#endif
+
+#define US_PER_TICK (64 / (F_CPU/1000000))
+
+// Variations on the functions in delay.h - w/a loop var passed in to preserve registers across calls by the optimizer/compiler
+template<int CYCLES> inline void _dc(register uint8_t & loopvar);
+
+template<int _LOOP, int PAD> __attribute__((always_inline)) inline void _dc_AVR(register uint8_t & loopvar) {
+	_dc<PAD>(loopvar);
+	// The convolution in here is to ensure that the state of the carry flag coming into the delay loop is preserved
+	asm __volatile__ (  "BRCS L_PC%=\n\t"
+						"        LDI %[loopvar], %[_LOOP]\n\tL_%=: DEC %[loopvar]\n\t BRNE L_%=\n\tBREQ L_DONE%=\n\t"
+						"L_PC%=: LDI %[loopvar], %[_LOOP]\n\tLL_%=: DEC %[loopvar]\n\t BRNE LL_%=\n\tBSET 0\n\t"
+						"L_DONE%=:\n\t"
+						:
+							[loopvar] "+a" (loopvar) : [_LOOP] "M" (_LOOP) : );
+}
+
+template<int CYCLES> __attribute__((always_inline)) inline void _dc(register uint8_t & loopvar) {
+	_dc_AVR<CYCLES/6,CYCLES%6>(loopvar);
+}
+template<> __attribute__((always_inline)) inline void _dc<-6>(register uint8_t & loopvar) {}
+template<> __attribute__((always_inline)) inline void _dc<-5>(register uint8_t & loopvar) {}
+template<> __attribute__((always_inline)) inline void _dc<-4>(register uint8_t & loopvar) {}
+template<> __attribute__((always_inline)) inline void _dc<-3>(register uint8_t & loopvar) {}
+template<> __attribute__((always_inline)) inline void _dc<-2>(register uint8_t & loopvar) {}
+template<> __attribute__((always_inline)) inline void _dc<-1>(register uint8_t & loopvar) {}
+template<> __attribute__((always_inline)) inline void _dc< 0>(register uint8_t & loopvar) {}
+template<> __attribute__((always_inline)) inline void _dc< 1>(register uint8_t & loopvar) {asm __volatile__("mov r0,r0":::);}
+template<> __attribute__((always_inline)) inline void _dc< 2>(register uint8_t & loopvar) {asm __volatile__("rjmp .+0":::);}
+template<> __attribute__((always_inline)) inline void _dc< 3>(register uint8_t & loopvar) { _dc<2>(loopvar); _dc<1>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc< 4>(register uint8_t & loopvar) { _dc<2>(loopvar); _dc<2>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc< 5>(register uint8_t & loopvar) { _dc<2>(loopvar); _dc<3>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc< 6>(register uint8_t & loopvar) { _dc<2>(loopvar); _dc<2>(loopvar); _dc<2>(loopvar);}
+template<> __attribute__((always_inline)) inline void _dc< 7>(register uint8_t & loopvar) { _dc<4>(loopvar); _dc<3>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc< 8>(register uint8_t & loopvar) { _dc<4>(loopvar); _dc<4>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc< 9>(register uint8_t & loopvar) { _dc<5>(loopvar); _dc<4>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<10>(register uint8_t & loopvar) { _dc<6>(loopvar); _dc<4>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<11>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<1>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<12>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<2>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<13>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<3>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<14>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<4>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<15>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<5>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<16>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<6>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<17>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<7>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<18>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<8>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<19>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<9>(loopvar); }
+template<> __attribute__((always_inline)) inline void _dc<20>(register uint8_t & loopvar) { _dc<10>(loopvar); _dc<10>(loopvar); }
+
+#define DINTPIN(T,ADJ,PINADJ) (T-(PINADJ+ADJ)>0) ? _dc<T-(PINADJ+ADJ)>(loopvar) : _dc<0>(loopvar);
+#define DINT(T,ADJ) if(AVR_PIN_CYCLES(DATA_PIN)==1) { DINTPIN(T,ADJ,1) } else { DINTPIN(T,ADJ,2); }
+#define D1(ADJ) DINT(T1,ADJ)
+#define D2(ADJ) DINT(T2,ADJ)
+#define D3(ADJ) DINT(T3,ADJ)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Base template for clockless controllers.  These controllers have 3 control points in their cycle for each bit.  The first point
+// is where the line is raised hi.  The second point is where the line is dropped low for a zero.  The third point is where the
+// line is dropped low for a one.  T1, T2, and T3 correspond to the timings for those three in clock cycles.
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if (!defined(NO_CORRECTION) || (NO_CORRECTION == 0)) && (FASTLED_ALLOW_INTERRUPTS == 0)
+static uint8_t gTimeErrorAccum256ths;
+#endif
+
+#define FASTLED_HAS_CLOCKLESS 1
+
+template <uint8_t DATA_PIN, int T1, int T2, int T3, EOrder RGB_ORDER = RGB, int XTRA0 = 0, bool FLIP = false, int WAIT_TIME = 10>
+class ClocklessController : public CLEDController {
+	typedef typename FastPin<DATA_PIN>::port_ptr_t data_ptr_t;
+	typedef typename FastPin<DATA_PIN>::port_t data_t;
+
+	CMinWait<WAIT_TIME> mWait;
+public:
+	virtual void init() {
+		FastPin<DATA_PIN>::setOutput();
+	}
+
+	virtual uint16_t getMaxRefreshRate() const { return 400; }
+
+	virtual void clearLeds(int nLeds) {
+		CRGB zeros(0,0,0);
+		showAdjTime((uint8_t*)&zeros, nLeds, zeros, false, 0);
+	}
+
+protected:
+
+	// set all the leds on the controller to a given color
+	virtual void showColor(const struct CRGB & rgbdata, int nLeds, CRGB scale) {
+		showAdjTime((uint8_t*)&rgbdata, nLeds, scale, false, 0);
+	}
+
+	virtual void show(const struct CRGB *rgbdata, int nLeds, CRGB scale) {
+		showAdjTime((uint8_t*)rgbdata, nLeds, scale, true, 0);
+	}
+
+#ifdef SUPPORT_ARGB
+	virtual void show(const struct CARGB *rgbdata, int nLeds, CRGB scale) {
+		showAdjTime((uint8_t*)rgbdata, nLeds, scale, true, 1);
+	}
+#endif
+
+	void showAdjTime(const uint8_t *data, int nLeds, CRGB & scale, bool advance, int skip) {
+		PixelController<RGB_ORDER> pixels(data, nLeds, scale, getDither(), advance, skip);
+
+		mWait.wait();
+		cli();
+
+		showRGBInternal(pixels);
+
+		// Adjust the timer
+#if (!defined(NO_CORRECTION) || (NO_CORRECTION == 0)) && (FASTLED_ALLOW_INTERRUPTS == 0)
+        uint32_t microsTaken = (uint32_t)nLeds * (uint32_t)CLKS_TO_MICROS(24 * (T1 + T2 + T3));
+
+        // adust for approximate observed actal runtime (as of January 2015)
+        // roughly 9.6 cycles per pixel, which is 0.6us/pixel at 16MHz
+        // microsTaken += nLeds * 0.6 * CLKS_TO_MICROS(16);
+        microsTaken += scale16by8(nLeds,(0.6 * 256) + 1) * CLKS_TO_MICROS(16);
+
+        // if less than 1000us, there is NO timer impact,
+        // this is because the ONE interrupt that might come in while interrupts
+        // are disabled is queued up, and it will be serviced as soon as
+        // interrupts are re-enabled.
+        // This actually should technically also account for the runtime of the
+        // interrupt handler itself, but we're just not going to worry about that.
+        if( microsTaken > 1000) {
+
+            // Since up to one timer tick will be queued, we don't need
+            // to adjust the MS_COUNTER for that one.
+            microsTaken -= 1000;
+
+            // Now convert microseconds to 256ths of a second, approximately like this:
+            // 250ths = (us/4)
+            // 256ths = 250ths * (263/256);
+            uint16_t x256ths = microsTaken >> 2;
+            x256ths += scale16by8(x256ths,7);
+
+            x256ths += gTimeErrorAccum256ths;
+            MS_COUNTER += (x256ths >> 8);
+            gTimeErrorAccum256ths = x256ths & 0xFF;
+        }
+
+#if 0
+        // For pixel counts of 30 and under at 16Mhz, no correction is necessary.
+        // For pixel counts of 15 and under at 8Mhz, no correction is necessary.
+        //
+        // This code, below, is smaller, and quicker clock correction, which drifts much
+        // more significantly, but is a few bytes smaller.  Presented here for consideration
+        // as an alternate on the ATtiny, which can't have more than about 150 pixels MAX
+        // anyway, meaning that microsTaken will never be more than about 4,500, which fits in
+        // a 16-bit variable.  The difference between /1000 and /1024 only starts showing
+        // up in the range of about 100 pixels, so many ATtiny projects won't even
+        // see a clock difference due to the approximation there.
+		uint16_t microsTaken = (uint32_t)nLeds * (uint32_t)CLKS_TO_MICROS((24) * (T1 + T2 + T3));
+        MS_COUNTER += (microsTaken >> 10);
+#endif
+
+#endif
+
+		sei();
+		mWait.mark();
+	}
+#define USE_ASM_MACROS
+
+// The variables that our various asm statemetns use.  The same block of variables needs to be declared for
+// all the asm blocks because GCC is pretty stupid and it would clobber variables happily or optimize code away too aggressively
+#define ASM_VARS : /* write variables */				\
+				[count] "+x" (count),					\
+				[data] "+z" (data),						\
+				[b1] "+a" (b1),							\
+				[d0] "+r" (d0),							\
+				[d1] "+r" (d1),							\
+				[d2] "+r" (d2),							\
+				[loopvar] "+a" (loopvar),				\
+				[scale_base] "+a" (scale_base)			\
+				: /* use variables */					\
+				[ADV] "r" (advanceBy),					\
+				[b0] "a" (b0),							\
+				[hi] "r" (hi),							\
+				[lo] "r" (lo),							\
+				[s0] "r" (s0),					  		\
+				[s1] "r" (s1),							\
+				[s2] "r" (s2),							\
+				[e0] "r" (e0),							\
+				[e1] "r" (e1),							\
+				[e2] "r" (e2),							\
+				[PORT] "M" (FastPin<DATA_PIN>::port()-0x20),		\
+				[O0] "M" (RGB_BYTE0(RGB_ORDER)),		\
+				[O1] "M" (RGB_BYTE1(RGB_ORDER)),		\
+				[O2] "M" (RGB_BYTE2(RGB_ORDER))		\
+				: "cc" /* clobber registers */
+
+
+// Note: the code in the else in HI1/LO1 will be turned into an sts (2 cycle, 2 word) opcode
+// 1 cycle, write hi to the port
+#define HI1 FASTLED_SLOW_CLOCK_ADJUST if((int)(FastPin<DATA_PIN>::port())-0x20 < 64) { asm __volatile__("out %[PORT], %[hi]" ASM_VARS ); } else { *FastPin<DATA_PIN>::port()=hi; }
+// 1 cycle, write lo to the port
+#define LO1 if((int)(FastPin<DATA_PIN>::port())-0x20 < 64) { asm __volatile__("out %[PORT], %[lo]" ASM_VARS ); } else { *FastPin<DATA_PIN>::port()=lo; }
+
+// 2 cycles, sbrs on flipping the line to lo if we're pushing out a 0
+#define QLO2(B, N) asm __volatile__("sbrs %[" #B "], " #N ASM_VARS ); LO1;
+// load a byte from ram into the given var with the given offset
+#define LD2(B,O) asm __volatile__("ldd %[" #B "], Z + %[" #O "]\n\t" ASM_VARS ); 
+// 4 cycles - load a byte from ram into the scaling scratch space with the given offset, clear the target var, clear carry
+#define LDSCL4(B,O) asm __volatile__("ldd %[scale_base], Z + %[" #O "]\n\tclr %[" #B "]\n\tclc\n\t" ASM_VARS ); 
+
+#if (DITHER==1) 
+// apply dithering value  before we do anything with scale_base
+#define PRESCALE4(D) asm __volatile__("cpse %[scale_base], __zero_reg__\n\t add %[scale_base],%[" #D "]\n\tbrcc L_%=\n\tldi %[scale_base], 0xFF\n\tL_%=:\n\t" ASM_VARS);
+
+// Do the add for the prescale
+#define PRESCALEA2(D) asm __volatile__("cpse %[scale_base], __zero_reg__\n\t add %[scale_base],%[" #D "]\n\t" ASM_VARS);
+
+// Do the clamp for the prescale, clear carry when we're done - NOTE: Must ensure carry flag state is preserved!
+#define PRESCALEB3(D) asm __volatile__("brcc L_%=\n\tldi %[scale_base], 0xFF\n\tL_%=:\n\tCLC" ASM_VARS);
+
+#else
+#define PRESCALE4(D) _dc<4>(loopvar);
+#define PRESCALEA2(D) _dc<2>(loopvar);
+#define PRESCALEB3(D) _dc<3>(loopvar);
+#endif
+
+// 2 cycles - perform one step of the scaling (if a given bit is set in scale, add scale-base to the scratch space)
+#define _SCALE02(B, N) "sbrc %[s0], " #N "\n\tadd %[" #B "], %[scale_base]\n\t"
+#define _SCALE12(B, N) "sbrc %[s1], " #N "\n\tadd %[" #B "], %[scale_base]\n\t" 
+#define _SCALE22(B, N) "sbrc %[s2], " #N "\n\tadd %[" #B "], %[scale_base]\n\t" 
+#define SCALE02(B,N) asm __volatile__( _SCALE02(B,N) ASM_VARS );
+#define SCALE12(B,N) asm __volatile__( _SCALE12(B,N) ASM_VARS );
+#define SCALE22(B,N) asm __volatile__( _SCALE22(B,N) ASM_VARS );
+
+// 1 cycle - rotate right, pulling in from carry
+#define _ROR1(B) "ror %[" #B "]\n\t" 
+#define ROR1(B) asm __volatile__( _ROR1(B) ASM_VARS);
+
+// 1 cycle, clear the carry bit
+#define _CLC1 "clc\n\t" 
+#define CLC1 asm __volatile__( _CLC1 ASM_VARS );
+
+// 2 cycles, rortate right, pulling in from carry then clear the carry bit
+#define RORCLC2(B) asm __volatile__( _ROR1(B) _CLC1 ASM_VARS );
+
+// 4 cycles, rotate, clear carry, scale next bit
+#define RORSC04(B, N) asm __volatile__( _ROR1(B) _CLC1 _SCALE02(B, N) ASM_VARS );
+#define RORSC14(B, N) asm __volatile__( _ROR1(B) _CLC1 _SCALE12(B, N) ASM_VARS );
+#define RORSC24(B, N) asm __volatile__( _ROR1(B) _CLC1 _SCALE22(B, N) ASM_VARS );
+
+// 4 cycles, scale bit, rotate, clear carry
+#define SCROR04(B, N) asm __volatile__( _SCALE02(B,N) _ROR1(B) _CLC1 ASM_VARS ); 
+#define SCROR14(B, N) asm __volatile__( _SCALE12(B,N) _ROR1(B) _CLC1 ASM_VARS );
+#define SCROR24(B, N) asm __volatile__( _SCALE22(B,N) _ROR1(B) _CLC1 ASM_VARS );
+
+/////////////////////////////////////////////////////////////////////////////////////
+// Loop life cycle
+
+// dither adjustment macro - should be kept in sync w/what's in stepDithering
+// #define ADJDITHER2(D, E) D = E - D;
+#define ADJDITHER2(D, E) asm __volatile__ ("neg %[" #D "]\n\tadd %[" #D "],%[" #E "]\n\t" ASM_VARS); 
+
+// #define xstr(a) str(a)
+// #define str(a) #a
+// #define ADJDITHER2(D,E) asm __volatile__("subi %[" #D "], " xstr(DUSE) "\n\tand %[" #D "], %[" #E "]\n\t" ASM_VARS);
+
+// define the beginning of the loop
+#define LOOP asm __volatile__("1:" ASM_VARS );
+// define the end of the loop
+#define DONE asm __volatile__("2:" ASM_VARS );
+
+// 2 cycles - increment the data pointer
+#define IDATA2 asm __volatile__("add %A[data], %[ADV]\n\tadc %B[data], __zero_reg__\n\t"  ASM_VARS );
+#define IDATACLC3 asm __volatile__("add %A[data], %[ADV]\n\tadc %B[data], __zero_reg__\n\t" _CLC1  ASM_VARS );
+
+// 1 cycle mov
+#define MOV1(B1, B2) asm __volatile__("mov %[" #B1 "], %[" #B2 "]" ASM_VARS );
+
+// 2 cycles - decrement the counter
+#define DCOUNT2 asm __volatile__("sbiw %[count], 1" ASM_VARS );
+// 2 cycles - jump to the beginning of the loop
+#define JMPLOOP2 asm __volatile__("rjmp 1b" ASM_VARS );
+// 2 cycles - jump out of the loop
+#define BRLOOP1 asm __volatile__("brne 3\n\trjmp 2f\n\t3:" ASM_VARS );
+
+// 5 cycles 2 sbiw, 3 for the breq/rjmp
+#define ENDLOOP5 asm __volatile__("sbiw %[count], 1\n\tbreq L_%=\n\trjmp 1b\n\tL_%=:\n\t" ASM_VARS);
+
+// NOP using the variables, forcing a move
+#define DNOP asm __volatile__("mov r0,r0" ASM_VARS);
+
+#define DADVANCE 3
+#define DUSE (0xFF - (DADVANCE-1))
+
+	// This method is made static to force making register Y available to use for data on AVR - if the method is non-static, then
+	// gcc will use register Y for the this pointer.
+	static void /*__attribute__((optimize("O0")))*/  /*__attribute__ ((always_inline))*/  showRGBInternal(PixelController<RGB_ORDER> & pixels)  {
+		uint8_t *data = (uint8_t*)pixels.mData;
+		data_ptr_t port = FastPin<DATA_PIN>::port();
+		data_t mask = FastPin<DATA_PIN>::mask();
+		uint8_t scale_base = 0;
+
+		// register uint8_t *end = data + nLeds;
+		data_t hi = *port | mask;
+		data_t lo = *port & ~mask;
+		*port = lo;
+
+		// the byte currently being written out
+		uint8_t b0 = 0;
+		// the byte currently being worked on to write the next out
+		uint8_t b1 = 0;
+
+		// Setup the pixel controller
+		pixels.preStepFirstByteDithering();
+
+		// pull the dithering/adjustment values out of the pixels object for direct asm access
+		uint8_t advanceBy = pixels.advanceBy();
+		uint16_t count = pixels.mLen;
+
+		uint8_t s0 = pixels.mScale.raw[RO(0)];
+		uint8_t s1 = pixels.mScale.raw[RO(1)];
+		uint8_t s2 = pixels.mScale.raw[RO(2)];
+		uint8_t d0 = pixels.d[RO(0)];
+		uint8_t d1 = pixels.d[RO(1)];
+		uint8_t d2 = pixels.d[RO(2)];
+		uint8_t e0 = pixels.e[RO(0)];
+		uint8_t e1 = pixels.e[RO(1)];
+		uint8_t e2 = pixels.e[RO(2)];
+
+		uint8_t loopvar=0;
+
+		// load/scale the first byte
+#if !defined(LIB8_ATTINY)
+		// we have a hardware multiply, can use loadAndScale0
+		b0 = pixels.loadAndScale0();
+#else
+		// no hardware multiply, we have to do our own mul by hand here, lest we incur a
+		// function call which will kill all of our register usage/allocations below
+		b0 = data[RO(0)];
+		{
+			LDSCL4(b0,O0) 	PRESCALEA2(d0)
+			PRESCALEB3(d0)	SCALE02(b0,0)
+			RORSC04(b0,1) 	ROR1(b0) CLC1
+			SCROR04(b0,2)		SCALE02(b0,3)
+			RORSC04(b0,4) 	ROR1(b0) CLC1
+			SCROR04(b0,5) 	SCALE02(b0,6)
+			RORSC04(b0,7) 	ROR1(b0) CLC1
+		}
+#endif
+
+		// #if (FASTLED_ALLOW_INTERRUPTS == 1)
+		// TCCR0A |= 0x30;
+		// OCR0B = (uint8_t)(TCNT0 + ((WAIT_TIME-INTERRUPT_THRESHOLD)/US_PER_TICK));
+		// TIFR0 = 0x04;
+		// #endif
+		{
+			// while(--count)
+			{
+				// Loop beginning, does some stuff that's outside of the pixel write cycle, namely incrementing d0-2 and masking off
+				// by the E values (see the definition )
+				DNOP;
+				LOOP;
+
+				// ADJDITHER2(d0,e0);
+				// ADJDITHER2(d1,e1);
+				// ADJDITHER2(d2,e2);
+				// NOP;
+				// #if (FASTLED_ALLOW_INTERRUPTS == 1)
+				// cli();
+				// if(TIFR0 & 0x04) {
+				// 	sei();
+				// 	TCCR0A &= ~0x30;
+				// 	return;
+				// }
+				// hi = *port | mask;
+				// lo = *port & ~mask;
+				// #endif
+
+				// Sum of the clock counts across each row should be 10 for 8Mhz, WS2811
+				// The values in the D1/D2/D3 indicate how many cycles the previous column takes
+				// to allow things to line back up.
+				//
+				// While writing out byte 0, we're loading up byte 1, applying the dithering adjustment,
+				// then scaling it using 8 cycles of shift/add interleaved in between writing the bits
+				// out.  When doing byte 1, we're doing the above for byte 2.  When we're doing byte 2,
+				// we're cycling back around and doing the above for byte 0.
+#if TRINKET_SCALE
+				// Inline scaling - RGB ordering
+				// DNOP
+				HI1 D1(1) QLO2(b0, 7) LDSCL4(b1,O1) 	D2(4)	LO1	PRESCALEA2(d1)	D3(2) 
+				HI1	D1(1) QLO2(b0, 6) PRESCALEB3(d1)	D2(3)	LO1	SCALE12(b1,0)	D3(2)
+				HI1 D1(1) QLO2(b0, 5) RORSC14(b1,1) 	D2(4)	LO1 RORCLC2(b1)		D3(2)
+				HI1 D1(1) QLO2(b0, 4) SCROR14(b1,2)		D2(4)	LO1 SCALE12(b1,3)	D3(2)
+				HI1 D1(1) QLO2(b0, 3) RORSC14(b1,4) 	D2(4)	LO1 RORCLC2(b1) 	D3(2)
+				HI1 D1(1) QLO2(b0, 2) SCROR14(b1,5) 	D2(4)	LO1 SCALE12(b1,6)	D3(2)
+				HI1 D1(1) QLO2(b0, 1) RORSC14(b1,7) 	D2(4)	LO1 RORCLC2(b1) 	D3(2)
+				HI1 D1(1) QLO2(b0, 0) 
+				switch(XTRA0) {
+					case 4: D2(0) LO1 D3(0) HI1 D1(1) QLO2(b0,0)
+					case 3: D2(0) LO1 D3(0) HI1 D1(1) QLO2(b0,0)
+					case 2: D2(0) LO1 D3(0) HI1 D1(1) QLO2(b0,0)
+					case 1: D2(0) LO1 D3(0) HI1 D1(1) QLO2(b0,0)
+				} 
+				ADJDITHER2(d1,e1) D2(2) LO1 MOV1(b0,b1) D3(1)
+
+				HI1 D1(1) QLO2(b0, 7) LDSCL4(b1,O2) 	D2(4)	LO1	PRESCALEA2(d2)	D3(2)
+				HI1	D1(1) QLO2(b0, 6) PRESCALEB3(d2)	D2(3)	LO1	SCALE22(b1,0)	D3(2)
+				HI1 D1(1) QLO2(b0, 5) RORSC24(b1,1) 	D2(4)	LO1 RORCLC2(b1) 	D3(2)
+				HI1 D1(1) QLO2(b0, 4) SCROR24(b1,2)		D2(4)	LO1 SCALE22(b1,3)	D3(2)
+				HI1 D1(1) QLO2(b0, 3) RORSC24(b1,4) 	D2(4)	LO1 RORCLC2(b1) 	D3(2)
+				HI1 D1(1) QLO2(b0, 2) SCROR24(b1,5) 	D2(4)	LO1 SCALE22(b1,6)	D3(2)
+				HI1 D1(1) QLO2(b0, 1) RORSC24(b1,7) 	D2(4)	LO1 RORCLC2(b1) 	D3(2)
+				HI1 D1(1) QLO2(b0, 0) 			
+				switch(XTRA0) {
+					case 4: D2(0) LO1 D3(0) HI1 D1(1) QLO2(b0,0)
+					case 3: D2(0) LO1 D3(0) HI1 D1(1) QLO2(b0,0)
+					case 2: D2(0) LO1 D3(0) HI1 D1(1) QLO2(b0,0)
+					case 1: D2(0) LO1 D3(0) HI1 D1(1) QLO2(b0,0)
+				} 
+				IDATACLC3 MOV1(b0,b1) D2(4) LO1 ADJDITHER2(d2,e2) D3(2)
+
+				HI1 D1(1) QLO2(b0, 7) LDSCL4(b1,O0) 	D2(4)	LO1	PRESCALEA2(d0)	D3(2)
+				HI1	D1(1) QLO2(b0, 6) PRESCALEB3(d0)	D2(3)	LO1	SCALE02(b1,0)	D3(2)
+				HI1 D1(1) QLO2(b0, 5) RORSC04(b1,1) 	D2(4)	LO1 RORCLC2(b1) 	D3(2)
+				HI1 D1(1) QLO2(b0, 4) SCROR04(b1,2)		D2(4)	LO1 SCALE02(b1,3)	D3(2)
+				HI1 D1(1) QLO2(b0, 3) RORSC04(b1,4) 	D2(4)	LO1 RORCLC2(b1)  	D3(2)
+				HI1 D1(1) QLO2(b0, 2) SCROR04(b1,5) 	D2(4)	LO1 SCALE02(b1,6)	D3(2)
+				HI1 D1(1) QLO2(b0, 1) RORSC04(b1,7) 	D2(4)	LO1 RORCLC2(b1) 	D3(2)
+				HI1 D1(1) QLO2(b0, 0) 	 
+				switch(XTRA0) {
+					case 4: D2(0) LO1 D3(0) HI1 D1(1) QLO2(b0,0)
+					case 3: D2(0) LO1 D3(0) HI1 D1(1) QLO2(b0,0)
+					case 2: D2(0) LO1 D3(0) HI1 D1(1) QLO2(b0,0)
+					case 1: D2(0) LO1 D3(0) HI1 D1(1) QLO2(b0,0)
+				} 
+				ADJDITHER2(d0,e0) MOV1(b0,b1) D2(3) LO1 D3(6)
+				ENDLOOP5
+#else
+				// no inline scaling - non-straight RGB ordering -- no longer in line with the actual asm macros above, left for
+				// reference only
+				HI1	D1(1) QLO2(b0, 7) LD2(b1,O1)	D2(2)	LO1 D3(0)
+				HI1 D1(1) QLO2(b0, 6) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b0, 5) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b0, 4) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b0, 3) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b0, 2) 				D2(0)	LO1 D3(0)
+				HI1 D1(1) QLO2(b0, 1) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b0, 0) 				D2(0) 	LO1 D3(0)
+				HI1	D1(1) QLO2(b1, 7) LD2(b1,O2) 	D2(2)	LO1 D3(0)
+				HI1 D1(1) QLO2(b1, 6) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b1, 5) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b1, 4) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b1, 3) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b1, 2) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b1, 1) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b1, 0) IDATA2 		D2(2)	LO1 D3(0)
+				HI1	D1(1) QLO2(b1, 7) LD2(b0,O0) 	D2(2)	LO1 D3(0)
+				HI1 D1(1) QLO2(b1, 6) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b1, 5) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b1, 4) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b1, 3) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b1, 2) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b1, 1) 				D2(0) 	LO1 D3(0)
+				HI1 D1(1) QLO2(b1, 0) 				D2(0) 	LO1 D3(0)
+#endif
+
+				// #if (FASTLED_ALLOW_INTERRUPTS == 1)
+				// // set the counter mark
+				// OCR0B = (uint8_t)(TCNT0 + ((WAIT_TIME-INTERRUPT_THRESHOLD)/US_PER_TICK));
+				// TIFR0 = 0x04;
+				// sei();
+				// #endif
+			}
+			DONE;
+		}
+
+		#if (FASTLED_ALLOW_INTERRUPTS == 1)
+		// stop using the clock juggler
+		TCCR0A &= ~0x30;
+		#endif
+	}
+
+#ifdef SUPPORT_ARGB
+	virtual void showARGB(struct CARGB *data, int nLeds) {
+		// TODO: IMPLEMENTME
+	}
+#endif
+};
+
+#endif
+
+FASTLED_NAMESPACE_END
+
+#endif
@@ -0,0 +1,14 @@
+#ifndef __INC_FASTLED_AVR_H
+#define __INC_FASTLED_AVR_H
+
+#include "fastled_delay.h"
+#include "fastpin_avr.h"
+#include "fastspi_avr.h"
+#include "clockless_trinket.h"
+
+// Default to using PROGMEM
+#ifndef FASTLED_USE_PROGMEM
+#define FASTLED_USE_PROGMEM 1
+#endif
+
+#endif
@@ -0,0 +1,280 @@
+#ifndef __INC_FASTPIN_AVR_H
+#define __INC_FASTPIN_AVR_H
+
+FASTLED_NAMESPACE_BEGIN
+
+#if defined(FASTLED_FORCE_SOFTWARE_PINS)
+#warning "Software pin support forced, pin access will be sloightly slower."
+#define NO_HARDWARE_PIN_SUPPORT
+#undef HAS_HARDWARE_PIN_SUPPORT
+
+#else
+
+#define AVR_PIN_CYCLES(_PIN) ((((int)FastPin<_PIN>::port())-0x20 < 64) ? 1 : 2)
+
+/// Class definition for a Pin where we know the port registers at compile time for said pin.  This allows us to make
+/// a lot of optimizations, as the inlined hi/lo methods will devolve to a single io register write/bitset.
+template<uint8_t PIN, uint8_t _MASK, typename _PORT, typename _DDR, typename _PIN> class _AVRPIN {
+public:
+	typedef volatile uint8_t * port_ptr_t;
+	typedef uint8_t port_t;
+
+	inline static void setOutput() { _DDR::r() |= _MASK; }
+	inline static void setInput() { _DDR::r() &= ~_MASK; }
+
+	inline static void hi() __attribute__ ((always_inline)) { _PORT::r() |= _MASK; }
+	inline static void lo() __attribute__ ((always_inline)) { _PORT::r() &= ~_MASK; }
+	inline static void set(register uint8_t val) __attribute__ ((always_inline)) { _PORT::r() = val; }
+
+	inline static void strobe() __attribute__ ((always_inline)) { toggle(); toggle(); }
+
+	inline static void toggle() __attribute__ ((always_inline)) { _PIN::r() = _MASK; }
+
+	inline static void hi(register port_ptr_t port) __attribute__ ((always_inline)) { hi(); }
+	inline static void lo(register port_ptr_t port) __attribute__ ((always_inline)) { lo(); }
+	inline static void fastset(register port_ptr_t port, register uint8_t val) __attribute__ ((always_inline)) { set(val); }
+
+	inline static port_t hival() __attribute__ ((always_inline)) { return _PORT::r() | _MASK; }
+	inline static port_t loval() __attribute__ ((always_inline)) { return _PORT::r() & ~_MASK; }
+	inline static port_ptr_t port() __attribute__ ((always_inline)) { return &_PORT::r(); }
+	inline static port_t mask() __attribute__ ((always_inline)) { return _MASK; }
+};
+
+
+
+/// AVR definitions for pins.  Getting around  the fact that I can't pass GPIO register addresses in as template arguments by instead creating
+/// a custom type for each GPIO register with a single, static, aggressively inlined function that returns that specific GPIO register.  A similar
+/// trick is used a bit further below for the ARM GPIO registers (of which there are far more than on AVR!)
+typedef volatile uint8_t & reg8_t;
+#define _R(T) struct __gen_struct_ ## T
+#define _RD8(T) struct __gen_struct_ ## T { static inline reg8_t r() { return T; }};
+#define _IO(L) _RD8(DDR ## L); _RD8(PORT ## L); _RD8(PIN ## L);
+#define _DEFPIN_AVR(_PIN, MASK, L) template<> class FastPin<_PIN> : public _AVRPIN<_PIN, MASK, _R(PORT ## L), _R(DDR ## L), _R(PIN ## L)> {};
+
+#if defined(__AVR_ATtiny85__) || defined(__AVR_ATtiny45__)
+_IO(B);
+
+#define MAX_PIN 5
+
+_DEFPIN_AVR(0, 0x01, B); _DEFPIN_AVR(1, 0x02, B); _DEFPIN_AVR(2, 0x04, B); _DEFPIN_AVR(3, 0x08, B);
+_DEFPIN_AVR(4, 0x10, B); _DEFPIN_AVR(5, 0x20, B);
+
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+#elif defined(ARDUINO_AVR_DIGISPARK) // digispark pin layout
+#define MAX_PIN 5
+#define HAS_HARDWARE_PIN_SUPPORT 1
+_IO(A); _IO(B);
+
+_DEFPIN_AVR(0, 0x01, B); _DEFPIN_AVR(1, 0x02, B); _DEFPIN_AVR(2, 0x04, B);
+_DEFPIN_AVR(3, 0x80, A); _DEFPIN_AVR(4, 0x40, A); _DEFPIN_AVR(5, 0x20, A);
+
+#elif defined(__AVR_ATtiny24__) || defined(__AVR_ATtiny44__) || defined(__AVR_ATtiny84__) || defined(__AVR_ATtiny25__)
+_IO(A); _IO(B);
+
+#define MAX_PIN 10
+
+_DEFPIN_AVR(0, 0x01, A); _DEFPIN_AVR(1, 0x02, A); _DEFPIN_AVR(2, 0x04, A); _DEFPIN_AVR(3, 0x08, A);
+_DEFPIN_AVR(4, 0x10, A); _DEFPIN_AVR(5, 0x20, A); _DEFPIN_AVR(6, 0x40, A); _DEFPIN_AVR(7, 0x80, A);
+_DEFPIN_AVR(8, 0x04, B); _DEFPIN_AVR(9, 0x02, B); _DEFPIN_AVR(10, 0x01, B);
+
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+#elif defined(ARDUINO_AVR_DIGISPARKPRO)
+
+_IO(A); _IO(B);
+#define MAX_PIN 12
+
+_DEFPIN_AVR(0, 0x01, B); _DEFPIN_AVR(1, 0x02, B); _DEFPIN_AVR(2, 0x04, B); _DEFPIN_AVR(3, 0x20, B);
+_DEFPIN_AVR(4, 0x08, B); _DEFPIN_AVR(5, 0x80, A); _DEFPIN_AVR(6, 0x01, A); _DEFPIN_AVR(7, 0x02, A);
+_DEFPIN_AVR(8, 0x04, A); _DEFPIN_AVR(9, 0x08, A); _DEFPIN_AVR(10, 0x10, A); _DEFPIN_AVR(11, 0x20, A);
+_DEFPIN_AVR(12, 0x40, A);
+
+#elif defined(__AVR_ATtiny167__) || defined(__AVR_ATtiny87__)
+_IO(A); _IO(B);
+
+#define MAX_PIN 15
+
+_DEFPIN_AVR(0, 0x01, A);  _DEFPIN_AVR(1, 0x02, A);   _DEFPIN_AVR(2, 0x04, A);  _DEFPIN_AVR(3, 0x08, A);
+_DEFPIN_AVR(4, 0x10, A);  _DEFPIN_AVR(5, 0x20, A);   _DEFPIN_AVR(6, 0x40, A);  _DEFPIN_AVR(7, 0x80, A);
+_DEFPIN_AVR(8, 0x01, B);  _DEFPIN_AVR(9, 0x02, B);   _DEFPIN_AVR(10, 0x04, B); _DEFPIN_AVR(11, 0x08, B);
+_DEFPIN_AVR(12, 0x10, B); _DEFPIN_AVR(13, 0x20, B); _DEFPIN_AVR(14, 0x40, B); _DEFPIN_AVR(15, 0x80, B);
+
+#define SPI_DATA 4
+#define SPI_CLOCK 5
+#define AVR_HARDWARE_SPI 1
+
+#define HAS_HARDWARE_PIN_SUPPORT 1
+#elif defined(ARDUINO_HOODLOADER2) && (defined(__AVR_ATmega32U2__) || defined(__AVR_ATmega16U2__) || defined(__AVR_ATmega8U2__)) || defined(__AVR_AT90USB82__) || defined(__AVR_AT90USB162__)
+
+_IO(D); _IO(B); _IO(C);
+
+#define MAX_PIN 20
+
+_DEFPIN_AVR( 0, 0x01, B); _DEFPIN_AVR( 1, 0x02, B); _DEFPIN_AVR( 2, 0x04, B); _DEFPIN_AVR( 3, 0x08, B);
+_DEFPIN_AVR( 4, 0x10, B); _DEFPIN_AVR( 5, 0x20, B); _DEFPIN_AVR( 6, 0x40, B); _DEFPIN_AVR( 7, 0x80, B);
+
+_DEFPIN_AVR( 8, 0x80, C); _DEFPIN_AVR( 9, 0x40, C); _DEFPIN_AVR( 10, 0x20,C); _DEFPIN_AVR( 11, 0x10, C);
+_DEFPIN_AVR( 12, 0x04, C); _DEFPIN_AVR( 13, 0x01, D); _DEFPIN_AVR( 14, 0x02, D); _DEFPIN_AVR(15, 0x04, D);
+_DEFPIN_AVR( 16, 0x08, D); _DEFPIN_AVR( 17, 0x10, D); _DEFPIN_AVR( 18, 0x20, D); _DEFPIN_AVR( 19, 0x40, D);
+_DEFPIN_AVR( 20, 0x80, D);
+
+#define HAS_HARDWARE_PIN_SUPPORT 1
+// #define SPI_DATA 2
+// #define SPI_CLOCK 1
+// #define AVR_HARDWARE_SPI 1
+
+#elif defined(__AVR_ATmega328P__) || defined(__AVR_ATmega168__) || defined(__AVR_ATmega168P__) || defined(__AVR_ATmega8__)
+// Accelerated port definitions for arduino avrs
+_IO(D); _IO(B); _IO(C);
+
+#define MAX_PIN 19
+_DEFPIN_AVR( 0, 0x01, D); _DEFPIN_AVR( 1, 0x02, D); _DEFPIN_AVR( 2, 0x04, D); _DEFPIN_AVR( 3, 0x08, D);
+_DEFPIN_AVR( 4, 0x10, D); _DEFPIN_AVR( 5, 0x20, D); _DEFPIN_AVR( 6, 0x40, D); _DEFPIN_AVR( 7, 0x80, D);
+_DEFPIN_AVR( 8, 0x01, B); _DEFPIN_AVR( 9, 0x02, B); _DEFPIN_AVR(10, 0x04, B); _DEFPIN_AVR(11, 0x08, B);
+_DEFPIN_AVR(12, 0x10, B); _DEFPIN_AVR(13, 0x20, B); _DEFPIN_AVR(14, 0x01, C); _DEFPIN_AVR(15, 0x02, C);
+_DEFPIN_AVR(16, 0x04, C); _DEFPIN_AVR(17, 0x08, C); _DEFPIN_AVR(18, 0x10, C); _DEFPIN_AVR(19, 0x20, C);
+
+#define SPI_DATA 11
+#define SPI_CLOCK 13
+#define SPI_SELECT 10
+#define AVR_HARDWARE_SPI 1
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+#define SPI_UART0_DATA 1
+#define SPI_UART0_CLOCK 4
+
+#elif defined(__AVR_ATmega1284P__)
+
+_IO(A); _IO(B); _IO(C); _IO(D);
+
+_DEFPIN_AVR(0, 1<<0, D); _DEFPIN_AVR(1, 1<<1, D); _DEFPIN_AVR(2, 1<<2, B); _DEFPIN_AVR(3, 1<<3, B);
+_DEFPIN_AVR(4, 1<<0, B); _DEFPIN_AVR(5, 1<<1, B); _DEFPIN_AVR(6, 1<<2, D); _DEFPIN_AVR(7, 1<<3, D);
+_DEFPIN_AVR(8, 1<<5, D); _DEFPIN_AVR(9, 1<<6, D); _DEFPIN_AVR(10, 1<<4, B); _DEFPIN_AVR(11, 1<<5, B);
+_DEFPIN_AVR(12, 1<<6, B); _DEFPIN_AVR(13, 1<<7, B); _DEFPIN_AVR(14, 1<<7, A); _DEFPIN_AVR(15, 1<<6, A);
+_DEFPIN_AVR(16, 1<<5, A); _DEFPIN_AVR(17, 1<<4, A); _DEFPIN_AVR(18, 1<<3, A); _DEFPIN_AVR(19, 1<<2, A);
+_DEFPIN_AVR(20, 1<<1, A); _DEFPIN_AVR(21, 1<<0, A); _DEFPIN_AVR(22, 1<<0, C); _DEFPIN_AVR(23, 1<<1, C);
+_DEFPIN_AVR(24, 1<<2, C); _DEFPIN_AVR(25, 1<<3, C); _DEFPIN_AVR(26, 1<<4, C); _DEFPIN_AVR(27, 1<<5, C);
+_DEFPIN_AVR(28, 1<<6, C); _DEFPIN_AVR(29, 1<<7, C); _DEFPIN_AVR(30, 1<<4, D); _DEFPIN_AVR(31, 1<<7, D);
+
+#define SPI_DATA 11
+#define SPI_CLOCK 13
+#define SPI_SELECT 10
+#define AVR_HARDWARE_SPI 1
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+#elif defined(__AVR_ATmega1280__) || defined(__AVR_ATmega2560__)
+// megas
+
+_IO(A); _IO(B); _IO(C); _IO(D); _IO(E); _IO(F); _IO(G); _IO(H); _IO(J); _IO(K); _IO(L);
+
+#define MAX_PIN 69
+_DEFPIN_AVR(0, 1, E); _DEFPIN_AVR(1, 2, E); _DEFPIN_AVR(2, 16, E); _DEFPIN_AVR(3, 32, E);
+_DEFPIN_AVR(4, 32, G); _DEFPIN_AVR(5, 8, E); _DEFPIN_AVR(6, 8, H); _DEFPIN_AVR(7, 16, H);
+_DEFPIN_AVR(8, 32, H); _DEFPIN_AVR(9, 64, H); _DEFPIN_AVR(10, 16, B); _DEFPIN_AVR(11, 32, B);
+_DEFPIN_AVR(12, 64, B); _DEFPIN_AVR(13, 128, B); _DEFPIN_AVR(14, 2, J); _DEFPIN_AVR(15, 1, J);
+_DEFPIN_AVR(16, 2, H); _DEFPIN_AVR(17, 1, H); _DEFPIN_AVR(18, 8, D); _DEFPIN_AVR(19, 4, D);
+_DEFPIN_AVR(20, 2, D); _DEFPIN_AVR(21, 1, D); _DEFPIN_AVR(22, 1, A); _DEFPIN_AVR(23, 2, A);
+_DEFPIN_AVR(24, 4, A); _DEFPIN_AVR(25, 8, A); _DEFPIN_AVR(26, 16, A); _DEFPIN_AVR(27, 32, A);
+_DEFPIN_AVR(28, 64, A); _DEFPIN_AVR(29, 128, A); _DEFPIN_AVR(30, 128, C); _DEFPIN_AVR(31, 64, C);
+_DEFPIN_AVR(32, 32, C); _DEFPIN_AVR(33, 16, C); _DEFPIN_AVR(34, 8, C); _DEFPIN_AVR(35, 4, C);
+_DEFPIN_AVR(36, 2, C); _DEFPIN_AVR(37, 1, C); _DEFPIN_AVR(38, 128, D); _DEFPIN_AVR(39, 4, G);
+_DEFPIN_AVR(40, 2, G); _DEFPIN_AVR(41, 1, G); _DEFPIN_AVR(42, 128, L); _DEFPIN_AVR(43, 64, L);
+_DEFPIN_AVR(44, 32, L); _DEFPIN_AVR(45, 16, L); _DEFPIN_AVR(46, 8, L); _DEFPIN_AVR(47, 4, L);
+_DEFPIN_AVR(48, 2, L); _DEFPIN_AVR(49, 1, L); _DEFPIN_AVR(50, 8, B); _DEFPIN_AVR(51, 4, B);
+_DEFPIN_AVR(52, 2, B); _DEFPIN_AVR(53, 1, B); _DEFPIN_AVR(54, 1, F); _DEFPIN_AVR(55, 2, F);
+_DEFPIN_AVR(56, 4, F); _DEFPIN_AVR(57, 8, F); _DEFPIN_AVR(58, 16, F); _DEFPIN_AVR(59, 32, F);
+_DEFPIN_AVR(60, 64, F); _DEFPIN_AVR(61, 128, F); _DEFPIN_AVR(62, 1, K); _DEFPIN_AVR(63, 2, K);
+_DEFPIN_AVR(64, 4, K); _DEFPIN_AVR(65, 8, K); _DEFPIN_AVR(66, 16, K); _DEFPIN_AVR(67, 32, K);
+_DEFPIN_AVR(68, 64, K); _DEFPIN_AVR(69, 128, K);
+
+#define SPI_DATA 51
+#define SPI_CLOCK 52
+#define SPI_SELECT 53
+#define AVR_HARDWARE_SPI 1
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+// Leonardo, teensy, blinkm
+#elif defined(__AVR_ATmega32U4__) && defined(CORE_TEENSY)
+
+// teensy defs
+_IO(B); _IO(C); _IO(D); _IO(E); _IO(F);
+
+#define MAX_PIN 23
+_DEFPIN_AVR(0, 1, B); _DEFPIN_AVR(1, 2, B); _DEFPIN_AVR(2, 4, B); _DEFPIN_AVR(3, 8, B);
+_DEFPIN_AVR(4, 128, B); _DEFPIN_AVR(5, 1, D); _DEFPIN_AVR(6, 2, D); _DEFPIN_AVR(7, 4, D);
+_DEFPIN_AVR(8, 8, D); _DEFPIN_AVR(9, 64, C); _DEFPIN_AVR(10, 128, C); _DEFPIN_AVR(11, 64, D);
+_DEFPIN_AVR(12, 128, D); _DEFPIN_AVR(13, 16, B); _DEFPIN_AVR(14, 32, B); _DEFPIN_AVR(15, 64, B);
+_DEFPIN_AVR(16, 128, F); _DEFPIN_AVR(17, 64, F); _DEFPIN_AVR(18, 32, F); _DEFPIN_AVR(19, 16, F);
+_DEFPIN_AVR(20, 2, F); _DEFPIN_AVR(21, 1, F); _DEFPIN_AVR(22, 16, D); _DEFPIN_AVR(23, 32, D);
+
+#define SPI_DATA 2
+#define SPI_CLOCK 1
+#define SPI_SELECT 0
+#define AVR_HARDWARE_SPI 1
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+// PD3/PD5
+#define SPI_UART1_DATA 8
+#define SPI_UART1_CLOCK 23
+
+#elif defined(__AVR_AT90USB646__) || defined(__AVR_AT90USB1286__)
+// teensy++ 2 defs
+
+_IO(A); _IO(B); _IO(C); _IO(D); _IO(E); _IO(F);
+
+#define MAX_PIN 45
+_DEFPIN_AVR(0, 1, D); _DEFPIN_AVR(1, 2, D); _DEFPIN_AVR(2, 4, D); _DEFPIN_AVR(3, 8, D);
+_DEFPIN_AVR(4, 16, D); _DEFPIN_AVR(5, 32, D); _DEFPIN_AVR(6, 64, D); _DEFPIN_AVR(7, 128, D);
+_DEFPIN_AVR(8, 1, E); _DEFPIN_AVR(9, 2, E); _DEFPIN_AVR(10, 1, C); _DEFPIN_AVR(11, 2, C);
+_DEFPIN_AVR(12, 4, C); _DEFPIN_AVR(13, 8, C); _DEFPIN_AVR(14, 16, C); _DEFPIN_AVR(15, 32, C);
+_DEFPIN_AVR(16, 64, C); _DEFPIN_AVR(17, 128, C); _DEFPIN_AVR(18, 64, E); _DEFPIN_AVR(19, 128, E);
+_DEFPIN_AVR(20, 1, B); _DEFPIN_AVR(21, 2, B); _DEFPIN_AVR(22, 4, B); _DEFPIN_AVR(23, 8, B);
+_DEFPIN_AVR(24, 16, B); _DEFPIN_AVR(25, 32, B); _DEFPIN_AVR(26, 64, B); _DEFPIN_AVR(27, 128, B);
+_DEFPIN_AVR(28, 1, A); _DEFPIN_AVR(29, 2, A); _DEFPIN_AVR(30, 4, A); _DEFPIN_AVR(31, 8, A);
+_DEFPIN_AVR(32, 16, A); _DEFPIN_AVR(33, 32, A); _DEFPIN_AVR(34, 64, A); _DEFPIN_AVR(35, 128, A);
+_DEFPIN_AVR(36, 16, E); _DEFPIN_AVR(37, 32, E); _DEFPIN_AVR(38, 1, F); _DEFPIN_AVR(39, 2, F);
+_DEFPIN_AVR(40, 4, F); _DEFPIN_AVR(41, 8, F); _DEFPIN_AVR(42, 16, F); _DEFPIN_AVR(43, 32, F);
+_DEFPIN_AVR(44, 64, F); _DEFPIN_AVR(45, 128, F);
+
+#define SPI_DATA 22
+#define SPI_CLOCK 21
+#define SPI_SELECT 20
+#define AVR_HARDWARE_SPI 1
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+// PD3/PD5
+#define SPI_UART1_DATA 3
+#define SPI_UART1_CLOCK 5
+
+
+#elif defined(__AVR_ATmega32U4__)
+
+// leonard defs
+_IO(B); _IO(C); _IO(D); _IO(E); _IO(F);
+
+#define MAX_PIN 23
+_DEFPIN_AVR(0, 4, D); _DEFPIN_AVR(1, 8, D); _DEFPIN_AVR(2, 2, D); _DEFPIN_AVR(3, 1, D); 
+_DEFPIN_AVR(4, 16, D); _DEFPIN_AVR(5, 64, C); _DEFPIN_AVR(6, 128, D); _DEFPIN_AVR(7, 64, E); 
+_DEFPIN_AVR(8, 16, B); _DEFPIN_AVR(9, 32, B); _DEFPIN_AVR(10, 64, B); _DEFPIN_AVR(11, 128, B); 
+_DEFPIN_AVR(12, 64, D); _DEFPIN_AVR(13, 128, C); _DEFPIN_AVR(14, 8, B); _DEFPIN_AVR(15, 2, B); 
+_DEFPIN_AVR(16, 4, B); _DEFPIN_AVR(17, 1, B); _DEFPIN_AVR(18, 128, F); _DEFPIN_AVR(19, 64, F); 
+_DEFPIN_AVR(20, 32, F); _DEFPIN_AVR(21, 16, F); _DEFPIN_AVR(22, 2, F); _DEFPIN_AVR(23, 1, F); 
+
+#define SPI_DATA 16
+#define SPI_CLOCK 15
+#define AVR_HARDWARE_SPI 1
+#define HAS_HARDWARE_PIN_SUPPORT 1
+
+// PD3/PD5
+// #define SPI_UART1_DATA 1
+// #define SPI_UART1_CLOCK 4
+
+
+#endif
+
+#endif // FASTLED_FORCE_SOFTWARE_PINS
+
+FASTLED_NAMESPACE_END
+
+#endif // __INC_FASTPIN_AVR_H
@@ -0,0 +1,505 @@
+#ifndef __INC_FASTSPI_AVR_H
+#define __INC_FASTSPI_AVR_H
+
+FASTLED_NAMESPACE_BEGIN
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Hardware SPI support using USART registers and friends
+//
+// TODO: Complete/test implementation - right now this doesn't work
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// uno/mini/duemilanove
+#if defined(AVR_HARDWARE_SPI)
+
+#if defined(UBRR1)
+
+#ifndef UCPHA1
+#define UCPHA1 1
+#endif
+
+template <uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint8_t _SPI_CLOCK_DIVIDER>
+class AVRUSART1SPIOutput {
+	Selectable *m_pSelect;
+
+public:
+	AVRUSART1SPIOutput() { m_pSelect = NULL; }
+	AVRUSART1SPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
+	void setSelect(Selectable *pSelect) { m_pSelect = pSelect; }
+
+	void init() {
+		UBRR1 = 0;
+
+		/* Set MSPI mode of operation and SPI data mode 0. */
+		UCSR1C = (1<<UMSEL11)|(1<<UMSEL10)|(0<<UCPHA1)|(0<<UCPOL1);
+		/* Enable receiver and transmitter. */
+		UCSR1B = (1<<RXEN1)|(1<<TXEN1);
+
+		FastPin<_CLOCK_PIN>::setOutput();
+		FastPin<_DATA_PIN>::setOutput();
+
+		// must be done last, see page 206
+		setSPIRate();
+	}
+
+	void setSPIRate() {
+		if(_SPI_CLOCK_DIVIDER > 2) {
+			UBRR1 = (_SPI_CLOCK_DIVIDER/2)-1;
+		} else {
+			UBRR1 = 0;
+		}
+	}
+
+
+	static void stop() {
+		// TODO: stop the uart spi output
+	}
+
+	static bool shouldWait(bool wait = false) __attribute__((always_inline)) {
+		static bool sWait=false;
+		if(sWait) {
+			sWait = wait; return true;
+		} else {
+			sWait = wait; return false;
+		}
+		// return true;
+	}
+	static void wait() __attribute__((always_inline)) {
+		if(shouldWait()) {
+			while(!(UCSR1A & (1<<UDRE1)));
+		}
+	}
+	static void waitFully() __attribute__((always_inline)) { wait(); }
+
+	static void writeWord(uint16_t w) __attribute__((always_inline)) { writeByte(w>>8); writeByte(w&0xFF); }
+
+	static void writeByte(uint8_t b) __attribute__((always_inline)) { wait(); UDR1=b;  shouldWait(true); }
+	static void writeBytePostWait(uint8_t b) __attribute__((always_inline)) { UDR1=b; shouldWait(true); wait(); }
+	static void writeByteNoWait(uint8_t b) __attribute__((always_inline)) { UDR1=b; shouldWait(true); }
+
+
+	template <uint8_t BIT> inline static void writeBit(uint8_t b) {
+		if(b && (1 << BIT)) {
+			FastPin<_DATA_PIN>::hi();
+		} else {
+			FastPin<_DATA_PIN>::lo();
+		}
+
+		FastPin<_CLOCK_PIN>::hi();
+		FastPin<_CLOCK_PIN>::lo();
+	}
+
+	void enable_pins() { }
+	void disable_pins() { }
+
+	void select() {
+		if(m_pSelect != NULL) {
+			m_pSelect->select();
+		}
+		enable_pins();
+		setSPIRate();
+	}
+
+	void release() {
+		if(m_pSelect != NULL) {
+			m_pSelect->release();
+		}
+		disable_pins();
+	}
+
+	static void writeBytesValueRaw(uint8_t value, int len) {
+		while(len--) {
+			writeByte(value);
+		}
+	}
+
+	void writeBytesValue(uint8_t value, int len) {
+		//setSPIRate();
+		select();
+		while(len--) {
+			writeByte(value);
+		}
+		release();
+	}
+
+	// Write a block of n uint8_ts out
+	template <class D> void writeBytes(register uint8_t *data, int len) {
+		//setSPIRate();
+		uint8_t *end = data + len;
+		select();
+		while(data != end) {
+			// a slight touch of delay here helps optimize the timing of the status register check loop (not used on ARM)
+			writeByte(D::adjust(*data++)); delaycycles<3>();
+		}
+		release();
+	}
+
+	void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }
+
+	// write a block of uint8_ts out in groups of three.  len is the total number of uint8_ts to write out.  The template
+	// parameters indicate how many uint8_ts to skip at the beginning and/or end of each grouping
+	template <uint8_t FLAGS, class D, EOrder RGB_ORDER> void writePixels(PixelController<RGB_ORDER> pixels) {
+		//setSPIRate();
+		int len = pixels.mLen;
+
+		select();
+		while(pixels.has(1)) {
+			if(FLAGS & FLAG_START_BIT) {
+				writeBit<0>(1);
+				writeBytePostWait(D::adjust(pixels.loadAndScale0()));
+				writeBytePostWait(D::adjust(pixels.loadAndScale1()));
+				writeBytePostWait(D::adjust(pixels.loadAndScale2()));
+			} else {
+				writeByte(D::adjust(pixels.loadAndScale0()));
+				writeByte(D::adjust(pixels.loadAndScale1()));
+				writeByte(D::adjust(pixels.loadAndScale2()));
+			}
+
+			pixels.advanceData();
+			pixels.stepDithering();
+		}
+		D::postBlock(len);
+		release();
+	}
+};
+#endif
+
+#if defined(UBRR0)
+template <uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint8_t _SPI_CLOCK_DIVIDER>
+class AVRUSART0SPIOutput {
+	Selectable *m_pSelect;
+
+public:
+	AVRUSART0SPIOutput() { m_pSelect = NULL; }
+	AVRUSART0SPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
+	void setSelect(Selectable *pSelect) { m_pSelect = pSelect; }
+
+	void init() {
+		UBRR0 = 0;
+
+		/* Set MSPI mode of operation and SPI data mode 0. */
+		UCSR0C = (1<<UMSEL01)|(1<<UMSEL00)/*|(0<<UCPHA0)*/|(0<<UCPOL0);
+		/* Enable receiver and transmitter. */
+		UCSR0B = (1<<RXEN0)|(1<<TXEN0);
+
+		FastPin<_CLOCK_PIN>::setOutput();
+		FastPin<_DATA_PIN>::setOutput();
+
+
+		// must be done last, see page 206
+		setSPIRate();
+	}
+
+	void setSPIRate() {
+		if(_SPI_CLOCK_DIVIDER > 2) {
+			UBRR0 = (_SPI_CLOCK_DIVIDER/2)-1;
+		} else {
+			UBRR0 = 0;
+		}
+	}
+
+	static void stop() {
+		// TODO: stop the uart spi output
+	}
+
+	static bool shouldWait(bool wait = false) __attribute__((always_inline)) {
+		static bool sWait=false;
+		if(sWait) {
+			sWait = wait; return true;
+		} else {
+			sWait = wait; return false;
+		}
+		// return true;
+	}
+	static void wait() __attribute__((always_inline)) {
+		if(shouldWait()) {
+			while(!(UCSR0A & (1<<UDRE0)));
+		}
+	}
+	static void waitFully() __attribute__((always_inline)) { wait(); }
+
+	static void writeWord(uint16_t w) __attribute__((always_inline)) { writeByte(w>>8); writeByte(w&0xFF); }
+
+	static void writeByte(uint8_t b) __attribute__((always_inline)) { wait(); UDR0=b;  shouldWait(true); }
+	static void writeBytePostWait(uint8_t b) __attribute__((always_inline)) { UDR0=b; shouldWait(true); wait(); }
+	static void writeByteNoWait(uint8_t b) __attribute__((always_inline)) { UDR0=b; shouldWait(true); }
+
+
+	template <uint8_t BIT> inline static void writeBit(uint8_t b) {
+		if(b && (1 << BIT)) {
+			FastPin<_DATA_PIN>::hi();
+		} else {
+			FastPin<_DATA_PIN>::lo();
+		}
+
+		FastPin<_CLOCK_PIN>::hi();
+		FastPin<_CLOCK_PIN>::lo();
+	}
+
+	void enable_pins() { }
+	void disable_pins() { }
+
+	void select() {
+		if(m_pSelect != NULL) {
+			m_pSelect->select();
+		}
+		enable_pins();
+		setSPIRate();
+	}
+
+		void release() {
+			if(m_pSelect != NULL) {
+				m_pSelect->release();
+			}
+			disable_pins();
+		}
+
+	static void writeBytesValueRaw(uint8_t value, int len) {
+		while(len--) {
+			writeByte(value);
+		}
+	}
+
+	void writeBytesValue(uint8_t value, int len) {
+		//setSPIRate();
+		select();
+		while(len--) {
+			writeByte(value);
+		}
+		release();
+	}
+
+	// Write a block of n uint8_ts out
+	template <class D> void writeBytes(register uint8_t *data, int len) {
+		//setSPIRate();
+		uint8_t *end = data + len;
+		select();
+		while(data != end) {
+			// a slight touch of delay here helps optimize the timing of the status register check loop (not used on ARM)
+			writeByte(D::adjust(*data++)); delaycycles<3>();
+		}
+		release();
+	}
+
+	void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }
+
+	// write a block of uint8_ts out in groups of three.  len is the total number of uint8_ts to write out.  The template
+	// parameters indicate how many uint8_ts to skip at the beginning and/or end of each grouping
+	template <uint8_t FLAGS, class D, EOrder RGB_ORDER> void writePixels(PixelController<RGB_ORDER> pixels) {
+		//setSPIRate();
+		int len = pixels.mLen;
+
+		select();
+		while(pixels.has(1)) {
+			if(FLAGS & FLAG_START_BIT) {
+				writeBit<0>(1);
+				writeBytePostWait(D::adjust(pixels.loadAndScale0()));
+				writeBytePostWait(D::adjust(pixels.loadAndScale1()));
+				writeBytePostWait(D::adjust(pixels.loadAndScale2()));
+			} else {
+				writeByte(D::adjust(pixels.loadAndScale0()));
+				writeByte(D::adjust(pixels.loadAndScale1()));
+				writeByte(D::adjust(pixels.loadAndScale2()));
+			}
+
+			pixels.advanceData();
+			pixels.stepDithering();
+		}
+		D::postBlock(len);
+		waitFully();
+		release();
+	}
+};
+
+#endif
+
+
+#if defined(SPSR)
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Hardware SPI support using SPDR registers and friends
+//
+// Technically speaking, this uses the AVR SPI registers.  This will work on the Teensy 3.0 because Paul made a set of compatability
+// classes that map the AVR SPI registers to ARM's, however this caps the performance of output.
+//
+// TODO: implement ARMHardwareSPIOutput
+//
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <uint8_t _DATA_PIN, uint8_t _CLOCK_PIN, uint8_t _SPI_CLOCK_DIVIDER>
+class AVRHardwareSPIOutput {
+	Selectable *m_pSelect;
+	bool mWait;
+public:
+	AVRHardwareSPIOutput() { m_pSelect = NULL; mWait = false;}
+	AVRHardwareSPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
+	void setSelect(Selectable *pSelect) { m_pSelect = pSelect; }
+
+	void setSPIRate() {
+		SPCR &= ~ ( (1<<SPR1) | (1<<SPR0) ); 	// clear out the prescalar bits
+
+	    bool b2x = false;
+
+	    if(_SPI_CLOCK_DIVIDER >= 128) { SPCR |= (1<<SPR1); SPCR |= (1<<SPR0); }
+	    else if(_SPI_CLOCK_DIVIDER >= 64) { SPCR |= (1<<SPR1);}
+	    else if(_SPI_CLOCK_DIVIDER >= 32) { SPCR |= (1<<SPR1); b2x = true;  }
+	    else if(_SPI_CLOCK_DIVIDER >= 16) { SPCR |= (1<<SPR0); }
+	    else if(_SPI_CLOCK_DIVIDER >= 8) { SPCR |= (1<<SPR0); b2x = true; }
+	    else if(_SPI_CLOCK_DIVIDER >= 4) { /* do nothing - default rate */ }
+	    else { b2x = true; }
+
+	    if(b2x) { SPSR |= (1<<SPI2X); }
+	    else { SPSR &= ~ (1<<SPI2X); }
+	}
+
+	void init() {
+		volatile uint8_t clr;
+
+		// set the pins to output
+		FastPin<_DATA_PIN>::setOutput();
+		FastPin<_CLOCK_PIN>::setOutput();
+#ifdef SPI_SELECT
+		// Make sure the slave select line is set to output, or arduino will block us
+		FastPin<SPI_SELECT>::setOutput();
+		FastPin<SPI_SELECT>::lo();
+#endif
+
+		SPCR |= ((1<<SPE) | (1<<MSTR) ); 		// enable SPI as master
+		SPCR &= ~ ( (1<<SPR1) | (1<<SPR0) ); 	// clear out the prescalar bits
+
+		clr = SPSR; // clear SPI status register
+		clr = SPDR; // clear SPI data register
+		clr;
+
+	    bool b2x = false;
+
+	    if(_SPI_CLOCK_DIVIDER >= 128) { SPCR |= (1<<SPR1); SPCR |= (1<<SPR0); }
+	    else if(_SPI_CLOCK_DIVIDER >= 64) { SPCR |= (1<<SPR1);}
+	    else if(_SPI_CLOCK_DIVIDER >= 32) { SPCR |= (1<<SPR1); b2x = true;  }
+	    else if(_SPI_CLOCK_DIVIDER >= 16) { SPCR |= (1<<SPR0); }
+	    else if(_SPI_CLOCK_DIVIDER >= 8) { SPCR |= (1<<SPR0); b2x = true; }
+	    else if(_SPI_CLOCK_DIVIDER >= 4) { /* do nothing - default rate */ }
+	    else { b2x = true; }
+
+	    if(b2x) { SPSR |= (1<<SPI2X); }
+	    else { SPSR &= ~ (1<<SPI2X); }
+
+	    SPDR=0;
+	    shouldWait(false);
+			release();
+		}
+
+	static bool shouldWait(bool wait = false) __attribute__((always_inline)) {
+		static bool sWait=false;
+		if(sWait) { sWait = wait; return true; } else { sWait = wait; return false; }
+		// return true;
+	}
+	static void wait() __attribute__((always_inline)) { if(shouldWait()) { while(!(SPSR & (1<<SPIF))); } }
+	static void waitFully() __attribute__((always_inline)) { wait(); }
+
+	static void writeWord(uint16_t w) __attribute__((always_inline)) { writeByte(w>>8); writeByte(w&0xFF); }
+
+	static void writeByte(uint8_t b) __attribute__((always_inline)) { wait(); SPDR=b;  shouldWait(true); }
+	static void writeBytePostWait(uint8_t b) __attribute__((always_inline)) { SPDR=b; shouldWait(true); wait(); }
+	static void writeByteNoWait(uint8_t b) __attribute__((always_inline)) { SPDR=b; shouldWait(true); }
+
+	template <uint8_t BIT> inline static void writeBit(uint8_t b) {
+		SPCR &= ~(1 << SPE);
+		if(b & (1 << BIT)) {
+			FastPin<_DATA_PIN>::hi();
+		} else {
+			FastPin<_DATA_PIN>::lo();
+		}
+
+		FastPin<_CLOCK_PIN>::hi();
+		FastPin<_CLOCK_PIN>::lo();
+		SPCR |= 1 << SPE;
+		shouldWait(false);
+	}
+
+	void enable_pins() {
+		SPCR |= ((1<<SPE) | (1<<MSTR) ); 		// enable SPI as master
+	}
+
+	void disable_pins() {
+		SPCR &= ~(((1<<SPE) | (1<<MSTR) )); // disable SPI
+	}
+
+	void select() {
+		if(m_pSelect != NULL) { m_pSelect->select(); }
+		enable_pins();
+		setSPIRate();
+	}
+
+	void release() {
+		if(m_pSelect != NULL) { m_pSelect->release(); }
+		disable_pins();
+	}
+
+	static void writeBytesValueRaw(uint8_t value, int len) {
+		while(len--) { writeByte(value); }
+	}
+
+	void writeBytesValue(uint8_t value, int len) {
+		//setSPIRate();
+		select();
+		while(len--) {
+			writeByte(value);
+		}
+		release();
+	}
+
+	// Write a block of n uint8_ts out
+	template <class D> void writeBytes(register uint8_t *data, int len) {
+		//setSPIRate();
+		uint8_t *end = data + len;
+		select();
+		while(data != end) {
+			// a slight touch of delay here helps optimize the timing of the status register check loop (not used on ARM)
+			writeByte(D::adjust(*data++)); delaycycles<3>();
+		}
+		release();
+	}
+
+	void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }
+
+	// write a block of uint8_ts out in groups of three.  len is the total number of uint8_ts to write out.  The template
+	// parameters indicate how many uint8_ts to skip at the beginning and/or end of each grouping
+	template <uint8_t FLAGS, class D, EOrder RGB_ORDER> void writePixels(PixelController<RGB_ORDER> pixels) {
+		//setSPIRate();
+		int len = pixels.mLen;
+
+		select();
+		while(pixels.has(1)) {
+			if(FLAGS & FLAG_START_BIT) {
+				writeBit<0>(1);
+				writeBytePostWait(D::adjust(pixels.loadAndScale0()));
+				writeBytePostWait(D::adjust(pixels.loadAndScale1()));
+				writeBytePostWait(D::adjust(pixels.loadAndScale2()));
+			} else {
+				writeByte(D::adjust(pixels.loadAndScale0()));
+				writeByte(D::adjust(pixels.loadAndScale1()));
+				writeByte(D::adjust(pixels.loadAndScale2()));
+			}
+
+			pixels.advanceData();
+			pixels.stepDithering();
+		}
+		D::postBlock(len);
+		waitFully();
+		release();
+	}
+};
+#endif
+
+#else
+// #define FASTLED_FORCE_SOFTWARE_SPI
+#endif
+
+FASTLED_NAMESPACE_END;
+
+
+#endif
@@ -0,0 +1,52 @@
+#ifndef __INC_LED_SYSDEFS_AVR_H
+#define __INC_LED_SYSDEFS_AVR_H
+
+#define FASTLED_AVR
+
+#ifndef INTERRUPT_THRESHOLD
+#define INTERRUPT_THRESHOLD 2
+#endif
+
+#include <avr/io.h>
+#include <avr/interrupt.h> // for cli/se definitions
+
+// Define the register types
+#if defined(ARDUINO) // && ARDUINO < 150
+typedef volatile       uint8_t RoReg; /**< Read only 8-bit register (volatile const unsigned int) */
+typedef volatile       uint8_t RwReg; /**< Read-Write 8-bit register (volatile unsigned int) */
+#endif
+
+
+// Default to disallowing interrupts (may want to gate this on teensy2 vs. other arm platforms, since the
+// teensy2 has a good, fast millis interrupt implementation)
+#ifndef FASTLED_ALLOW_INTERRUPTS
+#define FASTLED_ALLOW_INTERRUPTS 0
+#endif
+
+#if FASTLED_ALLOW_INTERRUPTS == 1
+#define FASTLED_ACCURATE_CLOCK
+#endif
+
+
+// Default to using PROGMEM here
+#ifndef FASTLED_USE_PROGMEM
+#define FASTLED_USE_PROGMEM 1
+#endif
+
+#if defined(ARDUINO_AVR_DIGISPARK) || defined(ARDUINO_AVR_DIGISPARKPRO)
+#ifndef NO_CORRECTION
+#define NO_CORRECTION 1
+#endif
+#endif
+
+extern "C" {
+#  if defined(CORE_TEENSY) || defined(TEENSYDUINO)
+extern volatile unsigned long timer0_millis_count;
+#    define MS_COUNTER timer0_millis_count
+#  else
+extern volatile unsigned long timer0_millis;
+#    define MS_COUNTER timer0_millis
+#  endif
+};
+
+#endif