This seems to work. It is efficient except at the 0X20000000 boundary.
Code:
// Tensy3 memcpy
inline bool is_aligned(const void* ptr, uintptr_t alignment) {
auto iptr = reinterpret_cast<uintptr_t>(ptr);
return !(iptr % alignment);
}
void memcpyT3(void* dst, const void* src, size_t len) {
const uint8_t* b = (const uint8_t*)0X20000000UL;
uint8_t* d = (uint8_t*)dst;
const uint8_t *s = (const uint8_t*)src;
if (((d < b && (d + len) >= b) || (s < b && (s + len) >= b)) &&
(!is_aligned(d, 4) || !is_aligned(s, 4) || len & 3)) {
while (len--) {
*d++ = *s++;
}
} else {
memcpy(dst, src, len);
}
}