add circuitpython code

author: Raghuram Subramani <raghus2247@gmail.com> 2022-06-19 19:47:51 +0530
committer: Raghuram Subramani <raghus2247@gmail.com> 2022-06-19 19:47:51 +0530
commit: 4fd287655a72b9aea14cdac715ad5b90ed082ed2 (patch)
tree: 65d393bc0e699dd12d05b29ba568e04cea666207 /circuitpython/extmod/ulab/code
parent: 0150f70ce9c39e9e6dd878766c0620c85e47bed0 (diff)
60 files changed, 15387 insertions, 0 deletions
diff --git a/circuitpython/extmod/ulab/code/micropython.cmake b/circuitpython/extmod/ulab/code/micropython.cmake
new file mode 100644
index 0000000..66890c0
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/micropython.cmake
@@ -0,0 +1,18 @@
+add_library(usermod_ulab INTERFACE)
+
+file(GLOB_RECURSE ULAB_SOURCES ${CMAKE_CURRENT_LIST_DIR}/*.c)
+
+target_sources(usermod_ulab INTERFACE
+    ${ULAB_SOURCES}
+)
+
+target_include_directories(usermod_ulab INTERFACE
+    ${CMAKE_CURRENT_LIST_DIR}
+)
+
+target_compile_definitions(usermod_ulab INTERFACE
+    MODULE_ULAB_ENABLED=1
+)
+
+target_link_libraries(usermod INTERFACE usermod_ulab)
+
diff --git a/circuitpython/extmod/ulab/code/micropython.mk b/circuitpython/extmod/ulab/code/micropython.mk
new file mode 100644
index 0000000..d16b177
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/micropython.mk
@@ -0,0 +1,38 @@
+
+USERMODULES_DIR := $(USERMOD_DIR)
+
+# Add all C files to SRC_USERMOD.
+SRC_USERMOD += $(USERMODULES_DIR)/scipy/linalg/linalg.c
+SRC_USERMOD += $(USERMODULES_DIR)/scipy/optimize/optimize.c
+SRC_USERMOD += $(USERMODULES_DIR)/scipy/signal/signal.c
+SRC_USERMOD += $(USERMODULES_DIR)/scipy/special/special.c
+SRC_USERMOD += $(USERMODULES_DIR)/ndarray_operators.c
+SRC_USERMOD += $(USERMODULES_DIR)/ulab_tools.c
+SRC_USERMOD += $(USERMODULES_DIR)/ndarray.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/ndarray/ndarray_iter.c
+SRC_USERMOD += $(USERMODULES_DIR)/ndarray_properties.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/approx.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/compare.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/carray/carray.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/carray/carray_tools.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/create.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/fft/fft.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/fft/fft_tools.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/filter.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/linalg/linalg.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/linalg/linalg_tools.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/numerical.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/poly.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/stats.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/transform.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/vector.c
+
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/numpy.c
+SRC_USERMOD += $(USERMODULES_DIR)/scipy/scipy.c
+SRC_USERMOD += $(USERMODULES_DIR)/user/user.c
+SRC_USERMOD += $(USERMODULES_DIR)/utils/utils.c
+SRC_USERMOD += $(USERMODULES_DIR)/ulab.c
+
+CFLAGS_USERMOD += -I$(USERMODULES_DIR)
+
+override CFLAGS_EXTRA += -DMODULE_ULAB_ENABLED=1
diff --git a/circuitpython/extmod/ulab/code/ndarray.c b/circuitpython/extmod/ulab/code/ndarray.c
new file mode 100644
index 0000000..f8caa67
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/ndarray.c
@@ -0,0 +1,2255 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2022 Zoltán Vörös
+ *               2020 Jeff Epler for Adafruit Industries
+ *               2020 Taku Fukada
+*/
+
+#include <unistd.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/runtime.h"
+#include "py/binary.h"
+#include "py/obj.h"
+#include "py/objtuple.h"
+#include "py/objint.h"
+
+#include "ulab_tools.h"
+#include "ndarray.h"
+#include "ndarray_operators.h"
+#include "numpy/carray/carray.h"
+#include "numpy/carray/carray_tools.h"
+
+mp_uint_t ndarray_print_threshold = NDARRAY_PRINT_THRESHOLD;
+mp_uint_t ndarray_print_edgeitems = NDARRAY_PRINT_EDGEITEMS;
+
+//| """Manipulate numeric data similar to numpy
+//|
+//| `ulab` is a numpy-like module for micropython, meant to simplify and
+//| speed up common mathematical operations on arrays. The primary goal was to
+//| implement a small subset of numpy that might be useful in the context of a
+//| microcontroller. This means low-level data processing of linear (array) and
+//| two-dimensional (matrix) data.
+//|
+//| `ulab` is adapted from micropython-ulab, and the original project's
+//| documentation can be found at
+//| https://micropython-ulab.readthedocs.io/en/latest/
+//|
+//| `ulab` is modeled after numpy, and aims to be a compatible subset where
+//| possible.  Numpy's documentation can be found at
+//| https://docs.scipy.org/doc/numpy/index.html"""
+//|
+
+void ndarray_set_complex_value(void *p, size_t index, mp_obj_t value) {
+    mp_float_t real, imag;
+    if(mp_obj_is_type(value, &mp_type_complex)) {
+        mp_obj_get_complex(value, &real, &imag);
+        ((mp_float_t *)p)[2 * index] = real;
+        ((mp_float_t *)p)[2 * index + 1] = imag;
+    } else {
+        real = mp_obj_get_float(value);
+        ((mp_float_t *)p)[2 * index] = real;
+        ((mp_float_t *)p)[2 * index + 1] = MICROPY_FLOAT_CONST(0.0);
+    }
+}
+
+#ifdef CIRCUITPY
+void ndarray_set_value(char typecode, void *p, size_t index, mp_obj_t val_in) {
+    switch (typecode) {
+        case NDARRAY_INT8:
+            ((signed char *)p)[index] = mp_obj_get_int(val_in);
+            break;
+        case NDARRAY_UINT8:
+            ((unsigned char *)p)[index] = mp_obj_get_int(val_in);
+            break;
+        case NDARRAY_INT16:
+            ((short *)p)[index] = mp_obj_get_int(val_in);
+            break;
+        case NDARRAY_UINT16:
+            ((unsigned short *)p)[index] = mp_obj_get_int(val_in);
+            break;
+        case NDARRAY_FLOAT:
+            ((mp_float_t *)p)[index] = mp_obj_get_float(val_in);
+            break;
+        #if ULAB_SUPPORTS_COMPLEX
+        case NDARRAY_COMPLEX:
+            ndarray_set_complex_value(p, index, val_in);
+            break;
+        #endif
+    }
+}
+#endif
+
+#if defined(MICROPY_VERSION_MAJOR) && MICROPY_VERSION_MAJOR == 1 && MICROPY_VERSION_MINOR == 11
+
+void mp_obj_slice_indices(mp_obj_t self_in, mp_int_t length, mp_bound_slice_t *result) {
+    mp_obj_slice_t *self = MP_OBJ_TO_PTR(self_in);
+    mp_int_t start, stop, step;
+
+    if (self->step == mp_const_none) {
+        step = 1;
+    } else {
+        step = mp_obj_get_int(self->step);
+        if (step == 0) {
+            mp_raise_ValueError(translate("slice step can't be zero"));
+        }
+    }
+
+    if (step > 0) {
+        // Positive step
+        if (self->start == mp_const_none) {
+            start = 0;
+        } else {
+            start = mp_obj_get_int(self->start);
+            if (start < 0) {
+                start += length;
+            }
+            start = MIN(length, MAX(start, 0));
+        }
+
+        if (self->stop == mp_const_none) {
+            stop = length;
+        } else {
+            stop = mp_obj_get_int(self->stop);
+            if (stop < 0) {
+                stop += length;
+            }
+            stop = MIN(length, MAX(stop, 0));
+        }
+    } else {
+        // Negative step
+        if (self->start == mp_const_none) {
+            start = length - 1;
+        } else {
+            start = mp_obj_get_int(self->start);
+            if (start < 0) {
+                start += length;
+            }
+            start = MIN(length - 1, MAX(start, -1));
+        }
+
+        if (self->stop == mp_const_none) {
+            stop = -1;
+        } else {
+            stop = mp_obj_get_int(self->stop);
+            if (stop < 0) {
+                stop += length;
+            }
+            stop = MIN(length - 1, MAX(stop, -1));
+        }
+    }
+
+    result->start = start;
+    result->stop = stop;
+    result->step = step;
+}
+#endif /* MICROPY_VERSION v1.11 */
+
+void ndarray_fill_array_iterable(mp_float_t *array, mp_obj_t iterable) {
+    mp_obj_iter_buf_t x_buf;
+    mp_obj_t x_item, x_iterable = mp_getiter(iterable, &x_buf);
+    while ((x_item = mp_iternext(x_iterable)) != MP_OBJ_STOP_ITERATION) {
+        *array++ = (mp_float_t)mp_obj_get_float(x_item);
+    }
+}
+
+#if ULAB_HAS_FUNCTION_ITERATOR
+size_t *ndarray_new_coords(uint8_t ndim) {
+    size_t *coords = m_new(size_t, ndim);
+    memset(coords, 0, ndim*sizeof(size_t));
+    return coords;
+}
+
+void ndarray_rewind_array(uint8_t ndim, uint8_t *array, size_t *shape, int32_t *strides, size_t *coords) {
+    // resets the data pointer of a single array, whenever an axis is full
+    // since we always iterate over the very last axis, we have to keep track of
+    // the last ndim-2 axes only
+    array -= shape[ULAB_MAX_DIMS - 1] * strides[ULAB_MAX_DIMS - 1];
+    array += strides[ULAB_MAX_DIMS - 2];
+    for(uint8_t i=1; i < ndim-1; i++) {
+        coords[ULAB_MAX_DIMS - 1 - i] += 1;
+        if(coords[ULAB_MAX_DIMS - 1 - i] == shape[ULAB_MAX_DIMS - 1 - i]) { // we are at a dimension boundary
+            array -= shape[ULAB_MAX_DIMS - 1 - i] * strides[ULAB_MAX_DIMS - 1 - i];
+            array += strides[ULAB_MAX_DIMS - 2 - i];
+            coords[ULAB_MAX_DIMS - 1 - i] = 0;
+            coords[ULAB_MAX_DIMS - 2 - i] += 1;
+        } else { // coordinates can change only, if the last coordinate changes
+            return;
+        }
+    }
+}
+#endif
+
+static int32_t *strides_from_shape(size_t *shape, uint8_t dtype) {
+    // returns a strides array that corresponds to a dense array with the prescribed shape
+    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+    strides[ULAB_MAX_DIMS-1] = (int32_t)ulab_binary_get_size(dtype);
+    for(uint8_t i=ULAB_MAX_DIMS; i > 1; i--) {
+        strides[i-2] = strides[i-1] * shape[i-1];
+    }
+    return strides;
+}
+
+size_t *ndarray_shape_vector(size_t a, size_t b, size_t c, size_t d) {
+    // returns a ULAB_MAX_DIMS-aware array of shapes
+    // WARNING: this assumes that the maximum possible dimension is 4!
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    shape[ULAB_MAX_DIMS - 1] = d;
+    #if ULAB_MAX_DIMS > 1
+    shape[ULAB_MAX_DIMS - 2] = c;
+    #endif
+    #if ULAB_MAX_DIMS > 2
+    shape[ULAB_MAX_DIMS - 3] = b;
+    #endif
+    #if ULAB_MAX_DIMS > 3
+    shape[ULAB_MAX_DIMS - 4] = a;
+    #endif
+    return shape;
+}
+
+bool ndarray_object_is_array_like(mp_obj_t o_in) {
+    if(mp_obj_is_type(o_in, &ulab_ndarray_type) ||
+      mp_obj_is_type(o_in, &mp_type_tuple) ||
+      mp_obj_is_type(o_in, &mp_type_list) ||
+      mp_obj_is_type(o_in, &mp_type_range)) {
+        return true;
+    }
+    return false;
+}
+
+void fill_array_iterable(mp_float_t *array, mp_obj_t iterable) {
+    mp_obj_iter_buf_t x_buf;
+    mp_obj_t x_item, x_iterable = mp_getiter(iterable, &x_buf);
+    size_t i=0;
+    while ((x_item = mp_iternext(x_iterable)) != MP_OBJ_STOP_ITERATION) {
+        array[i] = (mp_float_t)mp_obj_get_float(x_item);
+        i++;
+    }
+}
+
+#if NDARRAY_HAS_DTYPE
+#if ULAB_HAS_DTYPE_OBJECT
+void ndarray_dtype_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
+    (void)kind;
+    dtype_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    mp_print_str(print, "dtype('");
+    if(self->dtype == NDARRAY_BOOLEAN) {
+        mp_print_str(print, "bool')");
+    } else if(self->dtype == NDARRAY_UINT8) {
+        mp_print_str(print, "uint8')");
+    } else if(self->dtype == NDARRAY_INT8) {
+        mp_print_str(print, "int8')");
+    } else if(self->dtype == NDARRAY_UINT16) {
+        mp_print_str(print, "uint16')");
+    } else if(self->dtype == NDARRAY_INT16) {
+        mp_print_str(print, "int16')");
+    }
+    #if ULAB_SUPPORTS_COMPLEX
+    else if(self->dtype == NDARRAY_COMPLEX) {
+        mp_print_str(print, "complex')");
+    }
+    #endif
+    else {
+        #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
+        mp_print_str(print, "float32')");
+        #else
+        mp_print_str(print, "float64')");
+        #endif
+    }
+}
+
+mp_obj_t ndarray_dtype_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
+    (void) type;
+    mp_arg_check_num(n_args, n_kw, 0, 1, true);
+    mp_map_t kw_args;
+    mp_map_init_fixed_table(&kw_args, n_kw, args + n_args);
+
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_OBJ, { .u_obj = mp_const_none } },
+    };
+    mp_arg_val_t _args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, args, &kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, _args);
+
+    dtype_obj_t *dtype = m_new_obj(dtype_obj_t);
+    dtype->base.type = &ulab_dtype_type;
+
+    if(mp_obj_is_type(args[0], &ulab_ndarray_type)) {
+        // return the dtype of the array
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0]);
+        dtype->dtype = ndarray->dtype;
+    } else {
+        uint8_t _dtype;
+        if(mp_obj_is_int(_args[0].u_obj)) {
+            _dtype = mp_obj_get_int(_args[0].u_obj);
+            if((_dtype != NDARRAY_BOOL) && (_dtype != NDARRAY_UINT8)
+                && (_dtype != NDARRAY_INT8) && (_dtype != NDARRAY_UINT16)
+                && (_dtype != NDARRAY_INT16) && (_dtype != NDARRAY_FLOAT)) {
+                mp_raise_TypeError(translate("data type not understood"));
+            }
+        } else {
+            GET_STR_DATA_LEN(_args[0].u_obj, _dtype_, len);
+            if(memcmp(_dtype_, "uint8", 5) == 0) {
+                _dtype = NDARRAY_UINT8;
+            } else if(memcmp(_dtype_, "int8", 4) == 0) {
+                _dtype = NDARRAY_INT8;
+            } else if(memcmp(_dtype_, "uint16", 6) == 0) {
+                _dtype = NDARRAY_UINT16;
+            } else if(memcmp(_dtype_, "int16", 5) == 0) {
+                _dtype = NDARRAY_INT16;
+            } else if(memcmp(_dtype_, "float", 5) == 0) {
+                _dtype = NDARRAY_FLOAT;
+            }
+            #if ULAB_SUPPORTS_COMPLEX
+            else if(memcmp(_dtype_, "complex", 7) == 0) {
+                _dtype = NDARRAY_COMPLEX;
+            }
+            #endif
+            else {
+                mp_raise_TypeError(translate("data type not understood"));
+            }
+        }
+        dtype->dtype = _dtype;
+    }
+    return dtype;
+}
+
+mp_obj_t ndarray_dtype(mp_obj_t self_in) {
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    dtype_obj_t *dtype = m_new_obj(dtype_obj_t);
+    dtype->base.type = &ulab_dtype_type;
+    dtype->dtype = self->dtype;
+    return dtype;
+}
+
+#else
+// this is the cheap implementation of tbe dtype
+mp_obj_t ndarray_dtype(mp_obj_t self_in) {
+    uint8_t dtype;
+    if(mp_obj_is_type(self_in, &ulab_ndarray_type)) {
+        ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+        dtype = self->dtype;
+    } else { // we assume here that the input is a single character
+        GET_STR_DATA_LEN(self_in, _dtype, len);
+        if((len != 1) || ((*_dtype != NDARRAY_BOOL) && (*_dtype != NDARRAY_UINT8)
+            && (*_dtype != NDARRAY_INT8) && (*_dtype != NDARRAY_UINT16)
+            && (*_dtype != NDARRAY_INT16) && (*_dtype != NDARRAY_FLOAT)
+            #if ULAB_SUPPORTS_COMPLEX
+                && (*_dtype != NDARRAY_COMPLEX)
+            #endif
+        )) {
+            mp_raise_TypeError(translate("data type not understood"));
+        }
+        dtype = *_dtype;
+    }
+    return mp_obj_new_int(dtype);
+}
+#endif /* ULAB_HAS_DTYPE_OBJECT */
+#endif /* NDARRAY_HAS_DTYPE */
+
+#if ULAB_HAS_PRINTOPTIONS
+mp_obj_t ndarray_set_printoptions(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_threshold, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} },
+        { MP_QSTR_edgeitems, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+    if(args[0].u_rom_obj != mp_const_none) {
+        ndarray_print_threshold = mp_obj_get_int(args[0].u_rom_obj);
+    }
+    if(args[1].u_rom_obj != mp_const_none) {
+        ndarray_print_edgeitems = mp_obj_get_int(args[1].u_rom_obj);
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(ndarray_set_printoptions_obj, 0, ndarray_set_printoptions);
+
+mp_obj_t ndarray_get_printoptions(void) {
+    mp_obj_t dict = mp_obj_new_dict(2);
+    mp_obj_dict_store(MP_OBJ_FROM_PTR(dict), MP_OBJ_NEW_QSTR(MP_QSTR_threshold), mp_obj_new_int(ndarray_print_threshold));
+    mp_obj_dict_store(MP_OBJ_FROM_PTR(dict), MP_OBJ_NEW_QSTR(MP_QSTR_edgeitems), mp_obj_new_int(ndarray_print_edgeitems));
+    return dict;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_0(ndarray_get_printoptions_obj, ndarray_get_printoptions);
+#endif
+
+mp_obj_t ndarray_get_item(ndarray_obj_t *ndarray, void *array) {
+    // returns a proper micropython object from an array
+    if(!ndarray->boolean) {
+        #if ULAB_SUPPORTS_COMPLEX
+        if(ndarray->dtype == NDARRAY_COMPLEX) {
+            mp_float_t *c = (mp_float_t *)array;
+            mp_float_t real = *c++;
+            mp_float_t imag = *c;
+            return mp_obj_new_complex(real, imag);
+        }
+        #endif
+        return mp_binary_get_val_array(ndarray->dtype, array, 0);
+    } else {
+        if(*(uint8_t *)array) {
+            return mp_const_true;
+        } else {
+            return mp_const_false;
+        }
+    }
+}
+
+static void ndarray_print_element(const mp_print_t *print, ndarray_obj_t *ndarray, uint8_t *array) {
+    #if ULAB_SUPPORTS_COMPLEX
+        if(ndarray->dtype == NDARRAY_COMPLEX) {
+            // real part first
+            mp_float_t fvalue = *(mp_float_t *)array;
+            mp_obj_print_helper(print, mp_obj_new_float(fvalue), PRINT_REPR);
+            // imaginary part
+            array += ndarray->itemsize / 2;
+            fvalue = *(mp_float_t *)array;
+            if(fvalue >= MICROPY_FLOAT_CONST(0.0) || isnan(fvalue)) {
+                mp_print_str(print, "+");
+            }
+            array += ndarray->itemsize / 2;
+            mp_obj_print_helper(print, mp_obj_new_float(fvalue), PRINT_REPR);
+            mp_print_str(print, "j");
+        } else {
+            mp_obj_print_helper(print, ndarray_get_item(ndarray, array), PRINT_REPR);
+        }
+    #else
+        mp_obj_print_helper(print, ndarray_get_item(ndarray, array), PRINT_REPR);
+    #endif
+}
+
+static void ndarray_print_row(const mp_print_t *print, ndarray_obj_t * ndarray, uint8_t *array, size_t stride, size_t n) {
+    if(n == 0) {
+        return;
+    }
+    mp_print_str(print, "[");
+    if((n <= ndarray_print_threshold) || (n <= 2*ndarray_print_edgeitems)) { // if the array is short, print everything
+        ndarray_print_element(print, ndarray, array);
+        array += stride;
+        for(size_t i=1; i < n; i++, array += stride) {
+            mp_print_str(print, ", ");
+            ndarray_print_element(print, ndarray, array);
+        }
+    } else {
+        mp_obj_print_helper(print, ndarray_get_item(ndarray, array), PRINT_REPR);
+        array += stride;
+        for(size_t i=1; i < ndarray_print_edgeitems; i++, array += stride) {
+            mp_print_str(print, ", ");
+            ndarray_print_element(print, ndarray, array);
+        }
+        mp_printf(print, ", ..., ");
+        array += stride * (n - 2 * ndarray_print_edgeitems);
+        ndarray_print_element(print, ndarray, array);
+        array += stride;
+        for(size_t i=1; i < ndarray_print_edgeitems; i++, array += stride) {
+            mp_print_str(print, ", ");
+            ndarray_print_element(print, ndarray, array);
+        }
+    }
+    mp_print_str(print, "]");
+}
+
+#if ULAB_MAX_DIMS > 1
+static void ndarray_print_bracket(const mp_print_t *print, const size_t condition, const size_t shape, const char *string) {
+    if(condition < shape) {
+        mp_print_str(print, string);
+    }
+}
+#endif
+
+void ndarray_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
+    (void)kind;
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    uint8_t *array = (uint8_t *)self->array;
+    mp_print_str(print, "array(");
+    if(self->len == 0) {
+        mp_print_str(print, "[]");
+        if(self->ndim > 1) {
+            mp_print_str(print, ", shape=(");
+            #if ULAB_MAX_DIMS > 1
+            for(uint8_t ndim = self->ndim; ndim > 1; ndim--) {
+                mp_printf(MP_PYTHON_PRINTER, "%d,", self->shape[ULAB_MAX_DIMS - ndim]);
+            }
+            #else
+            mp_printf(MP_PYTHON_PRINTER, "%d,", self->shape[0]);
+            #endif
+            mp_printf(MP_PYTHON_PRINTER, "%d)", self->shape[ULAB_MAX_DIMS - 1]);
+        }
+    } else {
+        #if ULAB_MAX_DIMS > 3
+        size_t i=0;
+        ndarray_print_bracket(print, 0, self->shape[ULAB_MAX_DIMS-4], "[");
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            ndarray_print_bracket(print, 0, self->shape[ULAB_MAX_DIMS-3], "[");
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                ndarray_print_bracket(print, 0, self->shape[ULAB_MAX_DIMS-2], "[");
+                do {
+                #endif
+                    ndarray_print_row(print, self, array, self->strides[ULAB_MAX_DIMS-1], self->shape[ULAB_MAX_DIMS-1]);
+                #if ULAB_MAX_DIMS > 1
+                    array += self->strides[ULAB_MAX_DIMS-2];
+                    k++;
+                    ndarray_print_bracket(print, k, self->shape[ULAB_MAX_DIMS-2], ",\n       ");
+                } while(k < self->shape[ULAB_MAX_DIMS-2]);
+                ndarray_print_bracket(print, 0, self->shape[ULAB_MAX_DIMS-2], "]");
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                j++;
+                ndarray_print_bracket(print, j, self->shape[ULAB_MAX_DIMS-3], ",\n\n       ");
+                array -= self->strides[ULAB_MAX_DIMS-2] * self->shape[ULAB_MAX_DIMS-2];
+                array += self->strides[ULAB_MAX_DIMS-3];
+            } while(j < self->shape[ULAB_MAX_DIMS-3]);
+            ndarray_print_bracket(print, 0, self->shape[ULAB_MAX_DIMS-3], "]");
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            array -= self->strides[ULAB_MAX_DIMS-3] * self->shape[ULAB_MAX_DIMS-3];
+            array += self->strides[ULAB_MAX_DIMS-4];
+            i++;
+            ndarray_print_bracket(print, i, self->shape[ULAB_MAX_DIMS-4], ",\n\n       ");
+        } while(i < self->shape[ULAB_MAX_DIMS-4]);
+        ndarray_print_bracket(print, 0, self->shape[ULAB_MAX_DIMS-4], "]");
+        #endif
+    }
+    mp_print_str(print, ", dtype=");
+    if(self->boolean) {
+        mp_print_str(print, "bool)");
+    } else if(self->dtype == NDARRAY_UINT8) {
+        mp_print_str(print, "uint8)");
+    } else if(self->dtype == NDARRAY_INT8) {
+        mp_print_str(print, "int8)");
+    } else if(self->dtype == NDARRAY_UINT16) {
+        mp_print_str(print, "uint16)");
+    } else if(self->dtype == NDARRAY_INT16) {
+        mp_print_str(print, "int16)");
+    }
+    #if ULAB_SUPPORTS_COMPLEX
+    else if(self->dtype == NDARRAY_COMPLEX) {
+        mp_print_str(print, "complex)");
+    }
+    #endif /* ULAB_SUPPORTS_COMPLEX */
+    else {
+        #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
+        mp_print_str(print, "float32)");
+        #else
+        mp_print_str(print, "float64)");
+        #endif
+    }
+}
+
+void ndarray_assign_elements(ndarray_obj_t *ndarray, mp_obj_t iterable, uint8_t dtype, size_t *idx) {
+    // assigns a single row in the tensor
+    mp_obj_t item;
+    if(ndarray->boolean) {
+        uint8_t *array = (uint8_t *)ndarray->array;
+        array += *idx;
+        while ((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+            if(mp_obj_is_true(item)) {
+                *array = 1;
+            }
+            array++;
+            (*idx)++;
+        }
+    } else {
+        while ((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+            #if ULAB_SUPPORTS_COMPLEX
+                mp_float_t real;
+                mp_float_t imag;
+                if(dtype == NDARRAY_COMPLEX) {
+                    mp_obj_get_complex(item, &real, &imag);
+                    ndarray_set_value(NDARRAY_FLOAT, ndarray->array, (*idx)++, mp_obj_new_float(real));
+                    ndarray_set_value(NDARRAY_FLOAT, ndarray->array, (*idx)++, mp_obj_new_float(imag));
+                } else {
+                    ndarray_set_value(dtype, ndarray->array, (*idx)++, item);
+                }
+            #else
+                ndarray_set_value(dtype, ndarray->array, (*idx)++, item);
+            #endif
+        }
+    }
+}
+
+bool ndarray_is_dense(ndarray_obj_t *ndarray) {
+    // returns true, if the array is dense, false otherwise
+    // the array should be dense, if the very first stride can be calculated from shape
+    int32_t stride = ndarray->itemsize;
+    for(uint8_t i = ULAB_MAX_DIMS - 1; i > ULAB_MAX_DIMS-ndarray->ndim; i--) {
+        stride *= ndarray->shape[i];
+    }
+    return stride == ndarray->strides[ULAB_MAX_DIMS-ndarray->ndim] ? true : false;
+}
+
+
+ndarray_obj_t *ndarray_new_ndarray(uint8_t ndim, size_t *shape, int32_t *strides, uint8_t dtype) {
+    // Creates the base ndarray with shape, and initialises the values to straight 0s
+    ndarray_obj_t *ndarray = m_new_obj(ndarray_obj_t);
+    ndarray->base.type = &ulab_ndarray_type;
+    ndarray->dtype = dtype == NDARRAY_BOOL ? NDARRAY_UINT8 : dtype;
+    ndarray->boolean = dtype == NDARRAY_BOOL ? NDARRAY_BOOLEAN : NDARRAY_NUMERIC;
+    ndarray->ndim = ndim;
+    ndarray->len = ndim == 0 ? 0 : 1;
+    ndarray->itemsize = ulab_binary_get_size(dtype);
+    int32_t *_strides;
+    if(strides == NULL) {
+        _strides = strides_from_shape(shape, ndarray->dtype);
+    } else {
+        _strides = strides;
+    }
+    for(uint8_t i=ULAB_MAX_DIMS; i > ULAB_MAX_DIMS-ndim; i--) {
+        ndarray->shape[i-1] = shape[i-1];
+        ndarray->strides[i-1] = _strides[i-1];
+        ndarray->len *= shape[i-1];
+    }
+
+    // if the length is 0, still allocate a single item, so that contractions can be handled
+    size_t len = ndarray->itemsize * MAX(1, ndarray->len);
+    uint8_t *array = m_new(byte, len);
+    // this should set all elements to 0, irrespective of the of the dtype (all bits are zero)
+    // we could, perhaps, leave this step out, and initialise the array only, when needed
+    memset(array, 0, len);
+    ndarray->array = array;
+    ndarray->origin = array;
+    return ndarray;
+}
+
+ndarray_obj_t *ndarray_new_dense_ndarray(uint8_t ndim, size_t *shape, uint8_t dtype) {
+    // creates a dense array, i.e., one, where the strides are derived directly from the shapes
+    // the function should work in the general n-dimensional case
+    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+    strides[ULAB_MAX_DIMS-1] = (int32_t)ulab_binary_get_size(dtype);
+    for(size_t i=ULAB_MAX_DIMS; i > 1; i--) {
+        strides[i-2] = strides[i-1] * MAX(1, shape[i-1]);
+    }
+    return ndarray_new_ndarray(ndim, shape, strides, dtype);
+}
+
+ndarray_obj_t *ndarray_new_ndarray_from_tuple(mp_obj_tuple_t *_shape, uint8_t dtype) {
+    // creates a dense array from a tuple
+    // the function should work in the general n-dimensional case
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    for(size_t i=0; i < ULAB_MAX_DIMS; i++) {
+        if(i < ULAB_MAX_DIMS - _shape->len) {
+            shape[i] = 0;
+        } else {
+            shape[i] = mp_obj_get_int(_shape->items[i]);
+        }
+    }
+    return ndarray_new_dense_ndarray(_shape->len, shape, dtype);
+}
+
+void ndarray_copy_array(ndarray_obj_t *source, ndarray_obj_t *target, uint8_t shift) {
+    // TODO: if the array is dense, the content could be copied in a single pass
+    // copies the content of source->array into a new dense void pointer
+    // it is assumed that the dtypes in source and target are the same
+    // Since the target is a new array, it is supposed to be dense
+    uint8_t *sarray = (uint8_t *)source->array;
+    uint8_t *tarray = (uint8_t *)target->array;
+    #if ULAB_SUPPORTS_COMPLEX
+    if(source->dtype == NDARRAY_COMPLEX) {
+        sarray += shift;
+    }
+    #endif
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    memcpy(tarray, sarray, target->itemsize);
+                    tarray += target->itemsize;
+                    sarray += source->strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                sarray += source->strides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+            sarray += source->strides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+        sarray += source->strides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+    #endif
+}
+
+ndarray_obj_t *ndarray_new_view(ndarray_obj_t *source, uint8_t ndim, size_t *shape, int32_t *strides, int32_t offset) {
+    // creates a new view from the input arguments
+    ndarray_obj_t *ndarray = m_new_obj(ndarray_obj_t);
+    ndarray->base.type = &ulab_ndarray_type;
+    ndarray->boolean = source->boolean;
+    ndarray->dtype = source->dtype;
+    ndarray->ndim = ndim;
+    ndarray->itemsize = source->itemsize;
+    ndarray->len = ndim == 0 ? 0 : 1;
+    for(uint8_t i=ULAB_MAX_DIMS; i > ULAB_MAX_DIMS-ndim; i--) {
+        ndarray->shape[i-1] = shape[i-1];
+        ndarray->strides[i-1] = strides[i-1];
+        ndarray->len *= shape[i-1];
+    }
+    uint8_t *pointer = (uint8_t *)source->array;
+    pointer += offset;
+    ndarray->array = pointer;
+    ndarray->origin = source->origin;
+    return ndarray;
+}
+
+ndarray_obj_t *ndarray_copy_view(ndarray_obj_t *source) {
+    // creates a one-to-one deep copy of the input ndarray or its view
+    // the function should work in the general n-dimensional case
+    // In order to make it dtype-agnostic, we copy the memory content
+    // instead of reading out the values
+
+    int32_t *strides = strides_from_shape(source->shape, source->dtype);
+
+    uint8_t dtype = source->dtype;
+    if(source->boolean) {
+        dtype = NDARRAY_BOOLEAN;
+    }
+    ndarray_obj_t *ndarray = ndarray_new_ndarray(source->ndim, source->shape, strides, dtype);
+    ndarray_copy_array(source, ndarray, 0);
+    return ndarray;
+}
+
+ndarray_obj_t *ndarray_copy_view_convert_type(ndarray_obj_t *source, uint8_t dtype) {
+    // creates a copy, similar to ndarray_copy_view, but it also converts the dtype, if necessary
+    if(dtype == source->dtype) {
+        return ndarray_copy_view(source);
+    }
+    ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, dtype);
+    uint8_t *sarray = (uint8_t *)source->array;
+    uint8_t *array = (uint8_t *)ndarray->array;
+
+    #if ULAB_SUPPORTS_COMPLEX
+    uint8_t complex_size = 2 * sizeof(mp_float_t);
+    #endif
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    mp_obj_t item;
+                    #if ULAB_SUPPORTS_COMPLEX
+                    if(source->dtype == NDARRAY_COMPLEX) {
+                        if(dtype != NDARRAY_COMPLEX) {
+                            mp_raise_TypeError(translate("cannot convert complex type"));
+                        } else {
+                            memcpy(array, sarray, complex_size);
+                        }
+                    } else {
+                    #endif
+                        if((source->dtype == NDARRAY_FLOAT) && (dtype != NDARRAY_FLOAT)) {
+                            // floats must be treated separately, because they can't directly be converted to integer types
+                            mp_float_t f = ndarray_get_float_value(sarray, source->dtype);
+                            item = mp_obj_new_int((int32_t)MICROPY_FLOAT_C_FUN(floor)(f));
+                        } else {
+                            item = mp_binary_get_val_array(source->dtype, sarray, 0);
+                        }
+                    #if ULAB_SUPPORTS_COMPLEX
+                        if(dtype == NDARRAY_COMPLEX) {
+                            ndarray_set_value(NDARRAY_FLOAT, array, 0, item);
+                        } else {
+                            ndarray_set_value(dtype, array, 0, item);
+                        }
+                    }
+                    #else
+                    ndarray_set_value(dtype, array, 0, item);
+                    #endif
+                    array += ndarray->itemsize;
+                    sarray += source->strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                sarray += source->strides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+            sarray += source->strides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+        sarray += source->strides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+    #endif
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+#if NDARRAY_HAS_BYTESWAP
+mp_obj_t ndarray_byteswap(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    // changes the endiannes of an array
+    // if the dtype of the input uint8/int8/bool, simply return a copy or view
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_inplace, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_false } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(args[0].u_obj);
+    ndarray_obj_t *ndarray = NULL;
+    if(args[1].u_obj == mp_const_false) {
+        ndarray = ndarray_copy_view(self);
+    } else {
+        ndarray = ndarray_new_view(self, self->ndim, self->shape, self->strides, 0);
+    }
+    if((self->dtype == NDARRAY_BOOL) || (self->dtype == NDARRAY_UINT8) || (self->dtype == NDARRAY_INT8)) {
+        return MP_OBJ_FROM_PTR(ndarray);
+    } else {
+        uint8_t *array = (uint8_t *)ndarray->array;
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        if(self->dtype == NDARRAY_FLOAT) {
+                            #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
+                            SWAP(uint8_t, array[0], array[3]);
+                            SWAP(uint8_t, array[1], array[2]);
+                            #else
+                            SWAP(uint8_t, array[0], array[7]);
+                            SWAP(uint8_t, array[1], array[6]);
+                            SWAP(uint8_t, array[2], array[5]);
+                            SWAP(uint8_t, array[3], array[4]);
+                            #endif
+                        } else {
+                            SWAP(uint8_t, array[0], array[1]);
+                        }
+                        array += ndarray->strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < ndarray->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    array -= ndarray->strides[ULAB_MAX_DIMS - 1] * ndarray->shape[ULAB_MAX_DIMS-1];
+                    array += ndarray->strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < ndarray->shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                array -= ndarray->strides[ULAB_MAX_DIMS - 2] * ndarray->shape[ULAB_MAX_DIMS-2];
+                array += ndarray->strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < ndarray->shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            array -= ndarray->strides[ULAB_MAX_DIMS - 3] * ndarray->shape[ULAB_MAX_DIMS-3];
+            array += ndarray->strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < ndarray->shape[ULAB_MAX_DIMS - 4]);
+        #endif
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(ndarray_byteswap_obj, 1, ndarray_byteswap);
+#endif
+
+#if NDARRAY_HAS_COPY
+mp_obj_t ndarray_copy(mp_obj_t self_in) {
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    return MP_OBJ_FROM_PTR(ndarray_copy_view(self));
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_copy_obj, ndarray_copy);
+#endif
+
+ndarray_obj_t *ndarray_new_linear_array(size_t len, uint8_t dtype) {
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    if(len == 0) {
+        return ndarray_new_dense_ndarray(0, shape, dtype);
+    }
+    shape[ULAB_MAX_DIMS-1] = len;
+    return ndarray_new_dense_ndarray(1, shape, dtype);
+}
+
+ndarray_obj_t *ndarray_from_iterable(mp_obj_t obj, uint8_t dtype) {
+    // returns an ndarray from an iterable micropython object
+    // if the input is an ndarray, returns the input...
+    if(mp_obj_is_type(obj, &ulab_ndarray_type)) {
+        return MP_OBJ_TO_PTR(obj);
+    }
+    // ... otherwise, takes the values from the iterable, and creates the corresponding ndarray
+
+    // First, we have to figure out, whether the elements of the iterable are iterables themself
+    uint8_t ndim = 0;
+    size_t shape[ULAB_MAX_DIMS];
+    mp_obj_iter_buf_t iter_buf[ULAB_MAX_DIMS];
+    mp_obj_t iterable[ULAB_MAX_DIMS];
+    // inspect only the very first element in each dimension; this is fast,
+    // but not completely safe, e.g., length compatibility is not checked
+    mp_obj_t item = obj;
+
+    while(1) {
+        if(mp_obj_len_maybe(item) == MP_OBJ_NULL) {
+            break;
+        }
+        if(ndim == ULAB_MAX_DIMS) {
+            mp_raise_ValueError(translate("too many dimensions"));
+        }
+        shape[ndim] = MP_OBJ_SMALL_INT_VALUE(mp_obj_len_maybe(item));
+        if(shape[ndim] == 0) {
+            ndim++;
+            break;
+        }
+        iterable[ndim] = mp_getiter(item, &iter_buf[ndim]);
+        item = mp_iternext(iterable[ndim]);
+        ndim++;
+    }
+    for(uint8_t i = 0; i < ndim; i++) {
+        // align all values to the right
+        shape[ULAB_MAX_DIMS - i - 1] = shape[ndim - 1 - i];
+    }
+
+    ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(ndim, shape, dtype);
+    item = obj;
+    for(uint8_t i = 0; i < ndim - 1; i++) {
+        // if ndim > 1, descend into the hierarchy
+        iterable[ULAB_MAX_DIMS - ndim + i] = mp_getiter(item, &iter_buf[ULAB_MAX_DIMS - ndim + i]);
+        item = mp_iternext(iterable[ULAB_MAX_DIMS - ndim + i]);
+    }
+
+    size_t idx = 0;
+    // TODO: this could surely be done in a more elegant way...
+    #if ULAB_MAX_DIMS > 3
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            do {
+            #endif
+                iterable[ULAB_MAX_DIMS - 1] = mp_getiter(item, &iter_buf[ULAB_MAX_DIMS - 1]);
+                ndarray_assign_elements(ndarray, iterable[ULAB_MAX_DIMS - 1], ndarray->dtype, &idx);
+            #if ULAB_MAX_DIMS > 1
+                item = ndim > 1 ? mp_iternext(iterable[ULAB_MAX_DIMS - 2]) : MP_OBJ_STOP_ITERATION;
+            } while(item != MP_OBJ_STOP_ITERATION);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            item = ndim > 2 ? mp_iternext(iterable[ULAB_MAX_DIMS - 3]) : MP_OBJ_STOP_ITERATION;
+            if(item != MP_OBJ_STOP_ITERATION) {
+                iterable[ULAB_MAX_DIMS - 2] = mp_getiter(item, &iter_buf[ULAB_MAX_DIMS - 2]);
+                item = mp_iternext(iterable[ULAB_MAX_DIMS - 2]);
+            } else {
+                iterable[ULAB_MAX_DIMS - 2] = MP_OBJ_STOP_ITERATION;
+            }
+        } while(iterable[ULAB_MAX_DIMS - 2] != MP_OBJ_STOP_ITERATION);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        item = ndim > 3 ? mp_iternext(iterable[ULAB_MAX_DIMS - 4]) : MP_OBJ_STOP_ITERATION;
+        if(item != MP_OBJ_STOP_ITERATION) {
+            iterable[ULAB_MAX_DIMS - 3] = mp_getiter(item, &iter_buf[ULAB_MAX_DIMS - 3]);
+            item = mp_iternext(iterable[ULAB_MAX_DIMS - 3]);
+        } else {
+            iterable[ULAB_MAX_DIMS - 3] = MP_OBJ_STOP_ITERATION;
+        }
+    } while(iterable[ULAB_MAX_DIMS - 3] != MP_OBJ_STOP_ITERATION);
+    #endif
+
+    return ndarray;
+}
+
+STATIC uint8_t ndarray_init_helper(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_obj = MP_ROM_INT(NDARRAY_FLOAT) } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    uint8_t _dtype;
+    #if ULAB_HAS_DTYPE_OBJECT
+    if(mp_obj_is_type(args[1].u_obj, &ulab_dtype_type)) {
+        dtype_obj_t *dtype = MP_OBJ_TO_PTR(args[1].u_obj);
+        _dtype = dtype->dtype;
+    } else { // this must be an integer defined as a class constant (ulba.uint8 etc.)
+        _dtype = mp_obj_get_int(args[1].u_obj);
+    }
+    #else
+    _dtype = mp_obj_get_int(args[1].u_obj);
+    #endif
+    return _dtype;
+}
+
+STATIC mp_obj_t ndarray_make_new_core(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args, mp_map_t *kw_args) {
+    uint8_t dtype = ndarray_init_helper(n_args, args, kw_args);
+
+    if(mp_obj_is_type(args[0], &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0]);
+        return MP_OBJ_FROM_PTR(ndarray_copy_view_convert_type(source, dtype));
+    } else {
+        // assume that the input is an iterable
+        return MP_OBJ_FROM_PTR(ndarray_from_iterable(args[0], dtype));
+    }
+}
+
+mp_obj_t ndarray_array_constructor(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    // array constructor for ndarray, equivalent to numpy.array(...)
+    return ndarray_make_new_core(&ulab_ndarray_type, n_args, kw_args->used, pos_args, kw_args);
+}
+MP_DEFINE_CONST_FUN_OBJ_KW(ndarray_array_constructor_obj, 1, ndarray_array_constructor);
+
+mp_obj_t ndarray_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
+    (void) type;
+    mp_arg_check_num(n_args, n_kw, 1, 2, true);
+    mp_map_t kw_args;
+    mp_map_init_fixed_table(&kw_args, n_kw, args + n_args);
+    return ndarray_make_new_core(type, n_args, n_kw, args, &kw_args);
+}
+
+// broadcasting is used at a number of places, always include
+bool ndarray_can_broadcast(ndarray_obj_t *lhs, ndarray_obj_t *rhs, uint8_t *ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+    // Returns true or false, depending on, whether the two arrays can be broadcast together
+    // with numpy's broadcasting rules. These are as follows:
+    //
+    // 1. the two shapes are either equal
+    // 2. one of the shapes is 1
+    memset(lstrides, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    memset(rstrides, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    lstrides[ULAB_MAX_DIMS - 1] = lhs->strides[ULAB_MAX_DIMS - 1];
+    rstrides[ULAB_MAX_DIMS - 1] = rhs->strides[ULAB_MAX_DIMS - 1];
+    for(uint8_t i=ULAB_MAX_DIMS; i > 0; i--) {
+        if((lhs->shape[i-1] == rhs->shape[i-1]) || (lhs->shape[i-1] == 0) || (lhs->shape[i-1] == 1) ||
+        (rhs->shape[i-1] == 0) || (rhs->shape[i-1] == 1)) {
+            shape[i-1] = MAX(lhs->shape[i-1], rhs->shape[i-1]);
+            if(shape[i-1] > 0) (*ndim)++;
+            if(lhs->shape[i-1] < 2) {
+                lstrides[i-1] = 0;
+            } else {
+                lstrides[i-1] = lhs->strides[i-1];
+            }
+            if(rhs->shape[i-1] < 2) {
+                rstrides[i-1] = 0;
+            } else {
+                rstrides[i-1] = rhs->strides[i-1];
+            }
+        } else {
+            return false;
+        }
+    }
+    return true;
+}
+
+#if NDARRAY_HAS_INPLACE_OPS
+bool ndarray_can_broadcast_inplace(ndarray_obj_t *lhs, ndarray_obj_t *rhs, int32_t *rstrides) {
+    // returns true or false, depending on, whether the two arrays can be broadcast together inplace
+    // this means that the right hand side always must be "smaller" than the left hand side, i.e.
+    // the broadcasting rules are as follows:
+    //
+    // 1. the two shapes are either equal
+    // 2. the shapes on the right hand side is 1
+    memset(rstrides, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    rstrides[ULAB_MAX_DIMS - 1] = rhs->strides[ULAB_MAX_DIMS - 1];
+    for(uint8_t i=ULAB_MAX_DIMS; i > 0; i--) {
+        if((lhs->shape[i-1] == rhs->shape[i-1]) || (rhs->shape[i-1] == 0) || (rhs->shape[i-1] == 1)) {
+            if(rhs->shape[i-1] < 2) {
+                rstrides[i-1] = 0;
+            } else {
+                rstrides[i-1] = rhs->strides[i-1];
+            }
+        } else {
+            return false;
+        }
+    }
+    return true;
+}
+#endif
+
+#if NDARRAY_IS_SLICEABLE
+static size_t slice_length(mp_bound_slice_t slice) {
+    ssize_t len, correction = 1;
+    if(slice.step > 0) correction = -1;
+    len = (ssize_t)(slice.stop - slice.start + (slice.step + correction)) / slice.step;
+    if(len < 0) return 0;
+    return (size_t)len;
+}
+
+static mp_bound_slice_t generate_slice(mp_int_t n, mp_obj_t index) {
+    mp_bound_slice_t slice;
+    if(mp_obj_is_type(index, &mp_type_slice)) {
+        mp_obj_slice_indices(index, n, &slice);
+    } else if(mp_obj_is_int(index)) {
+        mp_int_t _index = mp_obj_get_int(index);
+        if(_index < 0) {
+            _index += n;
+        }
+        if((_index >= n) || (_index < 0)) {
+            mp_raise_msg(&mp_type_IndexError, translate("index is out of bounds"));
+        }
+        slice.start = _index;
+        slice.stop = _index + 1;
+        slice.step = 1;
+    } else {
+        mp_raise_msg(&mp_type_IndexError, translate("indices must be integers, slices, or Boolean lists"));
+    }
+    return slice;
+}
+
+static ndarray_obj_t *ndarray_view_from_slices(ndarray_obj_t *ndarray, mp_obj_tuple_t *tuple) {
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+    memset(strides, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+
+    uint8_t ndim = ndarray->ndim;
+
+    for(uint8_t i=0; i < ndim; i++) {
+        // copy from the end
+        shape[ULAB_MAX_DIMS - 1 - i] = ndarray->shape[ULAB_MAX_DIMS  - 1 - i];
+        strides[ULAB_MAX_DIMS - 1 - i] = ndarray->strides[ULAB_MAX_DIMS  - 1 - i];
+    }
+    int32_t offset = 0;
+    for(uint8_t i=0; i  < tuple->len; i++) {
+        if(mp_obj_is_int(tuple->items[i])) {
+            // if item is an int, the dimension will first be reduced ...
+            ndim--;
+            int32_t k = mp_obj_get_int(tuple->items[i]);
+            if(k < 0) {
+                k += ndarray->shape[ULAB_MAX_DIMS - ndarray->ndim + i];
+            }
+            if((k >= (int32_t)ndarray->shape[ULAB_MAX_DIMS - ndarray->ndim + i]) || (k < 0)) {
+                mp_raise_msg(&mp_type_IndexError, translate("index is out of bounds"));
+            }
+            offset += ndarray->strides[ULAB_MAX_DIMS - ndarray->ndim + i] * k;
+            // ... and then we have to shift the shapes to the right
+            for(uint8_t j=0; j < i; j++) {
+                shape[ULAB_MAX_DIMS - ndarray->ndim + i - j] = shape[ULAB_MAX_DIMS - ndarray->ndim + i - j - 1];
+                strides[ULAB_MAX_DIMS - ndarray->ndim + i - j] = strides[ULAB_MAX_DIMS - ndarray->ndim + i - j - 1];
+            }
+        } else {
+            mp_bound_slice_t slice = generate_slice(shape[ULAB_MAX_DIMS - ndarray->ndim + i], tuple->items[i]);
+            shape[ULAB_MAX_DIMS - ndarray->ndim + i] = slice_length(slice);
+            offset += ndarray->strides[ULAB_MAX_DIMS - ndarray->ndim + i] * (int32_t)slice.start;
+            strides[ULAB_MAX_DIMS - ndarray->ndim + i] = (int32_t)slice.step * ndarray->strides[ULAB_MAX_DIMS - ndarray->ndim + i];
+        }
+    }
+    return ndarray_new_view(ndarray, ndim, shape, strides, offset);
+}
+
+void ndarray_assign_view(ndarray_obj_t *view, ndarray_obj_t *values) {
+    if(values->len == 0) {
+        return;
+    }
+    uint8_t ndim = 0;
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    int32_t *lstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    int32_t *rstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    if(!ndarray_can_broadcast(view, values, &ndim, shape, lstrides, rstrides)) {
+        mp_raise_ValueError(translate("operands could not be broadcast together"));
+        m_del(size_t, shape, ULAB_MAX_DIMS);
+        m_del(int32_t, lstrides, ULAB_MAX_DIMS);
+        m_del(int32_t, rstrides, ULAB_MAX_DIMS);
+    }
+
+    uint8_t *rarray = (uint8_t *)values->array;
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if(values->dtype == NDARRAY_COMPLEX) {
+        if(view->dtype != NDARRAY_COMPLEX) {
+            mp_raise_TypeError(translate("cannot convert complex to dtype"));
+        } else {
+            uint8_t *larray = (uint8_t *)view->array;
+
+            #if ULAB_MAX_DIMS > 3
+            size_t i = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 2
+                size_t j = 0;
+                do {
+                #endif
+                    #if ULAB_MAX_DIMS > 1
+                    size_t k = 0;
+                    do {
+                    #endif
+                        size_t l = 0;
+                        do {
+                            memcpy(larray, rarray, view->itemsize);
+                            larray += lstrides[ULAB_MAX_DIMS - 1];
+                            rarray += rstrides[ULAB_MAX_DIMS - 1];
+                            l++;
+                        } while(l <  view->shape[ULAB_MAX_DIMS - 1]);
+                    #if ULAB_MAX_DIMS > 1
+                        larray -= lstrides[ULAB_MAX_DIMS - 1] * view->shape[ULAB_MAX_DIMS-1];
+                        larray += lstrides[ULAB_MAX_DIMS - 2];
+                        rarray -= rstrides[ULAB_MAX_DIMS - 1] * view->shape[ULAB_MAX_DIMS-1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 2];
+                        k++;
+                    } while(k <  view->shape[ULAB_MAX_DIMS - 2]);
+                    #endif
+                #if ULAB_MAX_DIMS > 2
+                    larray -= lstrides[ULAB_MAX_DIMS - 2] * view->shape[ULAB_MAX_DIMS-2];
+                    larray += lstrides[ULAB_MAX_DIMS - 3];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 2] * view->shape[ULAB_MAX_DIMS-2];
+                    rarray += rstrides[ULAB_MAX_DIMS - 3];
+                    j++;
+                } while(j <  view->shape[ULAB_MAX_DIMS - 3]);
+                #endif
+            #if ULAB_MAX_DIMS > 3
+                larray -= lstrides[ULAB_MAX_DIMS - 3] * view->shape[ULAB_MAX_DIMS-3];
+                larray += lstrides[ULAB_MAX_DIMS - 4];
+                rarray -= rstrides[ULAB_MAX_DIMS - 3] * view->shape[ULAB_MAX_DIMS-3];
+                rarray += rstrides[ULAB_MAX_DIMS - 4];
+                i++;
+            } while(i <  view->shape[ULAB_MAX_DIMS - 4]);
+            #endif
+        }
+        return;
+    }
+    #endif
+
+    // since in ASSIGNMENT_LOOP the array has a type, we have to divide the strides by the itemsize
+    for(uint8_t i=0; i < ULAB_MAX_DIMS; i++) {
+        lstrides[i] /= view->itemsize;
+        #if ULAB_SUPPORTS_COMPLEX
+        if(view->dtype == NDARRAY_COMPLEX) {
+            lstrides[i] *= 2;
+        }
+        #endif
+    }
+
+    if(view->dtype == NDARRAY_UINT8) {
+        if(values->dtype == NDARRAY_UINT8) {
+            ASSIGNMENT_LOOP(view, uint8_t, uint8_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_INT8) {
+            ASSIGNMENT_LOOP(view, uint8_t, int8_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_UINT16) {
+            ASSIGNMENT_LOOP(view, uint8_t, uint16_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_INT16) {
+            ASSIGNMENT_LOOP(view, uint8_t, int16_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_FLOAT) {
+            ASSIGNMENT_LOOP(view, uint8_t, mp_float_t, lstrides, rarray, rstrides);
+        }
+    } else if(view->dtype == NDARRAY_INT8) {
+        if(values->dtype == NDARRAY_UINT8) {
+            ASSIGNMENT_LOOP(view, int8_t, uint8_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_INT8) {
+            ASSIGNMENT_LOOP(view, int8_t, int8_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_UINT16) {
+            ASSIGNMENT_LOOP(view, int8_t, uint16_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_INT16) {
+            ASSIGNMENT_LOOP(view, int8_t, int16_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_FLOAT) {
+            ASSIGNMENT_LOOP(view, int8_t, mp_float_t, lstrides, rarray, rstrides);
+        }
+    } else if(view->dtype == NDARRAY_UINT16) {
+        if(values->dtype == NDARRAY_UINT8) {
+            ASSIGNMENT_LOOP(view, uint16_t, uint8_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_INT8) {
+            ASSIGNMENT_LOOP(view, uint16_t, int8_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_UINT16) {
+            ASSIGNMENT_LOOP(view, uint16_t, uint16_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_INT16) {
+            ASSIGNMENT_LOOP(view, uint16_t, int16_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_FLOAT) {
+            ASSIGNMENT_LOOP(view, uint16_t, mp_float_t, lstrides, rarray, rstrides);
+        }
+    } else if(view->dtype == NDARRAY_INT16) {
+        if(values->dtype == NDARRAY_UINT8) {
+            ASSIGNMENT_LOOP(view, int16_t, uint8_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_INT8) {
+            ASSIGNMENT_LOOP(view, int16_t, int8_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_UINT16) {
+            ASSIGNMENT_LOOP(view, int16_t, uint16_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_INT16) {
+            ASSIGNMENT_LOOP(view, int16_t, int16_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_FLOAT) {
+            ASSIGNMENT_LOOP(view, int16_t, mp_float_t, lstrides, rarray, rstrides);
+        }
+    } else { // the dtype must be an mp_float_t or complex now
+        if(values->dtype == NDARRAY_UINT8) {
+            ASSIGNMENT_LOOP(view, mp_float_t, uint8_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_INT8) {
+            ASSIGNMENT_LOOP(view, mp_float_t, int8_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_UINT16) {
+            ASSIGNMENT_LOOP(view, mp_float_t, uint16_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_INT16) {
+            ASSIGNMENT_LOOP(view, mp_float_t, int16_t, lstrides, rarray, rstrides);
+        } else if(values->dtype == NDARRAY_FLOAT) {
+            ASSIGNMENT_LOOP(view, mp_float_t, mp_float_t, lstrides, rarray, rstrides);
+        }
+    }
+}
+
+static mp_obj_t ndarray_from_boolean_index(ndarray_obj_t *ndarray, ndarray_obj_t *index) {
+    // returns a 1D array, indexed by a Boolean array
+    if(ndarray->len != index->len) {
+        mp_raise_ValueError(translate("array and index length must be equal"));
+    }
+    uint8_t *iarray = (uint8_t *)index->array;
+    // first we have to find out how many trues there are
+    size_t count = 0;
+    for(size_t i=0; i < index->len; i++) {
+        count += *iarray;
+        iarray += index->strides[ULAB_MAX_DIMS - 1];
+    }
+    ndarray_obj_t *results = ndarray_new_linear_array(count, ndarray->dtype);
+    uint8_t *rarray = (uint8_t *)results->array;
+    uint8_t *array = (uint8_t *)ndarray->array;
+    // re-wind the index array
+    iarray = index->array;
+    for(size_t i=0; i < index->len; i++) {
+        if(*iarray) {
+            memcpy(rarray, array, results->itemsize);
+            rarray += results->itemsize;
+            count++;
+        }
+        array += ndarray->strides[ULAB_MAX_DIMS - 1];
+        iarray += index->strides[ULAB_MAX_DIMS - 1];
+    }
+    return MP_OBJ_FROM_PTR(results);
+}
+
+static mp_obj_t ndarray_assign_from_boolean_index(ndarray_obj_t *ndarray, ndarray_obj_t *index, ndarray_obj_t *values) {
+    // assigns values to a Boolean-indexed array
+    // first we have to find out how many trues there are
+    uint8_t *iarray = (uint8_t *)index->array;
+    size_t istride = index->strides[ULAB_MAX_DIMS - 1];
+    size_t count = 0;
+    for(size_t i=0; i < index->len; i++) {
+        count += *iarray;
+        iarray += istride;
+    }
+    // re-wind the index array
+    iarray = index->array;
+    uint8_t *varray = (uint8_t *)values->array;
+    size_t vstride;
+
+    if(count == values->len) {
+        // there are as many values as true indices
+        vstride = values->strides[ULAB_MAX_DIMS - 1];
+    } else {
+        // there is a single value
+        vstride = 0;
+    }
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if(values->dtype == NDARRAY_COMPLEX) {
+        if(ndarray->dtype != NDARRAY_COMPLEX) {
+            mp_raise_TypeError(translate("cannot convert complex to dtype"));
+        } else {
+            uint8_t *array = (uint8_t *)ndarray->array;
+            for(size_t i = 0; i < ndarray->len; i++) {
+                if(*iarray) {
+                    memcpy(array, varray, ndarray->itemsize);
+                    varray += vstride;
+                }
+                array += ndarray->strides[ULAB_MAX_DIMS - 1];
+                iarray += istride;
+            } while(0);
+            return MP_OBJ_FROM_PTR(ndarray);
+        }
+    }
+    #endif
+
+    int32_t lstrides = ndarray->strides[ULAB_MAX_DIMS - 1] / ndarray->itemsize;
+
+    if(ndarray->dtype == NDARRAY_UINT8) {
+        if(values->dtype == NDARRAY_UINT8) {
+            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, uint8_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_INT8) {
+            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, int8_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_UINT16) {
+            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, uint16_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_INT16) {
+            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, int16_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_FLOAT) {
+            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, mp_float_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        }
+    } else if(ndarray->dtype == NDARRAY_INT8) {
+        if(values->dtype == NDARRAY_UINT8) {
+            BOOLEAN_ASSIGNMENT_LOOP(int8_t, uint8_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_INT8) {
+            BOOLEAN_ASSIGNMENT_LOOP(int8_t, int8_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_UINT16) {
+            BOOLEAN_ASSIGNMENT_LOOP(int8_t, uint16_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_INT16) {
+            BOOLEAN_ASSIGNMENT_LOOP(int8_t, int16_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_FLOAT) {
+            BOOLEAN_ASSIGNMENT_LOOP(int8_t, mp_float_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        }
+    } else if(ndarray->dtype == NDARRAY_UINT16) {
+        if(values->dtype == NDARRAY_UINT8) {
+            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, uint8_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_INT8) {
+            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, int8_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_UINT16) {
+            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, uint16_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_INT16) {
+            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, int16_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_FLOAT) {
+            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, mp_float_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        }
+    } else if(ndarray->dtype == NDARRAY_INT16) {
+        if(values->dtype == NDARRAY_UINT8) {
+            BOOLEAN_ASSIGNMENT_LOOP(int16_t, uint8_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_INT8) {
+            BOOLEAN_ASSIGNMENT_LOOP(int16_t, int8_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_UINT16) {
+            BOOLEAN_ASSIGNMENT_LOOP(int16_t, uint16_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_INT16) {
+            BOOLEAN_ASSIGNMENT_LOOP(int16_t, int16_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_FLOAT) {
+            BOOLEAN_ASSIGNMENT_LOOP(int16_t, mp_float_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        }
+    } else {
+        #if ULAB_SUPPORTS_COMPLEX
+        if(ndarray->dtype == NDARRAY_COMPLEX) {
+            lstrides *= 2;
+        }
+        #endif
+        if(values->dtype == NDARRAY_UINT8) {
+            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, uint8_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_INT8) {
+            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, int8_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_UINT16) {
+            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, uint16_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_INT16) {
+            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, int16_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        } else if(values->dtype == NDARRAY_FLOAT) {
+            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, mp_float_t, ndarray, lstrides, iarray, istride, varray, vstride);
+        }
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+static mp_obj_t ndarray_get_slice(ndarray_obj_t *ndarray, mp_obj_t index, ndarray_obj_t *values) {
+    if(mp_obj_is_type(index, &ulab_ndarray_type)) {
+        ndarray_obj_t *nindex = MP_OBJ_TO_PTR(index);
+        if((nindex->ndim > 1) || (nindex->boolean == false)) {
+            mp_raise_NotImplementedError(translate("operation is implemented for 1D Boolean arrays only"));
+        }
+        if(values == NULL) { // return value(s)
+            return ndarray_from_boolean_index(ndarray, nindex);
+        } else { // assign value(s)
+            ndarray_assign_from_boolean_index(ndarray, nindex, values);
+        }
+    }
+    if(mp_obj_is_type(index, &mp_type_tuple) || mp_obj_is_int(index) || mp_obj_is_type(index, &mp_type_slice)) {
+        mp_obj_tuple_t *tuple;
+        if(mp_obj_is_type(index, &mp_type_tuple)) {
+            tuple = MP_OBJ_TO_PTR(index);
+            if(tuple->len > ndarray->ndim) {
+                mp_raise_msg(&mp_type_IndexError, translate("too many indices"));
+            }
+        } else {
+            mp_obj_t *items = m_new(mp_obj_t, 1);
+            items[0] = index;
+            tuple = mp_obj_new_tuple(1, items);
+        }
+        ndarray_obj_t *view = ndarray_view_from_slices(ndarray, tuple);
+        if(values == NULL) { // return value(s)
+            // if the view has been reduced to nothing, return a single value
+            if(view->ndim == 0) {
+                return ndarray_get_item(view, view->array);
+            } else {
+                return MP_OBJ_FROM_PTR(view);
+            }
+        } else { // assign value(s)
+            ndarray_assign_view(view, values);
+        }
+    }
+    return mp_const_none;
+}
+
+mp_obj_t ndarray_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+
+    if (value == MP_OBJ_SENTINEL) { // return value(s)
+        return ndarray_get_slice(self, index, NULL);
+    } else { // assignment to slices; the value must be an ndarray, or a scalar
+        ndarray_obj_t *values = ndarray_from_mp_obj(value, 0);
+        return ndarray_get_slice(self, index, values);
+    }
+    return mp_const_none;
+}
+#endif /* NDARRAY_IS_SLICEABLE */
+
+#if NDARRAY_IS_ITERABLE
+
+// itarray iterator
+mp_obj_t ndarray_getiter(mp_obj_t o_in, mp_obj_iter_buf_t *iter_buf) {
+    return ndarray_new_ndarray_iterator(o_in, iter_buf);
+}
+
+typedef struct _mp_obj_ndarray_it_t {
+    mp_obj_base_t base;
+    mp_fun_1_t iternext;
+    mp_obj_t ndarray;
+    size_t cur;
+} mp_obj_ndarray_it_t;
+
+mp_obj_t ndarray_iternext(mp_obj_t self_in) {
+    mp_obj_ndarray_it_t *self = MP_OBJ_TO_PTR(self_in);
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(self->ndarray);
+    uint8_t *array = (uint8_t *)ndarray->array;
+
+    size_t iter_end = ndarray->shape[ULAB_MAX_DIMS-ndarray->ndim];
+    if(self->cur < iter_end) {
+        // separating this case out saves 50 bytes for 1D arrays
+        #if ULAB_MAX_DIMS == 1
+        array += self->cur * ndarray->strides[0];
+        self->cur++;
+        return ndarray_get_item(ndarray, array);
+        #else
+        if(ndarray->ndim == 1) { // we have a linear array
+            array += self->cur * ndarray->strides[ULAB_MAX_DIMS - 1];
+            self->cur++;
+            return ndarray_get_item(ndarray, array);
+        } else { // we have a tensor, return the reduced view
+            size_t offset = self->cur * ndarray->strides[ULAB_MAX_DIMS - ndarray->ndim];
+            self->cur++;
+            return MP_OBJ_FROM_PTR(ndarray_new_view(ndarray, ndarray->ndim-1, ndarray->shape, ndarray->strides, offset));
+        }
+        #endif
+    } else {
+        return MP_OBJ_STOP_ITERATION;
+    }
+}
+
+mp_obj_t ndarray_new_ndarray_iterator(mp_obj_t ndarray, mp_obj_iter_buf_t *iter_buf) {
+    assert(sizeof(mp_obj_ndarray_it_t) <= sizeof(mp_obj_iter_buf_t));
+    mp_obj_ndarray_it_t *iter = (mp_obj_ndarray_it_t *)iter_buf;
+    iter->base.type = &mp_type_polymorph_iter;
+    iter->iternext = ndarray_iternext;
+    iter->ndarray = ndarray;
+    iter->cur = 0;
+    return MP_OBJ_FROM_PTR(iter);
+}
+#endif /* NDARRAY_IS_ITERABLE */
+
+#if NDARRAY_HAS_FLATTEN
+mp_obj_t ndarray_flatten(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_order, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = MP_ROM_QSTR(MP_QSTR_C)} },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args - 1, pos_args + 1, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(pos_args[0]);
+    GET_STR_DATA_LEN(args[0].u_obj, order, len);
+    if((len != 1) || ((memcmp(order, "C", 1) != 0) && (memcmp(order, "F", 1) != 0))) {
+        mp_raise_ValueError(translate("flattening order must be either 'C', or 'F'"));
+    }
+
+    uint8_t *sarray = (uint8_t *)self->array;
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(self->len, self->dtype);
+    uint8_t *array = (uint8_t *)ndarray->array;
+
+    if(memcmp(order, "C", 1) == 0) { // C-type ordering
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        memcpy(array, sarray, self->itemsize);
+                        array += ndarray->strides[ULAB_MAX_DIMS - 1];
+                        sarray += self->strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l <  self->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    sarray -= self->strides[ULAB_MAX_DIMS - 1] * self->shape[ULAB_MAX_DIMS-1];
+                    sarray += self->strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k <  self->shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                sarray -= self->strides[ULAB_MAX_DIMS - 2] * self->shape[ULAB_MAX_DIMS-2];
+                sarray += self->strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j <  self->shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            sarray -= self->strides[ULAB_MAX_DIMS - 3] * self->shape[ULAB_MAX_DIMS-3];
+            sarray += self->strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i <  self->shape[ULAB_MAX_DIMS - 4]);
+        #endif
+    } else { // 'F', Fortran-type ordering
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        memcpy(array, sarray, self->itemsize);
+                        array += ndarray->strides[ULAB_MAX_DIMS - 1];
+                        sarray += self->strides[0];
+                        l++;
+                    } while(l < self->shape[0]);
+                #if ULAB_MAX_DIMS > 1
+                    sarray -= self->strides[0] * self->shape[0];
+                    sarray += self->strides[1];
+                    k++;
+                } while(k < self->shape[1]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                sarray -= self->strides[1] * self->shape[1];
+                sarray += self->strides[2];
+                j++;
+            } while(j < self->shape[2]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            sarray -= self->strides[2] * self->shape[2];
+            sarray += self->strides[3];
+            i++;
+        } while(i < self->shape[3]);
+        #endif
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(ndarray_flatten_obj, 1, ndarray_flatten);
+#endif
+
+#if NDARRAY_HAS_ITEMSIZE
+mp_obj_t ndarray_itemsize(mp_obj_t self_in) {
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    return MP_OBJ_NEW_SMALL_INT(self->itemsize);
+}
+#endif
+
+#if NDARRAY_HAS_SHAPE
+mp_obj_t ndarray_shape(mp_obj_t self_in) {
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    uint8_t nitems = MAX(1, self->ndim);
+    mp_obj_t *items = m_new(mp_obj_t, nitems);
+    for(uint8_t i = 0; i < nitems; i++) {
+        items[nitems - i - 1] = mp_obj_new_int(self->shape[ULAB_MAX_DIMS - i - 1]);
+    }
+    mp_obj_t tuple = mp_obj_new_tuple(nitems, items);
+    m_del(mp_obj_t, items, nitems);
+    return tuple;
+}
+#endif
+
+#if NDARRAY_HAS_SIZE
+mp_obj_t ndarray_size(mp_obj_t self_in) {
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    return mp_obj_new_int(self->len);
+}
+#endif
+
+#if NDARRAY_HAS_STRIDES
+mp_obj_t ndarray_strides(mp_obj_t self_in) {
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    mp_obj_t *items = m_new(mp_obj_t, self->ndim);
+    for(int8_t i=0; i < self->ndim; i++) {
+        items[i] = mp_obj_new_int(self->strides[ULAB_MAX_DIMS - self->ndim + i]);
+    }
+    mp_obj_t tuple = mp_obj_new_tuple(self->ndim, items);
+    m_del(mp_obj_t, items, self->ndim);
+    return tuple;
+}
+#endif
+
+#if NDARRAY_HAS_TOBYTES
+mp_obj_t ndarray_tobytes(mp_obj_t self_in) {
+    // As opposed to numpy, this function returns a bytearray object with the data pointer (i.e., not a copy)
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    // Piping into a bytearray makes sense for dense arrays only,
+    // so bail out, if that is not the case
+    if(!ndarray_is_dense(self)) {
+        mp_raise_ValueError(translate("tobytes can be invoked for dense arrays only"));
+    }
+    return mp_obj_new_bytearray_by_ref(self->itemsize * self->len, self->array);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_tobytes_obj, ndarray_tobytes);
+#endif
+
+#if NDARRAY_HAS_TOLIST
+static mp_obj_t ndarray_recursive_list(ndarray_obj_t *self, uint8_t *array, uint8_t dim) {
+    int32_t stride = self->strides[ULAB_MAX_DIMS - dim];
+    size_t len = self->shape[ULAB_MAX_DIMS - dim];
+
+    mp_obj_list_t *list = MP_OBJ_TO_PTR(mp_obj_new_list(len, NULL));
+    for(size_t i = 0; i < len; i++) {
+        if(dim == 1) {
+            list->items[i] = ndarray_get_item(self, array);
+        } else {
+            list->items[i] = ndarray_recursive_list(self, array, dim-1);
+        }
+        array += stride;
+    }
+    return MP_OBJ_FROM_PTR(list);
+}
+
+mp_obj_t ndarray_tolist(mp_obj_t self_in) {
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    uint8_t *array = (uint8_t *)self->array;
+    return ndarray_recursive_list(self, array, self->ndim);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_tolist_obj, ndarray_tolist);
+#endif
+
+// Binary operations
+ndarray_obj_t *ndarray_from_mp_obj(mp_obj_t obj, uint8_t other_type) {
+    // creates an ndarray from a micropython int or float
+    // if the input is an ndarray, it is returned
+    // if other_type is 0, return the smallest type that can accommodate the object
+    ndarray_obj_t *ndarray;
+
+    if(mp_obj_is_int(obj)) {
+        int32_t ivalue = mp_obj_get_int(obj);
+        if((ivalue < -32767) || (ivalue > 32767)) {
+            // the integer value clearly does not fit the ulab integer types, so move on to float
+            ndarray = ndarray_new_linear_array(1, NDARRAY_FLOAT);
+            mp_float_t *array = (mp_float_t *)ndarray->array;
+            array[0] = (mp_float_t)ivalue;
+        } else {
+            uint8_t dtype;
+            if(ivalue < 0) {
+                if(ivalue > -128) {
+                    dtype = NDARRAY_INT8;
+                } else {
+                    dtype = NDARRAY_INT16;
+                }
+            } else { // ivalue >= 0
+                if((other_type == NDARRAY_INT8) || (other_type == NDARRAY_INT16)) {
+                    if(ivalue < 128) {
+                        dtype = NDARRAY_INT8;
+                    } else {
+                        dtype = NDARRAY_INT16;
+                    }
+                } else { // other_type = 0 is also included here
+                    if(ivalue < 256) {
+                        dtype = NDARRAY_UINT8;
+                    } else {
+                        dtype = NDARRAY_UINT16;
+                    }
+                }
+            }
+            ndarray = ndarray_new_linear_array(1, dtype);
+            ndarray_set_value(dtype, ndarray->array, 0, obj);
+        }
+    } else if(mp_obj_is_float(obj)) {
+        ndarray = ndarray_new_linear_array(1, NDARRAY_FLOAT);
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        array[0] = mp_obj_get_float(obj);
+    } else if(mp_obj_is_type(obj, &ulab_ndarray_type)){
+        return obj;
+    }
+    #if ULAB_SUPPORTS_COMPLEX
+    else if(mp_obj_is_type(obj, &mp_type_complex)) {
+        ndarray = ndarray_new_linear_array(1, NDARRAY_COMPLEX);
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        mp_obj_get_complex(obj, &array[0], &array[1]);
+    }
+    #endif
+    else {
+        // assume that the input is an iterable (raises an exception, if it is not the case)
+        ndarray = ndarray_from_iterable(obj, NDARRAY_FLOAT);
+    }
+    return ndarray;
+}
+
+#if NDARRAY_HAS_BINARY_OPS || NDARRAY_HAS_INPLACE_OPS
+mp_obj_t ndarray_binary_op(mp_binary_op_t _op, mp_obj_t lobj, mp_obj_t robj) {
+    // TODO: implement in-place operators
+    // if the ndarray stands on the right hand side of the expression, simply swap the operands
+    ndarray_obj_t *lhs, *rhs;
+    mp_binary_op_t op = _op;
+    if((op == MP_BINARY_OP_REVERSE_ADD) || (op == MP_BINARY_OP_REVERSE_MULTIPLY) ||
+        (op == MP_BINARY_OP_REVERSE_POWER) || (op == MP_BINARY_OP_REVERSE_SUBTRACT) ||
+        (op == MP_BINARY_OP_REVERSE_TRUE_DIVIDE)) {
+        lhs = ndarray_from_mp_obj(robj, 0);
+        rhs = ndarray_from_mp_obj(lobj, lhs->dtype);
+    } else {
+        lhs = ndarray_from_mp_obj(lobj, 0);
+        rhs = ndarray_from_mp_obj(robj, lhs->dtype);
+    }
+    if(op == MP_BINARY_OP_REVERSE_ADD) {
+        op = MP_BINARY_OP_ADD;
+    } else if(op == MP_BINARY_OP_REVERSE_MULTIPLY) {
+        op = MP_BINARY_OP_MULTIPLY;
+    } else if(op == MP_BINARY_OP_REVERSE_POWER) {
+        op = MP_BINARY_OP_POWER;
+    } else if(op == MP_BINARY_OP_REVERSE_SUBTRACT) {
+        op = MP_BINARY_OP_SUBTRACT;
+    } else if(op == MP_BINARY_OP_REVERSE_TRUE_DIVIDE) {
+        op = MP_BINARY_OP_TRUE_DIVIDE;
+    }
+
+    uint8_t ndim = 0;
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    int32_t *lstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    int32_t *rstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    uint8_t broadcastable;
+    if((op == MP_BINARY_OP_INPLACE_ADD) || (op == MP_BINARY_OP_INPLACE_MULTIPLY) || (op == MP_BINARY_OP_INPLACE_POWER) ||
+        (op == MP_BINARY_OP_INPLACE_SUBTRACT) || (op == MP_BINARY_OP_INPLACE_TRUE_DIVIDE)) {
+        broadcastable = ndarray_can_broadcast_inplace(lhs, rhs, rstrides);
+    } else {
+        broadcastable = ndarray_can_broadcast(lhs, rhs, &ndim, shape, lstrides, rstrides);
+    }
+    if(!broadcastable) {
+        mp_raise_ValueError(translate("operands could not be broadcast together"));
+        m_del(size_t, shape, ULAB_MAX_DIMS);
+        m_del(int32_t, lstrides, ULAB_MAX_DIMS);
+        m_del(int32_t, rstrides, ULAB_MAX_DIMS);
+    }
+    // the empty arrays have to be treated separately
+    uint8_t dtype = NDARRAY_INT16;
+    ndarray_obj_t *nd;
+    if((lhs->ndim == 0) || (rhs->ndim == 0)) {
+        switch(op) {
+            case MP_BINARY_OP_INPLACE_ADD:
+            case MP_BINARY_OP_INPLACE_MULTIPLY:
+            case MP_BINARY_OP_INPLACE_SUBTRACT:
+            case MP_BINARY_OP_ADD:
+            case MP_BINARY_OP_MULTIPLY:
+            case MP_BINARY_OP_SUBTRACT:
+                // here we don't have to list those cases that result in an int16,
+                // because dtype is initialised with that NDARRAY_INT16
+                if(lhs->dtype == rhs->dtype) {
+                    dtype = rhs->dtype;
+                } else if((lhs->dtype == NDARRAY_FLOAT) || (rhs->dtype == NDARRAY_FLOAT)) {
+                    dtype = NDARRAY_FLOAT;
+                } else if(((lhs->dtype == NDARRAY_UINT8) && (rhs->dtype == NDARRAY_UINT16)) ||
+                            ((lhs->dtype == NDARRAY_INT8) && (rhs->dtype == NDARRAY_UINT16)) ||
+                            ((rhs->dtype == NDARRAY_UINT8) && (lhs->dtype == NDARRAY_UINT16)) ||
+                            ((rhs->dtype == NDARRAY_INT8) && (lhs->dtype == NDARRAY_UINT16))) {
+                    dtype = NDARRAY_UINT16;
+                }
+                return MP_OBJ_FROM_PTR(ndarray_new_linear_array(0, dtype));
+                break;
+
+            case MP_BINARY_OP_INPLACE_POWER:
+            case MP_BINARY_OP_INPLACE_TRUE_DIVIDE:
+            case MP_BINARY_OP_POWER:
+            case MP_BINARY_OP_TRUE_DIVIDE:
+                return MP_OBJ_FROM_PTR(ndarray_new_linear_array(0, NDARRAY_FLOAT));
+                break;
+
+            case MP_BINARY_OP_LESS:
+            case MP_BINARY_OP_LESS_EQUAL:
+            case MP_BINARY_OP_MORE:
+            case MP_BINARY_OP_MORE_EQUAL:
+            case MP_BINARY_OP_EQUAL:
+            case MP_BINARY_OP_NOT_EQUAL:
+                nd = ndarray_new_linear_array(0, NDARRAY_UINT8);
+                nd->boolean = 1;
+                return MP_OBJ_FROM_PTR(nd);
+
+            default:
+                return mp_const_none;
+                break;
+        }
+    }
+
+    switch(op) {
+        // first the in-place operators
+        #if NDARRAY_HAS_INPLACE_ADD
+        case MP_BINARY_OP_INPLACE_ADD:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
+            return ndarray_inplace_ams(lhs, rhs, rstrides, op);
+            break;
+        #endif
+        #if NDARRAY_HAS_INPLACE_MULTIPLY
+        case MP_BINARY_OP_INPLACE_MULTIPLY:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
+            return ndarray_inplace_ams(lhs, rhs, rstrides, op);
+            break;
+        #endif
+        #if NDARRAY_HAS_INPLACE_POWER
+        case MP_BINARY_OP_INPLACE_POWER:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
+            return ndarray_inplace_power(lhs, rhs, rstrides);
+            break;
+        #endif
+        #if NDARRAY_HAS_INPLACE_SUBTRACT
+        case MP_BINARY_OP_INPLACE_SUBTRACT:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
+            return ndarray_inplace_ams(lhs, rhs, rstrides, op);
+            break;
+        #endif
+        #if NDARRAY_HAS_INPLACE_TRUE_DIVIDE
+        case MP_BINARY_OP_INPLACE_TRUE_DIVIDE:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
+            return ndarray_inplace_divide(lhs, rhs, rstrides);
+            break;
+        #endif
+        // end if in-place operators
+
+        #if NDARRAY_HAS_BINARY_OP_LESS
+        case MP_BINARY_OP_LESS:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
+            // here we simply swap the operands
+            return ndarray_binary_more(rhs, lhs, ndim, shape, rstrides, lstrides, MP_BINARY_OP_MORE);
+            break;
+        #endif
+        #if NDARRAY_HAS_BINARY_OP_LESS_EQUAL
+        case MP_BINARY_OP_LESS_EQUAL:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
+            // here we simply swap the operands
+            return ndarray_binary_more(rhs, lhs, ndim, shape, rstrides, lstrides, MP_BINARY_OP_MORE_EQUAL);
+            break;
+        #endif
+        #if NDARRAY_HAS_BINARY_OP_EQUAL
+        case MP_BINARY_OP_EQUAL:
+            return ndarray_binary_equality(lhs, rhs, ndim, shape, lstrides, rstrides, MP_BINARY_OP_EQUAL);
+            break;
+        #endif
+        #if NDARRAY_HAS_BINARY_OP_NOT_EQUAL
+        case MP_BINARY_OP_NOT_EQUAL:
+            return ndarray_binary_equality(lhs, rhs, ndim, shape, lstrides, rstrides, MP_BINARY_OP_NOT_EQUAL);
+            break;
+        #endif
+        #if NDARRAY_HAS_BINARY_OP_ADD
+        case MP_BINARY_OP_ADD:
+            return ndarray_binary_add(lhs, rhs, ndim, shape, lstrides, rstrides);
+            break;
+        #endif
+        #if NDARRAY_HAS_BINARY_OP_MULTIPLY
+        case MP_BINARY_OP_MULTIPLY:
+            return ndarray_binary_multiply(lhs, rhs, ndim, shape, lstrides, rstrides);
+            break;
+        #endif
+        #if NDARRAY_HAS_BINARY_OP_MORE
+        case MP_BINARY_OP_MORE:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
+            return ndarray_binary_more(lhs, rhs, ndim, shape, lstrides, rstrides, MP_BINARY_OP_MORE);
+            break;
+        #endif
+        #if NDARRAY_HAS_BINARY_OP_MORE_EQUAL
+        case MP_BINARY_OP_MORE_EQUAL:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
+            return ndarray_binary_more(lhs, rhs, ndim, shape, lstrides, rstrides, MP_BINARY_OP_MORE_EQUAL);
+            break;
+        #endif
+        #if NDARRAY_HAS_BINARY_OP_SUBTRACT
+        case MP_BINARY_OP_SUBTRACT:
+            return ndarray_binary_subtract(lhs, rhs, ndim, shape, lstrides, rstrides);
+            break;
+        #endif
+        #if NDARRAY_HAS_BINARY_OP_TRUE_DIVIDE
+        case MP_BINARY_OP_TRUE_DIVIDE:
+            return ndarray_binary_true_divide(lhs, rhs, ndim, shape, lstrides, rstrides);
+            break;
+        #endif
+        #if NDARRAY_HAS_BINARY_OP_POWER
+        case MP_BINARY_OP_POWER:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
+            return ndarray_binary_power(lhs, rhs, ndim, shape, lstrides, rstrides);
+            break;
+        #endif
+        default:
+            return MP_OBJ_NULL; // op not supported
+            break;
+    }
+    return MP_OBJ_NULL;
+}
+#endif /* NDARRAY_HAS_BINARY_OPS || NDARRAY_HAS_INPLACE_OPS */
+
+#if NDARRAY_HAS_UNARY_OPS
+mp_obj_t ndarray_unary_op(mp_unary_op_t op, mp_obj_t self_in) {
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    ndarray_obj_t *ndarray = NULL;
+
+    switch (op) {
+        #if NDARRAY_HAS_UNARY_OP_ABS
+        case MP_UNARY_OP_ABS:
+            #if ULAB_SUPPORTS_COMPLEX
+            if(self->dtype == NDARRAY_COMPLEX) {
+                int32_t *strides = strides_from_shape(self->shape, NDARRAY_FLOAT);
+                ndarray_obj_t *target = ndarray_new_ndarray(self->ndim, self->shape, strides, NDARRAY_FLOAT);
+                ndarray = carray_abs(self, target);
+            } else {
+            #endif
+                ndarray = ndarray_copy_view(self);
+                // if Boolean, NDARRAY_UINT8, or NDARRAY_UINT16, there is nothing to do
+                if(self->dtype == NDARRAY_INT8) {
+                    int8_t *array = (int8_t *)ndarray->array;
+                    for(size_t i=0; i < self->len; i++, array++) {
+                        if(*array < 0) *array = -(*array);
+                    }
+                } else if(self->dtype == NDARRAY_INT16) {
+                    int16_t *array = (int16_t *)ndarray->array;
+                    for(size_t i=0; i < self->len; i++, array++) {
+                        if(*array < 0) *array = -(*array);
+                    }
+                } else {
+                    mp_float_t *array = (mp_float_t *)ndarray->array;
+                    for(size_t i=0; i < self->len; i++, array++) {
+                        if(*array < 0) *array = -(*array);
+                    }
+                }
+            #if ULAB_SUPPORTS_COMPLEX
+            }
+            #endif
+            return MP_OBJ_FROM_PTR(ndarray);
+            break;
+        #endif
+        #if NDARRAY_HAS_UNARY_OP_INVERT
+        case MP_UNARY_OP_INVERT:
+            #if ULAB_SUPPORTS_COMPLEX
+            if(self->dtype == NDARRAY_FLOAT || self->dtype == NDARRAY_COMPLEX) {
+            #else
+            if(self->dtype == NDARRAY_FLOAT) {
+            #endif
+                mp_raise_ValueError(translate("operation is not supported for given type"));
+            }
+            // we can invert the content byte by byte, no need to distinguish between different dtypes
+            ndarray = ndarray_copy_view(self); // from this point, this is a dense copy
+            uint8_t *array = (uint8_t *)ndarray->array;
+            if(ndarray->boolean) {
+                for(size_t i=0; i < ndarray->len; i++, array++) *array = *array ^ 0x01;
+            } else {
+                uint8_t itemsize = ulab_binary_get_size(self->dtype);
+                for(size_t i=0; i < ndarray->len*itemsize; i++, array++) *array ^= 0xFF;
+            }
+            return MP_OBJ_FROM_PTR(ndarray);
+            break;
+        #endif
+        #if NDARRAY_HAS_UNARY_OP_LEN
+        case MP_UNARY_OP_LEN:
+            return mp_obj_new_int(self->shape[ULAB_MAX_DIMS - self->ndim]);
+            break;
+        #endif
+        #if NDARRAY_HAS_UNARY_OP_NEGATIVE
+        case MP_UNARY_OP_NEGATIVE:
+            ndarray = ndarray_copy_view(self); // from this point, this is a dense copy
+            if(self->dtype == NDARRAY_UINT8) {
+                uint8_t *array = (uint8_t *)ndarray->array;
+                for(size_t i=0; i < self->len; i++, array++) *array = -(*array);
+            } else if(self->dtype == NDARRAY_INT8) {
+                int8_t *array = (int8_t *)ndarray->array;
+                for(size_t i=0; i < self->len; i++, array++) *array = -(*array);
+            } else if(self->dtype == NDARRAY_UINT16) {
+                uint16_t *array = (uint16_t *)ndarray->array;
+                for(size_t i=0; i < self->len; i++, array++) *array = -(*array);
+            } else if(self->dtype == NDARRAY_INT16) {
+                int16_t *array = (int16_t *)ndarray->array;
+                for(size_t i=0; i < self->len; i++, array++) *array = -(*array);
+            } else {
+                mp_float_t *array = (mp_float_t *)ndarray->array;
+                size_t len = self->len;
+                #if ULAB_SUPPORTS_COMPLEX
+                if(self->dtype == NDARRAY_COMPLEX) {
+                    len *= 2;
+                }
+                #endif
+                for(size_t i=0; i < len; i++, array++) *array = -(*array);
+            }
+            return MP_OBJ_FROM_PTR(ndarray);
+            break;
+        #endif
+        #if NDARRAY_HAS_UNARY_OP_POSITIVE
+        case MP_UNARY_OP_POSITIVE:
+            return MP_OBJ_FROM_PTR(ndarray_copy_view(self));
+        #endif
+
+        default:
+            return MP_OBJ_NULL; // operator not supported
+            break;
+    }
+}
+#endif /* NDARRAY_HAS_UNARY_OPS */
+
+#if NDARRAY_HAS_TRANSPOSE
+mp_obj_t ndarray_transpose(mp_obj_t self_in) {
+    #if ULAB_MAX_DIMS == 1
+        return self_in;
+    #endif
+    // TODO: check, what happens to the offset here, if we have a view
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    if(self->ndim == 1) {
+        return self_in;
+    }
+    size_t *shape = m_new(size_t, self->ndim);
+    int32_t *strides = m_new(int32_t, self->ndim);
+    for(uint8_t i=0; i < self->ndim; i++) {
+        shape[ULAB_MAX_DIMS - 1 - i] = self->shape[ULAB_MAX_DIMS - self->ndim + i];
+        strides[ULAB_MAX_DIMS - 1 - i] = self->strides[ULAB_MAX_DIMS - self->ndim + i];
+    }
+    // TODO: I am not sure ndarray_new_view is OK here...
+    // should be deep copy...
+    ndarray_obj_t *ndarray = ndarray_new_view(self, self->ndim, shape, strides, 0);
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_transpose_obj, ndarray_transpose);
+#endif /* NDARRAY_HAS_TRANSPOSE */
+
+#if ULAB_MAX_DIMS > 1
+#if NDARRAY_HAS_RESHAPE
+mp_obj_t ndarray_reshape_core(mp_obj_t oin, mp_obj_t _shape, bool inplace) {
+    ndarray_obj_t *source = MP_OBJ_TO_PTR(oin);
+    if(!mp_obj_is_type(_shape, &mp_type_tuple)) {
+        mp_raise_TypeError(translate("shape must be a tuple"));
+    }
+
+    mp_obj_tuple_t *shape = MP_OBJ_TO_PTR(_shape);
+    if(shape->len > ULAB_MAX_DIMS) {
+        mp_raise_ValueError(translate("maximum number of dimensions is 4"));
+    }
+    size_t *new_shape = m_new(size_t, ULAB_MAX_DIMS);
+    memset(new_shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    size_t new_length = 1;
+    for(uint8_t i=0; i < shape->len; i++) {
+        new_shape[ULAB_MAX_DIMS - i - 1] = mp_obj_get_int(shape->items[shape->len - i - 1]);
+        new_length *= new_shape[ULAB_MAX_DIMS - i - 1];
+    }
+    if(source->len != new_length) {
+        mp_raise_ValueError(translate("input and output shapes are not compatible"));
+    }
+    ndarray_obj_t *ndarray;
+    if(ndarray_is_dense(source)) {
+        int32_t *new_strides = strides_from_shape(new_shape, source->dtype);
+        if(inplace) {
+            for(uint8_t i = 0; i < ULAB_MAX_DIMS; i++) {
+                source->shape[i] = new_shape[i];
+                source->strides[i] = new_strides[i];
+            }
+            return MP_OBJ_FROM_PTR(oin);
+        } else {
+            ndarray = ndarray_new_view(source, shape->len, new_shape, new_strides, 0);
+        }
+    } else {
+        if(inplace) {
+            mp_raise_ValueError(translate("cannot assign new shape"));
+        }
+        ndarray = ndarray_new_ndarray_from_tuple(shape, source->dtype);
+        ndarray_copy_array(source, ndarray, 0);
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+mp_obj_t ndarray_reshape(mp_obj_t oin, mp_obj_t _shape) {
+    return ndarray_reshape_core(oin, _shape, 0);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(ndarray_reshape_obj, ndarray_reshape);
+#endif /* NDARRAY_HAS_RESHAPE */
+#endif /* ULAB_MAX_DIMS > 1 */
+
+#if ULAB_NUMPY_HAS_NDINFO
+mp_obj_t ndarray_info(mp_obj_t obj_in) {
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(obj_in);
+    if(!mp_obj_is_type(ndarray, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("function is defined for ndarrays only"));
+    }
+    mp_printf(MP_PYTHON_PRINTER, "class: ndarray\n");
+    mp_printf(MP_PYTHON_PRINTER, "shape: (");
+    if(ndarray->ndim == 1) {
+        mp_printf(MP_PYTHON_PRINTER, "%d,", ndarray->shape[ULAB_MAX_DIMS-1]);
+    } else {
+        for(uint8_t i=0; i < ndarray->ndim-1; i++) mp_printf(MP_PYTHON_PRINTER, "%d, ", ndarray->shape[i]);
+        mp_printf(MP_PYTHON_PRINTER, "%d", ndarray->shape[ULAB_MAX_DIMS-1]);
+    }
+    mp_printf(MP_PYTHON_PRINTER, ")\n");
+    mp_printf(MP_PYTHON_PRINTER, "strides: (");
+    if(ndarray->ndim == 1) {
+        mp_printf(MP_PYTHON_PRINTER, "%d,", ndarray->strides[ULAB_MAX_DIMS-1]);
+    } else {
+        for(uint8_t i=0; i < ndarray->ndim-1; i++) mp_printf(MP_PYTHON_PRINTER, "%d, ", ndarray->strides[i]);
+        mp_printf(MP_PYTHON_PRINTER, "%d", ndarray->strides[ULAB_MAX_DIMS-1]);
+    }
+    mp_printf(MP_PYTHON_PRINTER, ")\n");
+    mp_printf(MP_PYTHON_PRINTER, "itemsize: %d\n", ndarray->itemsize);
+    mp_printf(MP_PYTHON_PRINTER, "data pointer: 0x%p\n", ndarray->array);
+    mp_printf(MP_PYTHON_PRINTER, "type: ");
+    if(ndarray->boolean) {
+        mp_printf(MP_PYTHON_PRINTER, "bool\n");
+    } else if(ndarray->dtype == NDARRAY_UINT8) {
+        mp_printf(MP_PYTHON_PRINTER, "uint8\n");
+    } else if(ndarray->dtype == NDARRAY_INT8) {
+        mp_printf(MP_PYTHON_PRINTER, "int8\n");
+    } else if(ndarray->dtype == NDARRAY_UINT16) {
+        mp_printf(MP_PYTHON_PRINTER, "uint16\n");
+    } else if(ndarray->dtype == NDARRAY_INT16) {
+        mp_printf(MP_PYTHON_PRINTER, "int16\n");
+    } else if(ndarray->dtype == NDARRAY_FLOAT) {
+        mp_printf(MP_PYTHON_PRINTER, "float\n");
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_info_obj, ndarray_info);
+#endif
+
+// (the get_buffer protocol returns 0 for success, 1 for failure)
+mp_int_t ndarray_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, mp_uint_t flags) {
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    if(!ndarray_is_dense(self)) {
+        return 1;
+    }
+    bufinfo->len = self->itemsize * self->len;
+    bufinfo->buf = self->array;
+    bufinfo->typecode = self->dtype;
+    return 0;
+}
diff --git a/circuitpython/extmod/ulab/code/ndarray.h b/circuitpython/extmod/ulab/code/ndarray.h
new file mode 100644
index 0000000..4478f94
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/ndarray.h
@@ -0,0 +1,749 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *               2020 Jeff Epler for Adafruit Industries
+*/
+
+#ifndef _NDARRAY_
+#define _NDARRAY_
+
+#include "py/objarray.h"
+#include "py/binary.h"
+#include "py/objstr.h"
+#include "py/objlist.h"
+
+#include "ulab.h"
+
+#ifndef MP_PI
+#define MP_PI MICROPY_FLOAT_CONST(3.14159265358979323846)
+#endif
+#ifndef MP_E
+#define MP_E MICROPY_FLOAT_CONST(2.71828182845904523536)
+#endif
+
+#if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
+#define FLOAT_TYPECODE 'f'
+#elif MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_DOUBLE
+#define FLOAT_TYPECODE 'd'
+#endif
+
+// this typedef is lifted from objfloat.c, because mp_obj_float_t is not exposed
+typedef struct _mp_obj_float_t {
+    mp_obj_base_t base;
+    mp_float_t value;
+} mp_obj_float_t;
+
+#if defined(MICROPY_VERSION_MAJOR) && MICROPY_VERSION_MAJOR == 1 && MICROPY_VERSION_MINOR == 11
+typedef struct _mp_obj_slice_t {
+    mp_obj_base_t base;
+    mp_obj_t start;
+    mp_obj_t stop;
+    mp_obj_t step;
+} mp_obj_slice_t;
+#define MP_ERROR_TEXT(x) x
+#endif
+
+#if !defined(MP_TYPE_FLAG_EXTENDED)
+#define MP_TYPE_CALL call
+#define mp_type_get_call_slot(t) t->call
+#define MP_TYPE_FLAG_EXTENDED (0)
+#define MP_TYPE_EXTENDED_FIELDS(...) __VA_ARGS__
+#endif
+
+#if !CIRCUITPY
+#define translate(x) MP_ERROR_TEXT(x)
+#define ndarray_set_value(a, b, c, d) mp_binary_set_val_array(a, b, c, d)
+#else
+void ndarray_set_value(char , void *, size_t , mp_obj_t );
+#endif
+
+void ndarray_set_complex_value(void *, size_t , mp_obj_t );
+
+#define NDARRAY_NUMERIC   0
+#define NDARRAY_BOOLEAN   1
+
+#define NDARRAY_NDARRAY_TYPE    1
+#define NDARRAY_ITERABLE_TYPE   2
+
+extern const mp_obj_type_t ulab_ndarray_type;
+
+enum NDARRAY_TYPE {
+    NDARRAY_BOOL = '?', // this must never be assigned to the dtype!
+    NDARRAY_UINT8 = 'B',
+    NDARRAY_INT8 = 'b',
+    NDARRAY_UINT16 = 'H',
+    NDARRAY_INT16 = 'h',
+    #if ULAB_SUPPORTS_COMPLEX
+        NDARRAY_COMPLEX = 'c',
+    #endif
+    NDARRAY_FLOAT = FLOAT_TYPECODE,
+};
+
+typedef struct _ndarray_obj_t {
+    mp_obj_base_t base;
+    uint8_t dtype;
+    uint8_t itemsize;
+    uint8_t boolean;
+    uint8_t ndim;
+    size_t len;
+    size_t shape[ULAB_MAX_DIMS];
+    int32_t strides[ULAB_MAX_DIMS];
+    void *array;
+    void *origin;
+} ndarray_obj_t;
+
+#if ULAB_HAS_DTYPE_OBJECT
+extern const mp_obj_type_t ulab_dtype_type;
+
+typedef struct _dtype_obj_t {
+    mp_obj_base_t base;
+    uint8_t dtype;
+} dtype_obj_t;
+
+void ndarray_dtype_print(const mp_print_t *, mp_obj_t , mp_print_kind_t );
+
+mp_obj_t ndarray_dtype_make_new(const mp_obj_type_t *, size_t , size_t , const mp_obj_t *);
+#endif /* ULAB_HAS_DTYPE_OBJECT */
+
+extern const mp_obj_type_t ndarray_flatiter_type;
+
+mp_obj_t ndarray_new_ndarray_iterator(mp_obj_t , mp_obj_iter_buf_t *);
+
+mp_obj_t ndarray_get_item(ndarray_obj_t *, void *);
+mp_float_t ndarray_get_float_value(void *, uint8_t );
+mp_float_t ndarray_get_float_index(void *, uint8_t , size_t );
+bool ndarray_object_is_array_like(mp_obj_t );
+void fill_array_iterable(mp_float_t *, mp_obj_t );
+size_t *ndarray_shape_vector(size_t , size_t , size_t , size_t );
+
+void ndarray_print(const mp_print_t *, mp_obj_t , mp_print_kind_t );
+
+#if ULAB_HAS_PRINTOPTIONS
+mp_obj_t ndarray_set_printoptions(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(ndarray_set_printoptions_obj);
+
+mp_obj_t ndarray_get_printoptions(void);
+MP_DECLARE_CONST_FUN_OBJ_0(ndarray_get_printoptions_obj);
+#endif
+
+void ndarray_assign_elements(ndarray_obj_t *, mp_obj_t , uint8_t , size_t *);
+size_t *ndarray_contract_shape(ndarray_obj_t *, uint8_t );
+int32_t *ndarray_contract_strides(ndarray_obj_t *, uint8_t );
+
+ndarray_obj_t *ndarray_new_dense_ndarray(uint8_t , size_t *, uint8_t );
+ndarray_obj_t *ndarray_new_ndarray_from_tuple(mp_obj_tuple_t *, uint8_t );
+ndarray_obj_t *ndarray_new_ndarray(uint8_t , size_t *, int32_t *, uint8_t );
+ndarray_obj_t *ndarray_new_linear_array(size_t , uint8_t );
+ndarray_obj_t *ndarray_new_view(ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t );
+bool ndarray_is_dense(ndarray_obj_t *);
+ndarray_obj_t *ndarray_copy_view(ndarray_obj_t *);
+ndarray_obj_t *ndarray_copy_view_convert_type(ndarray_obj_t *, uint8_t );
+void ndarray_copy_array(ndarray_obj_t *, ndarray_obj_t *, uint8_t );
+
+MP_DECLARE_CONST_FUN_OBJ_KW(ndarray_array_constructor_obj);
+mp_obj_t ndarray_make_new(const mp_obj_type_t *, size_t , size_t , const mp_obj_t *);
+mp_obj_t ndarray_subscr(mp_obj_t , mp_obj_t , mp_obj_t );
+mp_obj_t ndarray_getiter(mp_obj_t , mp_obj_iter_buf_t *);
+bool ndarray_can_broadcast(ndarray_obj_t *, ndarray_obj_t *, uint8_t *, size_t *, int32_t *, int32_t *);
+bool ndarray_can_broadcast_inplace(ndarray_obj_t *, ndarray_obj_t *, int32_t *);
+mp_obj_t ndarray_binary_op(mp_binary_op_t , mp_obj_t , mp_obj_t );
+mp_obj_t ndarray_unary_op(mp_unary_op_t , mp_obj_t );
+
+size_t *ndarray_new_coords(uint8_t );
+void ndarray_rewind_array(uint8_t , uint8_t *, size_t *, int32_t *, size_t *);
+
+// various ndarray methods
+#if NDARRAY_HAS_BYTESWAP
+mp_obj_t ndarray_byteswap(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(ndarray_byteswap_obj);
+#endif
+
+#if NDARRAY_HAS_COPY
+mp_obj_t ndarray_copy(mp_obj_t );
+MP_DECLARE_CONST_FUN_OBJ_1(ndarray_copy_obj);
+#endif
+
+#if NDARRAY_HAS_FLATTEN
+mp_obj_t ndarray_flatten(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(ndarray_flatten_obj);
+#endif
+
+mp_obj_t ndarray_dtype(mp_obj_t );
+mp_obj_t ndarray_itemsize(mp_obj_t );
+mp_obj_t ndarray_size(mp_obj_t );
+mp_obj_t ndarray_shape(mp_obj_t );
+mp_obj_t ndarray_strides(mp_obj_t );
+
+#if NDARRAY_HAS_RESHAPE
+mp_obj_t ndarray_reshape_core(mp_obj_t , mp_obj_t , bool );
+mp_obj_t ndarray_reshape(mp_obj_t , mp_obj_t );
+MP_DECLARE_CONST_FUN_OBJ_2(ndarray_reshape_obj);
+#endif
+
+#if NDARRAY_HAS_TOBYTES
+mp_obj_t ndarray_tobytes(mp_obj_t );
+MP_DECLARE_CONST_FUN_OBJ_1(ndarray_tobytes_obj);
+#endif
+
+#if NDARRAY_HAS_TOBYTES
+mp_obj_t ndarray_tolist(mp_obj_t );
+MP_DECLARE_CONST_FUN_OBJ_1(ndarray_tolist_obj);
+#endif
+
+#if NDARRAY_HAS_TRANSPOSE
+mp_obj_t ndarray_transpose(mp_obj_t );
+MP_DECLARE_CONST_FUN_OBJ_1(ndarray_transpose_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_NDINFO
+mp_obj_t ndarray_info(mp_obj_t );
+MP_DECLARE_CONST_FUN_OBJ_1(ndarray_info_obj);
+#endif
+
+mp_int_t ndarray_get_buffer(mp_obj_t , mp_buffer_info_t *, mp_uint_t );
+//void ndarray_attributes(mp_obj_t , qstr , mp_obj_t *);
+
+ndarray_obj_t *ndarray_from_mp_obj(mp_obj_t , uint8_t );
+
+
+#define BOOLEAN_ASSIGNMENT_LOOP(type_left, type_right, ndarray, lstrides, iarray, istride, varray, vstride)\
+    type_left *array = (type_left *)(ndarray)->array;\
+    for(size_t i=0; i < (ndarray)->len; i++) {\
+        if(*(iarray)) {\
+            *array = (type_left)(*((type_right *)(varray)));\
+            (varray) += (vstride);\
+        }\
+        array += (lstrides);\
+        (iarray) += (istride);\
+    } while(0)
+
+#if ULAB_HAS_FUNCTION_ITERATOR
+#define BINARY_LOOP(results, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    type_out *array = (type_out *)(results)->array;\
+    size_t *lcoords = ndarray_new_coords((results)->ndim);\
+    size_t *rcoords = ndarray_new_coords((results)->ndim);\
+    for(size_t i=0; i < (results)->len/(results)->shape[ULAB_MAX_DIMS -1]; i++) {\
+        size_t l = 0;\
+        do {\
+            *array++ = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray));\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+        ndarray_rewind_array((results)->ndim, (larray), (results)->shape, (lstrides), lcoords);\
+        ndarray_rewind_array((results)->ndim, (rarray), (results)->shape, (rstrides), rcoords);\
+    } while(0)
+
+#define INPLACE_LOOP(results, type_left, type_right, larray, rarray, rstrides, OPERATOR)\
+    size_t *lcoords = ndarray_new_coords((results)->ndim);\
+    size_t *rcoords = ndarray_new_coords((results)->ndim);\
+    for(size_t i=0; i < (results)->len/(results)->shape[ULAB_MAX_DIMS -1]; i++) {\
+        size_t l = 0;\
+        do {\
+            *((type_left *)(larray)) OPERATOR *((type_right *)(rarray));\
+            (larray) += (results)->strides[ULAB_MAX_DIMS - 1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+        ndarray_rewind_array((results)->ndim, (larray), (results)->shape, (results)->strides, lcoords);\
+        ndarray_rewind_array((results)->ndim, (rarray), (results)->shape, (rstrides), rcoords);\
+    } while(0)
+
+#define EQUALITY_LOOP(results, array, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t *lcoords = ndarray_new_coords((results)->ndim);\
+    size_t *rcoords = ndarray_new_coords((results)->ndim);\
+    for(size_t i=0; i < (results)->len/(results)->shape[ULAB_MAX_DIMS -1]; i++) {\
+        size_t l = 0;\
+        do {\
+            *(array)++ = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray)) ? 1 : 0;\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+        ndarray_rewind_array((results)->ndim, (larray), (results)->shape, (lstrides), lcoords);\
+        ndarray_rewind_array((results)->ndim, (rarray), (results)->shape, (rstrides), rcoords);\
+    } while(0)
+
+#define POWER_LOOP(results, type_out, type_left, type_right, larray, lstrides, rarray, rstrides)\
+    type_out *array = (type_out *)(results)->array;\
+    size_t *lcoords = ndarray_new_coords((results)->ndim);\
+    size_t *rcoords = ndarray_new_coords((results)->ndim);\
+    for(size_t i=0; i < (results)->len/(results)->shape[ULAB_MAX_DIMS -1]; i++) {\
+        size_t l = 0;\
+        do {\
+            *array++ = MICROPY_FLOAT_C_FUN(pow)(*((type_left *)(larray)), *((type_right *)(rarray)));\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+        ndarray_rewind_array((results)->ndim, (larray), (results)->shape, (lstrides), lcoords);\
+        ndarray_rewind_array((results)->ndim, (rarray), (results)->shape, (rstrides), rcoords);\
+    } while(0)
+
+#else
+
+#if ULAB_MAX_DIMS == 1
+#define BINARY_LOOP(results, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    type_out *array = (type_out *)results->array;\
+    size_t l = 0;\
+    do {\
+        *array++ = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray));\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define INPLACE_LOOP(results, type_left, type_right, larray, rarray, rstrides, OPERATOR)\
+    size_t l = 0;\
+    do {\
+        *((type_left *)(larray)) OPERATOR *((type_right *)(rarray));\
+        (larray) += (results)->strides[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define EQUALITY_LOOP(results, array, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t l = 0;\
+    do {\
+        *(array)++ = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray)) ? 1 : 0;\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define POWER_LOOP(results, type_out, type_left, type_right, larray, lstrides, rarray, rstrides)\
+    type_out *array = (type_out *)results->array;\
+    size_t l = 0;\
+    do {\
+        *array++ = MICROPY_FLOAT_C_FUN(pow)(*((type_left *)(larray)), *((type_right *)(rarray)));\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#endif /* ULAB_MAX_DIMS == 1 */
+
+#if ULAB_MAX_DIMS == 2
+#define BINARY_LOOP(results, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    type_out *array = (type_out *)(results)->array;\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            *array++ = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray));\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define INPLACE_LOOP(results, type_left, type_right, larray, rarray, rstrides, OPERATOR)\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            *((type_left *)(larray)) OPERATOR *((type_right *)(rarray));\
+            (larray) += (results)->strides[ULAB_MAX_DIMS - 1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+        (larray) -= (results)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (larray) += (results)->strides[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define EQUALITY_LOOP(results, array, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            *(array)++ = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray)) ? 1 : 0;\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define POWER_LOOP(results, type_out, type_left, type_right, larray, lstrides, rarray, rstrides)\
+    type_out *array = (type_out *)(results)->array;\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            *array++ = MICROPY_FLOAT_C_FUN(pow)(*((type_left *)(larray)), *((type_right *)(rarray)));\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#endif /* ULAB_MAX_DIMS == 2 */
+
+#if ULAB_MAX_DIMS == 3
+#define BINARY_LOOP(results, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    type_out *array = (type_out *)results->array;\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                *array++ = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray));\
+                (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+            (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define INPLACE_LOOP(results, type_left, type_right, larray, rarray, rstrides, OPERATOR)\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                *((type_left *)(larray)) OPERATOR *((type_right *)(rarray));\
+                (larray) += (results)->strides[ULAB_MAX_DIMS - 1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+            (larray) -= (results)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+            (larray) += (results)->strides[ULAB_MAX_DIMS - 2];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+        (larray) -= (results)->strides[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+        (larray) += (results)->strides[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define EQUALITY_LOOP(results, array, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                *(array)++ = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray)) ? 1 : 0;\
+                (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+            (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define POWER_LOOP(results, type_out, type_left, type_right, larray, lstrides, rarray, rstrides)\
+    type_out *array = (type_out *)results->array;\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                *array++ = MICROPY_FLOAT_C_FUN(pow)(*((type_left *)(larray)), *((type_right *)(rarray)));\
+                (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+            (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#endif /* ULAB_MAX_DIMS == 3 */
+
+#if ULAB_MAX_DIMS == 4
+#define BINARY_LOOP(results, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    type_out *array = (type_out *)results->array;\
+    size_t i = 0;\
+    do {\
+        size_t j = 0;\
+        do {\
+            size_t k = 0;\
+            do {\
+                size_t l = 0;\
+                do {\
+                    *array++ = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray));\
+                    (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+                    (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                    l++;\
+                } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+                (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+                (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+                (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+                k++;\
+            } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+            (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+            j++;\
+        } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+#define INPLACE_LOOP(results, type_left, type_right, larray, rarray, rstrides, OPERATOR)\
+    size_t i = 0;\
+    do {\
+        size_t j = 0;\
+        do {\
+            size_t k = 0;\
+            do {\
+                size_t l = 0;\
+                do {\
+                    *((type_left *)(larray)) OPERATOR *((type_right *)(rarray));\
+                    (larray) += (results)->strides[ULAB_MAX_DIMS - 1];\
+                    (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                    l++;\
+                } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+                (larray) -= (results)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+                (larray) += (results)->strides[ULAB_MAX_DIMS - 2];\
+                (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+                k++;\
+            } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+            (larray) -= (results)->strides[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+            (larray) += (results)->strides[ULAB_MAX_DIMS - 3];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+            j++;\
+        } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+        (larray) -= (results)->strides[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
+        (larray) += (results)->strides[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+#define EQUALITY_LOOP(results, array, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t i = 0;\
+    do {\
+        size_t j = 0;\
+        do {\
+            size_t k = 0;\
+            do {\
+                size_t l = 0;\
+                do {\
+                    *(array)++ = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray)) ? 1 : 0;\
+                    (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+                    (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                    l++;\
+                } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+                (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];\
+                (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+                (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+                k++;\
+            } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+            (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+            j++;\
+        } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+#define POWER_LOOP(results, type_out, type_left, type_right, larray, lstrides, rarray, rstrides)\
+    type_out *array = (type_out *)results->array;\
+    size_t i = 0;\
+    do {\
+        size_t j = 0;\
+        do {\
+            size_t k = 0;\
+            do {\
+                size_t l = 0;\
+                do {\
+                    *array++ = MICROPY_FLOAT_C_FUN(pow)(*((type_left *)(larray)), *((type_right *)(rarray)));\
+                    (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+                    (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                    l++;\
+                } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+                (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+                (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+                (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+                k++;\
+            } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+            (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+            j++;\
+        } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+#endif /* ULAB_MAX_DIMS == 4 */
+#endif /* ULAB_HAS_FUNCTION_ITERATOR */
+
+
+#if ULAB_MAX_DIMS == 1
+#define ASSIGNMENT_LOOP(results, type_left, type_right, lstrides, rarray, rstrides)\
+    type_left *larray = (type_left *)(results)->array;\
+    size_t l = 0;\
+    do {\
+        *larray = (type_left)(*((type_right *)(rarray)));\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#endif /* ULAB_MAX_DIMS == 1 */
+
+#if ULAB_MAX_DIMS == 2
+#define ASSIGNMENT_LOOP(results, type_left, type_right, lstrides, rarray, rstrides)\
+    type_left *larray = (type_left *)(results)->array;\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            *larray = (type_left)(*((type_right *)(rarray)));\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#endif /* ULAB_MAX_DIMS == 2 */
+
+#if ULAB_MAX_DIMS == 3
+#define ASSIGNMENT_LOOP(results, type_left, type_right, lstrides, rarray, rstrides)\
+    type_left *larray = (type_left *)(results)->array;\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                *larray = (type_left)(*((type_right *)(rarray)));\
+                (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+            (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#endif /* ULAB_MAX_DIMS == 3 */
+
+#if ULAB_MAX_DIMS == 4
+#define ASSIGNMENT_LOOP(results, type_left, type_right, lstrides, rarray, rstrides)\
+    type_left *larray = (type_left *)(results)->array;\
+    size_t i = 0;\
+    do {\
+        size_t j = 0;\
+        do {\
+            size_t k = 0;\
+            do {\
+                size_t l = 0;\
+                do {\
+                    *larray = (type_left)(*((type_right *)(rarray)));\
+                    (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+                    (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                    l++;\
+                } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+                (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+                (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+                (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+                k++;\
+            } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+            (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+            j++;\
+        } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+#endif /* ULAB_MAX_DIMS == 4 */
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/ndarray_operators.c b/circuitpython/extmod/ulab/code/ndarray_operators.c
new file mode 100644
index 0000000..de1042c
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/ndarray_operators.c
@@ -0,0 +1,839 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+*/
+
+
+#include <math.h>
+
+#include "py/runtime.h"
+#include "py/objtuple.h"
+#include "ndarray.h"
+#include "ndarray_operators.h"
+#include "ulab.h"
+#include "ulab_tools.h"
+#include "numpy/carray/carray.h"
+
+/*
+    This file contains the actual implementations of the various
+    ndarray operators.
+
+    These are the upcasting rules of the binary operators
+
+    - if complex is supported, and if one of the operarands is a complex, the result is always complex
+    - if both operarands are real one of them is a float, then the result is also a float
+    - operation on identical types preserves type
+
+    uint8 + int8 => int16
+    uint8 + int16 => int16
+    uint8 + uint16 => uint16
+    int8 + int16 => int16
+    int8 + uint16 => uint16
+    uint16 + int16 => float
+*/
+
+#if NDARRAY_HAS_BINARY_OP_EQUAL | NDARRAY_HAS_BINARY_OP_NOT_EQUAL
+mp_obj_t ndarray_binary_equality(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                                            uint8_t ndim, size_t *shape,  int32_t *lstrides, int32_t *rstrides, mp_binary_op_t op) {
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if((lhs->dtype == NDARRAY_COMPLEX) || (rhs->dtype == NDARRAY_COMPLEX))  {
+        return carray_binary_equal_not_equal(lhs, rhs, ndim, shape, lstrides, rstrides, op);
+    }
+    #endif
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT8);
+    results->boolean = 1;
+    uint8_t *array = (uint8_t *)results->array;
+    uint8_t *larray = (uint8_t *)lhs->array;
+    uint8_t *rarray = (uint8_t *)rhs->array;
+
+    #if NDARRAY_HAS_BINARY_OP_EQUAL
+    if(op == MP_BINARY_OP_EQUAL) {
+        if(lhs->dtype == NDARRAY_UINT8) {
+            if(rhs->dtype == NDARRAY_UINT8) {
+                EQUALITY_LOOP(results, array, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, ==);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                EQUALITY_LOOP(results, array, uint8_t, int8_t, larray, lstrides, rarray, rstrides, ==);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, ==);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, uint8_t, int16_t, larray, lstrides, rarray, rstrides, ==);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, ==);
+            }
+        } else if(lhs->dtype == NDARRAY_INT8) {
+            if(rhs->dtype == NDARRAY_INT8) {
+                EQUALITY_LOOP(results, array, int8_t, int8_t, larray, lstrides, rarray, rstrides, ==);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, int8_t, uint16_t, larray, lstrides, rarray, rstrides, ==);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, int8_t, int16_t, larray, lstrides, rarray, rstrides, ==);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, ==);
+            } else {
+                return ndarray_binary_op(op, rhs, lhs);
+            }
+        } else if(lhs->dtype == NDARRAY_UINT16) {
+            if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, ==);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, uint16_t, int16_t, larray, lstrides, rarray, rstrides, ==);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, ==);
+            } else {
+                return ndarray_binary_op(op, rhs, lhs);
+            }
+        } else if(lhs->dtype == NDARRAY_INT16) {
+            if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, int16_t, int16_t, larray, lstrides, rarray, rstrides, ==);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, int16_t, mp_float_t, larray, lstrides, rarray, rstrides, ==);
+            } else {
+                return ndarray_binary_op(op, rhs, lhs);
+            }
+        } else if(lhs->dtype == NDARRAY_FLOAT) {
+            if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, ==);
+            } else {
+                return ndarray_binary_op(op, rhs, lhs);
+            }
+        }
+    }
+    #endif /* NDARRAY_HAS_BINARY_OP_EQUAL */
+
+    #if NDARRAY_HAS_BINARY_OP_NOT_EQUAL
+    if(op == MP_BINARY_OP_NOT_EQUAL) {
+        if(lhs->dtype == NDARRAY_UINT8) {
+            if(rhs->dtype == NDARRAY_UINT8) {
+                EQUALITY_LOOP(results, array, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, !=);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                EQUALITY_LOOP(results, array, uint8_t, int8_t, larray, lstrides, rarray, rstrides, !=);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, !=);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, uint8_t, int16_t, larray, lstrides, rarray, rstrides, !=);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, !=);
+            }
+        } else if(lhs->dtype == NDARRAY_INT8) {
+            if(rhs->dtype == NDARRAY_INT8) {
+                EQUALITY_LOOP(results, array, int8_t, int8_t, larray, lstrides, rarray, rstrides, !=);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, int8_t, uint16_t, larray, lstrides, rarray, rstrides, !=);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, int8_t, int16_t, larray, lstrides, rarray, rstrides, !=);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, !=);
+            } else {
+                return ndarray_binary_op(op, rhs, lhs);
+            }
+        } else if(lhs->dtype == NDARRAY_UINT16) {
+            if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, !=);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, uint16_t, int16_t, larray, lstrides, rarray, rstrides, !=);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, !=);
+            } else {
+                return ndarray_binary_op(op, rhs, lhs);
+            }
+        } else if(lhs->dtype == NDARRAY_INT16) {
+            if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, int16_t, int16_t, larray, lstrides, rarray, rstrides, !=);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, int16_t, mp_float_t, larray, lstrides, rarray, rstrides, !=);
+            } else {
+                return ndarray_binary_op(op, rhs, lhs);
+            }
+        } else if(lhs->dtype == NDARRAY_FLOAT) {
+            if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, !=);
+            } else {
+                return ndarray_binary_op(op, rhs, lhs);
+            }
+        }
+    }
+    #endif /* NDARRAY_HAS_BINARY_OP_NOT_EQUAL */
+
+    return MP_OBJ_FROM_PTR(results);
+}
+#endif /* NDARRAY_HAS_BINARY_OP_EQUAL | NDARRAY_HAS_BINARY_OP_NOT_EQUAL */
+
+#if NDARRAY_HAS_BINARY_OP_ADD
+mp_obj_t ndarray_binary_add(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                                        uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if((lhs->dtype == NDARRAY_COMPLEX) || (rhs->dtype == NDARRAY_COMPLEX))  {
+        return carray_binary_add(lhs, rhs, ndim, shape, lstrides, rstrides);
+    }
+    #endif
+
+    ndarray_obj_t *results = NULL;
+    uint8_t *larray = (uint8_t *)lhs->array;
+    uint8_t *rarray = (uint8_t *)rhs->array;
+
+    if(lhs->dtype == NDARRAY_UINT8) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
+            BINARY_LOOP(results, uint16_t, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, +);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, uint8_t, int8_t, larray, lstrides, rarray, rstrides, +);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
+            BINARY_LOOP(results, uint16_t, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, +);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, uint8_t, int16_t, larray, lstrides, rarray, rstrides, +);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, +);
+        }
+    } else if(lhs->dtype == NDARRAY_INT8) {
+        if(rhs->dtype == NDARRAY_INT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT8);
+            BINARY_LOOP(results, int8_t, int8_t, int8_t, larray, lstrides, rarray, rstrides, +);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, int8_t, uint16_t, larray, lstrides, rarray, rstrides, +);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, int8_t, int16_t, larray, lstrides, rarray, rstrides, +);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, +);
+        } else {
+            return ndarray_binary_op(MP_BINARY_OP_ADD, rhs, lhs);
+        }
+    } else if(lhs->dtype == NDARRAY_UINT16) {
+        if(rhs->dtype == NDARRAY_UINT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
+            BINARY_LOOP(results, uint16_t, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, +);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, uint16_t, int16_t, larray, lstrides, rarray, rstrides, +);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, +);
+        } else {
+            return ndarray_binary_op(MP_BINARY_OP_ADD, rhs, lhs);
+        }
+    } else if(lhs->dtype == NDARRAY_INT16) {
+        if(rhs->dtype == NDARRAY_INT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, int16_t, int16_t, larray, lstrides, rarray, rstrides, +);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, int16_t, mp_float_t, larray, lstrides, rarray, rstrides, +);
+        } else {
+            return ndarray_binary_op(MP_BINARY_OP_ADD, rhs, lhs);
+        }
+    } else if(lhs->dtype == NDARRAY_FLOAT) {
+        if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, +);
+        } else {
+            return ndarray_binary_op(MP_BINARY_OP_ADD, rhs, lhs);
+        }
+    }
+
+    return MP_OBJ_FROM_PTR(results);
+}
+#endif /* NDARRAY_HAS_BINARY_OP_ADD */
+
+#if NDARRAY_HAS_BINARY_OP_MULTIPLY
+mp_obj_t ndarray_binary_multiply(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if((lhs->dtype == NDARRAY_COMPLEX) || (rhs->dtype == NDARRAY_COMPLEX))  {
+        return carray_binary_multiply(lhs, rhs, ndim, shape, lstrides, rstrides);
+    }
+    #endif
+
+    ndarray_obj_t *results = NULL;
+    uint8_t *larray = (uint8_t *)lhs->array;
+    uint8_t *rarray = (uint8_t *)rhs->array;
+
+    if(lhs->dtype == NDARRAY_UINT8) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
+            BINARY_LOOP(results, uint16_t, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, *);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, uint8_t, int8_t, larray, lstrides, rarray, rstrides, *);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
+            BINARY_LOOP(results, uint16_t, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, *);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, uint8_t, int16_t, larray, lstrides, rarray, rstrides, *);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, *);
+        }
+    } else if(lhs->dtype == NDARRAY_INT8) {
+        if(rhs->dtype == NDARRAY_INT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT8);
+            BINARY_LOOP(results, int8_t, int8_t, int8_t, larray, lstrides, rarray, rstrides, *);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, int8_t, uint16_t, larray, lstrides, rarray, rstrides, *);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, int8_t, int16_t, larray, lstrides, rarray, rstrides, *);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, *);
+        } else {
+            return ndarray_binary_op(MP_BINARY_OP_MULTIPLY, rhs, lhs);
+        }
+    } else if(lhs->dtype == NDARRAY_UINT16) {
+        if(rhs->dtype == NDARRAY_UINT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
+            BINARY_LOOP(results, uint16_t, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, *);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, uint16_t, int16_t, larray, lstrides, rarray, rstrides, *);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, *);
+        } else {
+            return ndarray_binary_op(MP_BINARY_OP_MULTIPLY, rhs, lhs);
+        }
+    } else if(lhs->dtype == NDARRAY_INT16) {
+        if(rhs->dtype == NDARRAY_INT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, int16_t, int16_t, larray, lstrides, rarray, rstrides, *);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, int16_t, mp_float_t, larray, lstrides, rarray, rstrides, *);
+        } else {
+            return ndarray_binary_op(MP_BINARY_OP_MULTIPLY, rhs, lhs);
+        }
+    } else if(lhs->dtype == NDARRAY_FLOAT) {
+        if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, *);
+        } else {
+            return ndarray_binary_op(MP_BINARY_OP_MULTIPLY, rhs, lhs);
+        }
+    }
+
+    return MP_OBJ_FROM_PTR(results);
+}
+#endif /* NDARRAY_HAS_BINARY_OP_MULTIPLY */
+
+#if NDARRAY_HAS_BINARY_OP_MORE | NDARRAY_HAS_BINARY_OP_MORE_EQUAL | NDARRAY_HAS_BINARY_OP_LESS | NDARRAY_HAS_BINARY_OP_LESS_EQUAL
+mp_obj_t ndarray_binary_more(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides, mp_binary_op_t op) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT8);
+    results->boolean = 1;
+    uint8_t *array = (uint8_t *)results->array;
+    uint8_t *larray = (uint8_t *)lhs->array;
+    uint8_t *rarray = (uint8_t *)rhs->array;
+
+    #if NDARRAY_HAS_BINARY_OP_MORE | NDARRAY_HAS_BINARY_OP_LESS
+    if(op == MP_BINARY_OP_MORE) {
+        if(lhs->dtype == NDARRAY_UINT8) {
+            if(rhs->dtype == NDARRAY_UINT8) {
+                EQUALITY_LOOP(results, array, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                EQUALITY_LOOP(results, array, uint8_t, int8_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, uint8_t, int16_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, >);
+            }
+        } else if(lhs->dtype == NDARRAY_INT8) {
+            if(rhs->dtype == NDARRAY_UINT8) {
+                EQUALITY_LOOP(results, array, int8_t, uint8_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                EQUALITY_LOOP(results, array, int8_t, int8_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, int8_t, uint16_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, int8_t, int16_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, >);
+            }
+        } else if(lhs->dtype == NDARRAY_UINT16) {
+            if(rhs->dtype == NDARRAY_UINT8) {
+                EQUALITY_LOOP(results, array, uint16_t, uint8_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                EQUALITY_LOOP(results, array, uint16_t, int8_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, uint16_t, int16_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, >);
+            }
+        } else if(lhs->dtype == NDARRAY_INT16) {
+            if(rhs->dtype == NDARRAY_UINT8) {
+                EQUALITY_LOOP(results, array, int16_t, uint8_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                EQUALITY_LOOP(results, array, int16_t, int8_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, int16_t, uint16_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, int16_t, int16_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, >);
+            }
+        } else if(lhs->dtype == NDARRAY_FLOAT) {
+            if(rhs->dtype == NDARRAY_UINT8) {
+                EQUALITY_LOOP(results, array, mp_float_t, uint8_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                EQUALITY_LOOP(results, array, mp_float_t, int8_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, mp_float_t, uint16_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, mp_float_t, int16_t, larray, lstrides, rarray, rstrides, >);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, >);
+            }
+        }
+    }
+    #endif /* NDARRAY_HAS_BINARY_OP_MORE | NDARRAY_HAS_BINARY_OP_LESS*/
+    #if NDARRAY_HAS_BINARY_OP_MORE_EQUAL | NDARRAY_HAS_BINARY_OP_LESS_EQUAL
+    if(op == MP_BINARY_OP_MORE_EQUAL) {
+        if(lhs->dtype == NDARRAY_UINT8) {
+            if(rhs->dtype == NDARRAY_UINT8) {
+                EQUALITY_LOOP(results, array, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                EQUALITY_LOOP(results, array, uint8_t, int8_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, uint8_t, int16_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, >=);
+            }
+        } else if(lhs->dtype == NDARRAY_INT8) {
+            if(rhs->dtype == NDARRAY_UINT8) {
+                EQUALITY_LOOP(results, array, int8_t, uint8_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                EQUALITY_LOOP(results, array, int8_t, int8_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, int8_t, uint16_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, int8_t, int16_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, >=);
+            }
+        } else if(lhs->dtype == NDARRAY_UINT16) {
+            if(rhs->dtype == NDARRAY_UINT8) {
+                EQUALITY_LOOP(results, array, uint16_t, uint8_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                EQUALITY_LOOP(results, array, uint16_t, int8_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, uint16_t, int16_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, >=);
+            }
+        } else if(lhs->dtype == NDARRAY_INT16) {
+            if(rhs->dtype == NDARRAY_UINT8) {
+                EQUALITY_LOOP(results, array, int16_t, uint8_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                EQUALITY_LOOP(results, array, int16_t, int8_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, int16_t, uint16_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, int16_t, int16_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, >=);
+            }
+        } else if(lhs->dtype == NDARRAY_FLOAT) {
+            if(rhs->dtype == NDARRAY_UINT8) {
+                EQUALITY_LOOP(results, array, mp_float_t, uint8_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                EQUALITY_LOOP(results, array, mp_float_t, int8_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                EQUALITY_LOOP(results, array, mp_float_t, uint16_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                EQUALITY_LOOP(results, array, mp_float_t, int16_t, larray, lstrides, rarray, rstrides, >=);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                EQUALITY_LOOP(results, array, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, >=);
+            }
+        }
+    }
+    #endif /* NDARRAY_HAS_BINARY_OP_MORE_EQUAL | NDARRAY_HAS_BINARY_OP_LESS_EQUAL */
+
+    return MP_OBJ_FROM_PTR(results);
+}
+#endif /* NDARRAY_HAS_BINARY_OP_MORE | NDARRAY_HAS_BINARY_OP_MORE_EQUAL | NDARRAY_HAS_BINARY_OP_LESS | NDARRAY_HAS_BINARY_OP_LESS_EQUAL */
+
+#if NDARRAY_HAS_BINARY_OP_SUBTRACT
+mp_obj_t ndarray_binary_subtract(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if((lhs->dtype == NDARRAY_COMPLEX) || (rhs->dtype == NDARRAY_COMPLEX))  {
+        return carray_binary_subtract(lhs, rhs, ndim, shape, lstrides, rstrides);
+    }
+    #endif
+
+    ndarray_obj_t *results = NULL;
+    uint8_t *larray = (uint8_t *)lhs->array;
+    uint8_t *rarray = (uint8_t *)rhs->array;
+
+    if(lhs->dtype == NDARRAY_UINT8) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT8);
+            BINARY_LOOP(results, uint8_t, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, uint8_t, int8_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
+            BINARY_LOOP(results, uint16_t, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, uint8_t, int16_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, -);
+        }
+    } else if(lhs->dtype == NDARRAY_INT8) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, int8_t, uint8_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT8);
+            BINARY_LOOP(results, int8_t, int8_t, int8_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, int8_t, uint16_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, int8_t, int16_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, -);
+        }
+    } else if(lhs->dtype == NDARRAY_UINT16) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
+            BINARY_LOOP(results, uint16_t, uint16_t, uint8_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
+            BINARY_LOOP(results, uint16_t, uint16_t, int8_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT16);
+            BINARY_LOOP(results, uint16_t, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, uint16_t, int16_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, -);
+        }
+    } else if(lhs->dtype == NDARRAY_INT16) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, int16_t, uint8_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, int16_t, int8_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, int16_t, uint16_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_INT16);
+            BINARY_LOOP(results, int16_t, int16_t, int16_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, -);
+        }
+    } else if(lhs->dtype == NDARRAY_FLOAT) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, mp_float_t, uint8_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, mp_float_t, int8_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, mp_float_t, uint16_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, mp_float_t, int16_t, larray, lstrides, rarray, rstrides, -);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+            BINARY_LOOP(results, mp_float_t, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, -);
+        }
+    }
+
+    return MP_OBJ_FROM_PTR(results);
+}
+#endif /* NDARRAY_HAS_BINARY_OP_SUBTRACT */
+
+#if NDARRAY_HAS_BINARY_OP_TRUE_DIVIDE
+mp_obj_t ndarray_binary_true_divide(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if((lhs->dtype == NDARRAY_COMPLEX) || (rhs->dtype == NDARRAY_COMPLEX))  {
+        return carray_binary_divide(lhs, rhs, ndim, shape, lstrides, rstrides);
+    }
+    #endif
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+    uint8_t *larray = (uint8_t *)lhs->array;
+    uint8_t *rarray = (uint8_t *)rhs->array;
+
+    #if NDARRAY_BINARY_USES_FUN_POINTER
+    mp_float_t (*get_lhs)(void *) = ndarray_get_float_function(lhs->dtype);
+    mp_float_t (*get_rhs)(void *) = ndarray_get_float_function(rhs->dtype);
+
+    uint8_t *array = (uint8_t *)results->array;
+    void (*set_result)(void *, mp_float_t ) = ndarray_set_float_function(NDARRAY_FLOAT);
+
+    // Note that lvalue and rvalue are local variables in the macro itself
+    FUNC_POINTER_LOOP(results, array, get_lhs, get_rhs, larray, lstrides, rarray, rstrides, lvalue/rvalue);
+
+    #else
+    if(lhs->dtype == NDARRAY_UINT8) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            BINARY_LOOP(results, mp_float_t, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            BINARY_LOOP(results, mp_float_t, uint8_t, int8_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            BINARY_LOOP(results, mp_float_t, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            BINARY_LOOP(results, mp_float_t, uint8_t, int16_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            BINARY_LOOP(results, mp_float_t, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, /);
+        }
+    } else if(lhs->dtype == NDARRAY_INT8) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            BINARY_LOOP(results, mp_float_t, int8_t, uint8_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            BINARY_LOOP(results, mp_float_t, int8_t, int8_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            BINARY_LOOP(results, mp_float_t, int8_t, uint16_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            BINARY_LOOP(results, mp_float_t, int8_t, int16_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            BINARY_LOOP(results, mp_float_t, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, /);
+        }
+    } else if(lhs->dtype == NDARRAY_UINT16) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            BINARY_LOOP(results, mp_float_t, uint16_t, uint8_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            BINARY_LOOP(results, mp_float_t, uint16_t, int8_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            BINARY_LOOP(results, mp_float_t, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            BINARY_LOOP(results, mp_float_t, uint16_t, int16_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            BINARY_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, /);
+        }
+    } else if(lhs->dtype == NDARRAY_INT16) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            BINARY_LOOP(results, mp_float_t, int16_t, uint8_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            BINARY_LOOP(results, mp_float_t, int16_t, int8_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            BINARY_LOOP(results, mp_float_t, int16_t, uint16_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            BINARY_LOOP(results, mp_float_t, int16_t, int16_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            BINARY_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, /);
+        }
+    } else if(lhs->dtype == NDARRAY_FLOAT) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            BINARY_LOOP(results, mp_float_t, mp_float_t, uint8_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            BINARY_LOOP(results, mp_float_t, mp_float_t, int8_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            BINARY_LOOP(results, mp_float_t, mp_float_t, uint16_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            BINARY_LOOP(results, mp_float_t, mp_float_t, int16_t, larray, lstrides, rarray, rstrides, /);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            BINARY_LOOP(results, mp_float_t, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, /);
+        }
+    }
+    #endif /* NDARRAY_BINARY_USES_FUN_POINTER */
+
+    return MP_OBJ_FROM_PTR(results);
+}
+#endif /* NDARRAY_HAS_BINARY_OP_TRUE_DIVIDE */
+
+#if NDARRAY_HAS_BINARY_OP_POWER
+mp_obj_t ndarray_binary_power(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    // Note that numpy upcasts the results to int64, if the inputs are of integer type,
+    // while we always return a float array.
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+    uint8_t *larray = (uint8_t *)lhs->array;
+    uint8_t *rarray = (uint8_t *)rhs->array;
+
+    #if NDARRAY_BINARY_USES_FUN_POINTER
+    mp_float_t (*get_lhs)(void *) = ndarray_get_float_function(lhs->dtype);
+    mp_float_t (*get_rhs)(void *) = ndarray_get_float_function(rhs->dtype);
+
+    uint8_t *array = (uint8_t *)results->array;
+    void (*set_result)(void *, mp_float_t ) = ndarray_set_float_function(NDARRAY_FLOAT);
+
+    // Note that lvalue and rvalue are local variables in the macro itself
+    FUNC_POINTER_LOOP(results, array, get_lhs, get_rhs, larray, lstrides, rarray, rstrides, MICROPY_FLOAT_C_FUN(pow)(lvalue, rvalue));
+
+    #else
+    if(lhs->dtype == NDARRAY_UINT8) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            POWER_LOOP(results, mp_float_t, uint8_t, uint8_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            POWER_LOOP(results, mp_float_t, uint8_t, int8_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            POWER_LOOP(results, mp_float_t, uint8_t, uint16_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            POWER_LOOP(results, mp_float_t, uint8_t, int16_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            POWER_LOOP(results, mp_float_t, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides);
+        }
+    } else if(lhs->dtype == NDARRAY_INT8) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            POWER_LOOP(results, mp_float_t, int8_t, uint8_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            POWER_LOOP(results, mp_float_t, int8_t, int8_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            POWER_LOOP(results, mp_float_t, int8_t, uint16_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            POWER_LOOP(results, mp_float_t, int8_t, int16_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            POWER_LOOP(results, mp_float_t, int8_t, mp_float_t, larray, lstrides, rarray, rstrides);
+        }
+    } else if(lhs->dtype == NDARRAY_UINT16) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            POWER_LOOP(results, mp_float_t, uint16_t, uint8_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            POWER_LOOP(results, mp_float_t, uint16_t, int8_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            POWER_LOOP(results, mp_float_t, uint16_t, uint16_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            POWER_LOOP(results, mp_float_t, uint16_t, int16_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            POWER_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides);
+        }
+    } else if(lhs->dtype == NDARRAY_INT16) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            POWER_LOOP(results, mp_float_t, int16_t, uint8_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            POWER_LOOP(results, mp_float_t, int16_t, int8_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            POWER_LOOP(results, mp_float_t, int16_t, uint16_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            POWER_LOOP(results, mp_float_t, int16_t, int16_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            POWER_LOOP(results, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides);
+        }
+    } else if(lhs->dtype == NDARRAY_FLOAT) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            POWER_LOOP(results, mp_float_t, mp_float_t, uint8_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            POWER_LOOP(results, mp_float_t, mp_float_t, int8_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            POWER_LOOP(results, mp_float_t, mp_float_t, uint16_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            POWER_LOOP(results, mp_float_t, mp_float_t, int16_t, larray, lstrides, rarray, rstrides);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            POWER_LOOP(results, mp_float_t, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides);
+        }
+    }
+    #endif /* NDARRAY_BINARY_USES_FUN_POINTER */
+
+    return MP_OBJ_FROM_PTR(results);
+}
+#endif /* NDARRAY_HAS_BINARY_OP_POWER */
+
+#if NDARRAY_HAS_INPLACE_ADD || NDARRAY_HAS_INPLACE_MULTIPLY || NDARRAY_HAS_INPLACE_SUBTRACT
+mp_obj_t ndarray_inplace_ams(ndarray_obj_t *lhs, ndarray_obj_t *rhs, int32_t *rstrides, uint8_t optype) {
+
+    if((lhs->dtype != NDARRAY_FLOAT) && (rhs->dtype == NDARRAY_FLOAT)) {
+        mp_raise_TypeError(translate("cannot cast output with casting rule"));
+    }
+    uint8_t *larray = (uint8_t *)lhs->array;
+    uint8_t *rarray = (uint8_t *)rhs->array;
+
+    #if NDARRAY_HAS_INPLACE_ADD
+    if(optype == MP_BINARY_OP_INPLACE_ADD) {
+        UNWRAP_INPLACE_OPERATOR(lhs, larray, rarray, rstrides, +=);
+    }
+    #endif
+    #if NDARRAY_HAS_INPLACE_ADD
+    if(optype == MP_BINARY_OP_INPLACE_MULTIPLY) {
+        UNWRAP_INPLACE_OPERATOR(lhs, larray, rarray, rstrides, *=);
+    }
+    #endif
+    #if NDARRAY_HAS_INPLACE_SUBTRACT
+    if(optype == MP_BINARY_OP_INPLACE_SUBTRACT) {
+        UNWRAP_INPLACE_OPERATOR(lhs, larray, rarray, rstrides, -=);
+    }
+    #endif
+
+    return MP_OBJ_FROM_PTR(lhs);
+}
+#endif /* NDARRAY_HAS_INPLACE_ADD || NDARRAY_HAS_INPLACE_MULTIPLY || NDARRAY_HAS_INPLACE_SUBTRACT */
+
+#if NDARRAY_HAS_INPLACE_TRUE_DIVIDE
+mp_obj_t ndarray_inplace_divide(ndarray_obj_t *lhs, ndarray_obj_t *rhs, int32_t *rstrides) {
+
+    if((lhs->dtype != NDARRAY_FLOAT)) {
+        mp_raise_TypeError(translate("results cannot be cast to specified type"));
+    }
+    uint8_t *larray = (uint8_t *)lhs->array;
+    uint8_t *rarray = (uint8_t *)rhs->array;
+
+    if(rhs->dtype == NDARRAY_UINT8) {
+        INPLACE_LOOP(lhs, mp_float_t, uint8_t, larray, rarray, rstrides, /=);
+    } else if(rhs->dtype == NDARRAY_INT8) {
+        INPLACE_LOOP(lhs, mp_float_t, int8_t, larray, rarray, rstrides, /=);
+    } else if(rhs->dtype == NDARRAY_UINT16) {
+        INPLACE_LOOP(lhs, mp_float_t, uint16_t, larray, rarray, rstrides, /=);
+    } else if(rhs->dtype == NDARRAY_INT16) {
+        INPLACE_LOOP(lhs, mp_float_t, int16_t, larray, rarray, rstrides, /=);
+    } else if(lhs->dtype == NDARRAY_FLOAT) {
+        INPLACE_LOOP(lhs, mp_float_t, mp_float_t, larray, rarray, rstrides, /=);
+    }
+    return MP_OBJ_FROM_PTR(lhs);
+}
+#endif /* NDARRAY_HAS_INPLACE_DIVIDE */
+
+#if NDARRAY_HAS_INPLACE_POWER
+mp_obj_t ndarray_inplace_power(ndarray_obj_t *lhs, ndarray_obj_t *rhs, int32_t *rstrides) {
+
+    if((lhs->dtype != NDARRAY_FLOAT)) {
+        mp_raise_TypeError(translate("results cannot be cast to specified type"));
+    }
+    uint8_t *larray = (uint8_t *)lhs->array;
+    uint8_t *rarray = (uint8_t *)rhs->array;
+
+    if(rhs->dtype == NDARRAY_UINT8) {
+        INPLACE_POWER(lhs, mp_float_t, uint8_t, larray, rarray, rstrides);
+    } else if(rhs->dtype == NDARRAY_INT8) {
+        INPLACE_POWER(lhs, mp_float_t, int8_t, larray, rarray, rstrides);
+    } else if(rhs->dtype == NDARRAY_UINT16) {
+        INPLACE_POWER(lhs, mp_float_t, uint16_t, larray, rarray, rstrides);
+    } else if(rhs->dtype == NDARRAY_INT16) {
+        INPLACE_POWER(lhs, mp_float_t, int16_t, larray, rarray, rstrides);
+    } else if(lhs->dtype == NDARRAY_FLOAT) {
+        INPLACE_POWER(lhs, mp_float_t, mp_float_t, larray, rarray, rstrides);
+    }
+    return MP_OBJ_FROM_PTR(lhs);
+}
+#endif /* NDARRAY_HAS_INPLACE_POWER */
diff --git a/circuitpython/extmod/ulab/code/ndarray_operators.h b/circuitpython/extmod/ulab/code/ndarray_operators.h
new file mode 100644
index 0000000..7849e03
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/ndarray_operators.h
@@ -0,0 +1,277 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+*/
+
+#include "ndarray.h"
+
+mp_obj_t ndarray_binary_equality(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *,  int32_t *, int32_t *, mp_binary_op_t );
+mp_obj_t ndarray_binary_add(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t ndarray_binary_multiply(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t ndarray_binary_more(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *, mp_binary_op_t );
+mp_obj_t ndarray_binary_power(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t ndarray_binary_subtract(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t ndarray_binary_true_divide(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+
+mp_obj_t ndarray_inplace_ams(ndarray_obj_t *, ndarray_obj_t *, int32_t *, uint8_t );
+mp_obj_t ndarray_inplace_power(ndarray_obj_t *, ndarray_obj_t *, int32_t *);
+mp_obj_t ndarray_inplace_divide(ndarray_obj_t *, ndarray_obj_t *, int32_t *);
+
+#define UNWRAP_INPLACE_OPERATOR(lhs, larray, rarray, rstrides, OPERATOR)\
+({\
+    if((lhs)->dtype == NDARRAY_UINT8) {\
+        if((rhs)->dtype == NDARRAY_UINT8) {\
+            INPLACE_LOOP((lhs), uint8_t, uint8_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else if(rhs->dtype == NDARRAY_INT8) {\
+            INPLACE_LOOP((lhs), uint8_t, int8_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else if(rhs->dtype == NDARRAY_UINT16) {\
+            INPLACE_LOOP((lhs), uint8_t, uint16_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else {\
+            INPLACE_LOOP((lhs), uint8_t, int16_t, (larray), (rarray), (rstrides), OPERATOR);\
+        }\
+    } else if(lhs->dtype == NDARRAY_INT8) {\
+        if(rhs->dtype == NDARRAY_UINT8) {\
+            INPLACE_LOOP((lhs), int8_t, uint8_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else if(rhs->dtype == NDARRAY_INT8) {\
+            INPLACE_LOOP((lhs), int8_t, int8_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else if(rhs->dtype == NDARRAY_UINT16) {\
+            INPLACE_LOOP((lhs), int8_t, uint16_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else {\
+            INPLACE_LOOP((lhs), int8_t, int16_t, (larray), (rarray), (rstrides), OPERATOR);\
+        }\
+    } else if(lhs->dtype == NDARRAY_UINT16) {\
+        if(rhs->dtype == NDARRAY_UINT8) {\
+            INPLACE_LOOP((lhs), uint16_t, uint8_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else if(rhs->dtype == NDARRAY_INT8) {\
+            INPLACE_LOOP((lhs), uint16_t, int8_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else if(rhs->dtype == NDARRAY_UINT16) {\
+            INPLACE_LOOP((lhs), uint16_t, uint16_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else {\
+            INPLACE_LOOP((lhs), uint16_t, int16_t, (larray), (rarray), (rstrides), OPERATOR);\
+        }\
+    } else if(lhs->dtype == NDARRAY_INT16) {\
+        if(rhs->dtype == NDARRAY_UINT8) {\
+            INPLACE_LOOP((lhs), int16_t, uint8_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else if(rhs->dtype == NDARRAY_INT8) {\
+            INPLACE_LOOP((lhs), int16_t, int8_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else if(rhs->dtype == NDARRAY_UINT16) {\
+            INPLACE_LOOP((lhs), int16_t, uint16_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else {\
+            INPLACE_LOOP((lhs), int16_t, int16_t, (larray), (rarray), (rstrides), OPERATOR);\
+        }\
+    } else if(lhs->dtype == NDARRAY_FLOAT) {\
+        if(rhs->dtype == NDARRAY_UINT8) {\
+            INPLACE_LOOP((lhs), mp_float_t, uint8_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else if(rhs->dtype == NDARRAY_INT8) {\
+            INPLACE_LOOP((lhs), mp_float_t, int8_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else if(rhs->dtype == NDARRAY_UINT16) {\
+            INPLACE_LOOP((lhs), mp_float_t, uint16_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else if(rhs->dtype == NDARRAY_INT16) {\
+            INPLACE_LOOP((lhs), mp_float_t, int16_t, (larray), (rarray), (rstrides), OPERATOR);\
+        } else {\
+            INPLACE_LOOP((lhs), mp_float_t, mp_float_t, (larray), (rarray), (rstrides), OPERATOR);\
+        }\
+    }\
+})
+
+#if ULAB_MAX_DIMS == 1
+#define INPLACE_POWER(results, type_left, type_right, larray, rarray, rstrides)\
+({  size_t l = 0;\
+    do {\
+        *((type_left *)(larray)) = MICROPY_FLOAT_C_FUN(pow)(*((type_left *)(larray)), *((type_right *)(rarray)));\
+        (larray) += (results)->strides[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+})
+
+#define FUNC_POINTER_LOOP(results, array, get_lhs, get_rhs, larray, lstrides, rarray, rstrides, OPERATION)\
+({  size_t l = 0;\
+    do {\
+        mp_float_t lvalue = (get_lhs)((larray));\
+        mp_float_t rvalue = (get_rhs)((rarray));\
+        (set_result)((array), OPERATION);\
+        (array) += (results)->itemsize;\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+})
+#endif /* ULAB_MAX_DIMS == 1 */
+
+#if ULAB_MAX_DIMS == 2
+#define INPLACE_POWER(results, type_left, type_right, larray, rarray, rstrides)\
+({  size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            *((type_left *)(larray)) = MICROPY_FLOAT_C_FUN(pow)(*((type_left *)(larray)), *((type_right *)(rarray)));\
+            (larray) += (results)->strides[ULAB_MAX_DIMS - 1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+        (larray) -= (results)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (larray) += (results)->strides[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+})
+
+#define FUNC_POINTER_LOOP(results, array, get_lhs, get_rhs, larray, lstrides, rarray, rstrides, OPERATION)\
+({  size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            mp_float_t lvalue = (get_lhs)((larray));\
+            mp_float_t rvalue = (get_rhs)((rarray));\
+            (set_result)((array), OPERATION);\
+            (array) += (results)->itemsize;\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < results->shape[ULAB_MAX_DIMS - 2]);\
+})
+#endif /* ULAB_MAX_DIMS == 2 */
+
+#if ULAB_MAX_DIMS == 3
+#define INPLACE_POWER(results, type_left, type_right, larray, rarray, rstrides)\
+({  size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                *((type_left *)(larray)) = MICROPY_FLOAT_C_FUN(pow)(*((type_left *)(larray)), *((type_right *)(rarray)));\
+                (larray) += (results)->strides[ULAB_MAX_DIMS - 1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+            (larray) -= (results)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+            (larray) += (results)->strides[ULAB_MAX_DIMS - 2];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+        (larray) -= (results)->strides[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+        (larray) += (results)->strides[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+})
+
+
+#define FUNC_POINTER_LOOP(results, array, get_lhs, get_rhs, larray, lstrides, rarray, rstrides, OPERATION)\
+({  size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                mp_float_t lvalue = (get_lhs)((larray));\
+                mp_float_t rvalue = (get_rhs)((rarray));\
+                (set_result)((array), OPERATION);\
+                (array) += (results)->itemsize;\
+                (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+            (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < results->shape[ULAB_MAX_DIMS - 2]);\
+        (larray) -= (results)->strides[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+        (larray) += (results)->strides[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+})
+#endif /* ULAB_MAX_DIMS == 3 */
+
+#if ULAB_MAX_DIMS == 4
+#define INPLACE_POWER(results, type_left, type_right, larray, rarray, rstrides)\
+({  size_t i = 0;\
+    do {\
+        size_t j = 0;\
+        do {\
+            size_t k = 0;\
+            do {\
+                size_t l = 0;\
+                do {\
+                    *((type_left *)(larray)) = MICROPY_FLOAT_C_FUN(pow)(*((type_left *)(larray)), *((type_right *)(rarray)));\
+                    (larray) += (results)->strides[ULAB_MAX_DIMS - 1];\
+                    (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                    l++;\
+                } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+                (larray) -= (results)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+                (larray) += (results)->strides[ULAB_MAX_DIMS - 2];\
+                (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+                k++;\
+            } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+            (larray) -= (results)->strides[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+            (larray) += (results)->strides[ULAB_MAX_DIMS - 3];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+            j++;\
+        } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+        (larray) -= (results)->strides[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
+        (larray) += (results)->strides[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+})
+
+#define FUNC_POINTER_LOOP(results, array, get_lhs, get_rhs, larray, lstrides, rarray, rstrides, OPERATION)\
+({  size_t i = 0;\
+    do {\
+        size_t j = 0;\
+        do {\
+            size_t k = 0;\
+            do {\
+                size_t l = 0;\
+                do {\
+                    mp_float_t lvalue = (get_lhs)((larray));\
+                    mp_float_t rvalue = (get_rhs)((rarray));\
+                    (set_result)((array), OPERATION);\
+                    (array) += (results)->itemsize;\
+                    (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+                    (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                    l++;\
+                } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+                (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+                (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+                (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+                k++;\
+            } while(k < results->shape[ULAB_MAX_DIMS - 2]);\
+            (larray) -= (results)->strides[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+            (larray) += (results)->strides[ULAB_MAX_DIMS - 3];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+            j++;\
+        } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+        (larray) -= (results)->strides[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
+        (larray) += (results)->strides[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+})
+#endif /* ULAB_MAX_DIMS == 4 */
diff --git a/circuitpython/extmod/ulab/code/ndarray_properties.c b/circuitpython/extmod/ulab/code/ndarray_properties.c
new file mode 100644
index 0000000..5464b31
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/ndarray_properties.c
@@ -0,0 +1,123 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2021 Zoltán Vörös
+ *
+*/
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+
+#include "ulab.h"
+#include "ndarray.h"
+#include "numpy/ndarray/ndarray_iter.h"
+#if ULAB_SUPPORTS_COMPLEX
+#include "numpy/carray/carray.h"
+#endif
+
+#ifndef CIRCUITPY
+
+// a somewhat hackish implementation of property getters/setters;
+// this functions is hooked into the attr member of ndarray
+
+STATIC void call_local_method(mp_obj_t obj, qstr attr, mp_obj_t *dest) {
+    const mp_obj_type_t *type = mp_obj_get_type(obj);
+    while (type->locals_dict != NULL) {
+        assert(type->locals_dict->base.type == &mp_type_dict); // MicroPython restriction, for now
+        mp_map_t *locals_map = &type->locals_dict->map;
+        mp_map_elem_t *elem = mp_map_lookup(locals_map, MP_OBJ_NEW_QSTR(attr), MP_MAP_LOOKUP);
+        if (elem != NULL) {
+            mp_convert_member_lookup(obj, type, elem->value, dest);
+            break;
+        }
+        if (type->parent == NULL) {
+            break;
+        }
+        type = type->parent;
+    }
+}
+
+
+void ndarray_properties_attr(mp_obj_t self_in, qstr attr, mp_obj_t *dest) {
+    if (dest[0] == MP_OBJ_NULL) {
+        switch(attr) {
+            #if NDARRAY_HAS_DTYPE
+            case MP_QSTR_dtype:
+                dest[0] = ndarray_dtype(self_in);
+                break;
+            #endif
+            #if NDARRAY_HAS_FLATITER
+            case MP_QSTR_flat:
+                dest[0] = ndarray_flatiter_make_new(self_in);
+                break;
+            #endif
+            #if NDARRAY_HAS_ITEMSIZE
+            case MP_QSTR_itemsize:
+                dest[0] = ndarray_itemsize(self_in);
+                break;
+            #endif
+            #if NDARRAY_HAS_SHAPE
+            case MP_QSTR_shape:
+                dest[0] = ndarray_shape(self_in);
+                break;
+            #endif
+            #if NDARRAY_HAS_SIZE
+            case MP_QSTR_size:
+                dest[0] = ndarray_size(self_in);
+                break;
+            #endif
+            #if NDARRAY_HAS_STRIDES
+            case MP_QSTR_strides:
+                dest[0] = ndarray_strides(self_in);
+                break;
+            #endif
+            #if NDARRAY_HAS_TRANSPOSE
+            case MP_QSTR_T:
+                dest[0] = ndarray_transpose(self_in);
+                break;
+            #endif
+            #if ULAB_SUPPORTS_COMPLEX
+            #if ULAB_NUMPY_HAS_IMAG
+            case MP_QSTR_imag:
+                dest[0] = carray_imag(self_in);
+                break;
+            #endif
+            #if ULAB_NUMPY_HAS_IMAG
+            case MP_QSTR_real:
+                dest[0] = carray_real(self_in);
+                break;
+            #endif
+            #endif /* ULAB_SUPPORTS_COMPLEX */
+            default:
+                call_local_method(self_in, attr, dest);
+                break;
+        }
+    } else {
+        if(dest[1]) {
+            switch(attr) {
+                #if ULAB_MAX_DIMS > 1
+                #if NDARRAY_HAS_RESHAPE
+                case MP_QSTR_shape:
+                    ndarray_reshape_core(self_in, dest[1], 1);
+                    break;
+                #endif
+                #endif
+                default:
+                    return;
+                    break;
+            }
+            dest[0] = MP_OBJ_NULL;
+        }
+    }
+}
+
+#endif /* CIRCUITPY */
+\ No newline at end of file
diff --git a/circuitpython/extmod/ulab/code/ndarray_properties.h b/circuitpython/extmod/ulab/code/ndarray_properties.h
new file mode 100644
index 0000000..28da7c0
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/ndarray_properties.h
@@ -0,0 +1,104 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2020-2021 Zoltán Vörös
+*/
+
+#ifndef _NDARRAY_PROPERTIES_
+#define _NDARRAY_PROPERTIES_
+
+#include "py/runtime.h"
+#include "py/binary.h"
+#include "py/obj.h"
+#include "py/objarray.h"
+
+#include "ulab.h"
+#include "ndarray.h"
+#include "numpy/ndarray/ndarray_iter.h"
+
+#if CIRCUITPY
+typedef struct _mp_obj_property_t {
+    mp_obj_base_t base;
+    mp_obj_t proxy[3]; // getter, setter, deleter
+} mp_obj_property_t;
+
+#if NDARRAY_HAS_DTYPE
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_get_dtype_obj, ndarray_dtype);
+STATIC const mp_obj_property_t ndarray_dtype_obj = {
+    .base.type = &mp_type_property,
+    .proxy = {(mp_obj_t)&ndarray_get_dtype_obj,
+              mp_const_none,
+              mp_const_none },
+};
+#endif /* NDARRAY_HAS_DTYPE */
+
+#if NDARRAY_HAS_FLATITER
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_flatiter_make_new_obj, ndarray_flatiter_make_new);
+STATIC const mp_obj_property_t ndarray_flat_obj = {
+    .base.type = &mp_type_property,
+    .proxy = {(mp_obj_t)&ndarray_flatiter_make_new_obj,
+              mp_const_none,
+              mp_const_none },
+};
+#endif /* NDARRAY_HAS_FLATITER */
+
+#if NDARRAY_HAS_ITEMSIZE
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_get_itemsize_obj, ndarray_itemsize);
+STATIC const mp_obj_property_t ndarray_itemsize_obj = {
+    .base.type = &mp_type_property,
+    .proxy = {(mp_obj_t)&ndarray_get_itemsize_obj,
+              mp_const_none,
+              mp_const_none },
+};
+#endif /* NDARRAY_HAS_ITEMSIZE */
+
+#if NDARRAY_HAS_SHAPE
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_get_shape_obj, ndarray_shape);
+STATIC const mp_obj_property_t ndarray_shape_obj = {
+    .base.type = &mp_type_property,
+    .proxy = {(mp_obj_t)&ndarray_get_shape_obj,
+              mp_const_none,
+              mp_const_none },
+};
+#endif /* NDARRAY_HAS_SHAPE */
+
+#if NDARRAY_HAS_SIZE
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_get_size_obj, ndarray_size);
+STATIC const mp_obj_property_t ndarray_size_obj = {
+    .base.type = &mp_type_property,
+    .proxy = {(mp_obj_t)&ndarray_get_size_obj,
+              mp_const_none,
+              mp_const_none },
+};
+#endif /* NDARRAY_HAS_SIZE */
+
+#if NDARRAY_HAS_STRIDES
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_get_strides_obj, ndarray_strides);
+STATIC const mp_obj_property_t ndarray_strides_obj = {
+    .base.type = &mp_type_property,
+    .proxy = {(mp_obj_t)&ndarray_get_strides_obj,
+              mp_const_none,
+              mp_const_none },
+};
+#endif /* NDARRAY_HAS_STRIDES */
+
+#else
+
+void ndarray_properties_attr(mp_obj_t , qstr , mp_obj_t *);
+
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_dtype_obj, ndarray_dtype);
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_flatiter_make_new_obj, ndarray_flatiter_make_new);
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_itemsize_obj, ndarray_itemsize);
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_shape_obj, ndarray_shape);
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_size_obj, ndarray_size);
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_strides_obj, ndarray_strides);
+
+#endif /* CIRCUITPY */
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/approx.c b/circuitpython/extmod/ulab/code/numpy/approx.c
new file mode 100644
index 0000000..85cdbf7
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/approx.c
@@ -0,0 +1,227 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+ *               2020 Diego Elio Pettenò
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../ulab.h"
+#include "../ulab_tools.h"
+#include "carray/carray_tools.h"
+#include "approx.h"
+
+//| """Numerical approximation methods"""
+//|
+
+const mp_obj_float_t approx_trapz_dx = {{&mp_type_float}, MICROPY_FLOAT_CONST(1.0)};
+
+#if ULAB_NUMPY_HAS_INTERP
+//| def interp(
+//|     x: ulab.numpy.ndarray,
+//|     xp: ulab.numpy.ndarray,
+//|     fp: ulab.numpy.ndarray,
+//|     *,
+//|     left: Optional[_float] = None,
+//|     right: Optional[_float] = None
+//| ) -> ulab.numpy.ndarray:
+//|     """
+//|     :param ulab.numpy.ndarray x: The x-coordinates at which to evaluate the interpolated values.
+//|     :param ulab.numpy.ndarray xp: The x-coordinates of the data points, must be increasing
+//|     :param ulab.numpy.ndarray fp: The y-coordinates of the data points, same length as xp
+//|     :param left: Value to return for ``x < xp[0]``, default is ``fp[0]``.
+//|     :param right: Value to return for ``x > xp[-1]``, default is ``fp[-1]``.
+//|
+//|     Returns the one-dimensional piecewise linear interpolant to a function with given discrete data points (xp, fp), evaluated at x."""
+//|     ...
+//|
+
+STATIC mp_obj_t approx_interp(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_left, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} },
+        { MP_QSTR_right, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} },
+    };
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    ndarray_obj_t *x = ndarray_from_mp_obj(args[0].u_obj, 0);
+    ndarray_obj_t *xp = ndarray_from_mp_obj(args[1].u_obj, 0); // xp must hold an increasing sequence of independent values
+    ndarray_obj_t *fp = ndarray_from_mp_obj(args[2].u_obj, 0);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(x->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(xp->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(fp->dtype)
+    if((xp->ndim != 1) || (fp->ndim != 1) || (xp->len < 2) || (fp->len < 2) || (xp->len != fp->len)) {
+        mp_raise_ValueError(translate("interp is defined for 1D iterables of equal length"));
+    }
+
+    ndarray_obj_t *y = ndarray_new_linear_array(x->len, NDARRAY_FLOAT);
+    mp_float_t left_value, right_value;
+    uint8_t *xparray = (uint8_t *)xp->array;
+
+    mp_float_t xp_left = ndarray_get_float_value(xparray, xp->dtype);
+    xparray += (xp->len-1) * xp->strides[ULAB_MAX_DIMS - 1];
+    mp_float_t xp_right = ndarray_get_float_value(xparray, xp->dtype);
+
+    uint8_t *fparray = (uint8_t *)fp->array;
+
+    if(args[3].u_obj == mp_const_none) {
+        left_value = ndarray_get_float_value(fparray, fp->dtype);
+    } else {
+        left_value = mp_obj_get_float(args[3].u_obj);
+    }
+    if(args[4].u_obj == mp_const_none) {
+        fparray += (fp->len-1) * fp->strides[ULAB_MAX_DIMS - 1];
+        right_value = ndarray_get_float_value(fparray, fp->dtype);
+    } else {
+        right_value = mp_obj_get_float(args[4].u_obj);
+    }
+
+    xparray = xp->array;
+    fparray = fp->array;
+
+    uint8_t *xarray = (uint8_t *)x->array;
+    mp_float_t *yarray = (mp_float_t *)y->array;
+    uint8_t *temp;
+
+    for(size_t i=0; i < x->len; i++, yarray++) {
+        mp_float_t x_value = ndarray_get_float_value(xarray, x->dtype);
+        xarray += x->strides[ULAB_MAX_DIMS - 1];
+        if(x_value < xp_left) {
+            *yarray = left_value;
+        } else if(x_value > xp_right) {
+            *yarray = right_value;
+        } else { // do the binary search here
+            mp_float_t xp_left_, xp_right_;
+            mp_float_t fp_left, fp_right;
+            size_t left_index = 0, right_index = xp->len - 1, middle_index;
+            while(right_index - left_index > 1) {
+                middle_index = left_index + (right_index - left_index) / 2;
+                temp = xparray + middle_index * xp->strides[ULAB_MAX_DIMS - 1];
+                mp_float_t xp_middle = ndarray_get_float_value(temp, xp->dtype);
+                if(x_value <= xp_middle) {
+                    right_index = middle_index;
+                } else {
+                    left_index = middle_index;
+                }
+            }
+            temp = xparray + left_index * xp->strides[ULAB_MAX_DIMS - 1];
+            xp_left_ = ndarray_get_float_value(temp, xp->dtype);
+
+            temp = xparray + right_index * xp->strides[ULAB_MAX_DIMS - 1];
+            xp_right_ = ndarray_get_float_value(temp, xp->dtype);
+
+            temp = fparray + left_index * fp->strides[ULAB_MAX_DIMS - 1];
+            fp_left = ndarray_get_float_value(temp, fp->dtype);
+
+            temp = fparray + right_index * fp->strides[ULAB_MAX_DIMS - 1];
+            fp_right = ndarray_get_float_value(temp, fp->dtype);
+
+            *yarray = fp_left + (x_value - xp_left_) * (fp_right - fp_left) / (xp_right_ - xp_left_);
+        }
+    }
+    return MP_OBJ_FROM_PTR(y);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(approx_interp_obj, 2, approx_interp);
+#endif
+
+#if ULAB_NUMPY_HAS_TRAPZ
+//| def trapz(y: ulab.numpy.ndarray, x: Optional[ulab.numpy.ndarray] = None, dx: _float = 1.0) -> _float:
+//|     """
+//|     :param 1D ulab.numpy.ndarray y: the values of the dependent variable
+//|     :param 1D ulab.numpy.ndarray x: optional, the coordinates of the independent variable. Defaults to uniformly spaced values.
+//|     :param float dx: the spacing between sample points, if x=None
+//|
+//|     Returns the integral of y(x) using the trapezoidal rule.
+//|     """
+//|     ...
+//|
+
+STATIC mp_obj_t approx_trapz(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_x, MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_dx, MP_ARG_OBJ, {.u_rom_obj = MP_ROM_PTR(&approx_trapz_dx)} },
+    };
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    ndarray_obj_t *y = ndarray_from_mp_obj(args[0].u_obj, 0);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(y->dtype)
+    ndarray_obj_t *x;
+    mp_float_t mean = MICROPY_FLOAT_CONST(0.0);
+    if(y->len < 2) {
+        return mp_obj_new_float(mean);
+    }
+    if((y->ndim != 1)) {
+        mp_raise_ValueError(translate("trapz is defined for 1D iterables"));
+    }
+
+    mp_float_t (*funcy)(void *) = ndarray_get_float_function(y->dtype);
+    uint8_t *yarray = (uint8_t *)y->array;
+
+    size_t count = 1;
+    mp_float_t y1, y2, m;
+
+    if(args[1].u_obj != mp_const_none) {
+        x = ndarray_from_mp_obj(args[1].u_obj, 0); // x must hold an increasing sequence of independent values
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(x->dtype)
+        if((x->ndim != 1) || (y->len != x->len)) {
+            mp_raise_ValueError(translate("trapz is defined for 1D arrays of equal length"));
+        }
+
+        mp_float_t (*funcx)(void *) = ndarray_get_float_function(x->dtype);
+        uint8_t *xarray = (uint8_t *)x->array;
+        mp_float_t x1, x2;
+
+        y1 = funcy(yarray);
+        yarray += y->strides[ULAB_MAX_DIMS - 1];
+        x1 = funcx(xarray);
+        xarray += x->strides[ULAB_MAX_DIMS - 1];
+
+        for(size_t i=1; i < y->len; i++) {
+            y2 = funcy(yarray);
+            yarray += y->strides[ULAB_MAX_DIMS - 1];
+            x2 = funcx(xarray);
+            xarray += x->strides[ULAB_MAX_DIMS - 1];
+            mp_float_t value = (x2 - x1) * (y2 + y1);
+            m = mean + (value - mean) / (mp_float_t)count;
+            mean = m;
+            x1 = x2;
+            y1 = y2;
+            count++;
+        }
+    } else {
+        mp_float_t dx = mp_obj_get_float(args[2].u_obj);
+        y1 = funcy(yarray);
+        yarray += y->strides[ULAB_MAX_DIMS - 1];
+
+        for(size_t i=1; i < y->len; i++) {
+            y2 = ndarray_get_float_index(y->array, y->dtype, i);
+            mp_float_t value = (y2 + y1);
+            m = mean + (value - mean) / (mp_float_t)count;
+            mean = m;
+            y1 = y2;
+            count++;
+        }
+        mean *= dx;
+    }
+    return mp_obj_new_float(MICROPY_FLOAT_CONST(0.5)*mean*(y->len-1));
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(approx_trapz_obj, 1, approx_trapz);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/approx.h b/circuitpython/extmod/ulab/code/numpy/approx.h
new file mode 100644
index 0000000..487a98b
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/approx.h
@@ -0,0 +1,29 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+*/
+
+#ifndef _APPROX_
+#define _APPROX_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+#define     APPROX_EPS          MICROPY_FLOAT_CONST(1.0e-4)
+#define     APPROX_NONZDELTA    MICROPY_FLOAT_CONST(0.05)
+#define     APPROX_ZDELTA       MICROPY_FLOAT_CONST(0.00025)
+#define     APPROX_ALPHA        MICROPY_FLOAT_CONST(1.0)
+#define     APPROX_BETA         MICROPY_FLOAT_CONST(2.0)
+#define     APPROX_GAMMA        MICROPY_FLOAT_CONST(0.5)
+#define     APPROX_DELTA        MICROPY_FLOAT_CONST(0.5)
+
+MP_DECLARE_CONST_FUN_OBJ_KW(approx_interp_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(approx_trapz_obj);
+
+#endif  /* _APPROX_ */
diff --git a/circuitpython/extmod/ulab/code/numpy/carray/carray.c b/circuitpython/extmod/ulab/code/numpy/carray/carray.c
new file mode 100644
index 0000000..a5f8a2b
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/carray/carray.c
@@ -0,0 +1,826 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2021-2022 Zoltán Vörös
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/objint.h"
+#include "py/runtime.h"
+#include "py/builtin.h"
+#include "py/misc.h"
+
+#include "../../ulab.h"
+#include "../../ndarray.h"
+#include "../../ulab_tools.h"
+#include "carray.h"
+
+#if ULAB_SUPPORTS_COMPLEX
+
+//| import ulab.numpy
+
+//| def real(val):
+//|     """
+//|     Return the real part of the complex argument, which can be
+//|     either an ndarray, or a scalar."""
+//|     ...
+//|
+
+mp_obj_t carray_real(mp_obj_t _source) {
+    if(mp_obj_is_type(_source, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(_source);
+        if(source->dtype != NDARRAY_COMPLEX) {
+            ndarray_obj_t *target = ndarray_new_dense_ndarray(source->ndim, source->shape, source->dtype);
+            ndarray_copy_array(source, target, 0);
+            return MP_OBJ_FROM_PTR(target);
+        } else { // the input is most definitely a complex array
+            ndarray_obj_t *target = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
+            ndarray_copy_array(source, target, 0);
+            return MP_OBJ_FROM_PTR(target);
+        }
+    } else {
+        mp_raise_NotImplementedError(translate("function is implemented for ndarrays only"));
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(carray_real_obj, carray_real);
+
+//| def imag(val):
+//|     """
+//|     Return the imaginary part of the complex argument, which can be
+//|     either an ndarray, or a scalar."""
+//|     ...
+//|
+
+mp_obj_t carray_imag(mp_obj_t _source) {
+    if(mp_obj_is_type(_source, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(_source);
+        if(source->dtype != NDARRAY_COMPLEX) { // if not complex, then the imaginary part is zero
+            ndarray_obj_t *target = ndarray_new_dense_ndarray(source->ndim, source->shape, source->dtype);
+            return MP_OBJ_FROM_PTR(target);
+        } else { // the input is most definitely a complex array
+            ndarray_obj_t *target = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
+            ndarray_copy_array(source, target, source->itemsize / 2);
+            return MP_OBJ_FROM_PTR(target);
+        }
+    } else {
+        mp_raise_NotImplementedError(translate("function is implemented for ndarrays only"));
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(carray_imag_obj, carray_imag);
+
+#if ULAB_NUMPY_HAS_CONJUGATE
+
+//| def conjugate(val):
+//|     """
+//|     Return the conjugate of the complex argument, which can be
+//|     either an ndarray, or a scalar."""
+//|     ...
+//|
+mp_obj_t carray_conjugate(mp_obj_t _source) {
+    if(mp_obj_is_type(_source, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(_source);
+        ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, source->dtype);
+        ndarray_copy_array(source, ndarray, 0);
+        if(source->dtype == NDARRAY_COMPLEX) {
+            mp_float_t *array = (mp_float_t *)ndarray->array;
+            array++;
+            for(size_t i = 0; i < ndarray->len; i++) {
+                *array *= MICROPY_FLOAT_CONST(-1.0);
+                array += 2;
+            }
+        }
+        return MP_OBJ_FROM_PTR(ndarray);
+    } else {
+        if(mp_obj_is_type(_source, &mp_type_complex)) {
+            mp_float_t real, imag;
+            mp_obj_get_complex(_source, &real, &imag);
+            imag = imag * MICROPY_FLOAT_CONST(-1.0);
+            return mp_obj_new_complex(real, imag);
+        } else if(mp_obj_is_int(_source) || mp_obj_is_float(_source)) {
+            return _source;
+        } else {
+            mp_raise_TypeError(translate("input must be an ndarray, or a scalar"));
+        }
+    }
+    // this should never happen
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(carray_conjugate_obj, carray_conjugate);
+#endif
+
+#if ULAB_NUMPY_HAS_SORT_COMPLEX
+//| def sort_complex(a: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: a
+//|       a one-dimensional ndarray
+//|
+//|     Sort a complex array using the real part first, then the imaginary part.
+//|     Always returns a sorted complex array, even if the input was real."""
+//|     ...
+//|
+
+static void carray_sort_complex_(mp_float_t *array, size_t len) {
+    // array is assumed to be a floating vector containing the real and imaginary parts
+    // of a complex array at alternating positions as
+    // array[0] = real[0]
+    // array[1] = imag[0]
+    // array[2] = real[1]
+    // array[3] = imag[1]
+
+    mp_float_t real, imag;
+    size_t c, q = len, p, r = len >> 1;
+    for (;;) {
+        if (r > 0) {
+            r--;
+            real = array[2 * r];
+            imag = array[2 * r + 1];
+        } else {
+            q--;
+            if(q == 0) {
+                break;
+            }
+            real = array[2 * q];
+            imag = array[2 * q + 1];
+            array[2 * q] = array[0];
+            array[2 * q + 1] = array[1];
+        }
+        p = r;
+        c = r + r + 1;
+        while (c < q) {
+            if(c + 1 < q) {
+                if((array[2 * (c+1)] > array[2 * c]) ||
+                    ((array[2 * (c+1)] == array[2 * c]) && (array[2 * (c+1) + 1] > array[2 * c + 1]))) {
+                    c++;
+                }
+            }
+            if((array[2 * c] > real) ||
+                ((array[2 * c] == real) && (array[2 * c + 1] > imag))) {
+                array[2 * p] = array[2 * c]; // real part
+                array[2 * p + 1] = array[2 * c + 1]; // imag part
+                p = c;
+                c = p + p + 1;
+            } else {
+                break;
+            }
+        }
+        array[2 * p] = real;
+        array[2 * p + 1] = imag;
+    }
+}
+
+mp_obj_t carray_sort_complex(mp_obj_t _source) {
+    if(!mp_obj_is_type(_source, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("input must be a 1D ndarray"));
+    }
+    ndarray_obj_t *source = MP_OBJ_TO_PTR(_source);
+    if(source->ndim != 1) {
+        mp_raise_TypeError(translate("input must be a 1D ndarray"));
+    }
+
+    ndarray_obj_t *ndarray = ndarray_copy_view_convert_type(source, NDARRAY_COMPLEX);
+    mp_float_t *array = (mp_float_t *)ndarray->array;
+    carray_sort_complex_(array, ndarray->len);
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(carray_sort_complex_obj, carray_sort_complex);
+#endif
+
+//| def abs(a: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: a
+//|       a one-dimensional ndarray
+//|
+//|     Return the absolute value of complex ndarray."""
+//|     ...
+//|
+
+mp_obj_t carray_abs(ndarray_obj_t *source, ndarray_obj_t *target) {
+    // calculates the absolute value of a complex array and returns a dense array
+    uint8_t *sarray = (uint8_t *)source->array;
+    mp_float_t *tarray = (mp_float_t *)target->array;
+    uint8_t itemsize = mp_binary_get_size('@', NDARRAY_FLOAT, NULL);
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    mp_float_t rvalue = *(mp_float_t *)sarray;
+                    mp_float_t ivalue = *(mp_float_t *)(sarray + itemsize);
+                    *tarray++ = MICROPY_FLOAT_C_FUN(sqrt)(rvalue * rvalue + ivalue * ivalue);
+                    sarray += source->strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                sarray += source->strides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+            sarray += source->strides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+        sarray += source->strides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+    #endif
+    return MP_OBJ_FROM_PTR(target);
+}
+
+static void carray_copy_part(uint8_t *tarray, uint8_t *sarray, size_t *shape, int32_t *strides) {
+    // copies the real or imaginary part of an array
+    // into the respective part of a dense complex array
+    uint8_t sz = sizeof(mp_float_t);
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    memcpy(tarray, sarray, sz);
+                    tarray += 2 * sz;
+                    sarray += strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                sarray -= strides[ULAB_MAX_DIMS - 1] * shape[ULAB_MAX_DIMS-1];
+                sarray += strides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < shape[ULAB_MAX_DIMS - 2]);
+            #endif /* ULAB_MAX_DIMS > 1 */
+        #if ULAB_MAX_DIMS > 2
+            sarray -= strides[ULAB_MAX_DIMS - 2] * shape[ULAB_MAX_DIMS-2];
+            sarray += strides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < shape[ULAB_MAX_DIMS - 3]);
+        #endif /* ULAB_MAX_DIMS > 2 */
+    #if ULAB_MAX_DIMS > 3
+        sarray -= strides[ULAB_MAX_DIMS - 3] * shape[ULAB_MAX_DIMS-3];
+        sarray += strides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < shape[ULAB_MAX_DIMS - 4]);
+    #endif /* ULAB_MAX_DIMS > 3 */
+}
+
+mp_obj_t carray_binary_equal_not_equal(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides, mp_binary_op_t op) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT8);
+    results->boolean = 1;
+    uint8_t *array = (uint8_t *)results->array;
+
+    if(op == MP_BINARY_OP_NOT_EQUAL) {
+        memset(array, 1, results->len);
+    }
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        if((larray[0] == rarray[0]) && (larray[1] == rarray[1])) {
+                            *array ^= 0x01;
+                        }
+                        array++;
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else { // only one of the operands is complex
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        uint8_t *rarray = (uint8_t *)rhs->array;
+
+        // align the complex array to the left
+        uint8_t rdtype = rhs->dtype;
+        int32_t *lstrides_ = lstrides;
+        int32_t *rstrides_ = rstrides;
+
+        if(rhs->dtype == NDARRAY_COMPLEX) {
+            larray = (mp_float_t *)rhs->array;
+            rarray = (uint8_t *)lhs->array;
+            lstrides_ = rstrides;
+            rstrides_ = lstrides;
+            rdtype = lhs->dtype;
+        }
+
+        ulab_rescale_float_strides(lstrides_);
+
+        if(rdtype == NDARRAY_UINT8) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, uint8_t, larray, lstrides_, rarray, rstrides_);
+        } else if(rdtype == NDARRAY_INT8) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, int8_t, larray, lstrides_, rarray, rstrides_);
+        } else if(rdtype == NDARRAY_UINT16) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, uint16_t, larray, lstrides_, rarray, rstrides_);
+        } else if(rdtype == NDARRAY_INT16) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, int16_t, larray, lstrides_, rarray, rstrides_);
+        } else if(rdtype == NDARRAY_FLOAT) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, mp_float_t, larray, lstrides_, rarray, rstrides_);
+        }
+    }
+    return MP_OBJ_FROM_PTR(results);
+}
+
+mp_obj_t carray_binary_add(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_COMPLEX);
+    mp_float_t *resarray = (mp_float_t *)results->array;
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        // real part
+                        *resarray++ = larray[0] + rarray[0];
+                        // imaginary part
+                        *resarray++ = larray[1] + rarray[1];
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else { // only one of the operands is complex
+        uint8_t *larray = (uint8_t *)lhs->array;
+        uint8_t *rarray = (uint8_t *)rhs->array;
+
+        // align the complex array to the left
+        uint8_t rdtype = rhs->dtype;
+        int32_t *lstrides_ = lstrides;
+        int32_t *rstrides_ = rstrides;
+
+        if(rhs->dtype == NDARRAY_COMPLEX) {
+            larray = (uint8_t *)rhs->array;
+            rarray = (uint8_t *)lhs->array;
+            lstrides_ = rstrides;
+            rstrides_ = lstrides;
+            rdtype = lhs->dtype;
+        }
+
+        if(rdtype == NDARRAY_UINT8) {
+            BINARY_LOOP_COMPLEX(results, resarray, uint8_t, larray, lstrides_, rarray, rstrides_, +);
+        } else if(rdtype == NDARRAY_INT8) {
+            BINARY_LOOP_COMPLEX(results, resarray, int8_t, larray, lstrides_, rarray, rstrides_, +);
+        } else if(rdtype == NDARRAY_UINT16) {
+            BINARY_LOOP_COMPLEX(results, resarray, uint16_t, larray, lstrides_, rarray, rstrides_, +);
+        } else if(rdtype == NDARRAY_INT16) {
+            BINARY_LOOP_COMPLEX(results, resarray, int16_t, larray, lstrides_, rarray, rstrides_, +);
+        } else if(rdtype == NDARRAY_FLOAT) {
+            BINARY_LOOP_COMPLEX(results, resarray, mp_float_t, larray, lstrides_, rarray, rstrides_, +);
+        }
+
+        // simply copy the imaginary part
+        uint8_t *tarray = (uint8_t *)results->array;
+        tarray += sizeof(mp_float_t);
+
+        if(lhs->dtype == NDARRAY_COMPLEX) {
+            rarray = (uint8_t *)lhs->array;
+            rstrides = lstrides;
+        } else {
+            rarray = (uint8_t *)rhs->array;
+        }
+        rarray += sizeof(mp_float_t);
+        carray_copy_part(tarray, rarray, results->shape, rstrides);
+    }
+    return MP_OBJ_FROM_PTR(results);
+}
+
+static void carray_binary_multiply_(ndarray_obj_t *results, mp_float_t *resarray, uint8_t *larray, uint8_t *rarray,
+                            int32_t *lstrides, int32_t *rstrides, uint8_t rdtype) {
+
+    if(rdtype == NDARRAY_UINT8) {
+        BINARY_LOOP_COMPLEX(results, resarray, uint8_t, larray, lstrides, rarray, rstrides, *);
+    } else if(rdtype == NDARRAY_INT8) {
+        BINARY_LOOP_COMPLEX(results, resarray, int8_t, larray, lstrides, rarray, rstrides, *);
+    } else if(rdtype == NDARRAY_UINT16) {
+        BINARY_LOOP_COMPLEX(results, resarray, uint16_t, larray, lstrides, rarray, rstrides, *);
+    } else if(rdtype == NDARRAY_INT16) {
+        BINARY_LOOP_COMPLEX(results, resarray, int16_t, larray, lstrides, rarray, rstrides, *);
+    } else if(rdtype == NDARRAY_FLOAT) {
+        BINARY_LOOP_COMPLEX(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides, *);
+    }
+}
+
+mp_obj_t carray_binary_multiply(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_COMPLEX);
+    mp_float_t *resarray = (mp_float_t *)results->array;
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        // real part
+                        *resarray++ = larray[0] * rarray[0] - larray[1] * rarray[1];
+                        // imaginary part
+                        *resarray++ = larray[0] * rarray[1] + larray[1] * rarray[0];
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else { // only one of the operands is complex
+
+        uint8_t *larray = (uint8_t *)lhs->array;
+        uint8_t *rarray = (uint8_t *)rhs->array;
+        uint8_t *lo = larray, *ro = rarray;
+        int32_t *left_strides = lstrides;
+        int32_t *right_strides = rstrides;
+        uint8_t rdtype = rhs->dtype;
+
+        // align the complex array to the left
+        if(rhs->dtype == NDARRAY_COMPLEX) {
+            lo = (uint8_t *)rhs->array;
+            ro = (uint8_t *)lhs->array;
+            rdtype = lhs->dtype;
+            left_strides = rstrides;
+            right_strides = lstrides;
+        }
+
+        larray = lo;
+        rarray = ro;
+        // real part
+        carray_binary_multiply_(results, resarray, larray, rarray, left_strides, right_strides, rdtype);
+
+        larray = lo + sizeof(mp_float_t);
+        rarray = ro;
+        resarray = (mp_float_t *)results->array;
+        resarray++;
+        // imaginary part
+        carray_binary_multiply_(results, resarray, larray, rarray, left_strides, right_strides, rdtype);
+    }
+    return MP_OBJ_FROM_PTR(results);
+}
+
+mp_obj_t carray_binary_subtract(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_COMPLEX);
+    mp_float_t *resarray = (mp_float_t *)results->array;
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        // real part
+                        *resarray++ = larray[0] - rarray[0];
+                        // imaginary part
+                        *resarray++ = larray[1] - rarray[1];
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else {
+        uint8_t *larray = (uint8_t *)lhs->array;
+        if(lhs->dtype == NDARRAY_COMPLEX) {
+            uint8_t *rarray = (uint8_t *)rhs->array;
+            if(rhs->dtype == NDARRAY_UINT8) {
+                BINARY_LOOP_COMPLEX(results, resarray, uint8_t, larray, lstrides, rarray, rstrides, -);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                BINARY_LOOP_COMPLEX(results, resarray, int8_t, larray, lstrides, rarray, rstrides, -);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                BINARY_LOOP_COMPLEX(results, resarray, uint16_t, larray, lstrides, rarray, rstrides, -);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                BINARY_LOOP_COMPLEX(results, resarray, int16_t, larray, lstrides, rarray, rstrides, -);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                BINARY_LOOP_COMPLEX(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides, -);
+            }
+            // copy the imaginary part
+            uint8_t *tarray = (uint8_t *)results->array;
+            tarray += sizeof(mp_float_t);
+
+            larray = (uint8_t *)lhs->array;
+            larray += sizeof(mp_float_t);
+
+            carray_copy_part(tarray, larray, results->shape, lstrides);
+        } else if(rhs->dtype == NDARRAY_COMPLEX) {
+            mp_float_t *rarray = (mp_float_t *)rhs->array;
+            ulab_rescale_float_strides(rstrides);
+
+            if(lhs->dtype == NDARRAY_UINT8) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, uint8_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_INT8) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, int8_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_UINT16) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, uint16_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_INT16) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, int16_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_FLOAT) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides);
+            }
+        }
+    }
+
+    return MP_OBJ_FROM_PTR(results);
+}
+
+static void carray_binary_left_divide_(ndarray_obj_t *results, mp_float_t *resarray, uint8_t *larray, uint8_t *rarray,
+                            int32_t *lstrides, int32_t *rstrides, uint8_t rdtype) {
+
+    if(rdtype == NDARRAY_UINT8) {
+        BINARY_LOOP_COMPLEX(results, resarray, uint8_t, larray, lstrides, rarray, rstrides, /);
+    } else if(rdtype == NDARRAY_INT8) {
+        BINARY_LOOP_COMPLEX(results, resarray, int8_t, larray, lstrides, rarray, rstrides, /);
+    } else if(rdtype == NDARRAY_UINT16) {
+        BINARY_LOOP_COMPLEX(results, resarray, uint16_t, larray, lstrides, rarray, rstrides, /);
+    } else if(rdtype == NDARRAY_INT16) {
+        BINARY_LOOP_COMPLEX(results, resarray, int16_t, larray, lstrides, rarray, rstrides, /);
+    } else if(rdtype == NDARRAY_FLOAT) {
+        BINARY_LOOP_COMPLEX(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides, /);
+    }
+}
+
+mp_obj_t carray_binary_divide(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_COMPLEX);
+    mp_float_t *resarray = (mp_float_t *)results->array;
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        // (a + bi) / (c + di) =
+                        // (ac + bd) / (c^2 + d^2) + i (bc - ad) / (c^2 + d^2)
+                        // denominator
+                        mp_float_t denom = rarray[0] * rarray[0] + rarray[1] * rarray[1];
+
+                        // real part
+                        *resarray++ = (larray[0] * rarray[0] + larray[1] * rarray[1]) / denom;
+                        // imaginary part
+                        *resarray++ = (larray[1] * rarray[0] - larray[0] * rarray[1]) / denom;
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else {
+        uint8_t *larray = (uint8_t *)lhs->array;
+        uint8_t *rarray = (uint8_t *)rhs->array;
+        if(lhs->dtype == NDARRAY_COMPLEX) {
+            // real part
+            carray_binary_left_divide_(results, resarray, larray, rarray, lstrides, rstrides, rhs->dtype);
+            // imaginary part
+            resarray = (mp_float_t *)results->array;
+            resarray++;
+            larray = (uint8_t *)lhs->array;
+            larray += sizeof(mp_float_t);
+            rarray = (uint8_t *)rhs->array;
+            carray_binary_left_divide_(results, resarray, larray, rarray, lstrides, rstrides, rhs->dtype);
+        } else {
+            if(lhs->dtype == NDARRAY_UINT8) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, uint8_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_INT8) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, int8_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_UINT16) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, uint16_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_INT16) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, int16_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_FLOAT) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides);
+            }
+        }
+    }
+
+    return MP_OBJ_FROM_PTR(results);
+}
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/carray/carray.h b/circuitpython/extmod/ulab/code/numpy/carray/carray.h
new file mode 100644
index 0000000..8ca5de2
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/carray/carray.h
@@ -0,0 +1,237 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2021-2022 Zoltán Vörös
+*/
+
+#ifndef _CARRAY_
+#define _CARRAY_
+
+MP_DECLARE_CONST_FUN_OBJ_1(carray_real_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(carray_imag_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(carray_conjugate_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(carray_sort_complex_obj);
+
+
+mp_obj_t carray_imag(mp_obj_t );
+mp_obj_t carray_real(mp_obj_t );
+
+mp_obj_t carray_abs(ndarray_obj_t *, ndarray_obj_t *);
+mp_obj_t carray_binary_add(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t carray_binary_multiply(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t carray_binary_subtract(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t carray_binary_divide(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t carray_binary_equal_not_equal(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *, mp_binary_op_t );
+
+#define BINARY_LOOP_COMPLEX1(results, resarray, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t l = 0;\
+    do {\
+        *(resarray) = *((mp_float_t *)(larray)) OPERATOR *((type_right *)(rarray));\
+        (resarray) += 2;\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define BINARY_LOOP_COMPLEX2(results, resarray, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t k = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX1((results), (resarray), type_right, (larray), (lstrides), (rarray), (rstrides), OPERATOR);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define BINARY_LOOP_COMPLEX3(results, resarray, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t j = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX2((results), (resarray), type_right, (larray), (lstrides), (rarray), (rstrides), OPERATOR);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define BINARY_LOOP_COMPLEX4(results, resarray, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t i = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX3((results), (resarray), type_right, (larray), (lstrides), (rarray), (rstrides), OPERATOR);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT1(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t l = 0;\
+    do {\
+        *(resarray)++ = *((type_left *)(larray)) - (rarray)[0];\
+        *(resarray)++ = -(rarray)[1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT2(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t k = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT1((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT3(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t j = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT2((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT4(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t i = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT3((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE1(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t l = 0;\
+    do {\
+        mp_float_t *c = (mp_float_t *)(rarray);\
+        mp_float_t denom = c[0] * c[0] + c[1] * c[1];\
+        mp_float_t a = *((type_left *)(larray)) / denom;\
+        *(resarray)++ = a * c[0];\
+        *(resarray)++ = -a * c[1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE2(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t k = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_RIGHT_DIVIDE1((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE3(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t j = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_RIGHT_DIVIDE2((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE4(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t i = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_RIGHT_DIVIDE3((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+
+#define BINARY_LOOP_COMPLEX_EQUAL1(results, array, type_right, larray, lstrides, rarray, rstrides)\
+    size_t l = 0;\
+    do {\
+        if((*(larray) == *((type_right *)(rarray))) && ((larray)[1] == MICROPY_FLOAT_CONST(0.0))) {\
+            *(array) ^= 0x01;\
+        }\
+        (array)++;\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define BINARY_LOOP_COMPLEX_EQUAL2(results, array, type_right, larray, lstrides, rarray, rstrides)\
+    size_t k = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_EQUAL1((results), (array), type_right, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define BINARY_LOOP_COMPLEX_EQUAL3(results, array, type_right, larray, lstrides, rarray, rstrides)\
+    size_t j = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_EQUAL2((results), (array), type_right, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define BINARY_LOOP_COMPLEX_EQUAL4(results, array, type_right, larray, lstrides, rarray, rstrides)\
+    size_t i = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_EQUAL3((results), (array), type_right, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+#if ULAB_MAX_DIMS == 1
+#define BINARY_LOOP_COMPLEX BINARY_LOOP_COMPLEX1
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT1
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE BINARY_LOOP_COMPLEX_RIGHT_DIVIDE1
+#define BINARY_LOOP_COMPLEX_EQUAL BINARY_LOOP_COMPLEX_EQUAL1
+#endif /* ULAB_MAX_DIMS == 1 */
+
+#if ULAB_MAX_DIMS == 2
+#define BINARY_LOOP_COMPLEX BINARY_LOOP_COMPLEX2
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT2
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE BINARY_LOOP_COMPLEX_RIGHT_DIVIDE2
+#define BINARY_LOOP_COMPLEX_EQUAL BINARY_LOOP_COMPLEX_EQUAL2
+#endif /* ULAB_MAX_DIMS == 2 */
+
+#if ULAB_MAX_DIMS == 3
+#define BINARY_LOOP_COMPLEX BINARY_LOOP_COMPLEX3
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT3
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE BINARY_LOOP_COMPLEX_RIGHT_DIVIDE3
+#define BINARY_LOOP_COMPLEX_EQUAL BINARY_LOOP_COMPLEX_EQUAL3
+#endif /* ULAB_MAX_DIMS == 3 */
+
+#if ULAB_MAX_DIMS == 4
+#define BINARY_LOOP_COMPLEX BINARY_LOOP_COMPLEX4
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT4
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE BINARY_LOOP_COMPLEX_RIGHT_DIVIDE4
+#define BINARY_LOOP_COMPLEX_EQUAL BINARY_LOOP_COMPLEX_EQUAL4
+#endif /* ULAB_MAX_DIMS == 4 */
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/carray/carray_tools.c b/circuitpython/extmod/ulab/code/numpy/carray/carray_tools.c
new file mode 100644
index 0000000..7b623d3
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/carray/carray_tools.c
@@ -0,0 +1,28 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2022 Zoltán Vörös
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../../ulab.h"
+#include "../../ndarray.h"
+
+#if ULAB_SUPPORTS_COMPLEX
+
+void raise_complex_NotImplementedError(void) {
+    mp_raise_NotImplementedError(translate("not implemented for complex dtype"));
+}
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/carray/carray_tools.h b/circuitpython/extmod/ulab/code/numpy/carray/carray_tools.h
new file mode 100644
index 0000000..3ac79b5
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/carray/carray_tools.h
@@ -0,0 +1,25 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2022 Zoltán Vörös
+*/
+
+#ifndef _CARRAY_TOOLS_
+#define _CARRAY_TOOLS_
+
+void raise_complex_NotImplementedError(void);
+
+#if ULAB_SUPPORTS_COMPLEX
+    #define NOT_IMPLEMENTED_FOR_COMPLEX() raise_complex_NotImplementedError();
+    #define COMPLEX_DTYPE_NOT_IMPLEMENTED(dtype) if((dtype) == NDARRAY_COMPLEX) raise_complex_NotImplementedError();
+#else
+    #define NOT_IMPLEMENTED_FOR_COMPLEX() // do nothing
+    #define COMPLEX_DTYPE_NOT_IMPLEMENTED(dtype) // do nothing
+#endif
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/compare.c b/circuitpython/extmod/ulab/code/numpy/compare.c
new file mode 100644
index 0000000..5a82072
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/compare.c
@@ -0,0 +1,428 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+ *               2020 Jeff Epler for Adafruit Industries
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../ulab.h"
+#include "../ndarray_operators.h"
+#include "../ulab_tools.h"
+#include "carray/carray_tools.h"
+#include "compare.h"
+
+static mp_obj_t compare_function(mp_obj_t x1, mp_obj_t x2, uint8_t op) {
+    ndarray_obj_t *lhs = ndarray_from_mp_obj(x1, 0);
+    ndarray_obj_t *rhs = ndarray_from_mp_obj(x2, 0);
+    #if ULAB_SUPPORTS_COMPLEX
+    if((lhs->dtype == NDARRAY_COMPLEX) || (rhs->dtype == NDARRAY_COMPLEX)) {
+        NOT_IMPLEMENTED_FOR_COMPLEX()
+    }
+    #endif
+    uint8_t ndim = 0;
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    int32_t *lstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    int32_t *rstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    if(!ndarray_can_broadcast(lhs, rhs, &ndim, shape, lstrides, rstrides)) {
+        mp_raise_ValueError(translate("operands could not be broadcast together"));
+        m_del(size_t, shape, ULAB_MAX_DIMS);
+        m_del(int32_t, lstrides, ULAB_MAX_DIMS);
+        m_del(int32_t, rstrides, ULAB_MAX_DIMS);
+    }
+
+    uint8_t *larray = (uint8_t *)lhs->array;
+    uint8_t *rarray = (uint8_t *)rhs->array;
+
+    if(op == COMPARE_EQUAL) {
+        return ndarray_binary_equality(lhs, rhs, ndim, shape, lstrides, rstrides, MP_BINARY_OP_EQUAL);
+    } else if(op == COMPARE_NOT_EQUAL) {
+        return ndarray_binary_equality(lhs, rhs, ndim, shape, lstrides, rstrides, MP_BINARY_OP_NOT_EQUAL);
+    }
+    // These are the upcasting rules
+    // float always becomes float
+    // operation on identical types preserves type
+    // uint8 + int8 => int16
+    // uint8 + int16 => int16
+    // uint8 + uint16 => uint16
+    // int8 + int16 => int16
+    // int8 + uint16 => uint16
+    // uint16 + int16 => float
+    // The parameters of RUN_COMPARE_LOOP are
+    // typecode of result, type_out, type_left, type_right, lhs operand, rhs operand, operator
+    if(lhs->dtype == NDARRAY_UINT8) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            RUN_COMPARE_LOOP(NDARRAY_UINT8, uint8_t, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, uint8_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            RUN_COMPARE_LOOP(NDARRAY_UINT16, uint16_t, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, uint8_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        }
+    } else if(lhs->dtype == NDARRAY_INT8) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int8_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            RUN_COMPARE_LOOP(NDARRAY_INT8, int8_t, int8_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int8_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int8_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        }
+    } else if(lhs->dtype == NDARRAY_UINT16) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            RUN_COMPARE_LOOP(NDARRAY_UINT16, uint16_t, uint16_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            RUN_COMPARE_LOOP(NDARRAY_UINT16, uint16_t, uint16_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            RUN_COMPARE_LOOP(NDARRAY_UINT16, uint16_t, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, uint16_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        }
+    } else if(lhs->dtype == NDARRAY_INT16) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int16_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int16_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, int16_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int16_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, int16_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        }
+    } else if(lhs->dtype == NDARRAY_FLOAT) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        }
+    }
+    return mp_const_none; // we should never reach this point
+}
+
+static mp_obj_t compare_equal_helper(mp_obj_t x1, mp_obj_t x2, uint8_t comptype) {
+    // scalar comparisons should return a single object of mp_obj_t type
+    mp_obj_t result = compare_function(x1, x2, comptype);
+    if((mp_obj_is_int(x1) || mp_obj_is_float(x1)) && (mp_obj_is_int(x2) || mp_obj_is_float(x2))) {
+        mp_obj_iter_buf_t iter_buf;
+        mp_obj_t iterable = mp_getiter(result, &iter_buf);
+        mp_obj_t item = mp_iternext(iterable);
+        return item;
+    }
+    return result;
+}
+
+#if ULAB_NUMPY_HAS_CLIP
+
+mp_obj_t compare_clip(mp_obj_t x1, mp_obj_t x2, mp_obj_t x3) {
+    // Note: this function could be made faster by implementing a single-loop comparison in
+    // RUN_COMPARE_LOOP. However, that would add around 2 kB of compile size, while we
+    // would not gain a factor of two in speed, since the two comparisons should still be
+    // evaluated. In contrast, calling the function twice adds only 140 bytes to the firmware
+    if(mp_obj_is_int(x1) || mp_obj_is_float(x1)) {
+        mp_float_t v1 = mp_obj_get_float(x1);
+        mp_float_t v2 = mp_obj_get_float(x2);
+        mp_float_t v3 = mp_obj_get_float(x3);
+        if(v1 < v2) {
+            return x2;
+        } else if(v1 > v3) {
+            return x3;
+        } else {
+            return x1;
+        }
+    } else { // assume ndarrays
+        return compare_function(x2, compare_function(x1, x3, COMPARE_MINIMUM), COMPARE_MAXIMUM);
+    }
+}
+
+MP_DEFINE_CONST_FUN_OBJ_3(compare_clip_obj, compare_clip);
+#endif
+
+#if ULAB_NUMPY_HAS_EQUAL
+
+mp_obj_t compare_equal(mp_obj_t x1, mp_obj_t x2) {
+    return compare_equal_helper(x1, x2, COMPARE_EQUAL);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(compare_equal_obj, compare_equal);
+#endif
+
+#if ULAB_NUMPY_HAS_NOTEQUAL
+
+mp_obj_t compare_not_equal(mp_obj_t x1, mp_obj_t x2) {
+    return compare_equal_helper(x1, x2, COMPARE_NOT_EQUAL);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(compare_not_equal_obj, compare_not_equal);
+#endif
+
+#if ULAB_NUMPY_HAS_ISFINITE | ULAB_NUMPY_HAS_ISINF
+static mp_obj_t compare_isinf_isfinite(mp_obj_t _x, uint8_t mask) {
+    // mask should signify, whether the function is called from isinf (mask = 1),
+    // or from isfinite (mask = 0)
+    if(mp_obj_is_int(_x)) {
+        if(mask) {
+            return mp_const_false;
+        } else {
+            return mp_const_true;
+        }
+    } else if(mp_obj_is_float(_x)) {
+        mp_float_t x = mp_obj_get_float(_x);
+        if(isnan(x)) {
+            return mp_const_false;
+        }
+        if(mask) { // called from isinf
+            return isinf(x) ? mp_const_true : mp_const_false;
+        } else { // called from isfinite
+            return isinf(x) ? mp_const_false : mp_const_true;
+        }
+    } else if(mp_obj_is_type(_x, &ulab_ndarray_type)) {
+        ndarray_obj_t *x = MP_OBJ_TO_PTR(_x);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(x->dtype)
+        ndarray_obj_t *results = ndarray_new_dense_ndarray(x->ndim, x->shape, NDARRAY_BOOL);
+        // At this point, results is all False
+        uint8_t *rarray = (uint8_t *)results->array;
+        if(x->dtype != NDARRAY_FLOAT) {
+            // int types can never be infinite...
+            if(!mask) {
+                // ...so flip all values in the array, if the function was called from isfinite
+                memset(rarray, 1, results->len);
+            }
+            return results;
+        }
+        uint8_t *xarray = (uint8_t *)x->array;
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        mp_float_t value = *(mp_float_t *)xarray;
+                        if(isnan(value)) {
+                            *rarray++ = 0;
+                        } else {
+                            *rarray++ = isinf(value) ? mask : 1 - mask;
+                        }
+                        xarray += x->strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < x->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    xarray -= x->strides[ULAB_MAX_DIMS - 1] * x->shape[ULAB_MAX_DIMS-1];
+                    xarray += x->strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < x->shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                xarray -= x->strides[ULAB_MAX_DIMS - 2] * x->shape[ULAB_MAX_DIMS-2];
+                xarray += x->strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < x->shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            xarray -= x->strides[ULAB_MAX_DIMS - 3] * x->shape[ULAB_MAX_DIMS-3];
+            xarray += x->strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < x->shape[ULAB_MAX_DIMS - 4]);
+        #endif
+
+        return results;
+    } else {
+        mp_raise_TypeError(translate("wrong input type"));
+    }
+    return mp_const_none;
+}
+#endif
+
+#if ULAB_NUMPY_HAS_ISFINITE
+mp_obj_t compare_isfinite(mp_obj_t _x) {
+    return compare_isinf_isfinite(_x, 0);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(compare_isfinite_obj, compare_isfinite);
+#endif
+
+#if ULAB_NUMPY_HAS_ISINF
+mp_obj_t compare_isinf(mp_obj_t _x) {
+    return compare_isinf_isfinite(_x, 1);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(compare_isinf_obj, compare_isinf);
+#endif
+
+#if ULAB_NUMPY_HAS_MAXIMUM
+mp_obj_t compare_maximum(mp_obj_t x1, mp_obj_t x2) {
+    // extra round, so that we can return maximum(3, 4) properly
+    mp_obj_t result = compare_function(x1, x2, COMPARE_MAXIMUM);
+    if((mp_obj_is_int(x1) || mp_obj_is_float(x1)) && (mp_obj_is_int(x2) || mp_obj_is_float(x2))) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(result);
+        return mp_binary_get_val_array(ndarray->dtype, ndarray->array, 0);
+    }
+    return result;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(compare_maximum_obj, compare_maximum);
+#endif
+
+#if ULAB_NUMPY_HAS_MINIMUM
+
+mp_obj_t compare_minimum(mp_obj_t x1, mp_obj_t x2) {
+    // extra round, so that we can return minimum(3, 4) properly
+    mp_obj_t result = compare_function(x1, x2, COMPARE_MINIMUM);
+    if((mp_obj_is_int(x1) || mp_obj_is_float(x1)) && (mp_obj_is_int(x2) || mp_obj_is_float(x2))) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(result);
+        return mp_binary_get_val_array(ndarray->dtype, ndarray->array, 0);
+    }
+    return result;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(compare_minimum_obj, compare_minimum);
+#endif
+
+#if ULAB_NUMPY_HAS_WHERE
+
+mp_obj_t compare_where(mp_obj_t _condition, mp_obj_t _x, mp_obj_t _y) {
+    // this implementation will work with ndarrays, and scalars only
+    ndarray_obj_t *c = ndarray_from_mp_obj(_condition, 0);
+    ndarray_obj_t *x = ndarray_from_mp_obj(_x, 0);
+    ndarray_obj_t *y = ndarray_from_mp_obj(_y, 0);
+
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(c->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(x->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(y->dtype)
+
+    int32_t *cstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    int32_t *xstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    int32_t *ystrides = m_new(int32_t, ULAB_MAX_DIMS);
+
+    size_t *oshape = m_new(size_t, ULAB_MAX_DIMS);
+
+    uint8_t ndim;
+
+    // establish the broadcasting conditions first
+    // if any two of the arrays can be broadcast together, then
+    // the three arrays can also be broadcast together
+    if(!ndarray_can_broadcast(c, x, &ndim, oshape, cstrides, ystrides) ||
+        !ndarray_can_broadcast(c, y, &ndim, oshape, cstrides, ystrides) ||
+        !ndarray_can_broadcast(x, y, &ndim, oshape, xstrides, ystrides)) {
+        mp_raise_ValueError(translate("operands could not be broadcast together"));
+    }
+
+    ndim = MAX(MAX(c->ndim, x->ndim), y->ndim);
+
+    for(uint8_t i = 1; i <= ndim; i++) {
+        cstrides[ULAB_MAX_DIMS - i] = c->shape[ULAB_MAX_DIMS - i] < 2 ? 0 : c->strides[ULAB_MAX_DIMS - i];
+        xstrides[ULAB_MAX_DIMS - i] = x->shape[ULAB_MAX_DIMS - i] < 2 ? 0 : x->strides[ULAB_MAX_DIMS - i];
+        ystrides[ULAB_MAX_DIMS - i] = y->shape[ULAB_MAX_DIMS - i] < 2 ? 0 : y->strides[ULAB_MAX_DIMS - i];
+        oshape[ULAB_MAX_DIMS - i] = MAX(MAX(c->shape[ULAB_MAX_DIMS - i], x->shape[ULAB_MAX_DIMS - i]), y->shape[ULAB_MAX_DIMS - i]);
+    }
+
+    uint8_t out_dtype = ndarray_upcast_dtype(x->dtype, y->dtype);
+    ndarray_obj_t *out = ndarray_new_dense_ndarray(ndim, oshape, out_dtype);
+
+    mp_float_t (*cfunc)(void *) = ndarray_get_float_function(c->dtype);
+    mp_float_t (*xfunc)(void *) = ndarray_get_float_function(x->dtype);
+    mp_float_t (*yfunc)(void *) = ndarray_get_float_function(y->dtype);
+    mp_float_t (*ofunc)(void *, mp_float_t ) = ndarray_set_float_function(out->dtype);
+
+    uint8_t *oarray = (uint8_t *)out->array;
+    uint8_t *carray = (uint8_t *)c->array;
+    uint8_t *xarray = (uint8_t *)x->array;
+    uint8_t *yarray = (uint8_t *)y->array;
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    mp_float_t value;
+                    mp_float_t cvalue = cfunc(carray);
+                    if(cvalue != MICROPY_FLOAT_CONST(0.0)) {
+                        value = xfunc(xarray);
+                    } else {
+                        value = yfunc(yarray);
+                    }
+                    ofunc(oarray, value);
+                    oarray += out->itemsize;
+                    carray += cstrides[ULAB_MAX_DIMS - 1];
+                    xarray += xstrides[ULAB_MAX_DIMS - 1];
+                    yarray += ystrides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < out->shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                carray -= cstrides[ULAB_MAX_DIMS - 1] * c->shape[ULAB_MAX_DIMS-1];
+                carray += cstrides[ULAB_MAX_DIMS - 2];
+                xarray -= xstrides[ULAB_MAX_DIMS - 1] * x->shape[ULAB_MAX_DIMS-1];
+                xarray += xstrides[ULAB_MAX_DIMS - 2];
+                yarray -= ystrides[ULAB_MAX_DIMS - 1] * y->shape[ULAB_MAX_DIMS-1];
+                yarray += ystrides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < out->shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            carray -= cstrides[ULAB_MAX_DIMS - 2] * c->shape[ULAB_MAX_DIMS-2];
+            carray += cstrides[ULAB_MAX_DIMS - 3];
+            xarray -= xstrides[ULAB_MAX_DIMS - 2] * x->shape[ULAB_MAX_DIMS-2];
+            xarray += xstrides[ULAB_MAX_DIMS - 3];
+            yarray -= ystrides[ULAB_MAX_DIMS - 2] * y->shape[ULAB_MAX_DIMS-2];
+            yarray += ystrides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < out->shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        carray -= cstrides[ULAB_MAX_DIMS - 3] * c->shape[ULAB_MAX_DIMS-3];
+        carray += cstrides[ULAB_MAX_DIMS - 4];
+        xarray -= xstrides[ULAB_MAX_DIMS - 3] * x->shape[ULAB_MAX_DIMS-3];
+        xarray += xstrides[ULAB_MAX_DIMS - 4];
+        yarray -= ystrides[ULAB_MAX_DIMS - 3] * y->shape[ULAB_MAX_DIMS-3];
+        yarray += ystrides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < out->shape[ULAB_MAX_DIMS - 4]);
+    #endif
+    return MP_OBJ_FROM_PTR(out);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_3(compare_where_obj, compare_where);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/compare.h b/circuitpython/extmod/ulab/code/numpy/compare.h
new file mode 100644
index 0000000..90ceaf7
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/compare.h
@@ -0,0 +1,150 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+*/
+
+#ifndef _COMPARE_
+#define _COMPARE_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+enum COMPARE_FUNCTION_TYPE {
+    COMPARE_EQUAL,
+    COMPARE_NOT_EQUAL,
+    COMPARE_MINIMUM,
+    COMPARE_MAXIMUM,
+    COMPARE_CLIP,
+};
+
+MP_DECLARE_CONST_FUN_OBJ_3(compare_clip_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(compare_equal_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(compare_isfinite_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(compare_isinf_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(compare_minimum_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(compare_maximum_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(compare_not_equal_obj);
+MP_DECLARE_CONST_FUN_OBJ_3(compare_where_obj);
+
+#if ULAB_MAX_DIMS == 1
+#define COMPARE_LOOP(results, array, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t l = 0;\
+    do {\
+        *((type_out *)(array)) = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray)) ? (type_out)(*((type_left *)(larray))) : (type_out)(*((type_right *)(rarray)));\
+        (array) += (results)->strides[ULAB_MAX_DIMS - 1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l <  results->shape[ULAB_MAX_DIMS - 1]);\
+    return MP_OBJ_FROM_PTR(results);\
+
+#endif // ULAB_MAX_DIMS == 1
+
+#if ULAB_MAX_DIMS == 2
+#define COMPARE_LOOP(results, array, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            *((type_out *)(array)) = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray)) ? (type_out)(*((type_left *)(larray))) : (type_out)(*((type_right *)(rarray)));\
+            (array) += (results)->strides[ULAB_MAX_DIMS - 1];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l <  results->shape[ULAB_MAX_DIMS - 1]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k <  results->shape[ULAB_MAX_DIMS - 2]);\
+    return MP_OBJ_FROM_PTR(results);\
+
+#endif // ULAB_MAX_DIMS == 2
+
+#if ULAB_MAX_DIMS == 3
+#define COMPARE_LOOP(results, array, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                *((type_out *)(array)) = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray)) ? (type_out)(*((type_left *)(larray))) : (type_out)(*((type_right *)(rarray)));\
+                (array) += (results)->strides[ULAB_MAX_DIMS - 1];\
+                (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l <  results->shape[ULAB_MAX_DIMS - 1]);\
+            (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k <  results->shape[ULAB_MAX_DIMS - 2]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j <  results->shape[ULAB_MAX_DIMS - 3]);\
+    return MP_OBJ_FROM_PTR(results);\
+
+#endif // ULAB_MAX_DIMS == 3
+
+#if ULAB_MAX_DIMS == 4
+#define COMPARE_LOOP(results, array, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t i = 0;\
+    do {\
+        size_t j = 0;\
+        do {\
+            size_t k = 0;\
+            do {\
+                size_t l = 0;\
+                do {\
+                    *((type_out *)(array)) = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray)) ? (type_out)(*((type_left *)(larray))) : (type_out)(*((type_right *)(rarray)));\
+                    (array) += (results)->strides[ULAB_MAX_DIMS - 1];\
+                    (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+                    (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                    l++;\
+                } while(l <  results->shape[ULAB_MAX_DIMS - 1]);\
+                (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];\
+                (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+                (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+                k++;\
+            } while(k <  results->shape[ULAB_MAX_DIMS - 2]);\
+            (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+            j++;\
+        } while(j <  results->shape[ULAB_MAX_DIMS - 3]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i <  results->shape[ULAB_MAX_DIMS - 4]);\
+    return MP_OBJ_FROM_PTR(results);\
+
+#endif // ULAB_MAX_DIMS == 4
+
+#define RUN_COMPARE_LOOP(dtype, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, ndim, shape, op) do {\
+    ndarray_obj_t *results = ndarray_new_dense_ndarray((ndim), (shape), (dtype));\
+    uint8_t *array = (uint8_t *)results->array;\
+    if((op) == COMPARE_MINIMUM) {\
+        COMPARE_LOOP(results, array, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, <);\
+    }\
+    if((op) == COMPARE_MAXIMUM) {\
+        COMPARE_LOOP(results, array, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, >);\
+    }\
+} while(0)
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/create.c b/circuitpython/extmod/ulab/code/numpy/create.c
new file mode 100644
index 0000000..5777070
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/create.c
@@ -0,0 +1,783 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2019-2021 Zoltán Vörös
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+
+#include "../ulab.h"
+#include "create.h"
+#include "../ulab_tools.h"
+
+#if ULAB_NUMPY_HAS_ONES | ULAB_NUMPY_HAS_ZEROS | ULAB_NUMPY_HAS_FULL | ULAB_NUMPY_HAS_EMPTY
+static mp_obj_t create_zeros_ones_full(mp_obj_t oshape, uint8_t dtype, mp_obj_t value) {
+    if(!mp_obj_is_int(oshape) && !mp_obj_is_type(oshape, &mp_type_tuple) && !mp_obj_is_type(oshape, &mp_type_list)) {
+        mp_raise_TypeError(translate("input argument must be an integer, a tuple, or a list"));
+    }
+    ndarray_obj_t *ndarray = NULL;
+    if(mp_obj_is_int(oshape)) {
+        size_t n = mp_obj_get_int(oshape);
+        ndarray = ndarray_new_linear_array(n, dtype);
+    } else if(mp_obj_is_type(oshape, &mp_type_tuple) || mp_obj_is_type(oshape, &mp_type_list)) {
+        uint8_t len = (uint8_t)mp_obj_get_int(mp_obj_len_maybe(oshape));
+        if(len > ULAB_MAX_DIMS) {
+            mp_raise_TypeError(translate("too many dimensions"));
+        }
+        size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+        memset(shape, 0, ULAB_MAX_DIMS * sizeof(size_t));
+        size_t i = 0;
+        mp_obj_iter_buf_t iter_buf;
+        mp_obj_t item, iterable = mp_getiter(oshape, &iter_buf);
+        while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION){
+            shape[ULAB_MAX_DIMS - len + i] = (size_t)mp_obj_get_int(item);
+            i++;
+        }
+        ndarray = ndarray_new_dense_ndarray(len, shape, dtype);
+    }
+    if(value != mp_const_none) {
+        if(dtype == NDARRAY_BOOL) {
+            dtype = NDARRAY_UINT8;
+            if(mp_obj_is_true(value)) {
+                value = mp_obj_new_int(1);
+            } else {
+                value = mp_obj_new_int(0);
+            }
+        }
+        for(size_t i=0; i < ndarray->len; i++) {
+            #if ULAB_SUPPORTS_COMPLEX
+            if(dtype == NDARRAY_COMPLEX) {
+                ndarray_set_complex_value(ndarray->array, i, value);
+            } else {
+                ndarray_set_value(dtype, ndarray->array, i, value);
+            }
+            #else
+            ndarray_set_value(dtype, ndarray->array, i, value);
+            #endif
+        }
+    }
+    // if zeros calls the function, we don't have to do anything
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+#endif
+
+#if ULAB_NUMPY_HAS_ARANGE | ULAB_NUMPY_HAS_LINSPACE
+static ndarray_obj_t *create_linspace_arange(mp_float_t start, mp_float_t step, mp_float_t stop, size_t len, uint8_t dtype) {
+    mp_float_t value = start;
+
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, dtype);
+    if(ndarray->boolean == NDARRAY_BOOLEAN) {
+        uint8_t *array = (uint8_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value += step) {
+            *array++ = value == MICROPY_FLOAT_CONST(0.0) ? 0 : 1;
+        }
+    } else if(dtype == NDARRAY_UINT8) {
+        ARANGE_LOOP(uint8_t, ndarray, len, step, stop);
+    } else if(dtype == NDARRAY_INT8) {
+        ARANGE_LOOP(int8_t, ndarray, len, step, stop);
+    } else if(dtype == NDARRAY_UINT16) {
+        ARANGE_LOOP(uint16_t, ndarray, len, step, stop);
+    } else if(dtype == NDARRAY_INT16) {
+        ARANGE_LOOP(int16_t, ndarray, len, step, stop);
+    } else {
+        ARANGE_LOOP(mp_float_t, ndarray, len, step, stop);
+    }
+    return ndarray;
+}
+#endif
+
+#if ULAB_NUMPY_HAS_ARANGE
+//| @overload
+//| def arange(stop: _float, step: _float = 1, *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray: ...
+//| @overload
+//| def arange(start: _float, stop: _float, step: _float = 1, *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: start
+//|       First value in the array, optional, defaults to 0
+//|     .. param: stop
+//|       Final value in the array
+//|     .. param: step
+//|       Difference between consecutive elements, optional, defaults to 1.0
+//|     .. param: dtype
+//|       Type of values in the array
+//|
+//|     Return a new 1-D array with elements ranging from ``start`` to ``stop``, with step size ``step``."""
+//|     ...
+//|
+
+mp_obj_t create_arange(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+    uint8_t dtype = NDARRAY_FLOAT;
+    mp_float_t start, stop, step;
+    if(n_args == 1) {
+        start = MICROPY_FLOAT_CONST(0.0);
+        stop = mp_obj_get_float(args[0].u_obj);
+        step = MICROPY_FLOAT_CONST(1.0);
+        if(mp_obj_is_int(args[0].u_obj)) dtype = NDARRAY_INT16;
+    } else if(n_args == 2) {
+        start = mp_obj_get_float(args[0].u_obj);
+        stop = mp_obj_get_float(args[1].u_obj);
+        step = MICROPY_FLOAT_CONST(1.0);
+        if(mp_obj_is_int(args[0].u_obj) && mp_obj_is_int(args[1].u_obj)) dtype = NDARRAY_INT16;
+    } else if(n_args == 3) {
+        start = mp_obj_get_float(args[0].u_obj);
+        stop = mp_obj_get_float(args[1].u_obj);
+        step = mp_obj_get_float(args[2].u_obj);
+        if(mp_obj_is_int(args[0].u_obj) && mp_obj_is_int(args[1].u_obj) && mp_obj_is_int(args[2].u_obj)) dtype = NDARRAY_INT16;
+    } else {
+        mp_raise_TypeError(translate("wrong number of arguments"));
+    }
+    if((MICROPY_FLOAT_C_FUN(fabs)(stop) > 32768) || (MICROPY_FLOAT_C_FUN(fabs)(start) > 32768) || (MICROPY_FLOAT_C_FUN(fabs)(step) > 32768)) {
+        dtype = NDARRAY_FLOAT;
+    }
+    if(args[3].u_obj != mp_const_none) {
+        dtype = (uint8_t)mp_obj_get_int(args[3].u_obj);
+    }
+    ndarray_obj_t *ndarray;
+    if((stop - start)/step < 0) {
+        ndarray = ndarray_new_linear_array(0, dtype);
+    } else {
+        size_t len = (size_t)(MICROPY_FLOAT_C_FUN(ceil)((stop - start) / step));
+        stop = start + (len - 1) * step;
+        ndarray = create_linspace_arange(start, step, stop, len, dtype);
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_arange_obj, 1, create_arange);
+#endif
+
+#if ULAB_NUMPY_HAS_CONCATENATE
+//| def concatenate(arrays: Tuple[ulab.numpy.ndarray], *, axis: int = 0) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: arrays
+//|       tuple of ndarrays
+//|     .. param: axis
+//|       axis along which the arrays will be joined
+//|
+//|     Join a sequence of arrays along an existing axis."""
+//|     ...
+//|
+
+mp_obj_t create_concatenate(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = 0 } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &mp_type_tuple)) {
+        mp_raise_TypeError(translate("first argument must be a tuple of ndarrays"));
+    }
+    int8_t axis = (int8_t)args[1].u_int;
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    mp_obj_tuple_t *ndarrays = MP_OBJ_TO_PTR(args[0].u_obj);
+
+    // first check, whether the arrays are compatible
+    ndarray_obj_t *_ndarray = MP_OBJ_TO_PTR(ndarrays->items[0]);
+    uint8_t dtype = _ndarray->dtype;
+    uint8_t ndim = _ndarray->ndim;
+    if(axis < 0) {
+        axis += ndim;
+    }
+    if((axis < 0) || (axis >= ndim)) {
+        mp_raise_ValueError(translate("wrong axis specified"));
+    }
+    // shift axis
+    axis = ULAB_MAX_DIMS - ndim + axis;
+    for(uint8_t j=0; j < ULAB_MAX_DIMS; j++) {
+        shape[j] = _ndarray->shape[j];
+    }
+
+    for(uint8_t i=1; i < ndarrays->len; i++) {
+        _ndarray = MP_OBJ_TO_PTR(ndarrays->items[i]);
+        // check, whether the arrays are compatible
+        if((dtype != _ndarray->dtype) || (ndim != _ndarray->ndim)) {
+            mp_raise_ValueError(translate("input arrays are not compatible"));
+        }
+        for(uint8_t j=0; j < ULAB_MAX_DIMS; j++) {
+            if(j == axis) {
+                shape[j] += _ndarray->shape[j];
+            } else {
+                if(shape[j] != _ndarray->shape[j]) {
+                    mp_raise_ValueError(translate("input arrays are not compatible"));
+                }
+            }
+        }
+    }
+
+    ndarray_obj_t *target = ndarray_new_dense_ndarray(ndim, shape, dtype);
+    uint8_t *tpos = (uint8_t *)target->array;
+    uint8_t *tarray;
+
+    for(uint8_t p=0; p < ndarrays->len; p++) {
+        // reset the pointer along the axis
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(ndarrays->items[p]);
+        uint8_t *sarray = (uint8_t *)source->array;
+        tarray = tpos;
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        memcpy(tarray, sarray, source->itemsize);
+                        tarray += target->strides[ULAB_MAX_DIMS - 1];
+                        sarray += source->strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    tarray -= target->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                    tarray += target->strides[ULAB_MAX_DIMS - 2];
+                    sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                    sarray += source->strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                tarray -= target->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                tarray += target->strides[ULAB_MAX_DIMS - 3];
+                sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                sarray += source->strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            tarray -= target->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+            tarray += target->strides[ULAB_MAX_DIMS - 4];
+            sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+            sarray += source->strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+        #endif
+        if(p < ndarrays->len - 1) {
+            tpos += target->strides[axis] * source->shape[axis];
+        }
+    }
+    return MP_OBJ_FROM_PTR(target);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_concatenate_obj, 1, create_concatenate);
+#endif
+
+#if ULAB_MAX_DIMS > 1
+#if ULAB_NUMPY_HAS_DIAG
+//| def diag(a: ulab.numpy.ndarray, *, k: int = 0) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: a
+//|       an ndarray
+//|     .. param: k
+//|       Offset of the diagonal from the main diagonal. Can be positive or negative.
+//|
+//|     Return specified diagonals."""
+//|     ...
+//|
+
+mp_obj_t create_diag(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_k, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = 0 } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("input must be an ndarray"));
+    }
+    ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0].u_obj);
+    if(source->ndim == 1) { // return a rank-2 tensor with the prescribed diagonal
+        ndarray_obj_t *target = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, source->len, source->len), source->dtype);
+        uint8_t *sarray = (uint8_t *)source->array;
+        uint8_t *tarray = (uint8_t *)target->array;
+        for(size_t i=0; i < source->len; i++) {
+            memcpy(tarray, sarray, source->itemsize);
+            sarray += source->strides[ULAB_MAX_DIMS - 1];
+            tarray += (source->len + 1) * target->itemsize;
+        }
+        return MP_OBJ_FROM_PTR(target);
+    }
+    if(source->ndim > 2) {
+        mp_raise_TypeError(translate("input must be a tensor of rank 2"));
+    }
+    int32_t k = args[1].u_int;
+    size_t len = 0;
+    uint8_t *sarray = (uint8_t *)source->array;
+    if(k < 0) { // move the pointer "vertically"
+        if(-k < (int32_t)source->shape[ULAB_MAX_DIMS - 2]) {
+            sarray -= k * source->strides[ULAB_MAX_DIMS - 2];
+            len = MIN(source->shape[ULAB_MAX_DIMS - 2] + k, source->shape[ULAB_MAX_DIMS - 1]);
+        }
+    } else { // move the pointer "horizontally"
+        if(k < (int32_t)source->shape[ULAB_MAX_DIMS - 1]) {
+            sarray += k * source->strides[ULAB_MAX_DIMS - 1];
+            len = MIN(source->shape[ULAB_MAX_DIMS - 1] - k, source->shape[ULAB_MAX_DIMS - 2]);
+        }
+    }
+
+    if(len == 0) {
+        mp_raise_ValueError(translate("offset is too large"));
+    }
+
+    ndarray_obj_t *target = ndarray_new_linear_array(len, source->dtype);
+    uint8_t *tarray = (uint8_t *)target->array;
+
+    for(size_t i=0; i < len; i++) {
+        memcpy(tarray, sarray, source->itemsize);
+        sarray += source->strides[ULAB_MAX_DIMS - 2];
+        sarray += source->strides[ULAB_MAX_DIMS - 1];
+        tarray += source->itemsize;
+    }
+    return MP_OBJ_FROM_PTR(target);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_diag_obj, 1, create_diag);
+#endif /* ULAB_NUMPY_HAS_DIAG */
+
+#if ULAB_NUMPY_HAS_EMPTY
+// This function is bound in numpy.c to numpy.zeros(), and is simply an alias for that
+
+//| def empty(shape: Union[int, Tuple[int, ...]], *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|    """
+//|    .. param: shape
+//|       Shape of the array, either an integer (for a 1-D array) or a tuple of 2 integers (for a 2-D array)
+//|    .. param: dtype
+//|       Type of values in the array
+//|
+//|    Return a new array of the given shape with all elements set to 0. An alias for numpy.zeros."""
+//|    ...
+//|
+#endif
+
+#if ULAB_NUMPY_HAS_EYE
+//| def eye(size: int, *, M: Optional[int] = None, k: int = 0, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|     """Return a new square array of size, with the diagonal elements set to 1
+//|        and the other elements set to 0. If k is given, the diagonal is shifted by the specified amount."""
+//|     ...
+//|
+
+mp_obj_t create_eye(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_INT, { .u_int = 0 } },
+        { MP_QSTR_M, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_k, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = 0 } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    size_t n = args[0].u_int, m;
+    size_t k = args[2].u_int > 0 ? (size_t)args[2].u_int : (size_t)(-args[2].u_int);
+    uint8_t dtype = args[3].u_int;
+    if(args[1].u_rom_obj == mp_const_none) {
+        m = n;
+    } else {
+        m = mp_obj_get_int(args[1].u_rom_obj);
+    }
+    ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, n, m), dtype);
+    if(dtype == NDARRAY_BOOL) {
+       dtype = NDARRAY_UINT8;
+   }
+    mp_obj_t one = mp_obj_new_int(1);
+    size_t i = 0;
+    if((args[2].u_int >= 0)) {
+        while(k < m) {
+            ndarray_set_value(dtype, ndarray->array, i*m+k, one);
+            k++;
+            i++;
+        }
+    } else {
+        while(k < n) {
+            ndarray_set_value(dtype, ndarray->array, k*m+i, one);
+            k++;
+            i++;
+        }
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_eye_obj, 1, create_eye);
+#endif /* ULAB_NUMPY_HAS_EYE */
+#endif /* ULAB_MAX_DIMS > 1 */
+
+#if ULAB_NUMPY_HAS_FULL
+//| def full(shape: Union[int, Tuple[int, ...]], fill_value: Union[_float, _bool], *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|    """
+//|    .. param: shape
+//|       Shape of the array, either an integer (for a 1-D array) or a tuple of integers (for tensors of higher rank)
+//|    .. param: fill_value
+//|       scalar, the value with which the array is filled
+//|    .. param: dtype
+//|       Type of values in the array
+//|
+//|    Return a new array of the given shape with all elements set to 0."""
+//|    ...
+//|
+
+mp_obj_t create_full(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_obj = MP_OBJ_NULL } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_obj = MP_OBJ_NULL } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    uint8_t dtype = args[2].u_int;
+
+    return create_zeros_ones_full(args[0].u_obj, dtype, args[1].u_obj);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_full_obj, 0, create_full);
+#endif
+
+
+#if ULAB_NUMPY_HAS_LINSPACE
+//| def linspace(
+//|     start: _float,
+//|     stop: _float,
+//|     *,
+//|     dtype: _DType = ulab.numpy.float,
+//|     num: int = 50,
+//|     endpoint: _bool = True,
+//|     retstep: _bool = False
+//| ) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: start
+//|       First value in the array
+//|     .. param: stop
+//|       Final value in the array
+//|     .. param int: num
+//|       Count of values in the array.
+//|     .. param: dtype
+//|       Type of values in the array
+//|     .. param bool: endpoint
+//|       Whether the ``stop`` value is included.  Note that even when
+//|       endpoint=True, the exact ``stop`` value may not be included due to the
+//|       inaccuracy of floating point arithmetic.
+//|      .. param bool: retstep,
+//|       If True, return (`samples`, `step`), where `step` is the spacing between samples.
+//|
+//|     Return a new 1-D array with ``num`` elements ranging from ``start`` to ``stop`` linearly."""
+//|     ...
+//|
+
+mp_obj_t create_linspace(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_num, MP_ARG_INT, { .u_int = 50 } },
+        { MP_QSTR_endpoint, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_true } },
+        { MP_QSTR_retstep, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_false } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(args[2].u_int < 2) {
+        mp_raise_ValueError(translate("number of points must be at least 2"));
+    }
+    size_t len = (size_t)args[2].u_int;
+    mp_float_t start, step, stop;
+
+    ndarray_obj_t *ndarray = NULL;
+
+    #if ULAB_SUPPORTS_COMPLEX
+    mp_float_t step_real, step_imag;
+    bool complex_out = false;
+
+    if(mp_obj_is_type(args[0].u_obj, &mp_type_complex) || mp_obj_is_type(args[1].u_obj, &mp_type_complex)) {
+        complex_out = true;
+        ndarray = ndarray_new_linear_array(len, NDARRAY_COMPLEX);
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        mp_float_t start_real, start_imag;
+        mp_float_t stop_real, stop_imag;
+
+        mp_obj_get_complex(args[0].u_obj, &start_real, &start_imag);
+        mp_obj_get_complex(args[1].u_obj, &stop_real, &stop_imag);
+        if(args[3].u_obj == mp_const_true) {
+            step_real = (stop_real - start_real) / (len - 1);
+            step_imag = (stop_imag - start_imag) / (len - 1);
+        } else {
+            step_real = (stop_real - start_real) / len;
+            step_imag = (stop_imag - start_imag) / len;
+        }
+
+        for(size_t i = 0; i < len; i++) {
+            *array++ = start_real;
+            *array++ = start_imag;
+            start_real += step_real;
+            start_imag += step_imag;
+        }
+    } else {
+    #endif
+        start = mp_obj_get_float(args[0].u_obj);
+        stop = mp_obj_get_float(args[1].u_obj);
+
+        uint8_t typecode = args[5].u_int;
+
+        if(args[3].u_obj == mp_const_true) {
+            step = (stop - start) / (len - 1);
+        } else {
+            step = (stop - start) / len;
+            stop = start + step * (len - 1);
+        }
+
+        ndarray = create_linspace_arange(start, step, stop, len, typecode);
+    #if ULAB_SUPPORTS_COMPLEX
+    }
+    #endif
+
+    if(args[4].u_obj == mp_const_false) {
+        return MP_OBJ_FROM_PTR(ndarray);
+    } else {
+        mp_obj_t tuple[2];
+        tuple[0] = ndarray;
+        #if ULAB_SUPPORTS_COMPLEX
+        if(complex_out) {
+            tuple[1] = mp_obj_new_complex(step_real, step_imag);
+        } else {
+            tuple[1] = mp_obj_new_float(step);
+        }
+        #else /* ULAB_SUPPORTS_COMPLEX */
+        tuple[1] = mp_obj_new_float(step);
+        #endif
+
+        return mp_obj_new_tuple(2, tuple);
+    }
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_linspace_obj, 2, create_linspace);
+#endif
+
+#if ULAB_NUMPY_HAS_LOGSPACE
+//| def logspace(
+//|     start: _float,
+//|     stop: _float,
+//|     *,
+//|     dtype: _DType = ulab.numpy.float,
+//|     num: int = 50,
+//|     endpoint: _bool = True,
+//|     base: _float = 10.0
+//| ) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: start
+//|       First value in the array
+//|     .. param: stop
+//|       Final value in the array
+//|     .. param int: num
+//|       Count of values in the array. Defaults to 50.
+//|     .. param: base
+//|       The base of the log space. The step size between the elements in
+//|       ``ln(samples) / ln(base)`` (or ``log_base(samples)``) is uniform. Defaults to 10.0.
+//|     .. param: dtype
+//|       Type of values in the array
+//|     .. param bool: endpoint
+//|       Whether the ``stop`` value is included.  Note that even when
+//|       endpoint=True, the exact ``stop`` value may not be included due to the
+//|       inaccuracy of floating point arithmetic. Defaults to True.
+//|
+//|     Return a new 1-D array with ``num`` evenly spaced elements on a log scale.
+//|     The sequence starts at ``base ** start``, and ends with ``base ** stop``."""
+//|     ...
+//|
+
+const mp_obj_float_t create_float_const_ten = {{&mp_type_float}, MICROPY_FLOAT_CONST(10.0)};
+
+mp_obj_t create_logspace(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_num, MP_ARG_INT, { .u_int = 50 } },
+        { MP_QSTR_base, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_PTR(&create_float_const_ten) } },
+        { MP_QSTR_endpoint, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_true } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(args[2].u_int < 2) {
+        mp_raise_ValueError(translate("number of points must be at least 2"));
+    }
+    size_t len = (size_t)args[2].u_int;
+    mp_float_t start, step, quotient;
+    start = mp_obj_get_float(args[0].u_obj);
+    uint8_t dtype = args[5].u_int;
+    mp_float_t base = mp_obj_get_float(args[3].u_obj);
+    if(args[4].u_obj == mp_const_true) step = (mp_obj_get_float(args[1].u_obj) - start)/(len - 1);
+    else step = (mp_obj_get_float(args[1].u_obj) - start) / len;
+    quotient = MICROPY_FLOAT_C_FUN(pow)(base, step);
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, dtype);
+
+    mp_float_t value = MICROPY_FLOAT_C_FUN(pow)(base, start);
+    if(ndarray->dtype == NDARRAY_UINT8) {
+        uint8_t *array = (uint8_t *)ndarray->array;
+        if(ndarray->boolean) {
+            memset(array, 1, len);
+        } else {
+            for(size_t i=0; i < len; i++, value *= quotient) *array++ = (uint8_t)value;
+        }
+    } else if(ndarray->dtype == NDARRAY_INT8) {
+        int8_t *array = (int8_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value *= quotient) *array++ = (int8_t)value;
+    } else if(ndarray->dtype == NDARRAY_UINT16) {
+        uint16_t *array = (uint16_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value *= quotient) *array++ = (uint16_t)value;
+    } else if(ndarray->dtype == NDARRAY_INT16) {
+        int16_t *array = (int16_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value *= quotient) *array++ = (int16_t)value;
+    } else {
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value *= quotient) *array++ = value;
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_logspace_obj, 2, create_logspace);
+#endif
+
+#if ULAB_NUMPY_HAS_ONES
+//| def ones(shape: Union[int, Tuple[int, ...]], *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|    """
+//|    .. param: shape
+//|       Shape of the array, either an integer (for a 1-D array) or a tuple of 2 integers (for a 2-D array)
+//|    .. param: dtype
+//|       Type of values in the array
+//|
+//|    Return a new array of the given shape with all elements set to 1."""
+//|    ...
+//|
+
+mp_obj_t create_ones(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_obj = MP_OBJ_NULL } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    uint8_t dtype = args[1].u_int;
+    mp_obj_t one = mp_obj_new_int(1);
+    return create_zeros_ones_full(args[0].u_obj, dtype, one);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_ones_obj, 0, create_ones);
+#endif
+
+#if ULAB_NUMPY_HAS_ZEROS
+//| def zeros(shape: Union[int, Tuple[int, ...]], *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|    """
+//|    .. param: shape
+//|       Shape of the array, either an integer (for a 1-D array) or a tuple of 2 integers (for a 2-D array)
+//|    .. param: dtype
+//|       Type of values in the array
+//|
+//|    Return a new array of the given shape with all elements set to 0."""
+//|    ...
+//|
+
+mp_obj_t create_zeros(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_obj = MP_OBJ_NULL } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    uint8_t dtype = args[1].u_int;
+    return create_zeros_ones_full(args[0].u_obj, dtype, mp_const_none);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_zeros_obj, 0, create_zeros);
+#endif
+
+#if ULAB_NUMPY_HAS_FROMBUFFER
+mp_obj_t create_frombuffer(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_INT(NDARRAY_FLOAT) } },
+        { MP_QSTR_count, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_INT(-1) } },
+        { MP_QSTR_offset, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_INT(0) } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    uint8_t dtype = mp_obj_get_int(args[1].u_obj);
+    size_t offset = mp_obj_get_int(args[3].u_obj);
+
+    mp_buffer_info_t bufinfo;
+    if(mp_get_buffer(args[0].u_obj, &bufinfo, MP_BUFFER_READ)) {
+        size_t sz = ulab_binary_get_size(dtype);
+
+        if(bufinfo.len < offset) {
+            mp_raise_ValueError(translate("offset must be non-negative and no greater than buffer length"));
+        }
+        size_t len = (bufinfo.len - offset) / sz;
+        if((len * sz) != (bufinfo.len - offset)) {
+            mp_raise_ValueError(translate("buffer size must be a multiple of element size"));
+        }
+        if(mp_obj_get_int(args[2].u_obj) > 0) {
+            size_t count = mp_obj_get_int(args[2].u_obj);
+            if(len < count) {
+                mp_raise_ValueError(translate("buffer is smaller than requested size"));
+            } else {
+                len = count;
+            }
+        }
+        ndarray_obj_t *ndarray = m_new_obj(ndarray_obj_t);
+        ndarray->base.type = &ulab_ndarray_type;
+        ndarray->dtype = dtype == NDARRAY_BOOL ? NDARRAY_UINT8 : dtype;
+        ndarray->boolean = dtype == NDARRAY_BOOL ? NDARRAY_BOOLEAN : NDARRAY_NUMERIC;
+        ndarray->ndim = 1;
+        ndarray->len = len;
+        ndarray->itemsize = sz;
+        ndarray->shape[ULAB_MAX_DIMS - 1] = len;
+        ndarray->strides[ULAB_MAX_DIMS - 1] = sz;
+
+        uint8_t *buffer = bufinfo.buf;
+        ndarray->array = buffer + offset;
+        return MP_OBJ_FROM_PTR(ndarray);
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_frombuffer_obj, 1, create_frombuffer);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/create.h b/circuitpython/extmod/ulab/code/numpy/create.h
new file mode 100644
index 0000000..18f636c
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/create.h
@@ -0,0 +1,79 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2019-2021 Zoltán Vörös
+*/
+
+#ifndef _CREATE_
+#define _CREATE_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+#if ULAB_NUMPY_HAS_ARANGE
+mp_obj_t create_arange(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_arange_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_CONCATENATE
+mp_obj_t create_concatenate(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_concatenate_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_DIAG
+mp_obj_t create_diag(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_diag_obj);
+#endif
+
+#if ULAB_MAX_DIMS > 1
+#if ULAB_NUMPY_HAS_EYE
+mp_obj_t create_eye(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_eye_obj);
+#endif
+#endif
+
+#if ULAB_NUMPY_HAS_FULL
+mp_obj_t create_full(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_full_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_LINSPACE
+mp_obj_t create_linspace(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_linspace_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_LOGSPACE
+mp_obj_t create_logspace(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_logspace_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_ONES
+mp_obj_t create_ones(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_ones_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_ZEROS
+mp_obj_t create_zeros(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_zeros_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_FROMBUFFER
+mp_obj_t create_frombuffer(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_frombuffer_obj);
+#endif
+
+#define ARANGE_LOOP(type_, ndarray, len, step, stop) \
+({\
+    type_ *array = (type_ *)(ndarray)->array;\
+    for (size_t i = 0; i < (len) - 1; i++, (value) += (step)) {\
+        *array++ = (type_)(value);\
+    }\
+    *array = (type_)(stop);\
+})
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/fft/fft.c b/circuitpython/extmod/ulab/code/numpy/fft/fft.c
new file mode 100644
index 0000000..27cb79c
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/fft/fft.c
@@ -0,0 +1,102 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/runtime.h"
+#include "py/builtin.h"
+#include "py/binary.h"
+#include "py/obj.h"
+#include "py/objarray.h"
+
+#include "../carray/carray_tools.h"
+#include "fft.h"
+
+//| """Frequency-domain functions"""
+//|
+//| import ulab.numpy
+
+
+//| def fft(r: ulab.numpy.ndarray, c: Optional[ulab.numpy.ndarray] = None) -> Tuple[ulab.numpy.ndarray, ulab.numpy.ndarray]:
+//|     """
+//|     :param ulab.numpy.ndarray r: A 1-dimension array of values whose size is a power of 2
+//|     :param ulab.numpy.ndarray c: An optional 1-dimension array of values whose size is a power of 2, giving the complex part of the value
+//|     :return tuple (r, c): The real and complex parts of the FFT
+//|
+//|     Perform a Fast Fourier Transform from the time domain into the frequency domain
+//|
+//|     See also ~ulab.extras.spectrum, which computes the magnitude of the fft,
+//|     rather than separately returning its real and imaginary parts."""
+//|     ...
+//|
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+static mp_obj_t fft_fft(mp_obj_t arg) {
+    return fft_fft_ifft_spectrogram(arg, FFT_FFT);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(fft_fft_obj, fft_fft);
+#else
+static mp_obj_t fft_fft(size_t n_args, const mp_obj_t *args) {
+    if(n_args == 2) {
+        return fft_fft_ifft_spectrogram(n_args, args[0], args[1], FFT_FFT);
+    } else {
+        return fft_fft_ifft_spectrogram(n_args, args[0], mp_const_none, FFT_FFT);
+    }
+}
+
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(fft_fft_obj, 1, 2, fft_fft);
+#endif
+
+//| def ifft(r: ulab.numpy.ndarray, c: Optional[ulab.numpy.ndarray] = None) -> Tuple[ulab.numpy.ndarray, ulab.numpy.ndarray]:
+//|     """
+//|     :param ulab.numpy.ndarray r: A 1-dimension array of values whose size is a power of 2
+//|     :param ulab.numpy.ndarray c: An optional 1-dimension array of values whose size is a power of 2, giving the complex part of the value
+//|     :return tuple (r, c): The real and complex parts of the inverse FFT
+//|
+//|     Perform an Inverse Fast Fourier Transform from the frequeny domain into the time domain"""
+//|     ...
+//|
+
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+static mp_obj_t fft_ifft(mp_obj_t arg) {
+    return fft_fft_ifft_spectrogram(arg, FFT_IFFT);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(fft_ifft_obj, fft_ifft);
+#else
+static mp_obj_t fft_ifft(size_t n_args, const mp_obj_t *args) {
+    NOT_IMPLEMENTED_FOR_COMPLEX()
+    if(n_args == 2) {
+        return fft_fft_ifft_spectrogram(n_args, args[0], args[1], FFT_IFFT);
+    } else {
+        return fft_fft_ifft_spectrogram(n_args, args[0], mp_const_none, FFT_IFFT);
+    }
+}
+
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(fft_ifft_obj, 1, 2, fft_ifft);
+#endif
+
+STATIC const mp_rom_map_elem_t ulab_fft_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_fft) },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_fft), (mp_obj_t)&fft_fft_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_ifft), (mp_obj_t)&fft_ifft_obj },
+};
+
+STATIC MP_DEFINE_CONST_DICT(mp_module_ulab_fft_globals, ulab_fft_globals_table);
+
+const mp_obj_module_t ulab_fft_module = {
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_fft_globals,
+};
+MP_REGISTER_MODULE(MP_QSTR_ulab_dot_fft, ulab_fft_module, MODULE_ULAB_ENABLED && CIRCUITPY_ULAB);
diff --git a/circuitpython/extmod/ulab/code/numpy/fft/fft.h b/circuitpython/extmod/ulab/code/numpy/fft/fft.h
new file mode 100644
index 0000000..1e50a8d
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/fft/fft.h
@@ -0,0 +1,30 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _FFT_
+#define _FFT_
+
+#include "../../ulab.h"
+#include "../../ulab_tools.h"
+#include "../../ndarray.h"
+#include "fft_tools.h"
+
+extern const mp_obj_module_t ulab_fft_module;
+
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+MP_DECLARE_CONST_FUN_OBJ_3(fft_fft_obj);
+MP_DECLARE_CONST_FUN_OBJ_3(fft_ifft_obj);
+#else
+MP_DECLARE_CONST_FUN_OBJ_VAR_BETWEEN(fft_fft_obj);
+MP_DECLARE_CONST_FUN_OBJ_VAR_BETWEEN(fft_ifft_obj);
+#endif
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/fft/fft_tools.c b/circuitpython/extmod/ulab/code/numpy/fft/fft_tools.c
new file mode 100644
index 0000000..8a55927
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/fft/fft_tools.c
@@ -0,0 +1,287 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#include <math.h>
+#include <string.h>
+#include "py/runtime.h"
+
+#include "../../ndarray.h"
+#include "../../ulab_tools.h"
+#include "../carray/carray_tools.h"
+#include "fft_tools.h"
+
+#ifndef MP_PI
+#define MP_PI MICROPY_FLOAT_CONST(3.14159265358979323846)
+#endif
+#ifndef MP_E
+#define MP_E MICROPY_FLOAT_CONST(2.71828182845904523536)
+#endif
+
+/* Kernel implementation for the case, when ulab has no complex support
+
+ * The following function takes two arrays, namely, the real and imaginary
+ * parts of a complex array, and calculates the Fourier transform in place.
+ *
+ * The function is basically a modification of four1 from Numerical Recipes,
+ * has no dependencies beyond micropython itself (for the definition of mp_float_t),
+ * and can be used independent of ulab.
+ */
+
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+/* Kernel implementation for the complex case. Data are contained in data as
+
+    data[0], data[1], data[2], data[3], .... , data[2n - 2], data[2n-1]
+    real[0], imag[0], real[1], imag[1], .... , real[n-1],    imag[n-1]
+
+    In general
+    real[i] = data[2i]
+    imag[i] = data[2i+1]
+
+*/
+void fft_kernel_complex(mp_float_t *data, size_t n, int isign) {
+    size_t j, m, mmax, istep;
+    mp_float_t tempr, tempi;
+    mp_float_t wtemp, wr, wpr, wpi, wi, theta;
+
+    j = 0;
+    for(size_t i = 0; i < n; i++) {
+        if (j > i) {
+            SWAP(mp_float_t, data[2*i], data[2*j]);
+            SWAP(mp_float_t, data[2*i+1], data[2*j+1]);
+        }
+        m = n >> 1;
+        while (j >= m && m > 0) {
+            j -= m;
+            m >>= 1;
+        }
+        j += m;
+    }
+
+    mmax = 1;
+    while (n > mmax) {
+        istep = mmax << 1;
+        theta = MICROPY_FLOAT_CONST(-2.0)*isign*MP_PI/istep;
+        wtemp = MICROPY_FLOAT_C_FUN(sin)(MICROPY_FLOAT_CONST(0.5) * theta);
+        wpr = MICROPY_FLOAT_CONST(-2.0) * wtemp * wtemp;
+        wpi = MICROPY_FLOAT_C_FUN(sin)(theta);
+        wr = MICROPY_FLOAT_CONST(1.0);
+        wi = MICROPY_FLOAT_CONST(0.0);
+        for(m = 0; m < mmax; m++) {
+            for(size_t i = m; i < n; i += istep) {
+                j = i + mmax;
+                tempr = wr * data[2*j] - wi * data[2*j+1];
+                tempi = wr * data[2*j+1] + wi * data[2*j];
+                data[2*j] = data[2*i] - tempr;
+                data[2*j+1] = data[2*i+1] - tempi;
+                data[2*i] += tempr;
+                data[2*i+1] += tempi;
+            }
+            wtemp = wr;
+            wr = wr*wpr - wi*wpi + wr;
+            wi = wi*wpr + wtemp*wpi + wi;
+        }
+        mmax = istep;
+    }
+}
+
+/*
+ * The following function is a helper interface to the python side.
+ * It has been factored out from fft.c, so that the same argument parsing
+ * routine can be called from scipy.signal.spectrogram.
+ */
+mp_obj_t fft_fft_ifft_spectrogram(mp_obj_t data_in, uint8_t type) {
+    if(!mp_obj_is_type(data_in, &ulab_ndarray_type)) {
+        mp_raise_NotImplementedError(translate("FFT is defined for ndarrays only"));
+    }
+    ndarray_obj_t *in = MP_OBJ_TO_PTR(data_in);
+    #if ULAB_MAX_DIMS > 1
+    if(in->ndim != 1) {
+        mp_raise_TypeError(translate("FFT is implemented for linear arrays only"));
+    }
+    #endif
+    size_t len = in->len;
+    // Check if input is of length of power of 2
+    if((len & (len-1)) != 0) {
+        mp_raise_ValueError(translate("input array length must be power of 2"));
+    }
+
+    ndarray_obj_t *out = ndarray_new_linear_array(len, NDARRAY_COMPLEX);
+    mp_float_t *data = (mp_float_t *)out->array;
+    uint8_t *array = (uint8_t *)in->array;
+
+    if(in->dtype == NDARRAY_COMPLEX) {
+        uint8_t sz = 2 * sizeof(mp_float_t);
+        uint8_t *data_ = (uint8_t *)out->array;
+        for(size_t i = 0; i < len; i++) {
+            memcpy(data_, array, sz);
+            array += in->strides[ULAB_MAX_DIMS - 1];
+        }
+    } else {
+        mp_float_t (*func)(void *) = ndarray_get_float_function(in->dtype);
+        for(size_t i = 0; i < len; i++) {
+            // real part; the imaginary part is 0, no need to assign
+            *data = func(array);
+            data += 2;
+            array += in->strides[ULAB_MAX_DIMS - 1];
+        }
+    }
+    data -= 2 * len;
+
+    if((type == FFT_FFT) || (type == FFT_SPECTROGRAM)) {
+        fft_kernel_complex(data, len, 1);
+        if(type == FFT_SPECTROGRAM) {
+            ndarray_obj_t *spectrum = ndarray_new_linear_array(len, NDARRAY_FLOAT);
+            mp_float_t *sarray = (mp_float_t *)spectrum->array;
+            for(size_t i = 0; i < len; i++) {
+                *sarray++ = MICROPY_FLOAT_C_FUN(sqrt)(data[0] * data[0] + data[1] * data[1]);
+                data += 2;
+            }
+            m_del(mp_float_t, data, 2 * len);
+            return MP_OBJ_FROM_PTR(spectrum);
+        }
+    } else { // inverse transform
+        fft_kernel_complex(data, len, -1);
+        // TODO: numpy accepts the norm keyword argument
+        for(size_t i = 0; i < len; i++) {
+            *data++ /= len;
+        }
+    }
+    return MP_OBJ_FROM_PTR(out);
+}
+#else /* ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE */
+void fft_kernel(mp_float_t *real, mp_float_t *imag, size_t n, int isign) {
+    size_t j, m, mmax, istep;
+    mp_float_t tempr, tempi;
+    mp_float_t wtemp, wr, wpr, wpi, wi, theta;
+
+    j = 0;
+    for(size_t i = 0; i < n; i++) {
+        if (j > i) {
+            SWAP(mp_float_t, real[i], real[j]);
+            SWAP(mp_float_t, imag[i], imag[j]);
+        }
+        m = n >> 1;
+        while (j >= m && m > 0) {
+            j -= m;
+            m >>= 1;
+        }
+        j += m;
+    }
+
+    mmax = 1;
+    while (n > mmax) {
+        istep = mmax << 1;
+        theta = MICROPY_FLOAT_CONST(-2.0)*isign*MP_PI/istep;
+        wtemp = MICROPY_FLOAT_C_FUN(sin)(MICROPY_FLOAT_CONST(0.5) * theta);
+        wpr = MICROPY_FLOAT_CONST(-2.0) * wtemp * wtemp;
+        wpi = MICROPY_FLOAT_C_FUN(sin)(theta);
+        wr = MICROPY_FLOAT_CONST(1.0);
+        wi = MICROPY_FLOAT_CONST(0.0);
+        for(m = 0; m < mmax; m++) {
+            for(size_t i = m; i < n; i += istep) {
+                j = i + mmax;
+                tempr = wr * real[j] - wi * imag[j];
+                tempi = wr * imag[j] + wi * real[j];
+                real[j] = real[i] - tempr;
+                imag[j] = imag[i] - tempi;
+                real[i] += tempr;
+                imag[i] += tempi;
+            }
+            wtemp = wr;
+            wr = wr*wpr - wi*wpi + wr;
+            wi = wi*wpr + wtemp*wpi + wi;
+        }
+        mmax = istep;
+    }
+}
+
+mp_obj_t fft_fft_ifft_spectrogram(size_t n_args, mp_obj_t arg_re, mp_obj_t arg_im, uint8_t type) {
+    if(!mp_obj_is_type(arg_re, &ulab_ndarray_type)) {
+        mp_raise_NotImplementedError(translate("FFT is defined for ndarrays only"));
+    }
+    if(n_args == 2) {
+        if(!mp_obj_is_type(arg_im, &ulab_ndarray_type)) {
+            mp_raise_NotImplementedError(translate("FFT is defined for ndarrays only"));
+        }
+    }
+    ndarray_obj_t *re = MP_OBJ_TO_PTR(arg_re);
+    #if ULAB_MAX_DIMS > 1
+    if(re->ndim != 1) {
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(re->dtype)
+        mp_raise_TypeError(translate("FFT is implemented for linear arrays only"));
+    }
+    #endif
+    size_t len = re->len;
+    // Check if input is of length of power of 2
+    if((len & (len-1)) != 0) {
+        mp_raise_ValueError(translate("input array length must be power of 2"));
+    }
+
+    ndarray_obj_t *out_re = ndarray_new_linear_array(len, NDARRAY_FLOAT);
+    mp_float_t *data_re = (mp_float_t *)out_re->array;
+
+    uint8_t *array = (uint8_t *)re->array;
+    mp_float_t (*func)(void *) = ndarray_get_float_function(re->dtype);
+
+    for(size_t i=0; i < len; i++) {
+        *data_re++ = func(array);
+        array += re->strides[ULAB_MAX_DIMS - 1];
+    }
+    data_re -= len;
+    ndarray_obj_t *out_im = ndarray_new_linear_array(len, NDARRAY_FLOAT);
+    mp_float_t *data_im = (mp_float_t *)out_im->array;
+
+    if(n_args == 2) {
+        ndarray_obj_t *im = MP_OBJ_TO_PTR(arg_im);
+        #if ULAB_MAX_DIMS > 1
+        if(im->ndim != 1) {
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(im->dtype)
+            mp_raise_TypeError(translate("FFT is implemented for linear arrays only"));
+        }
+        #endif
+        if (re->len != im->len) {
+            mp_raise_ValueError(translate("real and imaginary parts must be of equal length"));
+        }
+        array = (uint8_t *)im->array;
+        func = ndarray_get_float_function(im->dtype);
+        for(size_t i=0; i < len; i++) {
+           *data_im++ = func(array);
+           array += im->strides[ULAB_MAX_DIMS - 1];
+        }
+        data_im -= len;
+    }
+
+    if((type == FFT_FFT) || (type == FFT_SPECTROGRAM)) {
+        fft_kernel(data_re, data_im, len, 1);
+        if(type == FFT_SPECTROGRAM) {
+            for(size_t i=0; i < len; i++) {
+                *data_re = MICROPY_FLOAT_C_FUN(sqrt)(*data_re * *data_re + *data_im * *data_im);
+                data_re++;
+                data_im++;
+            }
+        }
+    } else { // inverse transform
+        fft_kernel(data_re, data_im, len, -1);
+        // TODO: numpy accepts the norm keyword argument
+        for(size_t i=0; i < len; i++) {
+            *data_re++ /= len;
+            *data_im++ /= len;
+        }
+    }
+    if(type == FFT_SPECTROGRAM) {
+        return MP_OBJ_TO_PTR(out_re);
+    } else {
+        mp_obj_t tuple[2];
+        tuple[0] = out_re;
+        tuple[1] = out_im;
+        return mp_obj_new_tuple(2, tuple);
+    }
+}
+#endif  /* ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE */
diff --git a/circuitpython/extmod/ulab/code/numpy/fft/fft_tools.h b/circuitpython/extmod/ulab/code/numpy/fft/fft_tools.h
new file mode 100644
index 0000000..9444232
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/fft/fft_tools.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _FFT_TOOLS_
+#define _FFT_TOOLS_
+
+enum FFT_TYPE {
+    FFT_FFT,
+    FFT_IFFT,
+    FFT_SPECTROGRAM,
+};
+
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+void fft_kernel(mp_float_t *, size_t , int );
+mp_obj_t fft_fft_ifft_spectrogram(mp_obj_t , uint8_t );
+#else
+void fft_kernel(mp_float_t *, mp_float_t *, size_t , int );
+mp_obj_t fft_fft_ifft_spectrogram(size_t , mp_obj_t , mp_obj_t , uint8_t );
+#endif /* ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE */
+
+#endif /* _FFT_TOOLS_ */
diff --git a/circuitpython/extmod/ulab/code/numpy/filter.c b/circuitpython/extmod/ulab/code/numpy/filter.c
new file mode 100644
index 0000000..057cd6d
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/filter.c
@@ -0,0 +1,132 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020-2021 Zoltán Vörös
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../ulab.h"
+#include "../scipy/signal/signal.h"
+#include "carray/carray_tools.h"
+#include "filter.h"
+
+#if ULAB_NUMPY_HAS_CONVOLVE
+
+mp_obj_t filter_convolve(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_a, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_v, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type) || !mp_obj_is_type(args[1].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("convolve arguments must be ndarrays"));
+    }
+
+    ndarray_obj_t *a = MP_OBJ_TO_PTR(args[0].u_obj);
+    ndarray_obj_t *c = MP_OBJ_TO_PTR(args[1].u_obj);
+    // deal with linear arrays only
+    #if ULAB_MAX_DIMS > 1
+    if((a->ndim != 1) || (c->ndim != 1)) {
+        mp_raise_TypeError(translate("convolve arguments must be linear arrays"));
+    }
+    #endif
+    size_t len_a = a->len;
+    size_t len_c = c->len;
+    if(len_a == 0 || len_c == 0) {
+        mp_raise_TypeError(translate("convolve arguments must not be empty"));
+    }
+
+    int len = len_a + len_c - 1; // convolve mode "full"
+    int32_t off = len_c - 1;
+    uint8_t dtype = NDARRAY_FLOAT;
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if((a->dtype == NDARRAY_COMPLEX) || (c->dtype == NDARRAY_COMPLEX)) {
+        dtype = NDARRAY_COMPLEX;
+    }
+    #endif
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, dtype);
+    mp_float_t *array = (mp_float_t *)ndarray->array;
+
+    uint8_t *aarray = (uint8_t *)a->array;
+    uint8_t *carray = (uint8_t *)c->array;
+
+    int32_t as = a->strides[ULAB_MAX_DIMS - 1] / a->itemsize;
+    int32_t cs = c->strides[ULAB_MAX_DIMS - 1] / c->itemsize;
+
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if(dtype == NDARRAY_COMPLEX) {
+        mp_float_t a_real, a_imag;
+        mp_float_t c_real, c_imag = MICROPY_FLOAT_CONST(0.0);
+        for(int32_t k = -off; k < len-off; k++) {
+            mp_float_t accum_real = MICROPY_FLOAT_CONST(0.0);
+            mp_float_t accum_imag = MICROPY_FLOAT_CONST(0.0);
+
+            int32_t top_n = MIN(len_c, len_a - k);
+            int32_t bot_n = MAX(-k, 0);
+
+            for(int32_t n = bot_n; n < top_n; n++) {
+                int32_t idx_c = (len_c - n - 1) * cs;
+                int32_t idx_a = (n + k) * as;
+                if(a->dtype != NDARRAY_COMPLEX) {
+                    a_real = ndarray_get_float_index(aarray, a->dtype, idx_a);
+                    a_imag = MICROPY_FLOAT_CONST(0.0);
+                } else {
+                    a_real = ndarray_get_float_index(aarray, NDARRAY_FLOAT, 2 * idx_a);
+                    a_imag = ndarray_get_float_index(aarray, NDARRAY_FLOAT, 2 * idx_a + 1);
+                }
+
+                if(c->dtype != NDARRAY_COMPLEX) {
+                    c_real = ndarray_get_float_index(carray, c->dtype, idx_c);
+                    c_imag = MICROPY_FLOAT_CONST(0.0);
+                } else {
+                    c_real = ndarray_get_float_index(carray, NDARRAY_FLOAT, 2 * idx_c);
+                    c_imag = ndarray_get_float_index(carray, NDARRAY_FLOAT, 2 * idx_c + 1);
+                }
+                accum_real += a_real * c_real - a_imag * c_imag;
+                accum_imag += a_real * c_imag + a_imag * c_real;
+            }
+            *array++ = accum_real;
+            *array++ = accum_imag;
+        }
+        return MP_OBJ_FROM_PTR(ndarray);
+    }
+    #endif
+
+    for(int32_t k = -off; k < len-off; k++) {
+        mp_float_t accum = MICROPY_FLOAT_CONST(0.0);
+        int32_t top_n = MIN(len_c, len_a - k);
+        int32_t bot_n = MAX(-k, 0);
+        for(int32_t n = bot_n; n < top_n; n++) {
+            int32_t idx_c = (len_c - n - 1) * cs;
+            int32_t idx_a = (n + k) * as;
+            mp_float_t ai = ndarray_get_float_index(aarray, a->dtype, idx_a);
+            mp_float_t ci = ndarray_get_float_index(carray, c->dtype, idx_c);
+            accum += ai * ci;
+        }
+        *array++ = accum;
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(filter_convolve_obj, 2, filter_convolve);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/filter.h b/circuitpython/extmod/ulab/code/numpy/filter.h
new file mode 100644
index 0000000..d6d0f17
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/filter.h
@@ -0,0 +1,20 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2020-2021 Zoltán Vörös
+*/
+
+#ifndef _FILTER_
+#define _FILTER_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+MP_DECLARE_CONST_FUN_OBJ_KW(filter_convolve_obj);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/linalg/linalg.c b/circuitpython/extmod/ulab/code/numpy/linalg/linalg.c
new file mode 100644
index 0000000..11dc7de
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/linalg/linalg.c
@@ -0,0 +1,541 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020 Roberto Colistete Jr.
+ *               2020 Taku Fukada
+ *
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../../ulab.h"
+#include "../../ulab_tools.h"
+#include "../carray/carray_tools.h"
+#include "linalg.h"
+
+#if ULAB_NUMPY_HAS_LINALG_MODULE
+//|
+//| import ulab.numpy
+//|
+//| """Linear algebra functions"""
+//|
+
+#if ULAB_MAX_DIMS > 1
+//| def cholesky(A: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|     """
+//|     :param ~ulab.numpy.ndarray A: a positive definite, symmetric square matrix
+//|     :return ~ulab.numpy.ndarray L: a square root matrix in the lower triangular form
+//|     :raises ValueError: If the input does not fulfill the necessary conditions
+//|
+//|     The returned matrix satisfies the equation m=LL*"""
+//|     ...
+//|
+
+static mp_obj_t linalg_cholesky(mp_obj_t oin) {
+    ndarray_obj_t *ndarray = tools_object_is_square(oin);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    ndarray_obj_t *L = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, ndarray->shape[ULAB_MAX_DIMS - 1], ndarray->shape[ULAB_MAX_DIMS - 1]), NDARRAY_FLOAT);
+    mp_float_t *Larray = (mp_float_t *)L->array;
+
+    size_t N = ndarray->shape[ULAB_MAX_DIMS - 1];
+    uint8_t *array = (uint8_t *)ndarray->array;
+    mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+
+    for(size_t m=0; m < N; m++) { // rows
+        for(size_t n=0; n < N; n++) { // columns
+            *Larray++ = func(array);
+            array += ndarray->strides[ULAB_MAX_DIMS - 1];
+        }
+        array -= ndarray->strides[ULAB_MAX_DIMS - 1] * N;
+        array += ndarray->strides[ULAB_MAX_DIMS - 2];
+    }
+    Larray -= N*N;
+    // make sure the matrix is symmetric
+    for(size_t m=0; m < N; m++) { // rows
+        for(size_t n=m+1; n < N; n++) { // columns
+            // compare entry (m, n) to (n, m)
+            if(LINALG_EPSILON < MICROPY_FLOAT_C_FUN(fabs)(Larray[m * N + n] - Larray[n * N + m])) {
+                mp_raise_ValueError(translate("input matrix is asymmetric"));
+            }
+        }
+    }
+
+    // this is actually not needed, but Cholesky in numpy returns the lower triangular matrix
+    for(size_t i=0; i < N; i++) { // rows
+        for(size_t j=i+1; j < N; j++) { // columns
+            Larray[i*N + j] = MICROPY_FLOAT_CONST(0.0);
+        }
+    }
+    mp_float_t sum = 0.0;
+    for(size_t i=0; i < N; i++) { // rows
+        for(size_t j=0; j <= i; j++) { // columns
+            sum = Larray[i * N + j];
+            for(size_t k=0; k < j; k++) {
+                sum -= Larray[i * N + k] * Larray[j * N + k];
+            }
+            if(i == j) {
+                if(sum <= MICROPY_FLOAT_CONST(0.0)) {
+                    mp_raise_ValueError(translate("matrix is not positive definite"));
+                } else {
+                    Larray[i * N + i] = MICROPY_FLOAT_C_FUN(sqrt)(sum);
+                }
+            } else {
+                Larray[i * N + j] = sum / Larray[j * N + j];
+            }
+        }
+    }
+    return MP_OBJ_FROM_PTR(L);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(linalg_cholesky_obj, linalg_cholesky);
+
+//| def det(m: ulab.numpy.ndarray) -> float:
+//|     """
+//|     :param: m, a square matrix
+//|     :return float: The determinant of the matrix
+//|
+//|     Computes the eigenvalues and eigenvectors of a square matrix"""
+//|     ...
+//|
+
+static mp_obj_t linalg_det(mp_obj_t oin) {
+    ndarray_obj_t *ndarray = tools_object_is_square(oin);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    uint8_t *array = (uint8_t *)ndarray->array;
+    size_t N = ndarray->shape[ULAB_MAX_DIMS - 1];
+    mp_float_t *tmp = m_new(mp_float_t, N * N);
+    for(size_t m=0; m < N; m++) { // rows
+        for(size_t n=0; n < N; n++) { // columns
+            *tmp++ = ndarray_get_float_value(array, ndarray->dtype);
+            array += ndarray->strides[ULAB_MAX_DIMS - 1];
+        }
+        array -= ndarray->strides[ULAB_MAX_DIMS - 1] * N;
+        array += ndarray->strides[ULAB_MAX_DIMS - 2];
+    }
+
+    // re-wind the pointer
+    tmp -= N*N;
+
+    mp_float_t c;
+    mp_float_t det_sign = 1.0;
+
+    for(size_t m=0; m < N-1; m++){
+        if(MICROPY_FLOAT_C_FUN(fabs)(tmp[m * (N+1)]) < LINALG_EPSILON) {
+            size_t m1 = m + 1;
+            for(; m1 < N; m1++) {
+                if(!(MICROPY_FLOAT_C_FUN(fabs)(tmp[m1*N+m]) < LINALG_EPSILON)) {
+                     //look for a line to swap
+                    for(size_t m2=0; m2 < N; m2++) {
+                        mp_float_t swapVal = tmp[m*N+m2];
+                        tmp[m*N+m2] = tmp[m1*N+m2];
+                        tmp[m1*N+m2] = swapVal;
+                    }
+                    det_sign = -det_sign;
+                    break;
+                }
+            }
+            if (m1 >= N) {
+                m_del(mp_float_t, tmp, N * N);
+                return mp_obj_new_float(0.0);
+            }
+        }
+        for(size_t n=0; n < N; n++) {
+            if(m != n) {
+                c = tmp[N * n + m] / tmp[m * (N+1)];
+                for(size_t k=0; k < N; k++){
+                    tmp[N * n + k] -= c * tmp[N * m + k];
+                }
+            }
+        }
+    }
+    mp_float_t det = det_sign;
+
+    for(size_t m=0; m < N; m++){
+        det *= tmp[m * (N+1)];
+    }
+    m_del(mp_float_t, tmp, N * N);
+    return mp_obj_new_float(det);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(linalg_det_obj, linalg_det);
+
+#endif
+
+#if ULAB_MAX_DIMS > 1
+//| def eig(m: ulab.numpy.ndarray) -> Tuple[ulab.numpy.ndarray, ulab.numpy.ndarray]:
+//|     """
+//|     :param m: a square matrix
+//|     :return tuple (eigenvectors, eigenvalues):
+//|
+//|     Computes the eigenvalues and eigenvectors of a square matrix"""
+//|     ...
+//|
+
+static mp_obj_t linalg_eig(mp_obj_t oin) {
+    ndarray_obj_t *in = tools_object_is_square(oin);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(in->dtype)
+    uint8_t *iarray = (uint8_t *)in->array;
+    size_t S = in->shape[ULAB_MAX_DIMS - 1];
+    mp_float_t *array = m_new(mp_float_t, S*S);
+    for(size_t i=0; i < S; i++) { // rows
+        for(size_t j=0; j < S; j++) { // columns
+            *array++ = ndarray_get_float_value(iarray, in->dtype);
+            iarray += in->strides[ULAB_MAX_DIMS - 1];
+        }
+        iarray -= in->strides[ULAB_MAX_DIMS - 1] * S;
+        iarray += in->strides[ULAB_MAX_DIMS - 2];
+    }
+    array -= S * S;
+    // make sure the matrix is symmetric
+    for(size_t m=0; m < S; m++) {
+        for(size_t n=m+1; n < S; n++) {
+            // compare entry (m, n) to (n, m)
+            // TODO: this must probably be scaled!
+            if(LINALG_EPSILON < MICROPY_FLOAT_C_FUN(fabs)(array[m * S + n] - array[n * S + m])) {
+                mp_raise_ValueError(translate("input matrix is asymmetric"));
+            }
+        }
+    }
+
+    // if we got this far, then the matrix will be symmetric
+
+    ndarray_obj_t *eigenvectors = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, S, S), NDARRAY_FLOAT);
+    mp_float_t *eigvectors = (mp_float_t *)eigenvectors->array;
+
+    size_t iterations = linalg_jacobi_rotations(array, eigvectors, S);
+
+    if(iterations == 0) {
+        // the computation did not converge; numpy raises LinAlgError
+        m_del(mp_float_t, array, in->len);
+        mp_raise_ValueError(translate("iterations did not converge"));
+    }
+    ndarray_obj_t *eigenvalues = ndarray_new_linear_array(S, NDARRAY_FLOAT);
+    mp_float_t *eigvalues = (mp_float_t *)eigenvalues->array;
+    for(size_t i=0; i < S; i++) {
+        eigvalues[i] = array[i * (S + 1)];
+    }
+    m_del(mp_float_t, array, in->len);
+
+    mp_obj_tuple_t *tuple = MP_OBJ_TO_PTR(mp_obj_new_tuple(2, NULL));
+    tuple->items[0] = MP_OBJ_FROM_PTR(eigenvalues);
+    tuple->items[1] = MP_OBJ_FROM_PTR(eigenvectors);
+    return tuple;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(linalg_eig_obj, linalg_eig);
+
+//| def inv(m: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|     """
+//|     :param ~ulab.numpy.ndarray m: a square matrix
+//|     :return: The inverse of the matrix, if it exists
+//|     :raises ValueError: if the matrix is not invertible
+//|
+//|     Computes the inverse of a square matrix"""
+//|     ...
+//|
+static mp_obj_t linalg_inv(mp_obj_t o_in) {
+    ndarray_obj_t *ndarray = tools_object_is_square(o_in);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    uint8_t *array = (uint8_t *)ndarray->array;
+    size_t N = ndarray->shape[ULAB_MAX_DIMS - 1];
+    ndarray_obj_t *inverted = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, N, N), NDARRAY_FLOAT);
+    mp_float_t *iarray = (mp_float_t *)inverted->array;
+
+    mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+
+    for(size_t i=0; i < N; i++) { // rows
+        for(size_t j=0; j < N; j++) { // columns
+            *iarray++ = func(array);
+            array += ndarray->strides[ULAB_MAX_DIMS - 1];
+        }
+        array -= ndarray->strides[ULAB_MAX_DIMS - 1] * N;
+        array += ndarray->strides[ULAB_MAX_DIMS - 2];
+    }
+    // re-wind the pointer
+    iarray -= N*N;
+
+    if(!linalg_invert_matrix(iarray, N)) {
+        mp_raise_ValueError(translate("input matrix is singular"));
+    }
+    return MP_OBJ_FROM_PTR(inverted);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(linalg_inv_obj, linalg_inv);
+#endif
+
+//| def norm(x: ulab.numpy.ndarray) -> float:
+//|    """
+//|    :param ~ulab.numpy.ndarray x: a vector or a matrix
+//|
+//|    Computes the 2-norm of a vector or a matrix, i.e., ``sqrt(sum(x*x))``, however, without the RAM overhead."""
+//|    ...
+//|
+
+static mp_obj_t linalg_norm(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none} } ,
+        { MP_QSTR_axis, MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t x = args[0].u_obj;
+    mp_obj_t axis = args[1].u_obj;
+
+    mp_float_t dot = 0.0, value;
+    size_t count = 1;
+
+    if(mp_obj_is_type(x, &mp_type_tuple) || mp_obj_is_type(x, &mp_type_list) || mp_obj_is_type(x, &mp_type_range)) {
+        mp_obj_iter_buf_t iter_buf;
+        mp_obj_t item, iterable = mp_getiter(x, &iter_buf);
+        while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+            value = mp_obj_get_float(item);
+            // we could simply take the sum of value ** 2,
+            // but this method is numerically stable
+            dot = dot + (value * value - dot) / count++;
+        }
+        return mp_obj_new_float(MICROPY_FLOAT_C_FUN(sqrt)(dot * (count - 1)));
+    } else if(mp_obj_is_type(x, &ulab_ndarray_type)) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(x);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+        uint8_t *array = (uint8_t *)ndarray->array;
+        // always get a float, so that we don't have to resolve the dtype later
+        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+        shape_strides _shape_strides = tools_reduce_axes(ndarray, axis);
+        ndarray_obj_t *results = ndarray_new_dense_ndarray(_shape_strides.ndim, _shape_strides.shape, NDARRAY_FLOAT);
+        mp_float_t *rarray = (mp_float_t *)results->array;
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    if(axis != mp_const_none) {
+                        count = 1;
+                        dot = 0.0;
+                    }
+                    do {
+                        value = func(array);
+                        dot = dot + (value * value - dot) / count++;
+                        array += _shape_strides.strides[0];
+                        l++;
+                    } while(l < _shape_strides.shape[0]);
+                    *rarray = MICROPY_FLOAT_C_FUN(sqrt)(dot * (count - 1));
+                #if ULAB_MAX_DIMS > 1
+                    rarray += _shape_strides.increment;
+                    array -= _shape_strides.strides[0] * _shape_strides.shape[0];
+                    array += _shape_strides.strides[ULAB_MAX_DIMS - 1];
+                    k++;
+                } while(k < _shape_strides.shape[ULAB_MAX_DIMS - 1]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                array -= _shape_strides.strides[ULAB_MAX_DIMS - 1] * _shape_strides.shape[ULAB_MAX_DIMS - 1];
+                array += _shape_strides.strides[ULAB_MAX_DIMS - 2];
+                j++;
+            } while(j < _shape_strides.shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            array -= _shape_strides.strides[ULAB_MAX_DIMS - 2] * _shape_strides.shape[ULAB_MAX_DIMS - 2];
+            array += _shape_strides.strides[ULAB_MAX_DIMS - 3];
+            i++;
+        } while(i < _shape_strides.shape[ULAB_MAX_DIMS - 3]);
+        #endif
+        if(results->ndim == 0) {
+            return mp_obj_new_float(*rarray);
+        }
+        return results;
+    }
+    return mp_const_none; // we should never reach this point
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(linalg_norm_obj, 1, linalg_norm);
+
+#if ULAB_MAX_DIMS > 1
+//| def qr(m: ulab.numpy.ndarray) -> Tuple[ulab.numpy.ndarray, ulab.numpy.ndarray]:
+//|     """
+//|     :param m: a matrix
+//|     :return tuple (Q, R):
+//|
+//|     Factor the matrix a as QR, where Q is orthonormal and R is upper-triangular.
+//|     """
+//|     ...
+//|
+
+static mp_obj_t linalg_qr(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_mode, MP_ARG_OBJ, { .u_rom_obj = MP_ROM_QSTR(MP_QSTR_reduced) } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("operation is defined for ndarrays only"));
+    }
+    ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0].u_obj);
+    if(source->ndim != 2) {
+        mp_raise_ValueError(translate("operation is defined for 2D arrays only"));
+    }
+
+    size_t m = source->shape[ULAB_MAX_DIMS - 2]; // rows
+    size_t n = source->shape[ULAB_MAX_DIMS - 1]; // columns
+
+    ndarray_obj_t *Q = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, m, m), NDARRAY_FLOAT);
+    ndarray_obj_t *R = ndarray_new_dense_ndarray(2, source->shape, NDARRAY_FLOAT);
+
+    mp_float_t *qarray = (mp_float_t *)Q->array;
+    mp_float_t *rarray = (mp_float_t *)R->array;
+
+    // simply copy the entries of source to a float array
+    mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype);
+    uint8_t *sarray = (uint8_t *)source->array;
+
+    for(size_t i = 0; i < m; i++) {
+        for(size_t j = 0; j < n; j++) {
+            *rarray++ = func(sarray);
+            sarray += source->strides[ULAB_MAX_DIMS - 1];
+        }
+        sarray -= n * source->strides[ULAB_MAX_DIMS - 1];
+        sarray += source->strides[ULAB_MAX_DIMS - 2];
+    }
+    rarray -= m * n;
+
+    // start with the unit matrix
+    for(size_t i = 0; i < m; i++) {
+        qarray[i * (m + 1)] = 1.0;
+    }
+
+    for(size_t j = 0; j < n; j++) { // columns
+        for(size_t i = m - 1; i > j; i--) { // rows
+            mp_float_t c, s;
+            // Givens matrix: note that numpy uses a strange form of the rotation
+            // [[c  s],
+            //  [s -c]]
+            if(MICROPY_FLOAT_C_FUN(fabs)(rarray[i * n + j]) < LINALG_EPSILON) { // r[i, j]
+                c = (rarray[(i - 1) * n + j] >= 0.0) ? 1.0 : -1.0; // r[i-1, j]
+                s = 0.0;
+            } else if(MICROPY_FLOAT_C_FUN(fabs)(rarray[(i - 1) * n + j]) < LINALG_EPSILON) { // r[i-1, j]
+                c = 0.0;
+                s = (rarray[i * n + j] >= 0.0) ? -1.0 : 1.0; // r[i, j]
+            } else {
+                mp_float_t t, u;
+                if(MICROPY_FLOAT_C_FUN(fabs)(rarray[(i - 1) * n + j]) > MICROPY_FLOAT_C_FUN(fabs)(rarray[i * n + j])) { // r[i-1, j], r[i, j]
+                    t = rarray[i * n + j] / rarray[(i - 1) * n + j]; // r[i, j]/r[i-1, j]
+                    u = MICROPY_FLOAT_C_FUN(sqrt)(1 + t * t);
+                    c = -1.0 / u;
+                    s = c * t;
+                } else {
+                    t = rarray[(i - 1) * n + j] / rarray[i * n + j]; // r[i-1, j]/r[i, j]
+                    u = MICROPY_FLOAT_C_FUN(sqrt)(1 + t * t);
+                    s = -1.0 / u;
+                    c = s * t;
+                }
+            }
+
+            mp_float_t r1, r2;
+            // update R: multiply with the rotation matrix from the left
+            for(size_t k = 0; k < n; k++) {
+                r1 = rarray[(i - 1) * n + k]; // r[i-1, k]
+                r2 = rarray[i * n + k]; // r[i, k]
+                rarray[(i - 1) * n + k] = c * r1 + s * r2; // r[i-1, k]
+                rarray[i * n + k] = s * r1 - c * r2; // r[i, k]
+            }
+
+            // update Q: multiply with the transpose of the rotation matrix from the right
+            for(size_t k = 0; k < m; k++) {
+                r1 = qarray[k * m + (i - 1)];
+                r2 = qarray[k * m + i];
+                qarray[k * m + (i - 1)] = c * r1 + s * r2;
+                qarray[k * m + i] = s * r1 - c * r2;
+            }
+        }
+    }
+
+    mp_obj_tuple_t *tuple = MP_OBJ_TO_PTR(mp_obj_new_tuple(2, NULL));
+    GET_STR_DATA_LEN(args[1].u_obj, mode, len);
+    if(memcmp(mode, "complete", 8) == 0) {
+        tuple->items[0] = MP_OBJ_FROM_PTR(Q);
+        tuple->items[1] = MP_OBJ_FROM_PTR(R);
+    } else if(memcmp(mode, "reduced", 7) == 0) {
+        size_t k = MAX(m, n) - MIN(m, n);
+        ndarray_obj_t *q = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, m, m - k), NDARRAY_FLOAT);
+        ndarray_obj_t *r = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, m - k, n), NDARRAY_FLOAT);
+        mp_float_t *qa = (mp_float_t *)q->array;
+        mp_float_t *ra = (mp_float_t *)r->array;
+        for(size_t i = 0; i < m; i++) {
+            memcpy(qa, qarray, (m - k) * q->itemsize);
+            qa += (m - k);
+            qarray += m;
+        }
+        for(size_t i = 0; i < m - k; i++) {
+            memcpy(ra, rarray, n * r->itemsize);
+            ra += n;
+            rarray += n;
+        }
+        tuple->items[0] = MP_OBJ_FROM_PTR(q);
+        tuple->items[1] = MP_OBJ_FROM_PTR(r);
+    } else {
+        mp_raise_ValueError(translate("mode must be complete, or reduced"));
+    }
+    return tuple;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(linalg_qr_obj, 1, linalg_qr);
+#endif
+
+STATIC const mp_rom_map_elem_t ulab_linalg_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_linalg) },
+    #if ULAB_MAX_DIMS > 1
+        #if ULAB_LINALG_HAS_CHOLESKY
+        { MP_ROM_QSTR(MP_QSTR_cholesky), (mp_obj_t)&linalg_cholesky_obj },
+        #endif
+        #if ULAB_LINALG_HAS_DET
+        { MP_ROM_QSTR(MP_QSTR_det), (mp_obj_t)&linalg_det_obj },
+        #endif
+        #if ULAB_LINALG_HAS_EIG
+        { MP_ROM_QSTR(MP_QSTR_eig), (mp_obj_t)&linalg_eig_obj },
+        #endif
+        #if ULAB_LINALG_HAS_INV
+        { MP_ROM_QSTR(MP_QSTR_inv), (mp_obj_t)&linalg_inv_obj },
+        #endif
+        #if ULAB_LINALG_HAS_QR
+        { MP_ROM_QSTR(MP_QSTR_qr), (mp_obj_t)&linalg_qr_obj },
+        #endif
+    #endif
+    #if ULAB_LINALG_HAS_NORM
+    { MP_ROM_QSTR(MP_QSTR_norm), (mp_obj_t)&linalg_norm_obj },
+    #endif
+};
+
+STATIC MP_DEFINE_CONST_DICT(mp_module_ulab_linalg_globals, ulab_linalg_globals_table);
+
+const mp_obj_module_t ulab_linalg_module = {
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_linalg_globals,
+};
+MP_REGISTER_MODULE(MP_QSTR_ulab_dot_linalg, ulab_linalg_module, MODULE_ULAB_ENABLED && CIRCUITPY_ULAB);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/linalg/linalg.h b/circuitpython/extmod/ulab/code/numpy/linalg/linalg.h
new file mode 100644
index 0000000..35fc403
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/linalg/linalg.h
@@ -0,0 +1,27 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _LINALG_
+#define _LINALG_
+
+#include "../../ulab.h"
+#include "../../ndarray.h"
+#include "linalg_tools.h"
+
+extern const mp_obj_module_t ulab_linalg_module;
+
+MP_DECLARE_CONST_FUN_OBJ_1(linalg_cholesky_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(linalg_det_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(linalg_eig_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(linalg_inv_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(linalg_norm_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(linalg_qr_obj);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/linalg/linalg_tools.c b/circuitpython/extmod/ulab/code/numpy/linalg/linalg_tools.c
new file mode 100644
index 0000000..5e03a50
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/linalg/linalg_tools.c
@@ -0,0 +1,171 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2010 Zoltán Vörös
+*/
+
+#include <math.h>
+#include <string.h>
+#include "py/runtime.h"
+
+#include "linalg_tools.h"
+
+/* 
+ * The following function inverts a matrix, whose entries are given in the input array 
+ * The function has no dependencies beyond micropython itself (for the definition of mp_float_t),
+ * and can be used independent of ulab.
+ */
+
+bool linalg_invert_matrix(mp_float_t *data, size_t N) {
+    // returns true, of the inversion was successful,
+    // false, if the matrix is singular
+
+    // initially, this is the unit matrix: the contents of this matrix is what
+    // will be returned after all the transformations
+    mp_float_t *unit = m_new(mp_float_t, N*N);
+    mp_float_t elem = 1.0;
+    // initialise the unit matrix
+    memset(unit, 0, sizeof(mp_float_t)*N*N);
+    for(size_t m=0; m < N; m++) {
+        memcpy(&unit[m * (N+1)], &elem, sizeof(mp_float_t));
+    }
+    for(size_t m=0; m < N; m++){
+        // this could be faster with ((c < epsilon) && (c > -epsilon))
+        if(MICROPY_FLOAT_C_FUN(fabs)(data[m * (N+1)]) < LINALG_EPSILON) {
+            //look for a line to swap
+            size_t m1 = m + 1;
+            for(; m1 < N; m1++) {
+                if(!(MICROPY_FLOAT_C_FUN(fabs)(data[m1*N + m]) < LINALG_EPSILON)) {
+                    for(size_t m2=0; m2 < N; m2++) {
+                        mp_float_t swapVal = data[m*N+m2];
+                        data[m*N+m2] = data[m1*N+m2];
+                        data[m1*N+m2] = swapVal;
+                        swapVal = unit[m*N+m2];
+                        unit[m*N+m2] = unit[m1*N+m2];
+                        unit[m1*N+m2] = swapVal;
+                    }
+                    break;
+                }
+            }
+            if (m1 >= N) {
+                m_del(mp_float_t, unit, N*N);
+                return false;
+            }
+        }
+        for(size_t n=0; n < N; n++) {
+            if(m != n){
+                elem = data[N * n + m] / data[m * (N+1)];
+                for(size_t k=0; k < N; k++) {
+                    data[N * n + k] -= elem * data[N * m + k];
+                    unit[N * n + k] -= elem * unit[N * m + k];
+                }
+            }
+        }
+    }
+    for(size_t m=0; m < N; m++) {
+        elem = data[m * (N+1)];
+        for(size_t n=0; n < N; n++) {
+            data[N * m + n] /= elem;
+            unit[N * m + n] /= elem;
+        }
+    }
+    memcpy(data, unit, sizeof(mp_float_t)*N*N);
+    m_del(mp_float_t, unit, N * N);
+    return true;
+}
+
+/* 
+ * The following function calculates the eigenvalues and eigenvectors of a symmetric 
+ * real matrix, whose entries are given in the input array. 
+ * The function has no dependencies beyond micropython itself (for the definition of mp_float_t),
+ * and can be used independent of ulab.
+ */
+
+size_t linalg_jacobi_rotations(mp_float_t *array, mp_float_t *eigvectors, size_t S) {
+    // eigvectors should be a 0-array; start out with the unit matrix
+    for(size_t m=0; m < S; m++) {
+        eigvectors[m * (S+1)] = 1.0;
+    }
+    mp_float_t largest, w, t, c, s, tau, aMk, aNk, vm, vn;
+    size_t M, N;
+    size_t iterations = JACOBI_MAX * S * S;
+    do {
+        iterations--;
+        // find the pivot here
+        M = 0;
+        N = 0;
+        largest = 0.0;
+        for(size_t m=0; m < S-1; m++) { // -1: no need to inspect last row
+            for(size_t n=m+1; n < S; n++) {
+                w = MICROPY_FLOAT_C_FUN(fabs)(array[m * S + n]);
+                if((largest < w) && (LINALG_EPSILON < w)) {
+                    M = m;
+                    N = n;
+                    largest = w;
+                }
+            }
+        }
+        if(M + N == 0) { // all entries are smaller than epsilon, there is not much we can do...
+            break;
+        }
+        // at this point, we have the pivot, and it is the entry (M, N)
+        // now we have to find the rotation angle
+        w = (array[N * S + N] - array[M * S + M]) / (MICROPY_FLOAT_CONST(2.0)*array[M * S + N]);
+        // The following if/else chooses the smaller absolute value for the tangent
+        // of the rotation angle. Going with the smaller should be numerically stabler.
+        if(w > 0) {
+            t = MICROPY_FLOAT_C_FUN(sqrt)(w*w + MICROPY_FLOAT_CONST(1.0)) - w;
+        } else {
+            t = MICROPY_FLOAT_CONST(-1.0)*(MICROPY_FLOAT_C_FUN(sqrt)(w*w + MICROPY_FLOAT_CONST(1.0)) + w);
+        }
+        s = t / MICROPY_FLOAT_C_FUN(sqrt)(t*t + MICROPY_FLOAT_CONST(1.0)); // the sine of the rotation angle
+        c = MICROPY_FLOAT_CONST(1.0) / MICROPY_FLOAT_C_FUN(sqrt)(t*t + MICROPY_FLOAT_CONST(1.0)); // the cosine of the rotation angle
+        tau = (MICROPY_FLOAT_CONST(1.0)-c)/s; // this is equal to the tangent of the half of the rotation angle
+
+        // at this point, we have the rotation angles, so we can transform the matrix
+        // first the two diagonal elements
+        // a(M, M) = a(M, M) - t*a(M, N)
+        array[M * S + M] = array[M * S + M] - t * array[M * S + N];
+        // a(N, N) = a(N, N) + t*a(M, N)
+        array[N * S + N] = array[N * S + N] + t * array[M * S + N];
+        // after the rotation, the a(M, N), and a(N, M) entries should become zero
+        array[M * S + N] = array[N * S + M] = MICROPY_FLOAT_CONST(0.0);
+        // then all other elements in the column
+        for(size_t k=0; k < S; k++) {
+            if((k == M) || (k == N)) {
+                continue;
+            }
+            aMk = array[M * S + k];
+            aNk = array[N * S + k];
+            // a(M, k) = a(M, k) - s*(a(N, k) + tau*a(M, k))
+            array[M * S + k] -= s * (aNk + tau * aMk);
+            // a(N, k) = a(N, k) + s*(a(M, k) - tau*a(N, k))
+            array[N * S + k] += s * (aMk - tau * aNk);
+            // a(k, M) = a(M, k)
+            array[k * S + M] = array[M * S + k];
+            // a(k, N) = a(N, k)
+            array[k * S + N] = array[N * S + k];
+        }
+        // now we have to update the eigenvectors
+        // the rotation matrix, R, multiplies from the right
+        // R is the unit matrix, except for the
+        // R(M,M) = R(N, N) = c
+        // R(N, M) = s
+        // (M, N) = -s
+        // entries. This means that only the Mth, and Nth columns will change
+        for(size_t m=0; m < S; m++) {
+            vm = eigvectors[m * S + M];
+            vn = eigvectors[m * S + N];
+            // the new value of eigvectors(m, M)
+            eigvectors[m * S + M] = c * vm - s * vn;
+            // the new value of eigvectors(m, N)
+            eigvectors[m * S + N] = s * vm + c * vn;
+        }
+    } while(iterations > 0);
+    
+    return iterations;
+}
diff --git a/circuitpython/extmod/ulab/code/numpy/linalg/linalg_tools.h b/circuitpython/extmod/ulab/code/numpy/linalg/linalg_tools.h
new file mode 100644
index 0000000..942da00
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/linalg/linalg_tools.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _TOOLS_TOOLS_
+#define _TOOLS_TOOLS_
+
+#ifndef LINALG_EPSILON
+#if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
+#define LINALG_EPSILON      MICROPY_FLOAT_CONST(1.2e-7)
+#elif MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_DOUBLE
+#define LINALG_EPSILON      MICROPY_FLOAT_CONST(2.3e-16)
+#endif
+#endif /* LINALG_EPSILON */
+
+#define JACOBI_MAX     20
+
+bool linalg_invert_matrix(mp_float_t *, size_t );
+size_t linalg_jacobi_rotations(mp_float_t *, mp_float_t *, size_t );
+
+#endif /* _TOOLS_TOOLS_ */
+
diff --git a/circuitpython/extmod/ulab/code/numpy/ndarray/ndarray_iter.c b/circuitpython/extmod/ulab/code/numpy/ndarray/ndarray_iter.c
new file mode 100644
index 0000000..8704836
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/ndarray/ndarray_iter.c
@@ -0,0 +1,66 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2021 Zoltán Vörös
+ *
+*/
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+
+#include "ndarray_iter.h"
+
+#ifdef NDARRAY_HAS_FLATITER
+mp_obj_t ndarray_flatiter_make_new(mp_obj_t self_in) {
+    ndarray_flatiter_t *flatiter = m_new_obj(ndarray_flatiter_t);
+    flatiter->base.type = &ndarray_flatiter_type;
+    flatiter->iternext = ndarray_flatiter_next;
+    flatiter->ndarray = MP_OBJ_TO_PTR(self_in);
+    flatiter->cur = 0;
+    return flatiter;
+}
+
+mp_obj_t ndarray_flatiter_next(mp_obj_t self_in) {
+    ndarray_flatiter_t *self = MP_OBJ_TO_PTR(self_in);
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(self->ndarray);
+    uint8_t *array = (uint8_t *)ndarray->array;
+
+    if(self->cur < ndarray->len) {
+        uint32_t remainder = self->cur;
+        uint8_t i = ULAB_MAX_DIMS - 1;
+        do {
+            size_t div = (remainder / ndarray->shape[i]);
+            array += remainder * ndarray->strides[i];
+            remainder -= div * ndarray->shape[i];
+            i--;
+        } while(i > ULAB_MAX_DIMS - ndarray->ndim);
+        self->cur++;
+        return ndarray_get_item(ndarray, array);
+    }
+    return MP_OBJ_STOP_ITERATION;
+}
+
+mp_obj_t ndarray_new_flatiterator(mp_obj_t flatiter_in, mp_obj_iter_buf_t *iter_buf) {
+    assert(sizeof(ndarray_flatiter_t) <= sizeof(mp_obj_iter_buf_t));
+    ndarray_flatiter_t *iter = (ndarray_flatiter_t *)iter_buf;
+    ndarray_flatiter_t *flatiter = MP_OBJ_TO_PTR(flatiter_in);
+    iter->base.type = &mp_type_polymorph_iter;
+    iter->iternext = ndarray_flatiter_next;
+    iter->ndarray = flatiter->ndarray;
+    iter->cur = 0;
+    return MP_OBJ_FROM_PTR(iter);
+}
+
+mp_obj_t ndarray_get_flatiterator(mp_obj_t o_in, mp_obj_iter_buf_t *iter_buf) {
+    return ndarray_new_flatiterator(o_in, iter_buf);
+}
+#endif /* NDARRAY_HAS_FLATITER */
diff --git a/circuitpython/extmod/ulab/code/numpy/ndarray/ndarray_iter.h b/circuitpython/extmod/ulab/code/numpy/ndarray/ndarray_iter.h
new file mode 100644
index 0000000..b3fc48d
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/ndarray/ndarray_iter.h
@@ -0,0 +1,36 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2020-2021 Zoltán Vörös
+*/
+
+#ifndef _NDARRAY_ITER_
+#define _NDARRAY_ITER_
+
+#include "py/runtime.h"
+#include "py/binary.h"
+#include "py/obj.h"
+#include "py/objarray.h"
+
+#include "../../ulab.h"
+#include "../../ndarray.h"
+
+// TODO: take simply mp_obj_ndarray_it_t from ndarray.c
+typedef struct _mp_obj_ndarray_flatiter_t {
+    mp_obj_base_t base;
+    mp_fun_1_t iternext;
+    mp_obj_t ndarray;
+    size_t cur;
+} ndarray_flatiter_t;
+
+mp_obj_t ndarray_get_flatiterator(mp_obj_t , mp_obj_iter_buf_t *);
+mp_obj_t ndarray_flatiter_make_new(mp_obj_t );
+mp_obj_t ndarray_flatiter_next(mp_obj_t );
+
+#endif
+\ No newline at end of file
diff --git a/circuitpython/extmod/ulab/code/numpy/numerical.c b/circuitpython/extmod/ulab/code/numpy/numerical.c
new file mode 100644
index 0000000..d6983c0
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/numerical.c
@@ -0,0 +1,1402 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/objint.h"
+#include "py/runtime.h"
+#include "py/builtin.h"
+#include "py/misc.h"
+
+#include "../ulab.h"
+#include "../ulab_tools.h"
+#include "./carray/carray_tools.h"
+#include "numerical.h"
+
+enum NUMERICAL_FUNCTION_TYPE {
+    NUMERICAL_ALL,
+    NUMERICAL_ANY,
+    NUMERICAL_ARGMAX,
+    NUMERICAL_ARGMIN,
+    NUMERICAL_MAX,
+    NUMERICAL_MEAN,
+    NUMERICAL_MIN,
+    NUMERICAL_STD,
+    NUMERICAL_SUM,
+};
+
+//| """Numerical and Statistical functions
+//|
+//| Most of these functions take an "axis" argument, which indicates whether to
+//| operate over the flattened array (None), or a particular axis (integer)."""
+//|
+//| from typing import Dict
+//|
+//| _ArrayLike = Union[ndarray, List[_float], Tuple[_float], range]
+//|
+//| _DType = int
+//| """`ulab.numpy.int8`, `ulab.numpy.uint8`, `ulab.numpy.int16`, `ulab.numpy.uint16`, `ulab.numpy.float` or `ulab.numpy.bool`"""
+//|
+//| from builtins import float as _float
+//| from builtins import bool as _bool
+//|
+//| int8: _DType
+//| """Type code for signed integers in the range -128 .. 127 inclusive, like the 'b' typecode of `array.array`"""
+//|
+//| int16: _DType
+//| """Type code for signed integers in the range -32768 .. 32767 inclusive, like the 'h' typecode of `array.array`"""
+//|
+//| float: _DType
+//| """Type code for floating point values, like the 'f' typecode of `array.array`"""
+//|
+//| uint8: _DType
+//| """Type code for unsigned integers in the range 0 .. 255 inclusive, like the 'H' typecode of `array.array`"""
+//|
+//| uint16: _DType
+//| """Type code for unsigned integers in the range 0 .. 65535 inclusive, like the 'h' typecode of `array.array`"""
+//|
+//| bool: _DType
+//| """Type code for boolean values"""
+//|
+
+static void numerical_reduce_axes(ndarray_obj_t *ndarray, int8_t axis, size_t *shape, int32_t *strides) {
+    // removes the values corresponding to a single axis from the shape and strides array
+    uint8_t index = ULAB_MAX_DIMS - ndarray->ndim + axis;
+    if((ndarray->ndim == 1) && (axis == 0)) {
+        index = 0;
+        shape[ULAB_MAX_DIMS - 1] = 1;
+        return;
+    }
+    for(uint8_t i = ULAB_MAX_DIMS - 1; i > 0; i--) {
+        if(i > index) {
+            shape[i] = ndarray->shape[i];
+            strides[i] = ndarray->strides[i];
+        } else {
+            shape[i] = ndarray->shape[i-1];
+            strides[i] = ndarray->strides[i-1];
+        }
+    }
+}
+
+#if ULAB_NUMPY_HAS_ALL | ULAB_NUMPY_HAS_ANY
+static mp_obj_t numerical_all_any(mp_obj_t oin, mp_obj_t axis, uint8_t optype) {
+    bool anytype = optype == NUMERICAL_ALL ? 1 : 0;
+    if(mp_obj_is_type(oin, &ulab_ndarray_type)) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(oin);
+        uint8_t *array = (uint8_t *)ndarray->array;
+        if(ndarray->len == 0) { // return immediately with empty arrays
+        if(optype == NUMERICAL_ALL) {
+                return mp_const_true;
+            } else {
+                return mp_const_false;
+            }
+        }
+        // always get a float, so that we don't have to resolve the dtype later
+        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+        ndarray_obj_t *results = NULL;
+        uint8_t *rarray = NULL;
+        shape_strides _shape_strides = tools_reduce_axes(ndarray, axis);
+        if(axis != mp_const_none) {
+            results = ndarray_new_dense_ndarray(_shape_strides.ndim, _shape_strides.shape, NDARRAY_BOOL);
+            rarray = results->array;
+            if(optype == NUMERICAL_ALL) {
+                memset(rarray, 1, results->len);
+            }
+        }
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    if(axis == mp_const_none) {
+                        do {
+                            #if ULAB_SUPPORTS_COMPLEX
+                            if(ndarray->dtype == NDARRAY_COMPLEX) {
+                                mp_float_t real = *((mp_float_t *)array);
+                                mp_float_t imag = *((mp_float_t *)(array + sizeof(mp_float_t)));
+                                if(((real != MICROPY_FLOAT_CONST(0.0)) | (imag != MICROPY_FLOAT_CONST(0.0))) & !anytype) {
+                                    // optype = NUMERICAL_ANY
+                                    return mp_const_true;
+                                } else if(((real == MICROPY_FLOAT_CONST(0.0)) & (imag == MICROPY_FLOAT_CONST(0.0))) & anytype) {
+                                    // optype == NUMERICAL_ALL
+                                    return mp_const_false;
+                                }
+                            } else {
+                            #endif
+                                mp_float_t value = func(array);
+                                if((value != MICROPY_FLOAT_CONST(0.0)) & !anytype) {
+                                    // optype = NUMERICAL_ANY
+                                    return mp_const_true;
+                                } else if((value == MICROPY_FLOAT_CONST(0.0)) & anytype) {
+                                    // optype == NUMERICAL_ALL
+                                    return mp_const_false;
+                                }
+                            #if ULAB_SUPPORTS_COMPLEX
+                            }
+                            #endif
+                            array += _shape_strides.strides[0];
+                            l++;
+                        } while(l < _shape_strides.shape[0]);
+                    } else { // a scalar axis keyword was supplied
+                        do {
+                            #if ULAB_SUPPORTS_COMPLEX
+                            if(ndarray->dtype == NDARRAY_COMPLEX) {
+                                mp_float_t real = *((mp_float_t *)array);
+                                mp_float_t imag = *((mp_float_t *)(array + sizeof(mp_float_t)));
+                                if(((real != MICROPY_FLOAT_CONST(0.0)) | (imag != MICROPY_FLOAT_CONST(0.0))) & !anytype) {
+                                    // optype = NUMERICAL_ANY
+                                    *rarray = 1;
+                                    // since we are breaking out of the loop, move the pointer forward
+                                    array += _shape_strides.strides[0] * (_shape_strides.shape[0] - l);
+                                    break;
+                                } else if(((real == MICROPY_FLOAT_CONST(0.0)) & (imag == MICROPY_FLOAT_CONST(0.0))) & anytype) {
+                                    // optype == NUMERICAL_ALL
+                                    *rarray = 0;
+                                    // since we are breaking out of the loop, move the pointer forward
+                                    array += _shape_strides.strides[0] * (_shape_strides.shape[0] - l);
+                                    break;
+                                }
+                            } else {
+                            #endif
+                                mp_float_t value = func(array);
+                                if((value != MICROPY_FLOAT_CONST(0.0)) & !anytype) {
+                                    // optype == NUMERICAL_ANY
+                                    *rarray = 1;
+                                    // since we are breaking out of the loop, move the pointer forward
+                                    array += _shape_strides.strides[0] * (_shape_strides.shape[0] - l);
+                                    break;
+                                } else if((value == MICROPY_FLOAT_CONST(0.0)) & anytype) {
+                                    // optype == NUMERICAL_ALL
+                                    *rarray = 0;
+                                    // since we are breaking out of the loop, move the pointer forward
+                                    array += _shape_strides.strides[0] * (_shape_strides.shape[0] - l);
+                                    break;
+                                }
+                            #if ULAB_SUPPORTS_COMPLEX
+                            }
+                            #endif
+                            array += _shape_strides.strides[0];
+                            l++;
+                        } while(l < _shape_strides.shape[0]);
+                    }
+                #if ULAB_MAX_DIMS > 1
+                    rarray += _shape_strides.increment;
+                    array -= _shape_strides.strides[0] * _shape_strides.shape[0];
+                    array += _shape_strides.strides[ULAB_MAX_DIMS - 1];
+                    k++;
+                } while(k < _shape_strides.shape[ULAB_MAX_DIMS - 1]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                array -= _shape_strides.strides[ULAB_MAX_DIMS - 1] * _shape_strides.shape[ULAB_MAX_DIMS - 1];
+                array += _shape_strides.strides[ULAB_MAX_DIMS - 2];
+                j++;
+            } while(j < _shape_strides.shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            array -= _shape_strides.strides[ULAB_MAX_DIMS - 2] * _shape_strides.shape[ULAB_MAX_DIMS - 2];
+            array += _shape_strides.strides[ULAB_MAX_DIMS - 3];
+            i++;
+        } while(i < _shape_strides.shape[ULAB_MAX_DIMS - 3]);
+        #endif
+        if(axis == mp_const_none) {
+            // the innermost loop fell through, so return the result here
+            if(!anytype) {
+                return mp_const_false;
+            } else {
+                return mp_const_true;
+            }
+        }
+        return results;
+    } else if(mp_obj_is_int(oin) || mp_obj_is_float(oin)) {
+        return mp_obj_is_true(oin) ? mp_const_true : mp_const_false;
+    } else {
+        mp_obj_iter_buf_t iter_buf;
+        mp_obj_t item, iterable = mp_getiter(oin, &iter_buf);
+        while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+            if(!mp_obj_is_true(item) & !anytype) {
+                return mp_const_false;
+            } else if(mp_obj_is_true(item) & anytype) {
+                return mp_const_true;
+            }
+        }
+    }
+    return anytype ? mp_const_true : mp_const_false;
+}
+#endif
+
+#if ULAB_NUMPY_HAS_SUM | ULAB_NUMPY_HAS_MEAN | ULAB_NUMPY_HAS_STD
+static mp_obj_t numerical_sum_mean_std_iterable(mp_obj_t oin, uint8_t optype, size_t ddof) {
+    mp_float_t value = MICROPY_FLOAT_CONST(0.0);
+    mp_float_t M = MICROPY_FLOAT_CONST(0.0);
+    mp_float_t m = MICROPY_FLOAT_CONST(0.0);
+    mp_float_t S = MICROPY_FLOAT_CONST(0.0);
+    mp_float_t s = MICROPY_FLOAT_CONST(0.0);
+    size_t count = 0;
+    mp_obj_iter_buf_t iter_buf;
+    mp_obj_t item, iterable = mp_getiter(oin, &iter_buf);
+    while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+        value = mp_obj_get_float(item);
+        m = M + (value - M) / (count + 1);
+        s = S + (value - M) * (value - m);
+        M = m;
+        S = s;
+        count++;
+    }
+    if(optype == NUMERICAL_SUM) {
+        return mp_obj_new_float(m * count);
+    } else if(optype == NUMERICAL_MEAN) {
+        return count > 0 ? mp_obj_new_float(m) : mp_obj_new_float(MICROPY_FLOAT_CONST(0.0));
+    } else { // this should be the case of the standard deviation
+        return count > ddof ? mp_obj_new_float(MICROPY_FLOAT_C_FUN(sqrt)(s / (count - ddof))) : mp_obj_new_float(MICROPY_FLOAT_CONST(0.0));
+    }
+}
+
+static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t axis, uint8_t optype, size_t ddof) {
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    uint8_t *array = (uint8_t *)ndarray->array;
+    shape_strides _shape_strides = tools_reduce_axes(ndarray, axis);
+
+    if(axis == mp_const_none) {
+        // work with the flattened array
+        if((optype == NUMERICAL_STD) && (ddof > ndarray->len)) {
+            // if there are too many degrees of freedom, there is no point in calculating anything
+            return mp_obj_new_float(MICROPY_FLOAT_CONST(0.0));
+        }
+        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+        mp_float_t M = MICROPY_FLOAT_CONST(0.0);
+        mp_float_t m = MICROPY_FLOAT_CONST(0.0);
+        mp_float_t S = MICROPY_FLOAT_CONST(0.0);
+        mp_float_t s = MICROPY_FLOAT_CONST(0.0);
+        size_t count = 0;
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        count++;
+                        mp_float_t value = func(array);
+                        m = M + (value - M) / (mp_float_t)count;
+                        if(optype == NUMERICAL_STD) {
+                            s = S + (value - M) * (value - m);
+                            S = s;
+                        }
+                        M = m;
+                        array += _shape_strides.strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < _shape_strides.shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    array -= _shape_strides.strides[ULAB_MAX_DIMS - 1] * _shape_strides.shape[ULAB_MAX_DIMS - 1];
+                    array += _shape_strides.strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < _shape_strides.shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                array -= _shape_strides.strides[ULAB_MAX_DIMS - 2] * _shape_strides.shape[ULAB_MAX_DIMS - 2];
+                array += _shape_strides.strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < _shape_strides.shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            array -= _shape_strides.strides[ULAB_MAX_DIMS - 3] * _shape_strides.shape[ULAB_MAX_DIMS - 3];
+            array += _shape_strides.strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < _shape_strides.shape[ULAB_MAX_DIMS - 4]);
+        #endif
+        if(optype == NUMERICAL_SUM) {
+            // numpy returns an integer for integer input types
+            if(ndarray->dtype == NDARRAY_FLOAT) {
+                return mp_obj_new_float(M * ndarray->len);
+            } else {
+                return mp_obj_new_int((int32_t)MICROPY_FLOAT_C_FUN(round)(M * ndarray->len));
+            }
+        } else if(optype == NUMERICAL_MEAN) {
+            return mp_obj_new_float(M);
+        } else { // this must be the case of the standard deviation
+            // we have already made certain that ddof < ndarray->len holds
+            return mp_obj_new_float(MICROPY_FLOAT_C_FUN(sqrt)(S / (ndarray->len - ddof)));
+        }
+    } else {
+        ndarray_obj_t *results = NULL;
+        uint8_t *rarray = NULL;
+        mp_float_t *farray = NULL;
+        if(optype == NUMERICAL_SUM) {
+            results = ndarray_new_dense_ndarray(_shape_strides.ndim, _shape_strides.shape, ndarray->dtype);
+            rarray = (uint8_t *)results->array;
+            // TODO: numpy promotes the output to the highest integer type
+            if(ndarray->dtype == NDARRAY_UINT8) {
+                RUN_SUM(uint8_t, array, results, rarray, _shape_strides);
+            } else if(ndarray->dtype == NDARRAY_INT8) {
+                RUN_SUM(int8_t, array, results, rarray, _shape_strides);
+            } else if(ndarray->dtype == NDARRAY_UINT16) {
+                RUN_SUM(uint16_t, array, results, rarray, _shape_strides);
+            } else if(ndarray->dtype == NDARRAY_INT16) {
+                RUN_SUM(int16_t, array, results, rarray, _shape_strides);
+            } else {
+                // for floats, the sum might be inaccurate with the naive summation
+                // call mean, and multiply with the number of samples
+                farray = (mp_float_t *)results->array;
+                RUN_MEAN_STD(mp_float_t, array, farray, _shape_strides, MICROPY_FLOAT_CONST(0.0), 0);
+                mp_float_t norm = (mp_float_t)_shape_strides.shape[0];
+                // re-wind the array here
+                farray = (mp_float_t *)results->array;
+                for(size_t i=0; i < results->len; i++) {
+                    *farray++ *= norm;
+                }
+            }
+        } else {
+            bool isStd = optype == NUMERICAL_STD ? 1 : 0;
+            results = ndarray_new_dense_ndarray(_shape_strides.ndim, _shape_strides.shape, NDARRAY_FLOAT);
+            farray = (mp_float_t *)results->array;
+            // we can return the 0 array here, if the degrees of freedom is larger than the length of the axis
+            if((optype == NUMERICAL_STD) && (_shape_strides.shape[0] <= ddof)) {
+                return MP_OBJ_FROM_PTR(results);
+            }
+            mp_float_t div = optype == NUMERICAL_STD ? (mp_float_t)(_shape_strides.shape[0] - ddof) : MICROPY_FLOAT_CONST(0.0);
+            if(ndarray->dtype == NDARRAY_UINT8) {
+                RUN_MEAN_STD(uint8_t, array, farray, _shape_strides, div, isStd);
+            } else if(ndarray->dtype == NDARRAY_INT8) {
+                RUN_MEAN_STD(int8_t, array, farray, _shape_strides, div, isStd);
+            } else if(ndarray->dtype == NDARRAY_UINT16) {
+                RUN_MEAN_STD(uint16_t, array, farray, _shape_strides, div, isStd);
+            } else if(ndarray->dtype == NDARRAY_INT16) {
+                RUN_MEAN_STD(int16_t, array, farray, _shape_strides, div, isStd);
+            } else {
+                RUN_MEAN_STD(mp_float_t, array, farray, _shape_strides, div, isStd);
+            }
+        }
+        if(results->ndim == 0) { // return a scalar here
+            return mp_binary_get_val_array(results->dtype, results->array, 0);
+        }
+        return MP_OBJ_FROM_PTR(results);
+    }
+    return mp_const_none;
+}
+#endif
+
+#if ULAB_NUMPY_HAS_ARGMINMAX
+static mp_obj_t numerical_argmin_argmax_iterable(mp_obj_t oin, uint8_t optype) {
+    if(MP_OBJ_SMALL_INT_VALUE(mp_obj_len_maybe(oin)) == 0) {
+        mp_raise_ValueError(translate("attempt to get argmin/argmax of an empty sequence"));
+    }
+    size_t idx = 0, best_idx = 0;
+    mp_obj_iter_buf_t iter_buf;
+    mp_obj_t iterable = mp_getiter(oin, &iter_buf);
+    mp_obj_t item;
+    uint8_t op = 0; // argmin, min
+    if((optype == NUMERICAL_ARGMAX) || (optype == NUMERICAL_MAX)) op = 1;
+    item = mp_iternext(iterable);
+    mp_obj_t best_obj = item;
+    mp_float_t value, best_value = mp_obj_get_float(item);
+    value = best_value;
+    while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+        idx++;
+        value = mp_obj_get_float(item);
+        if((op == 0) && (value < best_value)) {
+            best_obj = item;
+            best_idx = idx;
+            best_value = value;
+        } else if((op == 1) && (value > best_value)) {
+            best_obj = item;
+            best_idx = idx;
+            best_value = value;
+        }
+    }
+    if((optype == NUMERICAL_ARGMIN) || (optype == NUMERICAL_ARGMAX)) {
+        return MP_OBJ_NEW_SMALL_INT(best_idx);
+    } else {
+        return best_obj;
+    }
+}
+
+static mp_obj_t numerical_argmin_argmax_ndarray(ndarray_obj_t *ndarray, mp_obj_t axis, uint8_t optype) {
+    // TODO: treat the flattened array
+    if(ndarray->len == 0) {
+        mp_raise_ValueError(translate("attempt to get (arg)min/(arg)max of empty sequence"));
+    }
+
+    if(axis == mp_const_none) {
+        // work with the flattened array
+        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+        uint8_t *array = (uint8_t *)ndarray->array;
+        mp_float_t best_value = func(array);
+        mp_float_t value;
+        size_t index = 0, best_index = 0;
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        value = func(array);
+                        if((optype == NUMERICAL_ARGMAX) || (optype == NUMERICAL_MAX)) {
+                            if(best_value < value) {
+                                best_value = value;
+                                best_index = index;
+                            }
+                        } else {
+                            if(best_value > value) {
+                                best_value = value;
+                                best_index = index;
+                            }
+                        }
+                        array += ndarray->strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                        index++;
+                    } while(l < ndarray->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    array -= ndarray->strides[ULAB_MAX_DIMS - 1] * ndarray->shape[ULAB_MAX_DIMS-1];
+                    array += ndarray->strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < ndarray->shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                array -= ndarray->strides[ULAB_MAX_DIMS - 2] * ndarray->shape[ULAB_MAX_DIMS-2];
+                array += ndarray->strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < ndarray->shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            array -= ndarray->strides[ULAB_MAX_DIMS - 3] * ndarray->shape[ULAB_MAX_DIMS-3];
+            array += ndarray->strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < ndarray->shape[ULAB_MAX_DIMS - 4]);
+        #endif
+
+        if((optype == NUMERICAL_ARGMIN) || (optype == NUMERICAL_ARGMAX)) {
+            return mp_obj_new_int(best_index);
+        } else {
+            if(ndarray->dtype == NDARRAY_FLOAT) {
+                return mp_obj_new_float(best_value);
+            } else {
+                return MP_OBJ_NEW_SMALL_INT((int32_t)best_value);
+            }
+        }
+    } else {
+        int8_t ax = tools_get_axis(axis, ndarray->ndim);
+
+        uint8_t *array = (uint8_t *)ndarray->array;
+        size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+        memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+        int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+        memset(strides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+        numerical_reduce_axes(ndarray, ax, shape, strides);
+        uint8_t index = ULAB_MAX_DIMS - ndarray->ndim + ax;
+
+        ndarray_obj_t *results = NULL;
+
+        if((optype == NUMERICAL_ARGMIN) || (optype == NUMERICAL_ARGMAX)) {
+            results = ndarray_new_dense_ndarray(MAX(1, ndarray->ndim-1), shape, NDARRAY_INT16);
+        } else {
+            results = ndarray_new_dense_ndarray(MAX(1, ndarray->ndim-1), shape, ndarray->dtype);
+        }
+
+        uint8_t *rarray = (uint8_t *)results->array;
+
+        if(ndarray->dtype == NDARRAY_UINT8) {
+            RUN_ARGMIN(ndarray, uint8_t, array, results, rarray, shape, strides, index, optype);
+        } else if(ndarray->dtype == NDARRAY_INT8) {
+            RUN_ARGMIN(ndarray, int8_t, array, results, rarray, shape, strides, index, optype);
+        } else if(ndarray->dtype == NDARRAY_UINT16) {
+            RUN_ARGMIN(ndarray, uint16_t, array, results, rarray, shape, strides, index, optype);
+        } else if(ndarray->dtype == NDARRAY_INT16) {
+            RUN_ARGMIN(ndarray, int16_t, array, results, rarray, shape, strides, index, optype);
+        } else {
+            RUN_ARGMIN(ndarray, mp_float_t, array, results, rarray, shape, strides, index, optype);
+        }
+        if(results->len == 1) {
+            return mp_binary_get_val_array(results->dtype, results->array, 0);
+        }
+        return MP_OBJ_FROM_PTR(results);
+    }
+    return mp_const_none;
+}
+#endif
+
+static mp_obj_t numerical_function(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args, uint8_t optype) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none} } ,
+        { MP_QSTR_axis, MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t oin = args[0].u_obj;
+    mp_obj_t axis = args[1].u_obj;
+    if((axis != mp_const_none) && (!mp_obj_is_int(axis))) {
+        mp_raise_TypeError(translate("axis must be None, or an integer"));
+    }
+
+    if((optype == NUMERICAL_ALL) || (optype == NUMERICAL_ANY)) {
+        return numerical_all_any(oin, axis, optype);
+    }
+    if(mp_obj_is_type(oin, &mp_type_tuple) || mp_obj_is_type(oin, &mp_type_list) ||
+        mp_obj_is_type(oin, &mp_type_range)) {
+        switch(optype) {
+            case NUMERICAL_MIN:
+            case NUMERICAL_ARGMIN:
+            case NUMERICAL_MAX:
+            case NUMERICAL_ARGMAX:
+                return numerical_argmin_argmax_iterable(oin, optype);
+            case NUMERICAL_SUM:
+            case NUMERICAL_MEAN:
+                return numerical_sum_mean_std_iterable(oin, optype, 0);
+            default: // we should never reach this point, but whatever
+                return mp_const_none;
+        }
+    } else if(mp_obj_is_type(oin, &ulab_ndarray_type)) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(oin);
+        switch(optype) {
+            case NUMERICAL_MIN:
+            case NUMERICAL_MAX:
+            case NUMERICAL_ARGMIN:
+            case NUMERICAL_ARGMAX:
+                COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+                return numerical_argmin_argmax_ndarray(ndarray, axis, optype);
+            case NUMERICAL_SUM:
+            case NUMERICAL_MEAN:
+                COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+                return numerical_sum_mean_std_ndarray(ndarray, axis, optype, 0);
+            default:
+                mp_raise_NotImplementedError(translate("operation is not implemented on ndarrays"));
+        }
+    } else {
+        mp_raise_TypeError(translate("input must be tuple, list, range, or ndarray"));
+    }
+    return mp_const_none;
+}
+
+#if ULAB_NUMPY_HAS_SORT | NDARRAY_HAS_SORT
+static mp_obj_t numerical_sort_helper(mp_obj_t oin, mp_obj_t axis, uint8_t inplace) {
+    if(!mp_obj_is_type(oin, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("sort argument must be an ndarray"));
+    }
+
+    ndarray_obj_t *ndarray;
+    if(inplace == 1) {
+        ndarray = MP_OBJ_TO_PTR(oin);
+    } else {
+        ndarray = ndarray_copy_view(MP_OBJ_TO_PTR(oin));
+    }
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+
+    int8_t ax = 0;
+    if(axis == mp_const_none) {
+        // flatten the array
+        #if ULAB_MAX_DIMS > 1
+        for(uint8_t i=0; i < ULAB_MAX_DIMS - 1; i++) {
+            ndarray->shape[i] = 0;
+            ndarray->strides[i] = 0;
+        }
+        ndarray->shape[ULAB_MAX_DIMS - 1] = ndarray->len;
+        ndarray->strides[ULAB_MAX_DIMS - 1] = ndarray->itemsize;
+        ndarray->ndim = 1;
+        #endif
+    } else {
+        ax = tools_get_axis(axis, ndarray->ndim);
+    }
+
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+    memset(strides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+    numerical_reduce_axes(ndarray, ax, shape, strides);
+    ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
+    // we work with the typed array, so re-scale the stride
+    int32_t increment = ndarray->strides[ax] / ndarray->itemsize;
+
+    uint8_t *array = (uint8_t *)ndarray->array;
+    if((ndarray->dtype == NDARRAY_UINT8) || (ndarray->dtype == NDARRAY_INT8)) {
+        HEAPSORT(ndarray, uint8_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
+    } else if((ndarray->dtype == NDARRAY_INT16) || (ndarray->dtype == NDARRAY_INT16)) {
+        HEAPSORT(ndarray, uint16_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
+    } else {
+        HEAPSORT(ndarray, mp_float_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
+    }
+    if(inplace == 1) {
+        return mp_const_none;
+    } else {
+        return MP_OBJ_FROM_PTR(ndarray);
+    }
+}
+#endif /* ULAB_NUMERICAL_HAS_SORT | NDARRAY_HAS_SORT */
+
+#if ULAB_NUMPY_HAS_ALL
+mp_obj_t numerical_all(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_ALL);
+}
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_all_obj, 1, numerical_all);
+#endif
+
+#if ULAB_NUMPY_HAS_ANY
+mp_obj_t numerical_any(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_ANY);
+}
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_any_obj, 1, numerical_any);
+#endif
+
+#if ULAB_NUMPY_HAS_ARGMINMAX
+//| def argmax(array: _ArrayLike, *, axis: Optional[int] = None) -> int:
+//|     """Return the index of the maximum element of the 1D array"""
+//|     ...
+//|
+
+mp_obj_t numerical_argmax(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_ARGMAX);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_argmax_obj, 1, numerical_argmax);
+
+//| def argmin(array: _ArrayLike, *, axis: Optional[int] = None) -> int:
+//|     """Return the index of the minimum element of the 1D array"""
+//|     ...
+//|
+
+static mp_obj_t numerical_argmin(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_ARGMIN);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_argmin_obj, 1, numerical_argmin);
+#endif
+
+#if ULAB_NUMPY_HAS_ARGSORT
+//| def argsort(array: ulab.numpy.ndarray, *, axis: int = -1) -> ulab.numpy.ndarray:
+//|     """Returns an array which gives indices into the input array from least to greatest."""
+//|     ...
+//|
+
+mp_obj_t numerical_argsort(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("argsort argument must be an ndarray"));
+    }
+
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    if(args[1].u_obj == mp_const_none) {
+        // bail out, though dense arrays could still be sorted
+        mp_raise_NotImplementedError(translate("argsort is not implemented for flattened arrays"));
+    }
+    // Since we are returning an NDARRAY_UINT16 array, bail out,
+    // if the axis is longer than what we can hold
+    for(uint8_t i=0; i < ULAB_MAX_DIMS; i++) {
+        if(ndarray->shape[i] > 65535) {
+            mp_raise_ValueError(translate("axis too long"));
+        }
+    }
+    int8_t ax = tools_get_axis(args[1].u_obj, ndarray->ndim);
+
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+    memset(strides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+    numerical_reduce_axes(ndarray, ax, shape, strides);
+
+    // We could return an NDARRAY_UINT8 array, if all lengths are shorter than 256
+    ndarray_obj_t *indices = ndarray_new_ndarray(ndarray->ndim, ndarray->shape, NULL, NDARRAY_UINT16);
+    int32_t *istrides = m_new(int32_t, ULAB_MAX_DIMS);
+    memset(istrides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+    numerical_reduce_axes(indices, ax, shape, istrides);
+    for(uint8_t i=0; i < ULAB_MAX_DIMS; i++) {
+        istrides[i] /= sizeof(uint16_t);
+    }
+
+    ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
+    // we work with the typed array, so re-scale the stride
+    int32_t increment = ndarray->strides[ax] / ndarray->itemsize;
+    uint16_t iincrement = indices->strides[ax] / sizeof(uint16_t);
+
+    uint8_t *array = (uint8_t *)ndarray->array;
+    uint16_t *iarray = (uint16_t *)indices->array;
+
+    // fill in the index values
+    #if ULAB_MAX_DIMS > 3
+    size_t j = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t k = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t l = 0;
+            do {
+            #endif
+            uint16_t m = 0;
+                do {
+                    *iarray = m++;
+                    iarray += iincrement;
+                } while(m < indices->shape[ax]);
+            #if ULAB_MAX_DIMS > 1
+                iarray -= iincrement * indices->shape[ax];
+                iarray += istrides[ULAB_MAX_DIMS - 1];
+                l++;
+            } while(l < shape[ULAB_MAX_DIMS - 1]);
+            iarray -= istrides[ULAB_MAX_DIMS - 1] * shape[ULAB_MAX_DIMS - 1];
+            iarray += istrides[ULAB_MAX_DIMS - 2];
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            k++;
+        } while(k < shape[ULAB_MAX_DIMS - 2]);
+        iarray -= istrides[ULAB_MAX_DIMS - 2] * shape[ULAB_MAX_DIMS - 2];
+        iarray += istrides[ULAB_MAX_DIMS - 3];
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        j++;
+    } while(j < shape[ULAB_MAX_DIMS - 3]);
+    #endif
+    // reset the array
+    iarray = indices->array;
+
+    if((ndarray->dtype == NDARRAY_UINT8) || (ndarray->dtype == NDARRAY_INT8)) {
+        HEAP_ARGSORT(ndarray, uint8_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
+    } else if((ndarray->dtype == NDARRAY_UINT16) || (ndarray->dtype == NDARRAY_INT16)) {
+        HEAP_ARGSORT(ndarray, uint16_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
+    } else {
+        HEAP_ARGSORT(ndarray, mp_float_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
+    }
+    return MP_OBJ_FROM_PTR(indices);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_argsort_obj, 1, numerical_argsort);
+#endif
+
+#if ULAB_NUMPY_HAS_CROSS
+//| def cross(a: ulab.numpy.ndarray, b: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|     """Return the cross product of two vectors of length 3"""
+//|     ...
+//|
+
+static mp_obj_t numerical_cross(mp_obj_t _a, mp_obj_t _b) {
+    if (!mp_obj_is_type(_a, &ulab_ndarray_type) || !mp_obj_is_type(_b, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("arguments must be ndarrays"));
+    }
+    ndarray_obj_t *a = MP_OBJ_TO_PTR(_a);
+    ndarray_obj_t *b = MP_OBJ_TO_PTR(_b);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(a->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(b->dtype)
+    if((a->ndim != 1) || (b->ndim != 1) || (a->len != b->len) || (a->len != 3)) {
+        mp_raise_ValueError(translate("cross is defined for 1D arrays of length 3"));
+    }
+
+    mp_float_t *results = m_new(mp_float_t, 3);
+    results[0] = ndarray_get_float_index(a->array, a->dtype, 1) * ndarray_get_float_index(b->array, b->dtype, 2);
+    results[0] -= ndarray_get_float_index(a->array, a->dtype, 2) * ndarray_get_float_index(b->array, b->dtype, 1);
+    results[1] = -ndarray_get_float_index(a->array, a->dtype, 0) * ndarray_get_float_index(b->array, b->dtype, 2);
+    results[1] += ndarray_get_float_index(a->array, a->dtype, 2) * ndarray_get_float_index(b->array, b->dtype, 0);
+    results[2] = ndarray_get_float_index(a->array, a->dtype, 0) * ndarray_get_float_index(b->array, b->dtype, 1);
+    results[2] -= ndarray_get_float_index(a->array, a->dtype, 1) * ndarray_get_float_index(b->array, b->dtype, 0);
+
+    /* The upcasting happens here with the rules
+
+        - if one of the operarands is a float, the result is always float
+        - operation on identical types preserves type
+
+        uint8 + int8 => int16
+        uint8 + int16 => int16
+        uint8 + uint16 => uint16
+        int8 + int16 => int16
+        int8 + uint16 => uint16
+        uint16 + int16 => float
+
+    */
+
+    uint8_t dtype = NDARRAY_FLOAT;
+    if(a->dtype == b->dtype) {
+        dtype = a->dtype;
+    } else if(((a->dtype == NDARRAY_UINT8) && (b->dtype == NDARRAY_INT8)) || ((a->dtype == NDARRAY_INT8) && (b->dtype == NDARRAY_UINT8))) {
+        dtype = NDARRAY_INT16;
+    } else if(((a->dtype == NDARRAY_UINT8) && (b->dtype == NDARRAY_INT16)) || ((a->dtype == NDARRAY_INT16) && (b->dtype == NDARRAY_UINT8))) {
+        dtype = NDARRAY_INT16;
+    } else if(((a->dtype == NDARRAY_UINT8) && (b->dtype == NDARRAY_UINT16)) || ((a->dtype == NDARRAY_UINT16) && (b->dtype == NDARRAY_UINT8))) {
+        dtype = NDARRAY_UINT16;
+    } else if(((a->dtype == NDARRAY_INT8) && (b->dtype == NDARRAY_INT16)) || ((a->dtype == NDARRAY_INT16) && (b->dtype == NDARRAY_INT8))) {
+        dtype = NDARRAY_INT16;
+    } else if(((a->dtype == NDARRAY_INT8) && (b->dtype == NDARRAY_UINT16)) || ((a->dtype == NDARRAY_UINT16) && (b->dtype == NDARRAY_INT8))) {
+        dtype = NDARRAY_UINT16;
+    }
+
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(3, dtype);
+    if(dtype == NDARRAY_UINT8) {
+        uint8_t *array = (uint8_t *)ndarray->array;
+        for(uint8_t i=0; i < 3; i++) array[i] = (uint8_t)results[i];
+    } else if(dtype == NDARRAY_INT8) {
+        int8_t *array = (int8_t *)ndarray->array;
+        for(uint8_t i=0; i < 3; i++) array[i] = (int8_t)results[i];
+    } else if(dtype == NDARRAY_UINT16) {
+        uint16_t *array = (uint16_t *)ndarray->array;
+        for(uint8_t i=0; i < 3; i++) array[i] = (uint16_t)results[i];
+    } else if(dtype == NDARRAY_INT16) {
+        int16_t *array = (int16_t *)ndarray->array;
+        for(uint8_t i=0; i < 3; i++) array[i] = (int16_t)results[i];
+    } else {
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        for(uint8_t i=0; i < 3; i++) array[i] = results[i];
+    }
+    m_del(mp_float_t, results, 3);
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(numerical_cross_obj, numerical_cross);
+
+#endif /* ULAB_NUMERICAL_HAS_CROSS */
+
+#if ULAB_NUMPY_HAS_DIFF
+//| def diff(array: ulab.numpy.ndarray, *, n: int = 1, axis: int = -1) -> ulab.numpy.ndarray:
+//|     """Return the numerical derivative of successive elements of the array, as
+//|        an array.  axis=None is not supported."""
+//|     ...
+//|
+
+mp_obj_t numerical_diff(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_n, MP_ARG_KW_ONLY | MP_ARG_INT, {.u_int = 1 } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_INT, {.u_int = -1 } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("diff argument must be an ndarray"));
+    }
+
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    int8_t ax = args[2].u_int;
+    if(ax < 0) ax += ndarray->ndim;
+
+    if((ax < 0) || (ax > ndarray->ndim - 1)) {
+        mp_raise_ValueError(translate("index out of range"));
+    }
+
+    if((args[1].u_int < 0) || (args[1].u_int > 9)) {
+        mp_raise_ValueError(translate("differentiation order out of range"));
+    }
+    uint8_t N = (uint8_t)args[1].u_int;
+    uint8_t index = ULAB_MAX_DIMS - ndarray->ndim + ax;
+    if(N > ndarray->shape[index]) {
+        mp_raise_ValueError(translate("differentiation order out of range"));
+    }
+
+    int8_t *stencil = m_new(int8_t, N+1);
+    stencil[0] = 1;
+    for(uint8_t i=1; i < N+1; i++) {
+        stencil[i] = -stencil[i-1]*(N-i+1)/i;
+    }
+
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    for(uint8_t i=0; i < ULAB_MAX_DIMS; i++) {
+        shape[i] = ndarray->shape[i];
+        if(i == index) {
+            shape[i] -= N;
+        }
+    }
+    uint8_t *array = (uint8_t *)ndarray->array;
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndarray->ndim, shape, ndarray->dtype);
+    uint8_t *rarray = (uint8_t *)results->array;
+
+    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+    memset(strides, 0, sizeof(int32_t)*ULAB_MAX_DIMS);
+    numerical_reduce_axes(ndarray, ax, shape, strides);
+
+    if(ndarray->dtype == NDARRAY_UINT8) {
+        RUN_DIFF(ndarray, uint8_t, array, results, rarray, shape, strides, index, stencil, N);
+    } else if(ndarray->dtype == NDARRAY_INT8) {
+        RUN_DIFF(ndarray, int8_t, array, results, rarray, shape, strides, index, stencil, N);
+    }  else if(ndarray->dtype == NDARRAY_UINT16) {
+        RUN_DIFF(ndarray, uint16_t, array, results, rarray, shape, strides, index, stencil, N);
+    } else if(ndarray->dtype == NDARRAY_INT16) {
+        RUN_DIFF(ndarray, int16_t, array, results, rarray, shape, strides, index, stencil, N);
+    } else {
+        RUN_DIFF(ndarray, mp_float_t, array, results, rarray, shape, strides, index, stencil, N);
+    }
+    m_del(int8_t, stencil, N+1);
+    m_del(size_t, shape, ULAB_MAX_DIMS);
+    m_del(int32_t, strides, ULAB_MAX_DIMS);
+    return MP_OBJ_FROM_PTR(results);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_diff_obj, 1, numerical_diff);
+#endif
+
+#if ULAB_NUMPY_HAS_FLIP
+//| def flip(array: ulab.numpy.ndarray, *, axis: Optional[int] = None) -> ulab.numpy.ndarray:
+//|     """Returns a new array that reverses the order of the elements along the
+//|        given axis, or along all axes if axis is None."""
+//|     ...
+//|
+
+mp_obj_t numerical_flip(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("flip argument must be an ndarray"));
+    }
+
+    ndarray_obj_t *results = NULL;
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+    if(args[1].u_obj == mp_const_none) { // flip the flattened array
+        results = ndarray_new_linear_array(ndarray->len, ndarray->dtype);
+        ndarray_copy_array(ndarray, results, 0);
+        uint8_t *rarray = (uint8_t *)results->array;
+        rarray += (results->len - 1) * results->itemsize;
+        results->array = rarray;
+        results->strides[ULAB_MAX_DIMS - 1] = -results->strides[ULAB_MAX_DIMS - 1];
+    } else if(mp_obj_is_int(args[1].u_obj)){
+        int8_t ax = tools_get_axis(args[1].u_obj, ndarray->ndim);
+
+        ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
+        int32_t offset = (ndarray->shape[ax] - 1) * ndarray->strides[ax];
+        results = ndarray_new_view(ndarray, ndarray->ndim, ndarray->shape, ndarray->strides, offset);
+        results->strides[ax] = -results->strides[ax];
+    } else {
+        mp_raise_TypeError(translate("wrong axis index"));
+    }
+    return results;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_flip_obj, 1, numerical_flip);
+#endif
+
+#if ULAB_NUMPY_HAS_MINMAX
+//| def max(array: _ArrayLike, *, axis: Optional[int] = None) -> _float:
+//|     """Return the maximum element of the 1D array"""
+//|     ...
+//|
+
+mp_obj_t numerical_max(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_MAX);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_max_obj, 1, numerical_max);
+#endif
+
+#if ULAB_NUMPY_HAS_MEAN
+//| def mean(array: _ArrayLike, *, axis: Optional[int] = None) -> _float:
+//|     """Return the mean element of the 1D array, as a number if axis is None, otherwise as an array."""
+//|     ...
+//|
+
+mp_obj_t numerical_mean(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_MEAN);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_mean_obj, 1, numerical_mean);
+#endif
+
+#if ULAB_NUMPY_HAS_MEDIAN
+//| def median(array: ulab.numpy.ndarray, *, axis: int = -1) -> ulab.numpy.ndarray:
+//|     """Find the median value in an array along the given axis, or along all axes if axis is None."""
+//|     ...
+//|
+
+mp_obj_t numerical_median(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("median argument must be an ndarray"));
+    }
+
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+    if(ndarray->len == 0) {
+        return mp_obj_new_float(MICROPY_FLOAT_C_FUN(nan)(""));
+    }
+
+    ndarray = numerical_sort_helper(args[0].u_obj, args[1].u_obj, 0);
+
+    if((args[1].u_obj == mp_const_none) || (ndarray->ndim == 1)) {
+        // at this point, the array holding the sorted values should be flat
+        uint8_t *array = (uint8_t *)ndarray->array;
+        size_t len = ndarray->len;
+        array += (len >> 1) * ndarray->itemsize;
+        mp_float_t median = ndarray_get_float_value(array, ndarray->dtype);
+        if(!(len & 0x01)) { // len is an even number
+            array -= ndarray->itemsize;
+            median += ndarray_get_float_value(array, ndarray->dtype);
+            median *= MICROPY_FLOAT_CONST(0.5);
+        }
+        return mp_obj_new_float(median);
+    } else {
+        int8_t ax = tools_get_axis(args[1].u_obj, ndarray->ndim);
+
+        size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+        memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+        int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+        memset(strides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+        numerical_reduce_axes(ndarray, ax, shape, strides);
+        ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
+        ndarray_obj_t *results = ndarray_new_dense_ndarray(ndarray->ndim-1, shape, NDARRAY_FLOAT);
+        mp_float_t *rarray = (mp_float_t *)results->array;
+
+        uint8_t *array = (uint8_t *)ndarray->array;
+
+        size_t len = ndarray->shape[ax];
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                size_t k = 0;
+                do {
+                    array += ndarray->strides[ax] * (len >> 1);
+                    mp_float_t median = ndarray_get_float_value(array, ndarray->dtype);
+                    if(!(len & 0x01)) { // len is an even number
+                        array -= ndarray->strides[ax];
+                        median += ndarray_get_float_value(array, ndarray->dtype);
+                        median *= MICROPY_FLOAT_CONST(0.5);
+                        array += ndarray->strides[ax];
+                    }
+                    array -= ndarray->strides[ax] * (len >> 1);
+                    array += strides[ULAB_MAX_DIMS - 1];
+                    *rarray = median;
+                    rarray++;
+                    k++;
+                } while(k < shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 2
+                array -= strides[ULAB_MAX_DIMS - 1] * shape[ULAB_MAX_DIMS - 1];
+                array += strides[ULAB_MAX_DIMS - 2];
+                j++;
+            } while(j < shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            array -= strides[ULAB_MAX_DIMS - 2] * shape[ULAB_MAX_DIMS-2];
+            array += strides[ULAB_MAX_DIMS - 3];
+            i++;
+        } while(i < shape[ULAB_MAX_DIMS - 3]);
+        #endif
+
+        return MP_OBJ_FROM_PTR(results);
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_median_obj, 1, numerical_median);
+#endif
+
+#if ULAB_NUMPY_HAS_MINMAX
+//| def min(array: _ArrayLike, *, axis: Optional[int] = None) -> _float:
+//|     """Return the minimum element of the 1D array"""
+//|     ...
+//|
+
+mp_obj_t numerical_min(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_MIN);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_min_obj, 1, numerical_min);
+#endif
+
+#if ULAB_NUMPY_HAS_ROLL
+//| def roll(array: ulab.numpy.ndarray, distance: int, *, axis: Optional[int] = None) -> None:
+//|     """Shift the content of a vector by the positions given as the second
+//|        argument. If the ``axis`` keyword is supplied, the shift is applied to
+//|        the given axis.  The array is modified in place."""
+//|     ...
+//|
+
+mp_obj_t numerical_roll(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none  } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("roll argument must be an ndarray"));
+    }
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+    uint8_t *array = ndarray->array;
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndarray->ndim, ndarray->shape, ndarray->dtype);
+
+    int32_t shift = mp_obj_get_int(args[1].u_obj);
+    int32_t _shift = shift < 0 ? -shift : shift;
+
+    size_t counter;
+    uint8_t *rarray = (uint8_t *)results->array;
+
+    if(args[2].u_obj == mp_const_none) { // roll the flattened array
+        _shift = _shift % results->len;
+        if(shift > 0) { // shift to the right
+            rarray += _shift * results->itemsize;
+            counter = results->len - _shift;
+        } else { // shift to the left
+            rarray += (results->len - _shift) * results->itemsize;
+            counter = _shift;
+        }
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        memcpy(rarray, array, ndarray->itemsize);
+                        rarray += results->itemsize;
+                        array += ndarray->strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                        if(--counter == 0) {
+                            rarray = results->array;
+                        }
+                    } while(l <  ndarray->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    array -= ndarray->strides[ULAB_MAX_DIMS - 1] * ndarray->shape[ULAB_MAX_DIMS-1];
+                    array += ndarray->strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k <  ndarray->shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                array -= ndarray->strides[ULAB_MAX_DIMS - 2] * ndarray->shape[ULAB_MAX_DIMS-2];
+                array += ndarray->strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j <  ndarray->shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            array -= ndarray->strides[ULAB_MAX_DIMS - 3] * ndarray->shape[ULAB_MAX_DIMS-3];
+            array += ndarray->strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i <  ndarray->shape[ULAB_MAX_DIMS - 4]);
+        #endif
+    } else if(mp_obj_is_int(args[2].u_obj)){
+        int8_t ax = tools_get_axis(args[2].u_obj, ndarray->ndim);
+
+        size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+        memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+        int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+        memset(strides, 0, sizeof(int32_t)*ULAB_MAX_DIMS);
+        numerical_reduce_axes(ndarray, ax, shape, strides);
+
+        size_t *rshape = m_new(size_t, ULAB_MAX_DIMS);
+        memset(rshape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+        int32_t *rstrides = m_new(int32_t, ULAB_MAX_DIMS);
+        memset(rstrides, 0, sizeof(int32_t)*ULAB_MAX_DIMS);
+        numerical_reduce_axes(results, ax, rshape, rstrides);
+
+        ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
+        uint8_t *_rarray;
+        _shift = _shift % results->shape[ax];
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    _rarray = rarray;
+                    if(shift < 0) {
+                        rarray += (results->shape[ax] - _shift) * results->strides[ax];
+                        counter = _shift;
+                    } else {
+                        rarray += _shift * results->strides[ax];
+                        counter = results->shape[ax] - _shift;
+                    }
+                    do {
+                        memcpy(rarray, array, ndarray->itemsize);
+                        array += ndarray->strides[ax];
+                        rarray += results->strides[ax];
+                        if(--counter == 0) {
+                            rarray = _rarray;
+                        }
+                        l++;
+                    } while(l < ndarray->shape[ax]);
+                #if ULAB_MAX_DIMS > 1
+                    rarray = _rarray;
+                    rarray += rstrides[ULAB_MAX_DIMS - 1];
+                    array -= ndarray->strides[ax] * ndarray->shape[ax];
+                    array += strides[ULAB_MAX_DIMS - 1];
+                    k++;
+                } while(k < shape[ULAB_MAX_DIMS - 1]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                rarray -= rstrides[ULAB_MAX_DIMS - 1] * rshape[ULAB_MAX_DIMS-1];
+                rarray += rstrides[ULAB_MAX_DIMS - 2];
+                array -= strides[ULAB_MAX_DIMS - 1] * shape[ULAB_MAX_DIMS-1];
+                array += strides[ULAB_MAX_DIMS - 2];
+                j++;
+            } while(j < shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            rarray -= rstrides[ULAB_MAX_DIMS - 2] * rshape[ULAB_MAX_DIMS-2];
+            rarray += rstrides[ULAB_MAX_DIMS - 3];
+            array -= strides[ULAB_MAX_DIMS - 2] * shape[ULAB_MAX_DIMS-2];
+            array += strides[ULAB_MAX_DIMS - 3];
+            i++;
+        } while(i < shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    } else {
+        mp_raise_TypeError(translate("wrong axis index"));
+    }
+    return results;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_roll_obj, 2, numerical_roll);
+#endif
+
+#if ULAB_NUMPY_HAS_SORT
+//| def sort(array: ulab.numpy.ndarray, *, axis: int = -1) -> ulab.numpy.ndarray:
+//|     """Sort the array along the given axis, or along all axes if axis is None.
+//|        The array is modified in place."""
+//|     ...
+//|
+
+mp_obj_t numerical_sort(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    return numerical_sort_helper(args[0].u_obj, args[1].u_obj, 0);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_sort_obj, 1, numerical_sort);
+#endif
+
+#if NDARRAY_HAS_SORT
+// method of an ndarray
+static mp_obj_t numerical_sort_inplace(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_int = -1 } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    return numerical_sort_helper(args[0].u_obj, args[1].u_obj, 1);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_sort_inplace_obj, 1, numerical_sort_inplace);
+#endif /* NDARRAY_HAS_SORT */
+
+#if ULAB_NUMPY_HAS_STD
+//| def std(array: _ArrayLike, *, axis: Optional[int] = None, ddof: int = 0) -> _float:
+//|     """Return the standard deviation of the array, as a number if axis is None, otherwise as an array."""
+//|     ...
+//|
+
+mp_obj_t numerical_std(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } } ,
+        { MP_QSTR_axis, MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_ddof, MP_ARG_KW_ONLY | MP_ARG_INT, {.u_int = 0} },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t oin = args[0].u_obj;
+    mp_obj_t axis = args[1].u_obj;
+    size_t ddof = args[2].u_int;
+    if((axis != mp_const_none) && (mp_obj_get_int(axis) != 0) && (mp_obj_get_int(axis) != 1)) {
+        // this seems to pass with False, and True...
+        mp_raise_ValueError(translate("axis must be None, or an integer"));
+    }
+    if(mp_obj_is_type(oin, &mp_type_tuple) || mp_obj_is_type(oin, &mp_type_list) || mp_obj_is_type(oin, &mp_type_range)) {
+        return numerical_sum_mean_std_iterable(oin, NUMERICAL_STD, ddof);
+    } else if(mp_obj_is_type(oin, &ulab_ndarray_type)) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(oin);
+        return numerical_sum_mean_std_ndarray(ndarray, axis, NUMERICAL_STD, ddof);
+    } else {
+        mp_raise_TypeError(translate("input must be tuple, list, range, or ndarray"));
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_std_obj, 1, numerical_std);
+#endif
+
+#if ULAB_NUMPY_HAS_SUM
+//| def sum(array: _ArrayLike, *, axis: Optional[int] = None) -> Union[_float, int, ulab.numpy.ndarray]:
+//|     """Return the sum of the array, as a number if axis is None, otherwise as an array."""
+//|     ...
+//|
+
+mp_obj_t numerical_sum(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_SUM);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_sum_obj, 1, numerical_sum);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/numerical.h b/circuitpython/extmod/ulab/code/numpy/numerical.h
new file mode 100644
index 0000000..8d2971c
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/numerical.h
@@ -0,0 +1,652 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _NUMERICAL_
+#define _NUMERICAL_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+// TODO: implement cumsum
+
+#define RUN_ARGMIN1(ndarray, type, array, results, rarray, index, op)\
+({\
+    uint16_t best_index = 0;\
+    type best_value = *((type *)(array));\
+    if(((op) == NUMERICAL_MAX) || ((op) == NUMERICAL_ARGMAX)) {\
+        for(uint16_t i=0; i < (ndarray)->shape[(index)]; i++) {\
+            if(*((type *)(array)) > best_value) {\
+                best_index = i;\
+                best_value = *((type *)(array));\
+            }\
+            (array) += (ndarray)->strides[(index)];\
+        }\
+    } else {\
+        for(uint16_t i=0; i < (ndarray)->shape[(index)]; i++) {\
+            if(*((type *)(array)) < best_value) {\
+                best_index = i;\
+                best_value = *((type *)(array));\
+            }\
+            (array) += (ndarray)->strides[(index)];\
+        }\
+    }\
+    if(((op) == NUMERICAL_ARGMAX) || ((op) == NUMERICAL_ARGMIN)) {\
+        memcpy((rarray), &best_index, (results)->itemsize);\
+    } else {\
+        memcpy((rarray), &best_value, (results)->itemsize);\
+    }\
+    (rarray) += (results)->itemsize;\
+})
+
+#define RUN_SUM1(type, array, results, rarray, ss)\
+({\
+    type sum = 0;\
+    for(size_t i=0; i < (ss).shape[0]; i++) {\
+        sum += *((type *)(array));\
+        (array) += (ss).strides[0];\
+    }\
+    memcpy((rarray), &sum, (results)->itemsize);\
+    (rarray) += (results)->itemsize;\
+})
+
+// The mean could be calculated by simply dividing the sum by
+// the number of elements, but that method is numerically unstable
+#define RUN_MEAN1(type, array, rarray, ss)\
+({\
+    mp_float_t M = 0.0;\
+    for(size_t i=0; i < (ss).shape[0]; i++) {\
+        mp_float_t value = (mp_float_t)(*(type *)(array));\
+        M = M + (value - M) / (mp_float_t)(i+1);\
+        (array) += (ss).strides[0];\
+    }\
+    *(rarray)++ = M;\
+})
+
+// Instead of the straightforward implementation of the definition,
+// we take the numerically stable Welford algorithm here
+// https://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
+#define RUN_STD1(type, array, rarray, ss, div)\
+({\
+    mp_float_t M = 0.0, m = 0.0, S = 0.0;\
+    for(size_t i=0; i < (ss).shape[0]; i++) {\
+        mp_float_t value = (mp_float_t)(*(type *)(array));\
+        m = M + (value - M) / (mp_float_t)(i+1);\
+        S = S + (value - M) * (value - m);\
+        M = m;\
+        (array) += (ss).strides[0];\
+    }\
+    *(rarray)++ = MICROPY_FLOAT_C_FUN(sqrt)(S / (div));\
+})
+
+#define RUN_MEAN_STD1(type, array, rarray, ss, div, isStd)\
+({\
+    mp_float_t M = 0.0, m = 0.0, S = 0.0;\
+    for(size_t i=0; i < (ss).shape[0]; i++) {\
+        mp_float_t value = (mp_float_t)(*(type *)(array));\
+        m = M + (value - M) / (mp_float_t)(i+1);\
+        if(isStd) {\
+            S += (value - M) * (value - m);\
+        }\
+        M = m;\
+        (array) += (ss).strides[0];\
+    }\
+    *(rarray)++ = isStd ? MICROPY_FLOAT_C_FUN(sqrt)(S / (div)) : M;\
+})
+
+#define RUN_DIFF1(ndarray, type, array, results, rarray, index, stencil, N)\
+({\
+    for(size_t i=0; i < (results)->shape[ULAB_MAX_DIMS - 1]; i++) {\
+        type sum = 0;\
+        uint8_t *source = (array);\
+        for(uint8_t d=0; d < (N)+1; d++) {\
+            sum -= (stencil)[d] * *((type *)source);\
+            source += (ndarray)->strides[(index)];\
+        }\
+        (array) += (ndarray)->strides[ULAB_MAX_DIMS - 1];\
+        *(type *)(rarray) = sum;\
+        (rarray) += (results)->itemsize;\
+    }\
+})
+
+#define HEAPSORT1(type, array, increment, N)\
+({\
+    type *_array = (type *)array;\
+    type tmp;\
+    size_t c, q = (N), p, r = (N) >> 1;\
+    for (;;) {\
+        if (r > 0) {\
+            tmp = _array[(--r)*(increment)];\
+        } else {\
+            q--;\
+            if(q == 0) {\
+                break;\
+            }\
+            tmp = _array[q*(increment)];\
+            _array[q*(increment)] = _array[0];\
+        }\
+        p = r;\
+        c = r + r + 1;\
+        while (c < q) {\
+            if((c + 1 < q)  &&  (_array[(c+1)*(increment)] > _array[c*(increment)])) {\
+                c++;\
+            }\
+            if(_array[c*(increment)] > tmp) {\
+                _array[p*(increment)] = _array[c*(increment)];\
+                p = c;\
+                c = p + p + 1;\
+            } else {\
+                break;\
+            }\
+        }\
+        _array[p*(increment)] = tmp;\
+    }\
+})
+
+#define HEAP_ARGSORT1(type, array, increment, N, iarray, iincrement)\
+({\
+    type *_array = (type *)array;\
+    type tmp;\
+    uint16_t itmp, c, q = (N), p, r = (N) >> 1;\
+    for (;;) {\
+        if (r > 0) {\
+            r--;\
+            itmp = (iarray)[r*(iincrement)];\
+            tmp = _array[itmp*(increment)];\
+        } else {\
+            q--;\
+            if(q == 0) {\
+                break;\
+            }\
+            itmp = (iarray)[q*(iincrement)];\
+            tmp = _array[itmp*(increment)];\
+            (iarray)[q*(iincrement)] = (iarray)[0];\
+        }\
+        p = r;\
+        c = r + r + 1;\
+        while (c < q) {\
+            if((c + 1 < q)  &&  (_array[(iarray)[(c+1)*(iincrement)]*(increment)] > _array[(iarray)[c*(iincrement)]*(increment)])) {\
+                c++;\
+            }\
+            if(_array[(iarray)[c*(iincrement)]*(increment)] > tmp) {\
+                (iarray)[p*(iincrement)] = (iarray)[c*(iincrement)];\
+                p = c;\
+                c = p + p + 1;\
+            } else {\
+                break;\
+            }\
+        }\
+        (iarray)[p*(iincrement)] = itmp;\
+    }\
+})
+
+#if ULAB_MAX_DIMS == 1
+#define RUN_SUM(type, array, results, rarray, ss) do {\
+    RUN_SUM1(type, (array), (results), (rarray), (ss));\
+} while(0)
+
+#define RUN_MEAN(type, array, rarray, ss) do {\
+    RUN_MEAN1(type, (array), (rarray), (ss));\
+} while(0)
+
+#define RUN_STD(type, array, rarray, ss, div) do {\
+    RUN_STD1(type, (array), (results), (rarray), (ss), (div));\
+} while(0)
+
+#define RUN_MEAN_STD(type, array, rarray, ss, div, isStd) do {\
+    RUN_MEAN_STD1(type, (array), (rarray), (ss), (div), (isStd));\
+} while(0)
+
+#define RUN_ARGMIN(ndarray, type, array, results, rarray, shape, strides, index, op) do {\
+    RUN_ARGMIN1((ndarray), type, (array), (results), (rarray), (index), (op));\
+} while(0)
+
+#define RUN_DIFF(ndarray, type, array, results, rarray, shape, strides, index, stencil, N) do {\
+    RUN_DIFF1((ndarray), type, (array), (results), (rarray), (index), (stencil), (N));\
+} while(0)
+
+#define HEAPSORT(ndarray, type, array, shape, strides, index, increment, N) do {\
+    HEAPSORT1(type, (array), (increment), (N));\
+} while(0)
+
+#define HEAP_ARGSORT(ndarray, type, array, shape, strides, index, increment, N, iarray, istrides, iincrement) do {\
+    HEAP_ARGSORT1(type, (array), (increment), (N), (iarray), (iincrement));\
+} while(0)
+
+#endif
+
+#if ULAB_MAX_DIMS == 2
+#define RUN_SUM(type, array, results, rarray, ss) do {\
+    size_t l = 0;\
+    do {\
+        RUN_SUM1(type, (array), (results), (rarray), (ss));\
+        (array) -= (ss).strides[0] * (ss).shape[0];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+#define RUN_MEAN(type, array, rarray, ss) do {\
+    size_t l = 0;\
+    do {\
+        RUN_MEAN1(type, (array), (rarray), (ss));\
+        (array) -= (ss).strides[0] * (ss).shape[0];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+#define RUN_STD(type, array, rarray, ss, div) do {\
+    size_t l = 0;\
+    do {\
+        RUN_STD1(type, (array), (rarray), (ss), (div));\
+        (array) -= (ss).strides[0] * (ss).shape[0];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+#define RUN_MEAN_STD(type, array, rarray, ss, div, isStd) do {\
+    size_t l = 0;\
+    do {\
+        RUN_MEAN_STD1(type, (array), (rarray), (ss), (div), (isStd));\
+        (array) -= (ss).strides[0] * (ss).shape[0];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+
+#define RUN_ARGMIN(ndarray, type, array, results, rarray, shape, strides, index, op) do {\
+    size_t l = 0;\
+    do {\
+        RUN_ARGMIN1((ndarray), type, (array), (results), (rarray), (index), (op));\
+        (array) -= (ndarray)->strides[(index)] * (ndarray)->shape[(index)];\
+        (array) += (strides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+#define RUN_DIFF(ndarray, type, array, results, rarray, shape, strides, index, stencil, N) do {\
+    size_t l = 0;\
+    do {\
+        RUN_DIFF1((ndarray), type, (array), (results), (rarray), (index), (stencil), (N));\
+        (array) -= (ndarray)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (array) += (ndarray)->strides[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (results)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (rarray) += (results)->strides[ULAB_MAX_DIMS - 2];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define HEAPSORT(ndarray, type, array, shape, strides, index, increment, N) do {\
+    size_t l = 0;\
+    do {\
+        HEAPSORT1(type, (array), (increment), (N));\
+        (array) += (strides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+#define HEAP_ARGSORT(ndarray, type, array, shape, strides, index, increment, N, iarray, istrides, iincrement) do {\
+    size_t l = 0;\
+    do {\
+        HEAP_ARGSORT1(type, (array), (increment), (N), (iarray), (iincrement));\
+        (array) += (strides)[ULAB_MAX_DIMS - 1];\
+        (iarray) += (istrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+#endif
+
+#if ULAB_MAX_DIMS == 3
+#define RUN_SUM(type, array, results, rarray, ss) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            RUN_SUM1(type, (array), (results), (rarray), (ss));\
+            (array) -= (ss).strides[0] * (ss).shape[0];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define RUN_MEAN(type, array, rarray, ss) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            RUN_MEAN1(type, (array), (rarray), (ss));\
+            (array) -= (ss).strides[0] * (ss).shape[0];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define RUN_STD(type, array, rarray, ss, div) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            RUN_STD1(type, (array), (rarray), (ss), (div));\
+            (array) -= (ss).strides[0] * (ss).shape[0];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define RUN_MEAN_STD(type, array, rarray, ss, div, isStd) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            RUN_MEAN_STD1(type, (array), (rarray), (ss), (div), (isStd));\
+            (array) -= (ss).strides[0] * (ss).shape[0];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define RUN_ARGMIN(ndarray, type, array, results, rarray, shape, strides, index, op) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            RUN_ARGMIN1((ndarray), type, (array), (results), (rarray), (index), (op));\
+            (array) -= (ndarray)->strides[(index)] * (ndarray)->shape[(index)];\
+            (array) += (strides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+        (array) -= (strides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+        (array) += (strides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (shape)[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define RUN_DIFF(ndarray, type, array, results, rarray, shape, strides, index, stencil, N) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            RUN_DIFF1((ndarray), type, (array), (results), (rarray), (index), (stencil), (N));\
+            (array) -= (ndarray)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+			(array) += (ndarray)->strides[ULAB_MAX_DIMS - 2];\
+            (rarray) -= (results)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+            (rarray) += (results)->strides[ULAB_MAX_DIMS - 2];\
+            l++;\
+        } while(l < (shape)[ULAB_MAX_DIMS - 2]);\
+        (array) -= (ndarray)->strides[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+        (array) += (ndarray)->strides[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (results)->strides[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (rarray) += (results)->strides[ULAB_MAX_DIMS - 3];\
+        k++;\
+    } while(k < (shape)[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#define HEAPSORT(ndarray, type, array, shape, strides, index, increment, N) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            HEAPSORT1(type, (array), (increment), (N));\
+            (array) += (strides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+        (array) -= (strides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+        (array) += (strides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (shape)[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define HEAP_ARGSORT(ndarray, type, array, shape, strides, index, increment, N, iarray, istrides, iincrement) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            HEAP_ARGSORT1(type, (array), (increment), (N), (iarray), (iincrement));\
+            (array) += (strides)[ULAB_MAX_DIMS - 1];\
+            (iarray) += (istrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+        (iarray) -= (istrides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+        (iarray) += (istrides)[ULAB_MAX_DIMS - 2];\
+        (array) -= (strides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+        (array) += (strides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (shape)[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#endif
+
+#if ULAB_MAX_DIMS == 4
+#define RUN_SUM(type, array, results, rarray, ss) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                RUN_SUM1(type, (array), (results), (rarray), (ss));\
+                (array) -= (ss).strides[0] * (ss).shape[0];\
+                (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+            (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (ss).shape[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#define RUN_MEAN(type, array, rarray, ss) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                RUN_MEAN1(type, (array), (rarray), (ss));\
+                (array) -= (ss).strides[0] * (ss).shape[0];\
+                (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+            (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (ss).shape[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#define RUN_STD(type, array, rarray, ss, div) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                RUN_STD1(type, (array), (rarray), (ss), (div));\
+                (array) -= (ss).strides[0] * (ss).shape[0];\
+                (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+            (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (ss).shape[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#define RUN_MEAN_STD(type, array, rarray, ss, div, isStd) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                RUN_MEAN_STD1(type, (array), (rarray), (ss), (div), (isStd));\
+                (array) -= (ss).strides[0] * (ss).shape[0];\
+                (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+            (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (ss).shape[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#define RUN_ARGMIN(ndarray, type, array, results, rarray, shape, strides, index, op) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                RUN_ARGMIN1((ndarray), type, (array), (results), (rarray), (index), (op));\
+                (array) -= (ndarray)->strides[(index)] * (ndarray)->shape[(index)];\
+                (array) += (strides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+            (array) -= (strides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+            (array) += (strides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (shape)[ULAB_MAX_DIMS - 2]);\
+        (array) -= (strides)[ULAB_MAX_DIMS - 2] * (shape)[ULAB_MAX_DIMS-2];\
+        (array) += (strides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (shape)[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#define RUN_DIFF(ndarray, type, array, results, rarray, shape, strides, index, stencil, N) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                RUN_DIFF1((ndarray), type, (array), (results), (rarray), (index), (stencil), (N));\
+                (array) -= (ndarray)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+                (array) += (ndarray)->strides[ULAB_MAX_DIMS - 2];\
+                (rarray) -= (results)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+                (rarray) += (results)->strides[ULAB_MAX_DIMS - 2];\
+                l++;\
+            } while(l < (shape)[ULAB_MAX_DIMS - 2]);\
+            (array) -= (strides)[ULAB_MAX_DIMS - 2] * (shape)[ULAB_MAX_DIMS-2];\
+            (array) += (strides)[ULAB_MAX_DIMS - 3];\
+            (rarray) -= (results)->strides[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+            (rarray) += (results)->strides[ULAB_MAX_DIMS - 3];\
+            k++;\
+        } while(k < (shape)[ULAB_MAX_DIMS - 3]);\
+        (array) -= (strides)[ULAB_MAX_DIMS - 3] * (shape)[ULAB_MAX_DIMS-3];\
+        (array) += (strides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (results)->strides[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (rarray) += (results)->strides[ULAB_MAX_DIMS - 4];\
+        j++;\
+    } while(j < (shape)[ULAB_MAX_DIMS - 4]);\
+} while(0)
+
+#define HEAPSORT(ndarray, type, array, shape, strides, index, increment, N) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                HEAPSORT1(type, (array), (increment), (N));\
+                (array) += (strides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+            (array) -= (strides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+            (array) += (strides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (shape)[ULAB_MAX_DIMS - 2]);\
+        (array) -= (strides)[ULAB_MAX_DIMS - 2] * (shape)[ULAB_MAX_DIMS-2];\
+        (array) += (strides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (shape)[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#define HEAP_ARGSORT(ndarray, type, array, shape, strides, index, increment, N, iarray, istrides, iincrement) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                HEAP_ARGSORT1(type, (array), (increment), (N), (iarray), (iincrement));\
+                (array) += (strides)[ULAB_MAX_DIMS - 1];\
+                (iarray) += (istrides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+            (iarray) -= (istrides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+            (iarray) += (istrides)[ULAB_MAX_DIMS - 2];\
+            (array) -= (strides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+            (array) += (strides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (shape)[ULAB_MAX_DIMS - 2]);\
+        (iarray) -= (istrides)[ULAB_MAX_DIMS - 2] * (shape)[ULAB_MAX_DIMS-2];\
+        (iarray) += (istrides)[ULAB_MAX_DIMS - 3];\
+        (array) -= (strides)[ULAB_MAX_DIMS - 2] * (shape)[ULAB_MAX_DIMS-2];\
+        (array) += (strides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (shape)[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#endif
+
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_all_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_any_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_argmax_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_argmin_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_argsort_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(numerical_cross_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_diff_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_flip_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_max_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_mean_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_median_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_min_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_roll_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_std_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_sum_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_sort_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_sort_inplace_obj);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/numpy.c b/circuitpython/extmod/ulab/code/numpy/numpy.c
new file mode 100644
index 0000000..ebd171d
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/numpy.c
@@ -0,0 +1,383 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020-2022 Zoltán Vörös
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <string.h>
+#include "py/runtime.h"
+
+#include "numpy.h"
+#include "approx.h"
+#include "carray/carray.h"
+#include "compare.h"
+#include "create.h"
+#include "fft/fft.h"
+#include "filter.h"
+#include "linalg/linalg.h"
+#include "numerical.h"
+#include "stats.h"
+#include "transform.h"
+#include "poly.h"
+#include "vector.h"
+
+//| """Compatibility layer for numpy"""
+//|
+
+//| class ndarray: ...
+
+//| def get_printoptions() -> Dict[str, int]:
+//|     """Get printing options"""
+//|     ...
+//|
+//| def set_printoptions(threshold: Optional[int] = None, edgeitems: Optional[int] = None) -> None:
+//|     """Set printing options"""
+//|     ...
+//|
+//| def ndinfo(array: ulab.numpy.ndarray) -> None:
+//|     ...
+//|
+//| def array(
+//|     values: Union[ndarray, Iterable[Union[_float, _bool, Iterable[Any]]]],
+//|     *,
+//|     dtype: _DType = ulab.numpy.float
+//| ) -> ulab.numpy.ndarray:
+//|     """alternate constructor function for `ulab.numpy.ndarray`. Mirrors numpy.array"""
+//|     ...
+
+// math constants
+#if ULAB_NUMPY_HAS_E
+#if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_C
+#define ulab_const_float_e MP_ROM_PTR((mp_obj_t)(((0x402df854 & ~3) | 2) + 0x80800000))
+#elif MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
+#define ulab_const_float_e {((mp_obj_t)((uint64_t)0x4005bf0a8b145769 + 0x8004000000000000))}
+#else
+mp_obj_float_t ulab_const_float_e_obj = {{&mp_type_float}, MP_E};
+#define ulab_const_float_e MP_ROM_PTR(&ulab_const_float_e_obj)
+#endif
+#endif
+
+#if ULAB_NUMPY_HAS_INF
+#if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_C
+#define numpy_const_float_inf MP_ROM_PTR((mp_obj_t)(0x7f800002 + 0x80800000))
+#elif MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
+#define numpy_const_float_inf {((mp_obj_t)((uint64_t)0x7ff0000000000000 + 0x8004000000000000))}
+#else
+mp_obj_float_t numpy_const_float_inf_obj = {{&mp_type_float}, (mp_float_t)INFINITY};
+#define numpy_const_float_inf MP_ROM_PTR(&numpy_const_float_inf_obj)
+#endif
+#endif
+
+#if ULAB_NUMPY_HAS_NAN
+#if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_C
+#define numpy_const_float_nan MP_ROM_PTR((mp_obj_t)(0x7fc00002 + 0x80800000))
+#elif MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
+#define numpy_const_float_nan {((mp_obj_t)((uint64_t)0x7ff8000000000000 + 0x8004000000000000))}
+#else
+mp_obj_float_t numpy_const_float_nan_obj = {{&mp_type_float}, (mp_float_t)NAN};
+#define numpy_const_float_nan MP_ROM_PTR(&numpy_const_float_nan_obj)
+#endif
+#endif
+
+#if ULAB_NUMPY_HAS_PI
+#if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_C
+#define ulab_const_float_pi MP_ROM_PTR((mp_obj_t)(((0x40490fdb & ~3) | 2) + 0x80800000))
+#elif MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
+#define ulab_const_float_pi {((mp_obj_t)((uint64_t)0x400921fb54442d18 + 0x8004000000000000))}
+#else
+mp_obj_float_t ulab_const_float_pi_obj = {{&mp_type_float}, MP_PI};
+#define ulab_const_float_pi MP_ROM_PTR(&ulab_const_float_pi_obj)
+#endif
+#endif
+
+static const mp_rom_map_elem_t ulab_numpy_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_numpy) },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_ndarray), (mp_obj_t)&ulab_ndarray_type },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_array), MP_ROM_PTR(&ndarray_array_constructor_obj) },
+    #if ULAB_NUMPY_HAS_FROMBUFFER
+        { MP_ROM_QSTR(MP_QSTR_frombuffer), MP_ROM_PTR(&create_frombuffer_obj) },
+    #endif
+    // math constants
+    #if ULAB_NUMPY_HAS_E
+        { MP_ROM_QSTR(MP_QSTR_e), ulab_const_float_e },
+    #endif
+    #if ULAB_NUMPY_HAS_INF
+        { MP_ROM_QSTR(MP_QSTR_inf), numpy_const_float_inf },
+    #endif
+    #if ULAB_NUMPY_HAS_NAN
+        { MP_ROM_QSTR(MP_QSTR_nan), numpy_const_float_nan },
+    #endif
+    #if ULAB_NUMPY_HAS_PI
+        { MP_ROM_QSTR(MP_QSTR_pi), ulab_const_float_pi },
+    #endif
+    // class constants, always included
+    { MP_ROM_QSTR(MP_QSTR_bool), MP_ROM_INT(NDARRAY_BOOL) },
+    { MP_ROM_QSTR(MP_QSTR_uint8), MP_ROM_INT(NDARRAY_UINT8) },
+    { MP_ROM_QSTR(MP_QSTR_int8), MP_ROM_INT(NDARRAY_INT8) },
+    { MP_ROM_QSTR(MP_QSTR_uint16), MP_ROM_INT(NDARRAY_UINT16) },
+    { MP_ROM_QSTR(MP_QSTR_int16), MP_ROM_INT(NDARRAY_INT16) },
+    { MP_ROM_QSTR(MP_QSTR_float), MP_ROM_INT(NDARRAY_FLOAT) },
+    #if ULAB_SUPPORTS_COMPLEX
+        { MP_ROM_QSTR(MP_QSTR_complex), MP_ROM_INT(NDARRAY_COMPLEX) },
+    #endif
+    // modules of numpy
+    #if ULAB_NUMPY_HAS_FFT_MODULE
+        { MP_ROM_QSTR(MP_QSTR_fft), MP_ROM_PTR(&ulab_fft_module) },
+    #endif
+    #if ULAB_NUMPY_HAS_LINALG_MODULE
+        { MP_ROM_QSTR(MP_QSTR_linalg), MP_ROM_PTR(&ulab_linalg_module) },
+    #endif
+    #if ULAB_HAS_PRINTOPTIONS
+        { MP_ROM_QSTR(MP_QSTR_set_printoptions), (mp_obj_t)&ndarray_set_printoptions_obj },
+        { MP_ROM_QSTR(MP_QSTR_get_printoptions), (mp_obj_t)&ndarray_get_printoptions_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_NDINFO
+        { MP_ROM_QSTR(MP_QSTR_ndinfo), (mp_obj_t)&ndarray_info_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ARANGE
+        { MP_ROM_QSTR(MP_QSTR_arange), (mp_obj_t)&create_arange_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_COMPRESS
+        { MP_ROM_QSTR(MP_QSTR_compress), (mp_obj_t)&transform_compress_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_CONCATENATE
+        { MP_ROM_QSTR(MP_QSTR_concatenate), (mp_obj_t)&create_concatenate_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_DIAG
+        #if ULAB_MAX_DIMS > 1
+            { MP_ROM_QSTR(MP_QSTR_diag), (mp_obj_t)&create_diag_obj },
+        #endif
+    #endif
+    #if ULAB_NUMPY_HAS_EMPTY
+        { MP_ROM_QSTR(MP_QSTR_empty), (mp_obj_t)&create_zeros_obj },
+    #endif
+    #if ULAB_MAX_DIMS > 1
+        #if ULAB_NUMPY_HAS_EYE
+            { MP_ROM_QSTR(MP_QSTR_eye), (mp_obj_t)&create_eye_obj },
+        #endif
+    #endif /* ULAB_MAX_DIMS */
+    // functions of the approx sub-module
+    #if ULAB_NUMPY_HAS_INTERP
+        { MP_OBJ_NEW_QSTR(MP_QSTR_interp), (mp_obj_t)&approx_interp_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_TRAPZ
+        { MP_OBJ_NEW_QSTR(MP_QSTR_trapz), (mp_obj_t)&approx_trapz_obj },
+    #endif
+    // functions of the create sub-module
+    #if ULAB_NUMPY_HAS_FULL
+        { MP_ROM_QSTR(MP_QSTR_full), (mp_obj_t)&create_full_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_LINSPACE
+        { MP_ROM_QSTR(MP_QSTR_linspace), (mp_obj_t)&create_linspace_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_LOGSPACE
+        { MP_ROM_QSTR(MP_QSTR_logspace), (mp_obj_t)&create_logspace_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ONES
+        { MP_ROM_QSTR(MP_QSTR_ones), (mp_obj_t)&create_ones_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ZEROS
+        { MP_ROM_QSTR(MP_QSTR_zeros), (mp_obj_t)&create_zeros_obj },
+    #endif
+    // functions of the compare sub-module
+    #if ULAB_NUMPY_HAS_CLIP
+        { MP_OBJ_NEW_QSTR(MP_QSTR_clip), (mp_obj_t)&compare_clip_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_EQUAL
+        { MP_OBJ_NEW_QSTR(MP_QSTR_equal), (mp_obj_t)&compare_equal_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_NOTEQUAL
+        { MP_OBJ_NEW_QSTR(MP_QSTR_not_equal), (mp_obj_t)&compare_not_equal_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ISFINITE
+        { MP_OBJ_NEW_QSTR(MP_QSTR_isfinite), (mp_obj_t)&compare_isfinite_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ISINF
+        { MP_OBJ_NEW_QSTR(MP_QSTR_isinf), (mp_obj_t)&compare_isinf_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_MAXIMUM
+        { MP_OBJ_NEW_QSTR(MP_QSTR_maximum), (mp_obj_t)&compare_maximum_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_MINIMUM
+        { MP_OBJ_NEW_QSTR(MP_QSTR_minimum), (mp_obj_t)&compare_minimum_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_WHERE
+        { MP_OBJ_NEW_QSTR(MP_QSTR_where), (mp_obj_t)&compare_where_obj },
+    #endif
+    // functions of the filter sub-module
+    #if ULAB_NUMPY_HAS_CONVOLVE
+        { MP_OBJ_NEW_QSTR(MP_QSTR_convolve), (mp_obj_t)&filter_convolve_obj },
+    #endif
+    // functions of the numerical sub-module
+    #if ULAB_NUMPY_HAS_ALL
+        { MP_OBJ_NEW_QSTR(MP_QSTR_all), (mp_obj_t)&numerical_all_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ANY
+        { MP_OBJ_NEW_QSTR(MP_QSTR_any), (mp_obj_t)&numerical_any_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ARGMINMAX
+        { MP_OBJ_NEW_QSTR(MP_QSTR_argmax), (mp_obj_t)&numerical_argmax_obj },
+        { MP_OBJ_NEW_QSTR(MP_QSTR_argmin), (mp_obj_t)&numerical_argmin_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ARGSORT
+        { MP_OBJ_NEW_QSTR(MP_QSTR_argsort), (mp_obj_t)&numerical_argsort_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_CROSS
+        { MP_OBJ_NEW_QSTR(MP_QSTR_cross), (mp_obj_t)&numerical_cross_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_DIFF
+        { MP_OBJ_NEW_QSTR(MP_QSTR_diff), (mp_obj_t)&numerical_diff_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_DOT
+        #if ULAB_MAX_DIMS > 1
+            { MP_OBJ_NEW_QSTR(MP_QSTR_dot), (mp_obj_t)&transform_dot_obj },
+        #endif
+    #endif
+    #if ULAB_NUMPY_HAS_TRACE
+        #if ULAB_MAX_DIMS > 1
+            { MP_ROM_QSTR(MP_QSTR_trace), (mp_obj_t)&stats_trace_obj },
+        #endif
+    #endif
+    #if ULAB_NUMPY_HAS_FLIP
+        { MP_OBJ_NEW_QSTR(MP_QSTR_flip), (mp_obj_t)&numerical_flip_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_MINMAX
+        { MP_OBJ_NEW_QSTR(MP_QSTR_max), (mp_obj_t)&numerical_max_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_MEAN
+        { MP_OBJ_NEW_QSTR(MP_QSTR_mean), (mp_obj_t)&numerical_mean_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_MEDIAN
+        { MP_OBJ_NEW_QSTR(MP_QSTR_median), (mp_obj_t)&numerical_median_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_MINMAX
+        { MP_OBJ_NEW_QSTR(MP_QSTR_min), (mp_obj_t)&numerical_min_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ROLL
+        { MP_OBJ_NEW_QSTR(MP_QSTR_roll), (mp_obj_t)&numerical_roll_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_SORT
+        { MP_OBJ_NEW_QSTR(MP_QSTR_sort), (mp_obj_t)&numerical_sort_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_STD
+        { MP_OBJ_NEW_QSTR(MP_QSTR_std), (mp_obj_t)&numerical_std_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_SUM
+        { MP_OBJ_NEW_QSTR(MP_QSTR_sum), (mp_obj_t)&numerical_sum_obj },
+    #endif
+    // functions of the poly sub-module
+    #if ULAB_NUMPY_HAS_POLYFIT
+        { MP_OBJ_NEW_QSTR(MP_QSTR_polyfit), (mp_obj_t)&poly_polyfit_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_POLYVAL
+        { MP_OBJ_NEW_QSTR(MP_QSTR_polyval), (mp_obj_t)&poly_polyval_obj },
+    #endif
+    // functions of the vector sub-module
+    #if ULAB_NUMPY_HAS_ACOS
+    { MP_OBJ_NEW_QSTR(MP_QSTR_acos), (mp_obj_t)&vector_acos_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ACOSH
+    { MP_OBJ_NEW_QSTR(MP_QSTR_acosh), (mp_obj_t)&vector_acosh_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ARCTAN2
+    { MP_OBJ_NEW_QSTR(MP_QSTR_arctan2), (mp_obj_t)&vector_arctan2_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_AROUND
+    { MP_OBJ_NEW_QSTR(MP_QSTR_around), (mp_obj_t)&vector_around_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ASIN
+    { MP_OBJ_NEW_QSTR(MP_QSTR_asin), (mp_obj_t)&vector_asin_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ASINH
+    { MP_OBJ_NEW_QSTR(MP_QSTR_asinh), (mp_obj_t)&vector_asinh_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ATAN
+    { MP_OBJ_NEW_QSTR(MP_QSTR_atan), (mp_obj_t)&vector_atan_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ATANH
+    { MP_OBJ_NEW_QSTR(MP_QSTR_atanh), (mp_obj_t)&vector_atanh_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_CEIL
+    { MP_OBJ_NEW_QSTR(MP_QSTR_ceil), (mp_obj_t)&vector_ceil_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_COS
+    { MP_OBJ_NEW_QSTR(MP_QSTR_cos), (mp_obj_t)&vector_cos_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_COSH
+    { MP_OBJ_NEW_QSTR(MP_QSTR_cosh), (mp_obj_t)&vector_cosh_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_DEGREES
+    { MP_OBJ_NEW_QSTR(MP_QSTR_degrees), (mp_obj_t)&vector_degrees_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_EXP
+    { MP_OBJ_NEW_QSTR(MP_QSTR_exp), (mp_obj_t)&vector_exp_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_EXPM1
+    { MP_OBJ_NEW_QSTR(MP_QSTR_expm1), (mp_obj_t)&vector_expm1_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_FLOOR
+    { MP_OBJ_NEW_QSTR(MP_QSTR_floor), (mp_obj_t)&vector_floor_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_LOG
+    { MP_OBJ_NEW_QSTR(MP_QSTR_log), (mp_obj_t)&vector_log_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_LOG10
+    { MP_OBJ_NEW_QSTR(MP_QSTR_log10), (mp_obj_t)&vector_log10_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_LOG2
+    { MP_OBJ_NEW_QSTR(MP_QSTR_log2), (mp_obj_t)&vector_log2_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_RADIANS
+    { MP_OBJ_NEW_QSTR(MP_QSTR_radians), (mp_obj_t)&vector_radians_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_SIN
+    { MP_OBJ_NEW_QSTR(MP_QSTR_sin), (mp_obj_t)&vector_sin_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_SINH
+    { MP_OBJ_NEW_QSTR(MP_QSTR_sinh), (mp_obj_t)&vector_sinh_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_SQRT
+    { MP_OBJ_NEW_QSTR(MP_QSTR_sqrt), (mp_obj_t)&vector_sqrt_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_TAN
+    { MP_OBJ_NEW_QSTR(MP_QSTR_tan), (mp_obj_t)&vector_tan_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_TANH
+    { MP_OBJ_NEW_QSTR(MP_QSTR_tanh), (mp_obj_t)&vector_tanh_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_VECTORIZE
+    { MP_OBJ_NEW_QSTR(MP_QSTR_vectorize), (mp_obj_t)&vector_vectorize_obj },
+    #endif
+    #if ULAB_SUPPORTS_COMPLEX
+        #if ULAB_NUMPY_HAS_REAL
+        { MP_OBJ_NEW_QSTR(MP_QSTR_real), (mp_obj_t)&carray_real_obj },
+        #endif
+        #if ULAB_NUMPY_HAS_IMAG
+        { MP_OBJ_NEW_QSTR(MP_QSTR_imag), (mp_obj_t)&carray_imag_obj },
+        #endif
+        #if ULAB_NUMPY_HAS_CONJUGATE
+            { MP_ROM_QSTR(MP_QSTR_conjugate), (mp_obj_t)&carray_conjugate_obj },
+        #endif
+        #if ULAB_NUMPY_HAS_SORT_COMPLEX
+            { MP_ROM_QSTR(MP_QSTR_sort_complex), (mp_obj_t)&carray_sort_complex_obj },
+        #endif
+    #endif
+};
+
+static MP_DEFINE_CONST_DICT(mp_module_ulab_numpy_globals, ulab_numpy_globals_table);
+
+const mp_obj_module_t ulab_numpy_module = {
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_numpy_globals,
+};
+
+MP_REGISTER_MODULE(MP_QSTR_ulab_dot_numpy, ulab_numpy_module, MODULE_ULAB_ENABLED && CIRCUITPY_ULAB);
diff --git a/circuitpython/extmod/ulab/code/numpy/numpy.h b/circuitpython/extmod/ulab/code/numpy/numpy.h
new file mode 100644
index 0000000..f1348f3
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/numpy.h
@@ -0,0 +1,21 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+ *               
+*/
+
+#ifndef _NUMPY_
+#define _NUMPY_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+extern const mp_obj_module_t ulab_numpy_module;
+
+#endif /* _NUMPY_ */
diff --git a/circuitpython/extmod/ulab/code/numpy/poly.c b/circuitpython/extmod/ulab/code/numpy/poly.c
new file mode 100644
index 0000000..97ee5c7
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/poly.c
@@ -0,0 +1,250 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *               2020 Jeff Epler for Adafruit Industries
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020 Taku Fukada
+*/
+
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/objarray.h"
+
+#include "../ulab.h"
+#include "linalg/linalg_tools.h"
+#include "../ulab_tools.h"
+#include "carray/carray_tools.h"
+#include "poly.h"
+
+#if ULAB_NUMPY_HAS_POLYFIT
+
+mp_obj_t poly_polyfit(size_t n_args, const mp_obj_t *args) {
+    if(!ndarray_object_is_array_like(args[0])) {
+        mp_raise_ValueError(translate("input data must be an iterable"));
+    }
+    #if ULAB_SUPPORTS_COMPLEX
+    if(mp_obj_is_type(args[0], &ulab_ndarray_type)) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0]);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    }
+    #endif
+    size_t lenx = 0, leny = 0;
+    uint8_t deg = 0;
+    mp_float_t *x, *XT, *y, *prod;
+
+    if(n_args == 2) { // only the y values are supplied
+        // TODO: this is actually not enough: the first argument can very well be a matrix,
+        // in which case we are between the rock and a hard place
+        leny = (size_t)mp_obj_get_int(mp_obj_len_maybe(args[0]));
+        deg = (uint8_t)mp_obj_get_int(args[1]);
+        if(leny < deg) {
+            mp_raise_ValueError(translate("more degrees of freedom than data points"));
+        }
+        lenx = leny;
+        x = m_new(mp_float_t, lenx); // assume uniformly spaced data points
+        for(size_t i=0; i < lenx; i++) {
+            x[i] = i;
+        }
+        y = m_new(mp_float_t, leny);
+        fill_array_iterable(y, args[0]);
+    } else /* n_args == 3 */ {
+        if(!ndarray_object_is_array_like(args[1])) {
+            mp_raise_ValueError(translate("input data must be an iterable"));
+        }
+        lenx = (size_t)mp_obj_get_int(mp_obj_len_maybe(args[0]));
+        leny = (size_t)mp_obj_get_int(mp_obj_len_maybe(args[1]));
+        if(lenx != leny) {
+            mp_raise_ValueError(translate("input vectors must be of equal length"));
+        }
+        deg = (uint8_t)mp_obj_get_int(args[2]);
+        if(leny < deg) {
+            mp_raise_ValueError(translate("more degrees of freedom than data points"));
+        }
+        x = m_new(mp_float_t, lenx);
+        fill_array_iterable(x, args[0]);
+        y = m_new(mp_float_t, leny);
+        fill_array_iterable(y, args[1]);
+    }
+
+    // one could probably express X as a function of XT,
+    // and thereby save RAM, because X is used only in the product
+    XT = m_new(mp_float_t, (deg+1)*leny); // XT is a matrix of shape (deg+1, len) (rows, columns)
+    for(size_t i=0; i < leny; i++) { // column index
+        XT[i+0*lenx] = 1.0; // top row
+        for(uint8_t j=1; j < deg+1; j++) { // row index
+            XT[i+j*leny] = XT[i+(j-1)*leny]*x[i];
+        }
+    }
+
+    prod = m_new(mp_float_t, (deg+1)*(deg+1)); // the product matrix is of shape (deg+1, deg+1)
+    mp_float_t sum;
+    for(uint8_t i=0; i < deg+1; i++) { // column index
+        for(uint8_t j=0; j < deg+1; j++) { // row index
+            sum = 0.0;
+            for(size_t k=0; k < lenx; k++) {
+                // (j, k) * (k, i)
+                // Note that the second matrix is simply the transpose of the first:
+                // X(k, i) = XT(i, k) = XT[k*lenx+i]
+                sum += XT[j*lenx+k]*XT[i*lenx+k]; // X[k*(deg+1)+i];
+            }
+            prod[j*(deg+1)+i] = sum;
+        }
+    }
+    if(!linalg_invert_matrix(prod, deg+1)) {
+        // Although X was a Vandermonde matrix, whose inverse is guaranteed to exist,
+        // we bail out here, if prod couldn't be inverted: if the values in x are not all
+        // distinct, prod is singular
+        m_del(mp_float_t, XT, (deg+1)*lenx);
+        m_del(mp_float_t, x, lenx);
+        m_del(mp_float_t, y, lenx);
+        m_del(mp_float_t, prod, (deg+1)*(deg+1));
+        mp_raise_ValueError(translate("could not invert Vandermonde matrix"));
+    }
+    // at this point, we have the inverse of X^T * X
+    // y is a column vector; x is free now, we can use it for storing intermediate values
+    for(uint8_t i=0; i < deg+1; i++) { // row index
+        sum = 0.0;
+        for(size_t j=0; j < lenx; j++) { // column index
+            sum += XT[i*lenx+j]*y[j];
+        }
+        x[i] = sum;
+    }
+    // XT is no longer needed
+    m_del(mp_float_t, XT, (deg+1)*leny);
+
+    ndarray_obj_t *beta = ndarray_new_linear_array(deg+1, NDARRAY_FLOAT);
+    mp_float_t *betav = (mp_float_t *)beta->array;
+    // x[0..(deg+1)] contains now the product X^T * y; we can get rid of y
+    m_del(float, y, leny);
+
+    // now, we calculate beta, i.e., we apply prod = (X^T * X)^(-1) on x = X^T * y; x is a column vector now
+    for(uint8_t i=0; i < deg+1; i++) {
+        sum = 0.0;
+        for(uint8_t j=0; j < deg+1; j++) {
+            sum += prod[i*(deg+1)+j]*x[j];
+        }
+        betav[i] = sum;
+    }
+    m_del(mp_float_t, x, lenx);
+    m_del(mp_float_t, prod, (deg+1)*(deg+1));
+    for(uint8_t i=0; i < (deg+1)/2; i++) {
+        // We have to reverse the array, for the leading coefficient comes first.
+        SWAP(mp_float_t, betav[i], betav[deg-i]);
+    }
+    return MP_OBJ_FROM_PTR(beta);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(poly_polyfit_obj, 2, 3, poly_polyfit);
+#endif
+
+#if ULAB_NUMPY_HAS_POLYVAL
+
+mp_obj_t poly_polyval(mp_obj_t o_p, mp_obj_t o_x) {
+    if(!ndarray_object_is_array_like(o_p) || !ndarray_object_is_array_like(o_x)) {
+        mp_raise_TypeError(translate("inputs are not iterable"));
+    }
+    #if ULAB_SUPPORTS_COMPLEX
+    ndarray_obj_t *input;
+    if(mp_obj_is_type(o_p, &ulab_ndarray_type)) {
+        input = MP_OBJ_TO_PTR(o_p);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(input->dtype)
+    }
+    if(mp_obj_is_type(o_x, &ulab_ndarray_type)) {
+        input = MP_OBJ_TO_PTR(o_x);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(input->dtype)
+    }
+    #endif
+    // p had better be a one-dimensional standard iterable
+    uint8_t plen = mp_obj_get_int(mp_obj_len_maybe(o_p));
+    mp_float_t *p = m_new(mp_float_t, plen);
+    mp_obj_iter_buf_t p_buf;
+    mp_obj_t p_item, p_iterable = mp_getiter(o_p, &p_buf);
+    uint8_t i = 0;
+    while((p_item = mp_iternext(p_iterable)) != MP_OBJ_STOP_ITERATION) {
+        p[i] = mp_obj_get_float(p_item);
+        i++;
+    }
+
+    // polynomials are going to be of type float, except, when both
+    // the coefficients and the independent variable are integers
+    ndarray_obj_t *ndarray;
+    if(mp_obj_is_type(o_x, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(o_x);
+        uint8_t *sarray = (uint8_t *)source->array;
+        ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+
+        mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype);
+
+        // TODO: these loops are really nothing, but the re-impplementation of
+        // ITERATE_VECTOR from vectorise.c. We could pass a function pointer here
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        mp_float_t y = p[0];
+                        mp_float_t _x = func(sarray);
+                        for(uint8_t m=0; m < plen-1; m++) {
+                            y *= _x;
+                            y += p[m+1];
+                        }
+                        *array++ = y;
+                        sarray += source->strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                    sarray += source->strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                sarray += source->strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+            sarray += source->strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+        #endif
+    } else {
+        // o_x had better be a one-dimensional standard iterable
+        ndarray = ndarray_new_linear_array(mp_obj_get_int(mp_obj_len_maybe(o_x)), NDARRAY_FLOAT);
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        mp_obj_iter_buf_t x_buf;
+        mp_obj_t x_item, x_iterable = mp_getiter(o_x, &x_buf);
+        while ((x_item = mp_iternext(x_iterable)) != MP_OBJ_STOP_ITERATION) {
+            mp_float_t _x = mp_obj_get_float(x_item);
+            mp_float_t y = p[0];
+            for(uint8_t j=0; j < plen-1; j++) {
+                y *= _x;
+                y += p[j+1];
+            }
+            *array++ = y;
+        }
+    }
+    m_del(mp_float_t, p, plen);
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(poly_polyval_obj, poly_polyval);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/poly.h b/circuitpython/extmod/ulab/code/numpy/poly.h
new file mode 100644
index 0000000..59cb9f5
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/poly.h
@@ -0,0 +1,21 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _POLY_
+#define _POLY_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+MP_DECLARE_CONST_FUN_OBJ_VAR_BETWEEN(poly_polyfit_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(poly_polyval_obj);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/stats.c b/circuitpython/extmod/ulab/code/numpy/stats.c
new file mode 100644
index 0000000..2d34889
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/stats.c
@@ -0,0 +1,54 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020 Roberto Colistete Jr.
+ *               2020 Taku Fukada
+ *
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../ulab.h"
+#include "../ulab_tools.h"
+#include "carray/carray_tools.h"
+#include "stats.h"
+
+#if ULAB_MAX_DIMS > 1
+#if ULAB_NUMPY_HAS_TRACE
+
+//| def trace(m: ulab.numpy.ndarray) -> _float:
+//|     """
+//|     :param m: a square matrix
+//|
+//|     Compute the trace of the matrix, the sum of its diagonal elements."""
+//|     ...
+//|
+
+static mp_obj_t stats_trace(mp_obj_t oin) {
+    ndarray_obj_t *ndarray = tools_object_is_square(oin);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    mp_float_t trace = 0.0;
+    for(size_t i=0; i < ndarray->shape[ULAB_MAX_DIMS - 1]; i++) {
+        int32_t pos = i * (ndarray->strides[ULAB_MAX_DIMS - 1] + ndarray->strides[ULAB_MAX_DIMS - 2]);
+        trace += ndarray_get_float_index(ndarray->array, ndarray->dtype, pos/ndarray->itemsize);
+    }
+    if(ndarray->dtype == NDARRAY_FLOAT) {
+        return mp_obj_new_float(trace);
+    }
+    return mp_obj_new_int_from_float(trace);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(stats_trace_obj, stats_trace);
+#endif
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/stats.h b/circuitpython/extmod/ulab/code/numpy/stats.h
new file mode 100644
index 0000000..62bba9f
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/stats.h
@@ -0,0 +1,20 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _STATS_
+#define _STATS_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+MP_DECLARE_CONST_FUN_OBJ_1(stats_trace_obj);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/transform.c b/circuitpython/extmod/ulab/code/numpy/transform.c
new file mode 100644
index 0000000..f0e3e70
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/transform.c
@@ -0,0 +1,224 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../ulab.h"
+#include "../ulab_tools.h"
+#include "carray/carray_tools.h"
+#include "transform.h"
+
+#if ULAB_NUMPY_HAS_COMPRESS
+static mp_obj_t transform_compress(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t condition = args[0].u_obj;
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[1].u_obj);
+    uint8_t *array = (uint8_t *)ndarray->array;
+    mp_obj_t axis = args[2].u_obj;
+
+    size_t len = MP_OBJ_SMALL_INT_VALUE(mp_obj_len_maybe(condition));
+    int8_t ax, shift_ax;
+
+    if(axis != mp_const_none) {
+        ax = tools_get_axis(axis, ndarray->ndim);
+        shift_ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
+    }
+
+    if(((axis == mp_const_none) && (len != ndarray->len)) ||
+        ((axis != mp_const_none) && (len != ndarray->shape[shift_ax]))) {
+        mp_raise_ValueError(translate("wrong length of condition array"));
+    }
+
+    size_t true_count = 0;
+    mp_obj_iter_buf_t iter_buf;
+    mp_obj_t item, iterable = mp_getiter(condition, &iter_buf);
+    while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+        if(mp_obj_is_true(item)) {
+            true_count++;
+        }
+    }
+
+    iterable = mp_getiter(condition, &iter_buf);
+
+    ndarray_obj_t *result = NULL;
+    uint8_t *rarray = NULL;
+
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memcpy(shape, ndarray->shape, ULAB_MAX_DIMS * sizeof(size_t));
+
+    size_t *rshape = m_new(size_t, ULAB_MAX_DIMS);
+    memcpy(rshape, ndarray->shape, ULAB_MAX_DIMS * sizeof(size_t));
+
+    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+    memcpy(strides, ndarray->strides, ULAB_MAX_DIMS * sizeof(int32_t));
+
+    int32_t *rstrides = m_new(int32_t, ULAB_MAX_DIMS);
+
+    if(axis == mp_const_none) {
+        result = ndarray_new_linear_array(true_count, ndarray->dtype);
+        rarray = (uint8_t *)result->array;
+        memset(rstrides, 0, ndarray->ndim * sizeof(int32_t));
+        rstrides[ULAB_MAX_DIMS - 1] = ndarray->itemsize;
+        rshape[ULAB_MAX_DIMS - 1] = 0;
+    } else {
+        rshape[shift_ax] = true_count;
+
+        result = ndarray_new_dense_ndarray(ndarray->ndim, rshape, ndarray->dtype);
+        rarray = (uint8_t *)result->array;
+
+        SWAP(size_t, shape[shift_ax], shape[ULAB_MAX_DIMS - 1]);
+        SWAP(size_t, rshape[shift_ax], rshape[ULAB_MAX_DIMS - 1]);
+        SWAP(int32_t, strides[shift_ax], strides[ULAB_MAX_DIMS - 1]);
+
+        memcpy(rstrides, result->strides, ULAB_MAX_DIMS * sizeof(int32_t));
+        SWAP(int32_t, rstrides[shift_ax], rstrides[ULAB_MAX_DIMS - 1]);
+    }
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                if(axis != mp_const_none) {
+                    iterable = mp_getiter(condition, &iter_buf);
+                }
+                do {
+                    item = mp_iternext(iterable);
+                    if(mp_obj_is_true(item)) {
+                        memcpy(rarray, array, ndarray->itemsize);
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                    }
+                    array += strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                array -= strides[ULAB_MAX_DIMS - 1] * shape[ULAB_MAX_DIMS - 1];
+                array += strides[ULAB_MAX_DIMS - 2];
+                rarray -= rstrides[ULAB_MAX_DIMS - 1] * rshape[ULAB_MAX_DIMS - 1];
+                rarray += rstrides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            array -= strides[ULAB_MAX_DIMS - 2] * shape[ULAB_MAX_DIMS - 2];
+            array += strides[ULAB_MAX_DIMS - 3];
+            rarray -= rstrides[ULAB_MAX_DIMS - 2] * rshape[ULAB_MAX_DIMS - 2];
+            rarray += rstrides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        array -= strides[ULAB_MAX_DIMS - 3] * shape[ULAB_MAX_DIMS - 3];
+        array += strides[ULAB_MAX_DIMS - 4];
+        rarray -= rstrides[ULAB_MAX_DIMS - 2] * rshape[ULAB_MAX_DIMS - 2];
+        rarray += rstrides[ULAB_MAX_DIMS - 3];
+        i++;
+    } while(i < shape[ULAB_MAX_DIMS - 4]);
+    #endif
+
+    return result;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(transform_compress_obj, 2, transform_compress);
+#endif /* ULAB_NUMPY_HAS_COMPRESS */
+
+#if ULAB_MAX_DIMS > 1
+#if ULAB_NUMPY_HAS_DOT
+//| def dot(m1: ulab.numpy.ndarray, m2: ulab.numpy.ndarray) -> Union[ulab.numpy.ndarray, _float]:
+//|    """
+//|    :param ~ulab.numpy.ndarray m1: a matrix, or a vector
+//|    :param ~ulab.numpy.ndarray m2: a matrix, or a vector
+//|
+//|    Computes the product of two matrices, or two vectors. In the letter case, the inner product is returned."""
+//|    ...
+//|
+
+mp_obj_t transform_dot(mp_obj_t _m1, mp_obj_t _m2) {
+    // TODO: should the results be upcast?
+    // This implements 2D operations only!
+    if(!mp_obj_is_type(_m1, &ulab_ndarray_type) || !mp_obj_is_type(_m2, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("arguments must be ndarrays"));
+    }
+    ndarray_obj_t *m1 = MP_OBJ_TO_PTR(_m1);
+    ndarray_obj_t *m2 = MP_OBJ_TO_PTR(_m2);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(m1->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(m2->dtype)
+
+    uint8_t *array1 = (uint8_t *)m1->array;
+    uint8_t *array2 = (uint8_t *)m2->array;
+
+    mp_float_t (*func1)(void *) = ndarray_get_float_function(m1->dtype);
+    mp_float_t (*func2)(void *) = ndarray_get_float_function(m2->dtype);
+
+    if(m1->shape[ULAB_MAX_DIMS - 1] != m2->shape[ULAB_MAX_DIMS - m2->ndim]) {
+        mp_raise_ValueError(translate("dimensions do not match"));
+    }
+    uint8_t ndim = MIN(m1->ndim, m2->ndim);
+    size_t shape1 = m1->ndim == 2 ? m1->shape[ULAB_MAX_DIMS - m1->ndim] : 1;
+    size_t shape2 = m2->ndim == 2 ? m2->shape[ULAB_MAX_DIMS - 1] : 1;
+
+    size_t *shape = NULL;
+    if(ndim == 2) { // matrix times matrix -> matrix
+        shape = ndarray_shape_vector(0, 0, shape1, shape2);
+    } else { // matrix times vector -> vector, vector times vector -> vector (size 1)
+        shape = ndarray_shape_vector(0, 0, 0, shape1);
+    }
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+    mp_float_t *rarray = (mp_float_t *)results->array;
+
+    for(size_t i=0; i < shape1; i++) { // rows of m1
+        for(size_t j=0; j < shape2; j++) { // columns of m2
+            mp_float_t dot = 0.0;
+            for(size_t k=0; k < m1->shape[ULAB_MAX_DIMS - 1]; k++) {
+                // (i, k) * (k, j)
+                dot += func1(array1) * func2(array2);
+                array1 += m1->strides[ULAB_MAX_DIMS - 1];
+                array2 += m2->strides[ULAB_MAX_DIMS - m2->ndim];
+            }
+            *rarray++ = dot;
+            array1 -= m1->strides[ULAB_MAX_DIMS - 1] * m1->shape[ULAB_MAX_DIMS - 1];
+            array2 -= m2->strides[ULAB_MAX_DIMS - m2->ndim] * m2->shape[ULAB_MAX_DIMS - m2->ndim];
+            array2 += m2->strides[ULAB_MAX_DIMS - 1];
+        }
+        array1 += m1->strides[ULAB_MAX_DIMS - m1->ndim];
+        array2 = m2->array;
+    }
+    if((m1->ndim * m2->ndim) == 1) { // return a scalar, if product of two vectors
+        return mp_obj_new_float(*(--rarray));
+    } else {
+        return MP_OBJ_FROM_PTR(results);
+    }
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(transform_dot_obj, transform_dot);
+#endif
+#endif
+\ No newline at end of file
diff --git a/circuitpython/extmod/ulab/code/numpy/transform.h b/circuitpython/extmod/ulab/code/numpy/transform.h
new file mode 100644
index 0000000..039dcea
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/transform.h
@@ -0,0 +1,29 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *
+*/
+
+#ifndef _TRANSFORM_
+#define _TRANSFORM_
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../ulab.h"
+#include "../ulab_tools.h"
+#include "transform.h"
+
+MP_DECLARE_CONST_FUN_OBJ_KW(transform_compress_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(transform_dot_obj);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/vector.c b/circuitpython/extmod/ulab/code/numpy/vector.c
new file mode 100644
index 0000000..97ab66d
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/vector.c
@@ -0,0 +1,844 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *               2020 Jeff Epler for Adafruit Industries
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "py/runtime.h"
+#include "py/binary.h"
+#include "py/obj.h"
+#include "py/objarray.h"
+
+#include "../ulab.h"
+#include "../ulab_tools.h"
+#include "carray/carray_tools.h"
+#include "vector.h"
+
+//| """Element-by-element functions
+//|
+//| These functions can operate on numbers, 1-D iterables, and arrays of 1 to 4 dimensions by
+//| applying the function to every element in the array.  This is typically
+//| much more efficient than expressing the same operation as a Python loop."""
+//|
+
+static mp_obj_t vector_generic_vector(mp_obj_t o_in, mp_float_t (*f)(mp_float_t)) {
+    // Return a single value, if o_in is not iterable
+    if(mp_obj_is_float(o_in) || mp_obj_is_int(o_in)) {
+        return mp_obj_new_float(f(mp_obj_get_float(o_in)));
+    }
+    ndarray_obj_t *ndarray = NULL;
+    if(mp_obj_is_type(o_in, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(o_in);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(source->dtype)
+        uint8_t *sarray = (uint8_t *)source->array;
+        ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+
+        #if ULAB_VECTORISE_USES_FUN_POINTER
+
+            mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype);
+
+            #if ULAB_MAX_DIMS > 3
+            size_t i = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 2
+                size_t j = 0;
+                do {
+                #endif
+                    #if ULAB_MAX_DIMS > 1
+                    size_t k = 0;
+                    do {
+                    #endif
+                        size_t l = 0;
+                        do {
+                            mp_float_t value = func(sarray);
+                            *array++ = f(value);
+                            sarray += source->strides[ULAB_MAX_DIMS - 1];
+                            l++;
+                        } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                    #if ULAB_MAX_DIMS > 1
+                        sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                        sarray += source->strides[ULAB_MAX_DIMS - 2];
+                        k++;
+                    } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                    #endif /* ULAB_MAX_DIMS > 1 */
+                #if ULAB_MAX_DIMS > 2
+                    sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                    sarray += source->strides[ULAB_MAX_DIMS - 3];
+                    j++;
+                } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+                #endif /* ULAB_MAX_DIMS > 2 */
+            #if ULAB_MAX_DIMS > 3
+                sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+                sarray += source->strides[ULAB_MAX_DIMS - 4];
+                i++;
+            } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+            #endif /* ULAB_MAX_DIMS > 3 */
+        #else
+        if(source->dtype == NDARRAY_UINT8) {
+            ITERATE_VECTOR(uint8_t, array, source, sarray);
+        } else if(source->dtype == NDARRAY_INT8) {
+            ITERATE_VECTOR(int8_t, array, source, sarray);
+        } else if(source->dtype == NDARRAY_UINT16) {
+            ITERATE_VECTOR(uint16_t, array, source, sarray);
+        } else if(source->dtype == NDARRAY_INT16) {
+            ITERATE_VECTOR(int16_t, array, source, sarray);
+        } else {
+            ITERATE_VECTOR(mp_float_t, array, source, sarray);
+        }
+        #endif /* ULAB_VECTORISE_USES_FUN_POINTER */
+    } else {
+        ndarray = ndarray_from_mp_obj(o_in, 0);
+        mp_float_t *narray = (mp_float_t *)ndarray->array;
+        for(size_t i = 0; i < ndarray->len; i++) {
+            *narray = f(*narray);
+            narray++;
+        }
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+#if ULAB_NUMPY_HAS_ACOS
+//| def acos(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the inverse cosine function"""
+//|    ...
+//|
+
+MATH_FUN_1(acos, acos);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_acos_obj, vector_acos);
+#endif
+
+#if ULAB_NUMPY_HAS_ACOSH
+//| def acosh(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the inverse hyperbolic cosine function"""
+//|    ...
+//|
+
+MATH_FUN_1(acosh, acosh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_acosh_obj, vector_acosh);
+#endif
+
+#if ULAB_NUMPY_HAS_ASIN
+//| def asin(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the inverse sine function"""
+//|    ...
+//|
+
+MATH_FUN_1(asin, asin);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_asin_obj, vector_asin);
+#endif
+
+#if ULAB_NUMPY_HAS_ASINH
+//| def asinh(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the inverse hyperbolic sine function"""
+//|    ...
+//|
+
+MATH_FUN_1(asinh, asinh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_asinh_obj, vector_asinh);
+#endif
+
+#if ULAB_NUMPY_HAS_AROUND
+//| def around(a: _ArrayLike, *, decimals: int = 0) -> ulab.numpy.ndarray:
+//|    """Returns a new float array in which each element is rounded to
+//|       ``decimals`` places."""
+//|    ...
+//|
+
+mp_obj_t vector_around(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} },
+        { MP_QSTR_decimals, MP_ARG_KW_ONLY | MP_ARG_INT, {.u_int = 0 } }
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("first argument must be an ndarray"));
+    }
+    int8_t n = args[1].u_int;
+    mp_float_t mul = MICROPY_FLOAT_C_FUN(pow)(10.0, n);
+    ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0].u_obj);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(source->dtype)
+    ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
+    mp_float_t *narray = (mp_float_t *)ndarray->array;
+    uint8_t *sarray = (uint8_t *)source->array;
+
+    mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype);
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    mp_float_t f = func(sarray);
+                    *narray++ = MICROPY_FLOAT_C_FUN(round)(f * mul) / mul;
+                    sarray += source->strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                sarray += source->strides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+            sarray += source->strides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+        sarray += source->strides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+    #endif
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(vector_around_obj, 1, vector_around);
+#endif
+
+#if ULAB_NUMPY_HAS_ATAN
+//| def atan(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the inverse tangent function; the return values are in the
+//|       range [-pi/2,pi/2]."""
+//|    ...
+//|
+
+MATH_FUN_1(atan, atan);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_atan_obj, vector_atan);
+#endif
+
+#if ULAB_NUMPY_HAS_ARCTAN2
+//| def arctan2(ya: _ArrayLike, xa: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the inverse tangent function of y/x; the return values are in
+//|       the range [-pi, pi]."""
+//|    ...
+//|
+
+mp_obj_t vector_arctan2(mp_obj_t y, mp_obj_t x) {
+    ndarray_obj_t *ndarray_x = ndarray_from_mp_obj(x, 0);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray_x->dtype)
+
+    ndarray_obj_t *ndarray_y = ndarray_from_mp_obj(y, 0);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray_y->dtype)
+
+    uint8_t ndim = 0;
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    int32_t *xstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    int32_t *ystrides = m_new(int32_t, ULAB_MAX_DIMS);
+    if(!ndarray_can_broadcast(ndarray_x, ndarray_y, &ndim, shape, xstrides, ystrides)) {
+        mp_raise_ValueError(translate("operands could not be broadcast together"));
+        m_del(size_t, shape, ULAB_MAX_DIMS);
+        m_del(int32_t, xstrides, ULAB_MAX_DIMS);
+        m_del(int32_t, ystrides, ULAB_MAX_DIMS);
+    }
+
+    uint8_t *xarray = (uint8_t *)ndarray_x->array;
+    uint8_t *yarray = (uint8_t *)ndarray_y->array;
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+    mp_float_t *rarray = (mp_float_t *)results->array;
+
+    mp_float_t (*funcx)(void *) = ndarray_get_float_function(ndarray_x->dtype);
+    mp_float_t (*funcy)(void *) = ndarray_get_float_function(ndarray_y->dtype);
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    mp_float_t _x = funcx(xarray);
+                    mp_float_t _y = funcy(yarray);
+                    *rarray++ = MICROPY_FLOAT_C_FUN(atan2)(_y, _x);
+                    xarray += xstrides[ULAB_MAX_DIMS - 1];
+                    yarray += ystrides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                xarray -= xstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                xarray += xstrides[ULAB_MAX_DIMS - 2];
+                yarray -= ystrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                yarray += ystrides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            xarray -= xstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+            xarray += xstrides[ULAB_MAX_DIMS - 3];
+            yarray -= ystrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+            yarray += ystrides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        xarray -= xstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+        xarray += xstrides[ULAB_MAX_DIMS - 4];
+        yarray -= ystrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+        yarray += ystrides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+    #endif
+
+    return MP_OBJ_FROM_PTR(results);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(vector_arctan2_obj, vector_arctan2);
+#endif /* ULAB_VECTORISE_HAS_ARCTAN2 */
+
+#if ULAB_NUMPY_HAS_ATANH
+//| def atanh(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the inverse hyperbolic tangent function"""
+//|    ...
+//|
+
+MATH_FUN_1(atanh, atanh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_atanh_obj, vector_atanh);
+#endif
+
+#if ULAB_NUMPY_HAS_CEIL
+//| def ceil(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Rounds numbers up to the next whole number"""
+//|    ...
+//|
+
+MATH_FUN_1(ceil, ceil);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_ceil_obj, vector_ceil);
+#endif
+
+#if ULAB_NUMPY_HAS_COS
+//| def cos(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the cosine function"""
+//|    ...
+//|
+
+MATH_FUN_1(cos, cos);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_cos_obj, vector_cos);
+#endif
+
+#if ULAB_NUMPY_HAS_COSH
+//| def cosh(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the hyperbolic cosine function"""
+//|    ...
+//|
+
+MATH_FUN_1(cosh, cosh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_cosh_obj, vector_cosh);
+#endif
+
+#if ULAB_NUMPY_HAS_DEGREES
+//| def degrees(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Converts angles from radians to degrees"""
+//|    ...
+//|
+
+static mp_float_t vector_degrees_(mp_float_t value) {
+    return value * MICROPY_FLOAT_CONST(180.0) / MP_PI;
+}
+
+static mp_obj_t vector_degrees(mp_obj_t x_obj) {
+    return vector_generic_vector(x_obj, vector_degrees_);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(vector_degrees_obj, vector_degrees);
+#endif
+
+#if ULAB_SCIPY_SPECIAL_HAS_ERF
+//| def erf(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the error function, which has applications in statistics"""
+//|    ...
+//|
+
+MATH_FUN_1(erf, erf);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_erf_obj, vector_erf);
+#endif
+
+#if ULAB_SCIPY_SPECIAL_HAS_ERFC
+//| def erfc(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the complementary error function, which has applications in statistics"""
+//|    ...
+//|
+
+MATH_FUN_1(erfc, erfc);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_erfc_obj, vector_erfc);
+#endif
+
+#if ULAB_NUMPY_HAS_EXP
+//| def exp(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the exponent function."""
+//|    ...
+//|
+
+static mp_obj_t vector_exp(mp_obj_t o_in) {
+    #if ULAB_SUPPORTS_COMPLEX
+    if(mp_obj_is_type(o_in, &mp_type_complex)) {
+        mp_float_t real, imag;
+        mp_obj_get_complex(o_in, &real, &imag);
+        mp_float_t exp_real = MICROPY_FLOAT_C_FUN(exp)(real);
+        return mp_obj_new_complex(exp_real * MICROPY_FLOAT_C_FUN(cos)(imag), exp_real * MICROPY_FLOAT_C_FUN(sin)(imag));
+    } else if(mp_obj_is_type(o_in, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(o_in);
+        if(source->dtype == NDARRAY_COMPLEX) {
+            uint8_t *sarray = (uint8_t *)source->array;
+            ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_COMPLEX);
+            mp_float_t *array = (mp_float_t *)ndarray->array;
+            uint8_t itemsize = sizeof(mp_float_t);
+
+            #if ULAB_MAX_DIMS > 3
+            size_t i = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 2
+                size_t j = 0;
+                do {
+                #endif
+                    #if ULAB_MAX_DIMS > 1
+                    size_t k = 0;
+                    do {
+                    #endif
+                        size_t l = 0;
+                        do {
+                            mp_float_t real = *(mp_float_t *)sarray;
+                            mp_float_t imag = *(mp_float_t *)(sarray + itemsize);
+                            mp_float_t exp_real = MICROPY_FLOAT_C_FUN(exp)(real);
+                            *array++ = exp_real * MICROPY_FLOAT_C_FUN(cos)(imag);
+                            *array++ = exp_real * MICROPY_FLOAT_C_FUN(sin)(imag);
+                            sarray += source->strides[ULAB_MAX_DIMS - 1];
+                            l++;
+                        } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                    #if ULAB_MAX_DIMS > 1
+                        sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                        sarray += source->strides[ULAB_MAX_DIMS - 2];
+                        k++;
+                    } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                    #endif /* ULAB_MAX_DIMS > 1 */
+                #if ULAB_MAX_DIMS > 2
+                    sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                    sarray += source->strides[ULAB_MAX_DIMS - 3];
+                    j++;
+                } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+                #endif /* ULAB_MAX_DIMS > 2 */
+            #if ULAB_MAX_DIMS > 3
+                sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+                sarray += source->strides[ULAB_MAX_DIMS - 4];
+                i++;
+            } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+            #endif /* ULAB_MAX_DIMS > 3 */
+            return MP_OBJ_FROM_PTR(ndarray);
+        }
+    }
+    #endif
+    return vector_generic_vector(o_in, MICROPY_FLOAT_C_FUN(exp));
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(vector_exp_obj, vector_exp);
+#endif
+
+#if ULAB_NUMPY_HAS_EXPM1
+//| def expm1(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes $e^x-1$.  In certain applications, using this function preserves numeric accuracy better than the `exp` function."""
+//|    ...
+//|
+
+MATH_FUN_1(expm1, expm1);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_expm1_obj, vector_expm1);
+#endif
+
+#if ULAB_NUMPY_HAS_FLOOR
+//| def floor(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Rounds numbers up to the next whole number"""
+//|    ...
+//|
+
+MATH_FUN_1(floor, floor);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_floor_obj, vector_floor);
+#endif
+
+#if ULAB_SCIPY_SPECIAL_HAS_GAMMA
+//| def gamma(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the gamma function"""
+//|    ...
+//|
+
+MATH_FUN_1(gamma, tgamma);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_gamma_obj, vector_gamma);
+#endif
+
+#if ULAB_SCIPY_SPECIAL_HAS_GAMMALN
+//| def lgamma(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the natural log of the gamma function"""
+//|    ...
+//|
+
+MATH_FUN_1(lgamma, lgamma);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_lgamma_obj, vector_lgamma);
+#endif
+
+#if ULAB_NUMPY_HAS_LOG
+//| def log(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the natural log"""
+//|    ...
+//|
+
+MATH_FUN_1(log, log);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_log_obj, vector_log);
+#endif
+
+#if ULAB_NUMPY_HAS_LOG10
+//| def log10(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the log base 10"""
+//|    ...
+//|
+
+MATH_FUN_1(log10, log10);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_log10_obj, vector_log10);
+#endif
+
+#if ULAB_NUMPY_HAS_LOG2
+//| def log2(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the log base 2"""
+//|    ...
+//|
+
+MATH_FUN_1(log2, log2);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_log2_obj, vector_log2);
+#endif
+
+#if ULAB_NUMPY_HAS_RADIANS
+//| def radians(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Converts angles from degrees to radians"""
+//|    ...
+//|
+
+static mp_float_t vector_radians_(mp_float_t value) {
+    return value * MP_PI / MICROPY_FLOAT_CONST(180.0);
+}
+
+static mp_obj_t vector_radians(mp_obj_t x_obj) {
+    return vector_generic_vector(x_obj, vector_radians_);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(vector_radians_obj, vector_radians);
+#endif
+
+#if ULAB_NUMPY_HAS_SIN
+//| def sin(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the sine function"""
+//|    ...
+//|
+
+MATH_FUN_1(sin, sin);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_sin_obj, vector_sin);
+#endif
+
+#if ULAB_NUMPY_HAS_SINH
+//| def sinh(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the hyperbolic sine"""
+//|    ...
+//|
+
+MATH_FUN_1(sinh, sinh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_sinh_obj, vector_sinh);
+#endif
+
+
+#if ULAB_NUMPY_HAS_SQRT
+//| def sqrt(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the square root"""
+//|    ...
+//|
+
+#if ULAB_SUPPORTS_COMPLEX
+mp_obj_t vector_sqrt(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_INT(NDARRAY_FLOAT) } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t o_in = args[0].u_obj;
+    uint8_t dtype = mp_obj_get_int(args[1].u_obj);
+    if((dtype != NDARRAY_FLOAT) && (dtype != NDARRAY_COMPLEX)) {
+        mp_raise_TypeError(translate("dtype must be float, or complex"));
+    }
+
+    if(mp_obj_is_type(o_in, &mp_type_complex)) {
+        mp_float_t real, imag;
+        mp_obj_get_complex(o_in, &real, &imag);
+        mp_float_t sqrt_abs = MICROPY_FLOAT_C_FUN(sqrt)(real * real + imag * imag);
+        sqrt_abs = MICROPY_FLOAT_C_FUN(sqrt)(sqrt_abs);
+        mp_float_t theta = MICROPY_FLOAT_CONST(0.5) * MICROPY_FLOAT_C_FUN(atan2)(imag, real);
+        return mp_obj_new_complex(sqrt_abs * MICROPY_FLOAT_C_FUN(cos)(theta), sqrt_abs * MICROPY_FLOAT_C_FUN(sin)(theta));
+    } else if(mp_obj_is_type(o_in, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(o_in);
+        if((source->dtype == NDARRAY_COMPLEX) && (dtype == NDARRAY_FLOAT)) {
+            mp_raise_TypeError(translate("can't convert complex to float"));
+        }
+
+        if(dtype == NDARRAY_COMPLEX) {
+            if(source->dtype == NDARRAY_COMPLEX) {
+                uint8_t *sarray = (uint8_t *)source->array;
+                ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_COMPLEX);
+                mp_float_t *array = (mp_float_t *)ndarray->array;
+                uint8_t itemsize = sizeof(mp_float_t);
+
+                #if ULAB_MAX_DIMS > 3
+                size_t i = 0;
+                do {
+                #endif
+                    #if ULAB_MAX_DIMS > 2
+                    size_t j = 0;
+                    do {
+                    #endif
+                        #if ULAB_MAX_DIMS > 1
+                        size_t k = 0;
+                        do {
+                        #endif
+                            size_t l = 0;
+                            do {
+                                mp_float_t real = *(mp_float_t *)sarray;
+                                mp_float_t imag = *(mp_float_t *)(sarray + itemsize);
+                                mp_float_t sqrt_abs = MICROPY_FLOAT_C_FUN(sqrt)(real * real + imag * imag);
+                                sqrt_abs = MICROPY_FLOAT_C_FUN(sqrt)(sqrt_abs);
+                                mp_float_t theta = MICROPY_FLOAT_CONST(0.5) * MICROPY_FLOAT_C_FUN(atan2)(imag, real);
+                                *array++ = sqrt_abs * MICROPY_FLOAT_C_FUN(cos)(theta);
+                                *array++ = sqrt_abs * MICROPY_FLOAT_C_FUN(sin)(theta);
+                                sarray += source->strides[ULAB_MAX_DIMS - 1];
+                                l++;
+                            } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                        #if ULAB_MAX_DIMS > 1
+                            sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                            sarray += source->strides[ULAB_MAX_DIMS - 2];
+                            k++;
+                        } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                        #endif /* ULAB_MAX_DIMS > 1 */
+                    #if ULAB_MAX_DIMS > 2
+                        sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                        sarray += source->strides[ULAB_MAX_DIMS - 3];
+                        j++;
+                    } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+                    #endif /* ULAB_MAX_DIMS > 2 */
+                #if ULAB_MAX_DIMS > 3
+                    sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+                    sarray += source->strides[ULAB_MAX_DIMS - 4];
+                    i++;
+                } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+                #endif /* ULAB_MAX_DIMS > 3 */
+                return MP_OBJ_FROM_PTR(ndarray);
+            } else if(source->dtype == NDARRAY_FLOAT) {
+                uint8_t *sarray = (uint8_t *)source->array;
+                ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_COMPLEX);
+                mp_float_t *array = (mp_float_t *)ndarray->array;
+
+                #if ULAB_MAX_DIMS > 3
+                size_t i = 0;
+                do {
+                #endif
+                    #if ULAB_MAX_DIMS > 2
+                    size_t j = 0;
+                    do {
+                    #endif
+                        #if ULAB_MAX_DIMS > 1
+                        size_t k = 0;
+                        do {
+                        #endif
+                            size_t l = 0;
+                            do {
+                                mp_float_t value = *(mp_float_t *)sarray;
+                                if(value >= MICROPY_FLOAT_CONST(0.0)) {
+                                    *array++ = MICROPY_FLOAT_C_FUN(sqrt)(value);
+                                    array++;
+                                } else {
+                                    array++;
+                                    *array++ = MICROPY_FLOAT_C_FUN(sqrt)(-value);
+                                }
+                                sarray += source->strides[ULAB_MAX_DIMS - 1];
+                                l++;
+                            } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                        #if ULAB_MAX_DIMS > 1
+                            sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                            sarray += source->strides[ULAB_MAX_DIMS - 2];
+                            k++;
+                        } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                        #endif /* ULAB_MAX_DIMS > 1 */
+                    #if ULAB_MAX_DIMS > 2
+                        sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                        sarray += source->strides[ULAB_MAX_DIMS - 3];
+                        j++;
+                    } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+                    #endif /* ULAB_MAX_DIMS > 2 */
+                #if ULAB_MAX_DIMS > 3
+                    sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+                    sarray += source->strides[ULAB_MAX_DIMS - 4];
+                    i++;
+                } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+                #endif /* ULAB_MAX_DIMS > 3 */
+                return MP_OBJ_FROM_PTR(ndarray);
+            } else {
+                mp_raise_TypeError(translate("input dtype must be float or complex"));
+            }
+        }
+    }
+    return vector_generic_vector(o_in, MICROPY_FLOAT_C_FUN(sqrt));
+}
+MP_DEFINE_CONST_FUN_OBJ_KW(vector_sqrt_obj, 1, vector_sqrt);
+#else
+MATH_FUN_1(sqrt, sqrt);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_sqrt_obj, vector_sqrt);
+#endif /* ULAB_SUPPORTS_COMPLEX */
+
+#endif /* ULAB_NUMPY_HAS_SQRT */
+
+#if ULAB_NUMPY_HAS_TAN
+//| def tan(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the tangent"""
+//|    ...
+//|
+
+MATH_FUN_1(tan, tan);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_tan_obj, vector_tan);
+#endif
+
+#if ULAB_NUMPY_HAS_TANH
+//| def tanh(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the hyperbolic tangent"""
+//|    ...
+
+MATH_FUN_1(tanh, tanh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_tanh_obj, vector_tanh);
+#endif
+
+#if ULAB_NUMPY_HAS_VECTORIZE
+static mp_obj_t vector_vectorized_function_call(mp_obj_t self_in, size_t n_args, size_t n_kw, const mp_obj_t *args) {
+    (void) n_args;
+    (void) n_kw;
+    vectorized_function_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    mp_obj_t avalue[1];
+    mp_obj_t fvalue;
+    if(mp_obj_is_type(args[0], &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0]);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(source->dtype)
+        ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, self->otypes);
+        for(size_t i=0; i < source->len; i++) {
+            avalue[0] = mp_binary_get_val_array(source->dtype, source->array, i);
+            fvalue = self->type->MP_TYPE_CALL(self->fun, 1, 0, avalue);
+            ndarray_set_value(self->otypes, ndarray->array, i, fvalue);
+        }
+        return MP_OBJ_FROM_PTR(ndarray);
+    } else if(mp_obj_is_type(args[0], &mp_type_tuple) || mp_obj_is_type(args[0], &mp_type_list) ||
+        mp_obj_is_type(args[0], &mp_type_range)) { // i.e., the input is a generic iterable
+        size_t len = (size_t)mp_obj_get_int(mp_obj_len_maybe(args[0]));
+        ndarray_obj_t *ndarray = ndarray_new_linear_array(len, self->otypes);
+        mp_obj_iter_buf_t iter_buf;
+        mp_obj_t iterable = mp_getiter(args[0], &iter_buf);
+        size_t i=0;
+        while ((avalue[0] = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+            fvalue = self->type->MP_TYPE_CALL(self->fun, 1, 0, avalue);
+            ndarray_set_value(self->otypes, ndarray->array, i, fvalue);
+            i++;
+        }
+        return MP_OBJ_FROM_PTR(ndarray);
+    } else if(mp_obj_is_int(args[0]) || mp_obj_is_float(args[0])) {
+        ndarray_obj_t *ndarray = ndarray_new_linear_array(1, self->otypes);
+        fvalue = self->type->MP_TYPE_CALL(self->fun, 1, 0, args);
+        ndarray_set_value(self->otypes, ndarray->array, 0, fvalue);
+        return MP_OBJ_FROM_PTR(ndarray);
+    } else {
+        mp_raise_ValueError(translate("wrong input type"));
+    }
+    return mp_const_none;
+}
+
+const mp_obj_type_t vector_function_type = {
+    { &mp_type_type },
+    .flags = MP_TYPE_FLAG_EXTENDED,
+    .name = MP_QSTR_,
+    MP_TYPE_EXTENDED_FIELDS(
+    .call = vector_vectorized_function_call,
+    )
+};
+
+//| def vectorize(
+//|     f: Union[Callable[[int], _float], Callable[[_float], _float]],
+//|     *,
+//|     otypes: Optional[_DType] = None
+//| ) -> Callable[[_ArrayLike], ulab.numpy.ndarray]:
+//|    """
+//|    :param callable f: The function to wrap
+//|    :param otypes: List of array types that may be returned by the function.  None is interpreted to mean the return value is float.
+//|
+//|    Wrap a Python function ``f`` so that it can be applied to arrays.
+//|    The callable must return only values of the types specified by ``otypes``, or the result is undefined."""
+//|    ...
+//|
+
+static mp_obj_t vector_vectorize(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} },
+        { MP_QSTR_otypes, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} }
+    };
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+    const mp_obj_type_t *type = mp_obj_get_type(args[0].u_obj);
+    if(mp_type_get_call_slot(type) == NULL) {
+        mp_raise_TypeError(translate("first argument must be a callable"));
+    }
+    mp_obj_t _otypes = args[1].u_obj;
+    uint8_t otypes = NDARRAY_FLOAT;
+    if(_otypes == mp_const_none) {
+        // TODO: is this what numpy does?
+        otypes = NDARRAY_FLOAT;
+    } else if(mp_obj_is_int(_otypes)) {
+        otypes = mp_obj_get_int(_otypes);
+        if(otypes != NDARRAY_FLOAT && otypes != NDARRAY_UINT8 && otypes != NDARRAY_INT8 &&
+            otypes != NDARRAY_UINT16 && otypes != NDARRAY_INT16) {
+                mp_raise_ValueError(translate("wrong output type"));
+        }
+    }
+    else {
+        mp_raise_ValueError(translate("wrong output type"));
+    }
+    vectorized_function_obj_t *function = m_new_obj(vectorized_function_obj_t);
+    function->base.type = &vector_function_type;
+    function->otypes = otypes;
+    function->fun = args[0].u_obj;
+    function->type = type;
+    return MP_OBJ_FROM_PTR(function);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(vector_vectorize_obj, 1, vector_vectorize);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/vector.h b/circuitpython/extmod/ulab/code/numpy/vector.h
new file mode 100644
index 0000000..ea38b0f
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/vector.h
@@ -0,0 +1,161 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _VECTOR_
+#define _VECTOR_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+MP_DECLARE_CONST_FUN_OBJ_1(vector_acos_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_acosh_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(vector_arctan2_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(vector_around_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_asin_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_asinh_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_atan_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_atanh_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_ceil_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_cos_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_cosh_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_degrees_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_erf_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_erfc_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_exp_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_expm1_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_floor_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_gamma_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_lgamma_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_log_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_log10_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_log2_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_radians_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_sin_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_sinh_obj);
+#if ULAB_SUPPORTS_COMPLEX
+MP_DECLARE_CONST_FUN_OBJ_KW(vector_sqrt_obj);
+#else
+MP_DECLARE_CONST_FUN_OBJ_1(vector_sqrt_obj);
+#endif
+MP_DECLARE_CONST_FUN_OBJ_1(vector_tan_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_tanh_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(vector_vectorize_obj);
+
+typedef struct _vectorized_function_obj_t {
+    mp_obj_base_t base;
+    uint8_t otypes;
+    mp_obj_t fun;
+    const mp_obj_type_t *type;
+} vectorized_function_obj_t;
+
+#if ULAB_HAS_FUNCTION_ITERATOR
+#define ITERATE_VECTOR(type, array, source, sarray, shift)\
+({\
+    size_t *scoords = ndarray_new_coords((source)->ndim);\
+    for(size_t i=0; i < (source)->len/(source)->shape[ULAB_MAX_DIMS -1]; i++) {\
+        for(size_t l=0; l < (source)->shape[ULAB_MAX_DIMS - 1]; l++) {\
+            *(array) = f(*((type *)(sarray)));\
+            (array) += (shift);\
+            (sarray) += (source)->strides[ULAB_MAX_DIMS - 1];\
+        }\
+        ndarray_rewind_array((source)->ndim, sarray, (source)->shape, (source)->strides, scoords);\
+    }\
+})
+
+#else
+
+#if ULAB_MAX_DIMS == 4
+#define ITERATE_VECTOR(type, array, source, sarray) do {\
+    size_t i=0;\
+    do {\
+        size_t j = 0;\
+        do {\
+            size_t k = 0;\
+            do {\
+                size_t l = 0;\
+                do {\
+                    *(array)++ = f(*((type *)(sarray)));\
+                    (sarray) += (source)->strides[ULAB_MAX_DIMS - 1];\
+                    l++;\
+                } while(l < (source)->shape[ULAB_MAX_DIMS-1]);\
+                (sarray) -= (source)->strides[ULAB_MAX_DIMS - 1] * (source)->shape[ULAB_MAX_DIMS-1];\
+                (sarray) += (source)->strides[ULAB_MAX_DIMS - 2];\
+                k++;\
+            } while(k < (source)->shape[ULAB_MAX_DIMS-2]);\
+            (sarray) -= (source)->strides[ULAB_MAX_DIMS - 2] * (source)->shape[ULAB_MAX_DIMS-2];\
+            (sarray) += (source)->strides[ULAB_MAX_DIMS - 3];\
+            j++;\
+        } while(j < (source)->shape[ULAB_MAX_DIMS-3]);\
+        (sarray) -= (source)->strides[ULAB_MAX_DIMS - 3] * (source)->shape[ULAB_MAX_DIMS-3];\
+        (sarray) += (source)->strides[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (source)->shape[ULAB_MAX_DIMS-4]);\
+} while(0)
+#endif /* ULAB_MAX_DIMS == 4 */
+
+#if ULAB_MAX_DIMS == 3
+#define ITERATE_VECTOR(type, array, source, sarray) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                *(array)++ = f(*((type *)(sarray)));\
+                (sarray) += (source)->strides[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (source)->shape[ULAB_MAX_DIMS-1]);\
+            (sarray) -= (source)->strides[ULAB_MAX_DIMS - 1] * (source)->shape[ULAB_MAX_DIMS-1];\
+            (sarray) += (source)->strides[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (source)->shape[ULAB_MAX_DIMS-2]);\
+        (sarray) -= (source)->strides[ULAB_MAX_DIMS - 2] * (source)->shape[ULAB_MAX_DIMS-2];\
+        (sarray) += (source)->strides[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (source)->shape[ULAB_MAX_DIMS-3]);\
+} while(0)
+#endif /* ULAB_MAX_DIMS == 3 */
+
+#if ULAB_MAX_DIMS == 2
+#define ITERATE_VECTOR(type, array, source, sarray) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            *(array)++ = f(*((type *)(sarray)));\
+            (sarray) += (source)->strides[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (source)->shape[ULAB_MAX_DIMS-1]);\
+        (sarray) -= (source)->strides[ULAB_MAX_DIMS - 1] * (source)->shape[ULAB_MAX_DIMS-1];\
+        (sarray) += (source)->strides[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (source)->shape[ULAB_MAX_DIMS-2]);\
+} while(0)
+#endif /* ULAB_MAX_DIMS == 2 */
+
+#if ULAB_MAX_DIMS == 1
+#define ITERATE_VECTOR(type, array, source, sarray) do {\
+    size_t l = 0;\
+    do {\
+        *(array)++ = f(*((type *)(sarray)));\
+        (sarray) += (source)->strides[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (source)->shape[ULAB_MAX_DIMS-1]);\
+} while(0)
+#endif /* ULAB_MAX_DIMS == 1 */
+#endif /* ULAB_HAS_FUNCTION_ITERATOR */
+
+#define MATH_FUN_1(py_name, c_name) \
+    static mp_obj_t vector_ ## py_name(mp_obj_t x_obj) { \
+        return vector_generic_vector(x_obj, MICROPY_FLOAT_C_FUN(c_name)); \
+}
+
+#endif /* _VECTOR_ */
diff --git a/circuitpython/extmod/ulab/code/scipy/linalg/linalg.c b/circuitpython/extmod/ulab/code/scipy/linalg/linalg.c
new file mode 100644
index 0000000..d211f72
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/scipy/linalg/linalg.c
@@ -0,0 +1,280 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2021 Vikas Udupa
+ *
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../../ulab.h"
+#include "../../ulab_tools.h"
+#include "../../numpy/linalg/linalg_tools.h"
+#include "linalg.h"
+
+#if ULAB_SCIPY_HAS_LINALG_MODULE
+//|
+//| import ulab.scipy
+//| import ulab.numpy
+//|
+//| """Linear algebra functions"""
+//|
+
+#if ULAB_MAX_DIMS > 1
+
+//| def solve_triangular(A: ulab.numpy.ndarray, b: ulab.numpy.ndarray, lower: bool) -> ulab.numpy.ndarray:
+//|    """
+//|    :param ~ulab.numpy.ndarray A: a matrix
+//|    :param ~ulab.numpy.ndarray b: a vector
+//|    :param ~bool lower: if true, use only data contained in lower triangle of A, else use upper triangle of A
+//|    :return: solution to the system A x = b. Shape of return matches b
+//|    :raises TypeError: if A and b are not of type ndarray and are not dense
+//|    :raises ValueError: if A is a singular matrix
+//|
+//|    Solve the equation A x = b for x, assuming A is a triangular matrix"""
+//|    ...
+//|
+
+static mp_obj_t solve_triangular(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+
+    size_t i, j;
+
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none} } ,
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none} } ,
+        { MP_QSTR_lower, MP_ARG_OBJ, { .u_rom_obj = mp_const_false } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type) || !mp_obj_is_type(args[1].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("first two arguments must be ndarrays"));
+    }
+
+    ndarray_obj_t *A = MP_OBJ_TO_PTR(args[0].u_obj);
+    ndarray_obj_t *b = MP_OBJ_TO_PTR(args[1].u_obj);
+
+    if(!ndarray_is_dense(A) || !ndarray_is_dense(b)) {
+        mp_raise_TypeError(translate("input must be a dense ndarray"));
+    }
+
+    size_t A_rows = A->shape[ULAB_MAX_DIMS - 2];
+    size_t A_cols = A->shape[ULAB_MAX_DIMS - 1];
+
+    uint8_t *A_arr = (uint8_t *)A->array;
+    uint8_t *b_arr = (uint8_t *)b->array;
+
+    mp_float_t (*get_A_ele)(void *) = ndarray_get_float_function(A->dtype);
+    mp_float_t (*get_b_ele)(void *) = ndarray_get_float_function(b->dtype);
+
+    uint8_t *temp_A = A_arr;
+
+    // check if input matrix A is singular
+    for (i = 0; i < A_rows; i++) {
+        if (MICROPY_FLOAT_C_FUN(fabs)(get_A_ele(A_arr)) < LINALG_EPSILON)
+            mp_raise_ValueError(translate("input matrix is singular"));
+        A_arr += A->strides[ULAB_MAX_DIMS - 2];
+        A_arr += A->strides[ULAB_MAX_DIMS - 1];
+    }
+
+    A_arr = temp_A;
+
+    ndarray_obj_t *x = ndarray_new_dense_ndarray(b->ndim, b->shape, NDARRAY_FLOAT);
+    mp_float_t *x_arr = (mp_float_t *)x->array;
+
+    if (mp_obj_is_true(args[2].u_obj)) {
+        // Solve the lower triangular matrix by iterating each row of A.
+        // Start by finding the first unknown using the first row.
+        // On finding this unknown, find the second unknown using the second row.
+        // Continue the same till the last unknown is found using the last row.
+
+        for (i = 0; i < A_rows; i++) {
+            mp_float_t sum = 0.0;
+            for (j = 0; j < i; j++) {
+                sum += (get_A_ele(A_arr) * (*x_arr++));
+                A_arr += A->strides[ULAB_MAX_DIMS - 1];
+            }
+
+            sum = (get_b_ele(b_arr) - sum) / (get_A_ele(A_arr));
+            *x_arr = sum;
+
+            x_arr -= j;
+            A_arr -= A->strides[ULAB_MAX_DIMS - 1] * j;
+            A_arr += A->strides[ULAB_MAX_DIMS - 2];
+            b_arr += b->strides[ULAB_MAX_DIMS - 1];
+        }
+    } else {
+        // Solve the upper triangular matrix by iterating each row of A.
+        // Start by finding the last unknown using the last row.
+        // On finding this unknown, find the last-but-one unknown using the last-but-one row.
+        // Continue the same till the first unknown is found using the first row.
+
+        A_arr += (A->strides[ULAB_MAX_DIMS - 2] * A_rows);
+        b_arr += (b->strides[ULAB_MAX_DIMS - 1] * A_cols);
+        x_arr += A_cols;
+
+        for (i = A_rows - 1; i < A_rows; i--) {
+            mp_float_t sum = 0.0;
+            for (j = i + 1; j < A_cols; j++) {
+                sum += (get_A_ele(A_arr) * (*x_arr++));
+                A_arr += A->strides[ULAB_MAX_DIMS - 1];
+            }
+
+            x_arr -= (j - i);
+            A_arr -= (A->strides[ULAB_MAX_DIMS - 1] * (j - i));
+            b_arr -= b->strides[ULAB_MAX_DIMS - 1];
+
+            sum = (get_b_ele(b_arr) - sum) / get_A_ele(A_arr);
+            *x_arr = sum;
+
+            A_arr -= A->strides[ULAB_MAX_DIMS - 2];
+        }
+    }
+
+    return MP_OBJ_FROM_PTR(x);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(linalg_solve_triangular_obj, 2, solve_triangular);
+
+//| def cho_solve(L: ulab.numpy.ndarray, b: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|    """
+//|    :param ~ulab.numpy.ndarray L: the lower triangular, Cholesky factorization of A
+//|    :param ~ulab.numpy.ndarray b: right-hand-side vector b
+//|    :return: solution to the system A x = b. Shape of return matches b
+//|    :raises TypeError: if L and b are not of type ndarray and are not dense
+//|
+//|    Solve the linear equations A x = b, given the Cholesky factorization of A as input"""
+//|    ...
+//|
+
+static mp_obj_t cho_solve(mp_obj_t _L, mp_obj_t _b) {
+
+    if(!mp_obj_is_type(_L, &ulab_ndarray_type) || !mp_obj_is_type(_b, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("first two arguments must be ndarrays"));
+    }
+
+    ndarray_obj_t *L = MP_OBJ_TO_PTR(_L);
+    ndarray_obj_t *b = MP_OBJ_TO_PTR(_b);
+
+    if(!ndarray_is_dense(L) || !ndarray_is_dense(b)) {
+        mp_raise_TypeError(translate("input must be a dense ndarray"));
+    }
+
+    mp_float_t (*get_L_ele)(void *) = ndarray_get_float_function(L->dtype);
+    mp_float_t (*get_b_ele)(void *) = ndarray_get_float_function(b->dtype);
+    void (*set_L_ele)(void *, mp_float_t) = ndarray_set_float_function(L->dtype);
+
+    size_t L_rows = L->shape[ULAB_MAX_DIMS - 2];
+    size_t L_cols = L->shape[ULAB_MAX_DIMS - 1];
+
+    // Obtain transpose of the input matrix L in L_t
+    size_t L_t_shape[ULAB_MAX_DIMS];
+    size_t L_t_rows = L_t_shape[ULAB_MAX_DIMS - 2] = L_cols;
+    size_t L_t_cols = L_t_shape[ULAB_MAX_DIMS - 1] = L_rows;
+    ndarray_obj_t *L_t = ndarray_new_dense_ndarray(L->ndim, L_t_shape, L->dtype);
+
+    uint8_t *L_arr = (uint8_t *)L->array;
+    uint8_t *L_t_arr = (uint8_t *)L_t->array;
+    uint8_t *b_arr = (uint8_t *)b->array;
+
+    size_t i, j;
+
+    uint8_t *L_ptr = L_arr;
+    uint8_t *L_t_ptr = L_t_arr;
+    for (i = 0; i < L_rows; i++) {
+        for (j = 0; j < L_cols; j++) {
+            set_L_ele(L_t_ptr, get_L_ele(L_ptr));
+            L_t_ptr += L_t->strides[ULAB_MAX_DIMS - 2];
+            L_ptr += L->strides[ULAB_MAX_DIMS - 1];
+        }
+
+        L_t_ptr -= j * L_t->strides[ULAB_MAX_DIMS - 2];
+        L_t_ptr += L_t->strides[ULAB_MAX_DIMS - 1];
+        L_ptr -= j * L->strides[ULAB_MAX_DIMS - 1];
+        L_ptr += L->strides[ULAB_MAX_DIMS - 2];
+    }
+
+    ndarray_obj_t *x = ndarray_new_dense_ndarray(b->ndim, b->shape, NDARRAY_FLOAT);
+    mp_float_t *x_arr = (mp_float_t *)x->array;
+
+    ndarray_obj_t *y = ndarray_new_dense_ndarray(b->ndim, b->shape, NDARRAY_FLOAT);
+    mp_float_t *y_arr = (mp_float_t *)y->array;
+
+    // solve L y = b to obtain y, where L_t x = y
+    for (i = 0; i < L_rows; i++) {
+        mp_float_t sum = 0.0;
+        for (j = 0; j < i; j++) {
+            sum += (get_L_ele(L_arr) * (*y_arr++));
+            L_arr += L->strides[ULAB_MAX_DIMS - 1];
+        }
+
+        sum = (get_b_ele(b_arr) - sum) / (get_L_ele(L_arr));
+        *y_arr = sum;
+
+        y_arr -= j;
+        L_arr -= L->strides[ULAB_MAX_DIMS - 1] * j;
+        L_arr += L->strides[ULAB_MAX_DIMS - 2];
+        b_arr += b->strides[ULAB_MAX_DIMS - 1];
+    }
+
+    // using y, solve L_t x = y to obtain x
+    L_t_arr += (L_t->strides[ULAB_MAX_DIMS - 2] * L_t_rows);
+    y_arr += L_t_cols;
+    x_arr += L_t_cols;
+
+    for (i = L_t_rows - 1; i < L_t_rows; i--) {
+        mp_float_t sum = 0.0;
+        for (j = i + 1; j < L_t_cols; j++) {
+            sum += (get_L_ele(L_t_arr) * (*x_arr++));
+            L_t_arr += L_t->strides[ULAB_MAX_DIMS - 1];
+        }
+
+        x_arr -= (j - i);
+        L_t_arr -= (L_t->strides[ULAB_MAX_DIMS - 1] * (j - i));
+        y_arr--;
+
+        sum = ((*y_arr) - sum) / get_L_ele(L_t_arr);
+        *x_arr = sum;
+
+        L_t_arr -= L_t->strides[ULAB_MAX_DIMS - 2];
+    }
+
+    return MP_OBJ_FROM_PTR(x);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(linalg_cho_solve_obj, cho_solve);
+
+#endif
+
+static const mp_rom_map_elem_t ulab_scipy_linalg_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_linalg) },
+    #if ULAB_MAX_DIMS > 1
+        #if ULAB_SCIPY_LINALG_HAS_SOLVE_TRIANGULAR
+        { MP_ROM_QSTR(MP_QSTR_solve_triangular), (mp_obj_t)&linalg_solve_triangular_obj },
+        #endif
+        #if ULAB_SCIPY_LINALG_HAS_CHO_SOLVE
+        { MP_ROM_QSTR(MP_QSTR_cho_solve), (mp_obj_t)&linalg_cho_solve_obj },
+        #endif
+    #endif
+};
+
+static MP_DEFINE_CONST_DICT(mp_module_ulab_scipy_linalg_globals, ulab_scipy_linalg_globals_table);
+
+const mp_obj_module_t ulab_scipy_linalg_module = {
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_scipy_linalg_globals,
+};
+MP_REGISTER_MODULE(MP_QSTR_ulab_dot_scipy_dot_linalg, ulab_scipy_linalg_module, MODULE_ULAB_ENABLED && CIRCUITPY_ULAB);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/scipy/linalg/linalg.h b/circuitpython/extmod/ulab/code/scipy/linalg/linalg.h
new file mode 100644
index 0000000..628051f
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/scipy/linalg/linalg.h
@@ -0,0 +1,21 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2021 Vikas Udupa
+ * 
+*/
+
+#ifndef _SCIPY_LINALG_
+#define _SCIPY_LINALG_
+
+extern const mp_obj_module_t ulab_scipy_linalg_module;
+
+MP_DECLARE_CONST_FUN_OBJ_KW(linalg_solve_triangular_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(linalg_cho_solve_obj);
+
+#endif /* _SCIPY_LINALG_ */
diff --git a/circuitpython/extmod/ulab/code/scipy/optimize/optimize.c b/circuitpython/extmod/ulab/code/scipy/optimize/optimize.c
new file mode 100644
index 0000000..f1c746a
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/scipy/optimize/optimize.c
@@ -0,0 +1,415 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020-2021 Zoltán Vörös
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../../ndarray.h"
+#include "../../ulab.h"
+#include "../../ulab_tools.h"
+#include "optimize.h"
+
+const mp_obj_float_t xtolerance = {{&mp_type_float}, MICROPY_FLOAT_CONST(2.4e-7)};
+const mp_obj_float_t rtolerance = {{&mp_type_float}, MICROPY_FLOAT_CONST(0.0)};
+
+static mp_float_t optimize_python_call(const mp_obj_type_t *type, mp_obj_t fun, mp_float_t x, mp_obj_t *fargs, uint8_t nparams) {
+    // Helper function for calculating the value of f(x, a, b, c, ...),
+    // where f is defined in python. Takes a float, returns a float.
+    // The array of mp_obj_t type must be supplied, as must the number of parameters (a, b, c...) in nparams
+    fargs[0] = mp_obj_new_float(x);
+    return mp_obj_get_float(type->MP_TYPE_CALL(fun, nparams+1, 0, fargs));
+}
+
+#if ULAB_SCIPY_OPTIMIZE_HAS_BISECT
+//| def bisect(
+//|     fun: Callable[[float], float],
+//|     a: float,
+//|     b: float,
+//|     *,
+//|     xtol: float = 2.4e-7,
+//|     maxiter: int = 100
+//| ) -> float:
+//|     """
+//|     :param callable f: The function to bisect
+//|     :param float a: The left side of the interval
+//|     :param float b: The right side of the interval
+//|     :param float xtol: The tolerance value
+//|     :param float maxiter: The maximum number of iterations to perform
+//|
+//|     Find a solution (zero) of the function ``f(x)`` on the interval
+//|     (``a``..``b``) using the bisection method.  The result is accurate to within
+//|     ``xtol`` unless more than ``maxiter`` steps are required."""
+//|     ...
+//|
+
+STATIC mp_obj_t optimize_bisect(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    // Simple bisection routine
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_xtol, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = MP_ROM_PTR(&xtolerance)} },
+        { MP_QSTR_maxiter, MP_ARG_KW_ONLY | MP_ARG_INT, {.u_int = 100} },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t fun = args[0].u_obj;
+    const mp_obj_type_t *type = mp_obj_get_type(fun);
+    if(mp_type_get_call_slot(type) == NULL) {
+        mp_raise_TypeError(translate("first argument must be a function"));
+    }
+    mp_float_t xtol = mp_obj_get_float(args[3].u_obj);
+    mp_obj_t *fargs = m_new(mp_obj_t, 1);
+    mp_float_t left, right;
+    mp_float_t x_mid;
+    mp_float_t a = mp_obj_get_float(args[1].u_obj);
+    mp_float_t b = mp_obj_get_float(args[2].u_obj);
+    left = optimize_python_call(type, fun, a, fargs, 0);
+    right = optimize_python_call(type, fun, b, fargs, 0);
+    if(left * right > 0) {
+        mp_raise_ValueError(translate("function has the same sign at the ends of interval"));
+    }
+    mp_float_t rtb = left < MICROPY_FLOAT_CONST(0.0) ? a : b;
+    mp_float_t dx = left < MICROPY_FLOAT_CONST(0.0) ? b - a : a - b;
+    if(args[4].u_int < 0) {
+        mp_raise_ValueError(translate("maxiter should be > 0"));
+    }
+    for(uint16_t i=0; i < args[4].u_int; i++) {
+        dx *= MICROPY_FLOAT_CONST(0.5);
+        x_mid = rtb + dx;
+        if(optimize_python_call(type, fun, x_mid, fargs, 0) < MICROPY_FLOAT_CONST(0.0)) {
+            rtb = x_mid;
+        }
+        if(MICROPY_FLOAT_C_FUN(fabs)(dx) < xtol) break;
+    }
+    return mp_obj_new_float(rtb);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(optimize_bisect_obj, 3, optimize_bisect);
+#endif
+
+#if ULAB_SCIPY_OPTIMIZE_HAS_FMIN
+//| def fmin(
+//|     fun: Callable[[float], float],
+//|     x0: float,
+//|     *,
+//|     xatol: float = 2.4e-7,
+//|     fatol: float = 2.4e-7,
+//|     maxiter: int = 200
+//| ) -> float:
+//|     """
+//|     :param callable f: The function to bisect
+//|     :param float x0: The initial x value
+//|     :param float xatol: The absolute tolerance value
+//|     :param float fatol: The relative tolerance value
+//|
+//|     Find a minimum of the function ``f(x)`` using the downhill simplex method.
+//|     The located ``x`` is within ``fxtol`` of the actual minimum, and ``f(x)``
+//|     is within ``fatol`` of the actual minimum unless more than ``maxiter``
+//|     steps are requried."""
+//|     ...
+//|
+
+STATIC mp_obj_t optimize_fmin(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    // downhill simplex method in 1D
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_xatol, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = MP_ROM_PTR(&xtolerance)} },
+        { MP_QSTR_fatol, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = MP_ROM_PTR(&xtolerance)} },
+        { MP_QSTR_maxiter, MP_ARG_KW_ONLY | MP_ARG_INT, {.u_int = 200} },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t fun = args[0].u_obj;
+    const mp_obj_type_t *type = mp_obj_get_type(fun);
+    if(mp_type_get_call_slot(type) == NULL) {
+        mp_raise_TypeError(translate("first argument must be a function"));
+    }
+
+    // parameters controlling convergence conditions
+    mp_float_t xatol = mp_obj_get_float(args[2].u_obj);
+    mp_float_t fatol = mp_obj_get_float(args[3].u_obj);
+    if(args[4].u_int <= 0) {
+        mp_raise_ValueError(translate("maxiter must be > 0"));
+    }
+    uint16_t maxiter = (uint16_t)args[4].u_int;
+
+    mp_float_t x0 = mp_obj_get_float(args[1].u_obj);
+    mp_float_t x1 = MICROPY_FLOAT_C_FUN(fabs)(x0) > OPTIMIZE_EPSILON ? (MICROPY_FLOAT_CONST(1.0) + OPTIMIZE_NONZDELTA) * x0 : OPTIMIZE_ZDELTA;
+    mp_obj_t *fargs = m_new(mp_obj_t, 1);
+    mp_float_t f0 = optimize_python_call(type, fun, x0, fargs, 0);
+    mp_float_t f1 = optimize_python_call(type, fun, x1, fargs, 0);
+    if(f1 < f0) {
+        SWAP(mp_float_t, x0, x1);
+        SWAP(mp_float_t, f0, f1);
+    }
+    for(uint16_t i=0; i < maxiter; i++) {
+        uint8_t shrink = 0;
+        f0 = optimize_python_call(type, fun, x0, fargs, 0);
+        f1 = optimize_python_call(type, fun, x1, fargs, 0);
+
+        // reflection
+        mp_float_t xr = (MICROPY_FLOAT_CONST(1.0) + OPTIMIZE_ALPHA) * x0 - OPTIMIZE_ALPHA * x1;
+        mp_float_t fr = optimize_python_call(type, fun, xr, fargs, 0);
+        if(fr < f0) { // expansion
+            mp_float_t xe = (1 + OPTIMIZE_ALPHA * OPTIMIZE_BETA) * x0 - OPTIMIZE_ALPHA * OPTIMIZE_BETA * x1;
+            mp_float_t fe = optimize_python_call(type, fun, xe, fargs, 0);
+            if(fe < fr) {
+                x1 = xe;
+                f1 = fe;
+            } else {
+                x1 = xr;
+                f1 = fr;
+            }
+        } else {
+            if(fr < f1) { // contraction
+                mp_float_t xc = (1 + OPTIMIZE_GAMMA * OPTIMIZE_ALPHA) * x0 - OPTIMIZE_GAMMA * OPTIMIZE_ALPHA * x1;
+                mp_float_t fc = optimize_python_call(type, fun, xc, fargs, 0);
+                if(fc < fr) {
+                    x1 = xc;
+                    f1 = fc;
+                } else {
+                    shrink = 1;
+                }
+            } else { // inside contraction
+                mp_float_t xc = (MICROPY_FLOAT_CONST(1.0) - OPTIMIZE_GAMMA) * x0 + OPTIMIZE_GAMMA * x1;
+                mp_float_t fc = optimize_python_call(type, fun, xc, fargs, 0);
+                if(fc < f1) {
+                    x1 = xc;
+                    f1 = fc;
+                } else {
+                    shrink = 1;
+                }
+            }
+            if(shrink == 1) {
+                x1 = x0 + OPTIMIZE_DELTA * (x1 - x0);
+                f1 = optimize_python_call(type, fun, x1, fargs, 0);
+            }
+            if((MICROPY_FLOAT_C_FUN(fabs)(f1 - f0) < fatol) ||
+                (MICROPY_FLOAT_C_FUN(fabs)(x1 - x0) < xatol)) {
+                break;
+            }
+            if(f1 < f0) {
+                SWAP(mp_float_t, x0, x1);
+                SWAP(mp_float_t, f0, f1);
+            }
+        }
+    }
+    return mp_obj_new_float(x0);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(optimize_fmin_obj, 2, optimize_fmin);
+#endif
+
+#if ULAB_SCIPY_OPTIMIZE_HAS_CURVE_FIT
+static void optimize_jacobi(const mp_obj_type_t *type, mp_obj_t fun, mp_float_t *x, mp_float_t *y, uint16_t len, mp_float_t *params, uint8_t nparams, mp_float_t *jacobi, mp_float_t *grad) {
+    /* Calculates the Jacobian and the gradient of the cost function
+     *
+     * The entries in the Jacobian are
+     * J(m, n) = de_m/da_n,
+     *
+     * where
+     *
+     * e_m = (f(x_m, a1, a2, ...) - y_m)/sigma_m is the error at x_m,
+     *
+     * and
+     *
+     * a1, a2, ..., a_n are the free parameters
+     */
+    mp_obj_t *fargs0 = m_new(mp_obj_t, lenp+1);
+    mp_obj_t *fargs1 = m_new(mp_obj_t, lenp+1);
+    for(uint8_t p=0; p < nparams; p++) {
+        fargs0[p+1] = mp_obj_new_float(params[p]);
+        fargs1[p+1] = mp_obj_new_float(params[p]);
+    }
+    for(uint8_t p=0; p < nparams; p++) {
+        mp_float_t da = params[p] != MICROPY_FLOAT_CONST(0.0) ? (MICROPY_FLOAT_CONST(1.0) + APPROX_NONZDELTA) * params[p] : APPROX_ZDELTA;
+        fargs1[p+1] = mp_obj_new_float(params[p] + da);
+        grad[p] = MICROPY_FLOAT_CONST(0.0);
+        for(uint16_t i=0; i < len; i++) {
+            mp_float_t f0 = optimize_python_call(type, fun, x[i], fargs0, nparams);
+            mp_float_t f1 = optimize_python_call(type, fun, x[i], fargs1, nparams);
+            jacobi[i*nparamp+p] = (f1 - f0) / da;
+            grad[p] += (f0 - y[i]) * jacobi[i*nparamp+p];
+        }
+        fargs1[p+1] = fargs0[p+1]; // set back to the original value
+    }
+}
+
+static void optimize_delta(mp_float_t *jacobi, mp_float_t *grad, uint16_t len, uint8_t nparams, mp_float_t lambda) {
+    //
+}
+
+mp_obj_t optimize_curve_fit(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    // Levenberg-Marquardt non-linear fit
+    // The implementation follows the introductory discussion in Mark Tanstrum's paper, https://arxiv.org/abs/1201.5885
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_p0, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_xatol, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = MP_ROM_PTR(&xtolerance)} },
+        { MP_QSTR_fatol, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = MP_ROM_PTR(&xtolerance)} },
+        { MP_QSTR_maxiter, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t fun = args[0].u_obj;
+    const mp_obj_type_t *type = mp_obj_get_type(fun);
+    if(mp_type_get_call_slot(type) == NULL) {
+        mp_raise_TypeError(translate("first argument must be a function"));
+    }
+
+    mp_obj_t x_obj = args[1].u_obj;
+    mp_obj_t y_obj = args[2].u_obj;
+    mp_obj_t p0_obj = args[3].u_obj;
+    if(!ndarray_object_is_array_like(x_obj) || !ndarray_object_is_array_like(y_obj)) {
+        mp_raise_TypeError(translate("data must be iterable"));
+    }
+    if(!ndarray_object_is_nditerable(p0_obj)) {
+        mp_raise_TypeError(translate("initial values must be iterable"));
+    }
+    size_t len = (size_t)mp_obj_get_int(mp_obj_len_maybe(x_obj));
+    uint8_t lenp = (uint8_t)mp_obj_get_int(mp_obj_len_maybe(p0_obj));
+    if(len != (uint16_t)mp_obj_get_int(mp_obj_len_maybe(y_obj))) {
+        mp_raise_ValueError(translate("data must be of equal length"));
+    }
+
+    mp_float_t *x = m_new(mp_float_t, len);
+    fill_array_iterable(x, x_obj);
+    mp_float_t *y = m_new(mp_float_t, len);
+    fill_array_iterable(y, y_obj);
+    mp_float_t *p0 = m_new(mp_float_t, lenp);
+    fill_array_iterable(p0, p0_obj);
+    mp_float_t *grad = m_new(mp_float_t, len);
+    mp_float_t *jacobi = m_new(mp_float_t, len*len);
+    mp_obj_t *fargs = m_new(mp_obj_t, lenp+1);
+
+    m_del(mp_float_t, p0, lenp);
+    // parameters controlling convergence conditions
+    //mp_float_t xatol = mp_obj_get_float(args[2].u_obj);
+    //mp_float_t fatol = mp_obj_get_float(args[3].u_obj);
+
+    // this has finite binary representation; we will multiply/divide by 4
+    //mp_float_t lambda = 0.0078125;
+
+    //linalg_invert_matrix(mp_float_t *data, size_t N)
+
+    m_del(mp_float_t, x, len);
+    m_del(mp_float_t, y, len);
+    m_del(mp_float_t, grad, len);
+    m_del(mp_float_t, jacobi, len*len);
+    m_del(mp_obj_t, fargs, lenp+1);
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(optimize_curve_fit_obj, 2, optimize_curve_fit);
+#endif
+
+#if ULAB_SCIPY_OPTIMIZE_HAS_NEWTON
+//| def newton(
+//|     fun: Callable[[float], float],
+//|     x0: float,
+//|     *,
+//|     xtol: float = 2.4e-7,
+//|     rtol: float = 0.0,
+//|     maxiter: int = 50
+//| ) -> float:
+//|     """
+//|     :param callable f: The function to bisect
+//|     :param float x0: The initial x value
+//|     :param float xtol: The absolute tolerance value
+//|     :param float rtol: The relative tolerance value
+//|     :param float maxiter: The maximum number of iterations to perform
+//|
+//|     Find a solution (zero) of the function ``f(x)`` using Newton's Method.
+//|     The result is accurate to within ``xtol * rtol * |f(x)|`` unless more than
+//|     ``maxiter`` steps are requried."""
+//|     ...
+//|
+
+static mp_obj_t optimize_newton(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    // this is actually the secant method, as the first derivative of the function
+    // is not accepted as an argument. The function whose root we want to solve for
+    // must depend on a single variable without parameters, i.e., f(x)
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_tol, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_PTR(&xtolerance) } },
+        { MP_QSTR_rtol, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_PTR(&rtolerance) } },
+        { MP_QSTR_maxiter, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = 50 } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t fun = args[0].u_obj;
+    const mp_obj_type_t *type = mp_obj_get_type(fun);
+    if(mp_type_get_call_slot(type) == NULL) {
+        mp_raise_TypeError(translate("first argument must be a function"));
+    }
+    mp_float_t x = mp_obj_get_float(args[1].u_obj);
+    mp_float_t tol = mp_obj_get_float(args[2].u_obj);
+    mp_float_t rtol = mp_obj_get_float(args[3].u_obj);
+    mp_float_t dx, df, fx;
+    dx = x > MICROPY_FLOAT_CONST(0.0) ? OPTIMIZE_EPS * x : -OPTIMIZE_EPS * x;
+    mp_obj_t *fargs = m_new(mp_obj_t, 1);
+    if(args[4].u_int <= 0) {
+        mp_raise_ValueError(translate("maxiter must be > 0"));
+    }
+    for(uint16_t i=0; i < args[4].u_int; i++) {
+        fx = optimize_python_call(type, fun, x, fargs, 0);
+        df = (optimize_python_call(type, fun, x + dx, fargs, 0) - fx) / dx;
+        dx = fx / df;
+        x -= dx;
+        if(MICROPY_FLOAT_C_FUN(fabs)(dx) < (tol + rtol * MICROPY_FLOAT_C_FUN(fabs)(x))) break;
+    }
+    return mp_obj_new_float(x);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(optimize_newton_obj, 2, optimize_newton);
+#endif
+
+static const mp_rom_map_elem_t ulab_scipy_optimize_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_optimize) },
+    #if ULAB_SCIPY_OPTIMIZE_HAS_BISECT
+        { MP_OBJ_NEW_QSTR(MP_QSTR_bisect), (mp_obj_t)&optimize_bisect_obj },
+    #endif
+    #if ULAB_SCIPY_OPTIMIZE_HAS_CURVE_FIT
+        { MP_OBJ_NEW_QSTR(MP_QSTR_curve_fit), (mp_obj_t)&optimize_curve_fit_obj },
+    #endif
+    #if ULAB_SCIPY_OPTIMIZE_HAS_FMIN
+        { MP_OBJ_NEW_QSTR(MP_QSTR_fmin), (mp_obj_t)&optimize_fmin_obj },
+    #endif
+    #if ULAB_SCIPY_OPTIMIZE_HAS_NEWTON
+        { MP_OBJ_NEW_QSTR(MP_QSTR_newton), (mp_obj_t)&optimize_newton_obj },
+    #endif
+};
+
+static MP_DEFINE_CONST_DICT(mp_module_ulab_scipy_optimize_globals, ulab_scipy_optimize_globals_table);
+
+const mp_obj_module_t ulab_scipy_optimize_module = {
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_scipy_optimize_globals,
+};
+MP_REGISTER_MODULE(MP_QSTR_ulab_dot_scipy_dot_optimize, ulab_scipy_optimize_module, MODULE_ULAB_ENABLED && CIRCUITPY_ULAB);
diff --git a/circuitpython/extmod/ulab/code/scipy/optimize/optimize.h b/circuitpython/extmod/ulab/code/scipy/optimize/optimize.h
new file mode 100644
index 0000000..174b386
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/scipy/optimize/optimize.h
@@ -0,0 +1,41 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+ *
+*/
+
+#ifndef _SCIPY_OPTIMIZE_
+#define _SCIPY_OPTIMIZE_
+
+#include "../../ulab_tools.h"
+
+#ifndef     OPTIMIZE_EPSILON
+#if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
+#define     OPTIMIZE_EPSILON      MICROPY_FLOAT_CONST(1.2e-7)
+#elif MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_DOUBLE
+#define     OPTIMIZE_EPSILON      MICROPY_FLOAT_CONST(2.3e-16)
+#endif
+#endif
+
+#define     OPTIMIZE_EPS          MICROPY_FLOAT_CONST(1.0e-4)
+#define     OPTIMIZE_NONZDELTA    MICROPY_FLOAT_CONST(0.05)
+#define     OPTIMIZE_ZDELTA       MICROPY_FLOAT_CONST(0.00025)
+#define     OPTIMIZE_ALPHA        MICROPY_FLOAT_CONST(1.0)
+#define     OPTIMIZE_BETA         MICROPY_FLOAT_CONST(2.0)
+#define     OPTIMIZE_GAMMA        MICROPY_FLOAT_CONST(0.5)
+#define     OPTIMIZE_DELTA        MICROPY_FLOAT_CONST(0.5)
+
+extern const mp_obj_module_t ulab_scipy_optimize_module;
+
+MP_DECLARE_CONST_FUN_OBJ_KW(optimize_bisect_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(optimize_curve_fit_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(optimize_fmin_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(optimize_newton_obj);
+
+#endif /* _SCIPY_OPTIMIZE_ */
diff --git a/circuitpython/extmod/ulab/code/scipy/scipy.c b/circuitpython/extmod/ulab/code/scipy/scipy.c
new file mode 100644
index 0000000..ba37dde
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/scipy/scipy.c
@@ -0,0 +1,52 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020-2021 Zoltán Vörös
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include "py/runtime.h"
+
+#include "../ulab.h"
+#include "optimize/optimize.h"
+#include "signal/signal.h"
+#include "special/special.h"
+#include "linalg/linalg.h"
+
+#if ULAB_HAS_SCIPY
+
+//| """Compatibility layer for scipy"""
+//|
+
+static const mp_rom_map_elem_t ulab_scipy_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_scipy) },
+    #if ULAB_SCIPY_HAS_LINALG_MODULE
+        { MP_ROM_QSTR(MP_QSTR_linalg), MP_ROM_PTR(&ulab_scipy_linalg_module) },
+    #endif
+    #if ULAB_SCIPY_HAS_OPTIMIZE_MODULE
+        { MP_ROM_QSTR(MP_QSTR_optimize), MP_ROM_PTR(&ulab_scipy_optimize_module) },
+    #endif
+    #if ULAB_SCIPY_HAS_SIGNAL_MODULE
+        { MP_ROM_QSTR(MP_QSTR_signal), MP_ROM_PTR(&ulab_scipy_signal_module) },
+    #endif
+    #if ULAB_SCIPY_HAS_SPECIAL_MODULE
+        { MP_ROM_QSTR(MP_QSTR_special), MP_ROM_PTR(&ulab_scipy_special_module) },
+    #endif
+};
+
+static MP_DEFINE_CONST_DICT(mp_module_ulab_scipy_globals, ulab_scipy_globals_table);
+
+const mp_obj_module_t ulab_scipy_module = {
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_scipy_globals,
+};
+MP_REGISTER_MODULE(MP_QSTR_ulab_dot_scipy, ulab_scipy_module, MODULE_ULAB_ENABLED && CIRCUITPY_ULAB);
+#endif
diff --git a/circuitpython/extmod/ulab/code/scipy/scipy.h b/circuitpython/extmod/ulab/code/scipy/scipy.h
new file mode 100644
index 0000000..ec8c804
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/scipy/scipy.h
@@ -0,0 +1,21 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+ *               
+*/
+
+#ifndef _SCIPY_
+#define _SCIPY_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+extern const mp_obj_module_t ulab_scipy_module;
+
+#endif /* _SCIPY_ */
diff --git a/circuitpython/extmod/ulab/code/scipy/signal/signal.c b/circuitpython/extmod/ulab/code/scipy/signal/signal.c
new file mode 100644
index 0000000..69d5609
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/scipy/signal/signal.c
@@ -0,0 +1,172 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020-2021 Zoltán Vörös
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <string.h>
+#include "py/runtime.h"
+
+#include "../../ulab.h"
+#include "../../ndarray.h"
+#include "../../numpy/carray/carray_tools.h"
+#include "../../numpy/fft/fft_tools.h"
+
+#if ULAB_SCIPY_SIGNAL_HAS_SPECTROGRAM
+//| import ulab.numpy
+//|
+//| def spectrogram(r: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|     """
+//|     :param ulab.numpy.ndarray r: A 1-dimension array of values whose size is a power of 2
+//|
+//|     Computes the spectrum of the input signal.  This is the absolute value of the (complex-valued) fft of the signal.
+//|     This function is similar to scipy's ``scipy.signal.spectrogram``."""
+//|     ...
+//|
+
+mp_obj_t signal_spectrogram(size_t n_args, const mp_obj_t *args) {
+    #if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+        return fft_fft_ifft_spectrogram(args[0], FFT_SPECTROGRAM);
+    #else
+    if(n_args == 2) {
+        return fft_fft_ifft_spectrogram(n_args, args[0], args[1], FFT_SPECTROGRAM);
+    } else {
+        return fft_fft_ifft_spectrogram(n_args, args[0], mp_const_none, FFT_SPECTROGRAM);
+    }
+    #endif
+}
+
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(signal_spectrogram_obj, 1, 1, signal_spectrogram);
+#else
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(signal_spectrogram_obj, 1, 2, signal_spectrogram);
+#endif
+
+#endif /* ULAB_SCIPY_SIGNAL_HAS_SPECTROGRAM */
+
+#if ULAB_SCIPY_SIGNAL_HAS_SOSFILT
+static void signal_sosfilt_array(mp_float_t *x, const mp_float_t *coeffs, mp_float_t *zf, const size_t len) {
+    for(size_t i=0; i < len; i++) {
+        mp_float_t xn = *x;
+        *x = coeffs[0] * xn + zf[0];
+        zf[0] = zf[1] + coeffs[1] * xn - coeffs[4] * *x;
+        zf[1] = coeffs[2] * xn - coeffs[5] * *x;
+        x++;
+    }
+    x -= len;
+}
+
+mp_obj_t signal_sosfilt(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_sos, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_x, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_zi, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!ndarray_object_is_array_like(args[0].u_obj) || !ndarray_object_is_array_like(args[1].u_obj)) {
+        mp_raise_TypeError(translate("sosfilt requires iterable arguments"));
+    }
+    #if ULAB_SUPPORTS_COMPLEX
+    if(mp_obj_is_type(args[1].u_obj, &ulab_ndarray_type)) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[1].u_obj);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    }
+    #endif
+    size_t lenx = (size_t)mp_obj_get_int(mp_obj_len_maybe(args[1].u_obj));
+    ndarray_obj_t *y = ndarray_new_linear_array(lenx, NDARRAY_FLOAT);
+    mp_float_t *yarray = (mp_float_t *)y->array;
+    mp_float_t coeffs[6];
+    if(mp_obj_is_type(args[1].u_obj, &ulab_ndarray_type)) {
+        ndarray_obj_t *inarray = MP_OBJ_TO_PTR(args[1].u_obj);
+        #if ULAB_MAX_DIMS > 1
+        if(inarray->ndim > 1) {
+            mp_raise_ValueError(translate("input must be one-dimensional"));
+        }
+        #endif
+        uint8_t *iarray = (uint8_t *)inarray->array;
+        for(size_t i=0; i < lenx; i++) {
+            *yarray++ = ndarray_get_float_value(iarray, inarray->dtype);
+            iarray += inarray->strides[ULAB_MAX_DIMS - 1];
+        }
+        yarray -= lenx;
+    } else {
+        fill_array_iterable(yarray, args[1].u_obj);
+    }
+
+    mp_obj_iter_buf_t iter_buf;
+    mp_obj_t item, iterable = mp_getiter(args[0].u_obj, &iter_buf);
+    size_t lensos = (size_t)mp_obj_get_int(mp_obj_len_maybe(args[0].u_obj));
+
+    size_t *shape = ndarray_shape_vector(0, 0, lensos, 2);
+    ndarray_obj_t *zf = ndarray_new_dense_ndarray(2, shape, NDARRAY_FLOAT);
+    mp_float_t *zf_array = (mp_float_t *)zf->array;
+
+    if(args[2].u_obj != mp_const_none) {
+        if(!mp_obj_is_type(args[2].u_obj, &ulab_ndarray_type)) {
+            mp_raise_TypeError(translate("zi must be an ndarray"));
+        } else {
+            ndarray_obj_t *zi = MP_OBJ_TO_PTR(args[2].u_obj);
+            if((zi->shape[ULAB_MAX_DIMS - 1] != lensos) || (zi->shape[ULAB_MAX_DIMS - 1] != 2)) {
+                mp_raise_ValueError(translate("zi must be of shape (n_section, 2)"));
+            }
+            if(zi->dtype != NDARRAY_FLOAT) {
+                mp_raise_ValueError(translate("zi must be of float type"));
+            }
+            // TODO: this won't work with sparse arrays
+            memcpy(zf_array, zi->array, 2*lensos*sizeof(mp_float_t));
+        }
+    }
+    while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+        if(mp_obj_get_int(mp_obj_len_maybe(item)) != 6) {
+            mp_raise_ValueError(translate("sos array must be of shape (n_section, 6)"));
+        } else {
+            fill_array_iterable(coeffs, item);
+            if(coeffs[3] != MICROPY_FLOAT_CONST(1.0)) {
+                mp_raise_ValueError(translate("sos[:, 3] should be all ones"));
+            }
+            signal_sosfilt_array(yarray, coeffs, zf_array, lenx);
+            zf_array += 2;
+        }
+    }
+    if(args[2].u_obj == mp_const_none) {
+        return MP_OBJ_FROM_PTR(y);
+    } else {
+        mp_obj_tuple_t *tuple = MP_OBJ_TO_PTR(mp_obj_new_tuple(2, NULL));
+        tuple->items[0] = MP_OBJ_FROM_PTR(y);
+        tuple->items[1] = MP_OBJ_FROM_PTR(zf);
+        return tuple;
+    }
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(signal_sosfilt_obj, 2, signal_sosfilt);
+#endif /* ULAB_SCIPY_SIGNAL_HAS_SOSFILT */
+
+static const mp_rom_map_elem_t ulab_scipy_signal_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_signal) },
+    #if ULAB_SCIPY_SIGNAL_HAS_SPECTROGRAM
+        { MP_OBJ_NEW_QSTR(MP_QSTR_spectrogram), (mp_obj_t)&signal_spectrogram_obj },
+    #endif
+    #if ULAB_SCIPY_SIGNAL_HAS_SOSFILT
+        { MP_OBJ_NEW_QSTR(MP_QSTR_sosfilt), (mp_obj_t)&signal_sosfilt_obj },
+    #endif
+};
+
+static MP_DEFINE_CONST_DICT(mp_module_ulab_scipy_signal_globals, ulab_scipy_signal_globals_table);
+
+const mp_obj_module_t ulab_scipy_signal_module = {
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_scipy_signal_globals,
+};
+MP_REGISTER_MODULE(MP_QSTR_ulab_dot_scipy_dot_signal, ulab_scipy_signal_module, MODULE_ULAB_ENABLED && CIRCUITPY_ULAB);
diff --git a/circuitpython/extmod/ulab/code/scipy/signal/signal.h b/circuitpython/extmod/ulab/code/scipy/signal/signal.h
new file mode 100644
index 0000000..21299a6
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/scipy/signal/signal.h
@@ -0,0 +1,24 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+ *
+*/
+
+#ifndef _SCIPY_SIGNAL_
+#define _SCIPY_SIGNAL_
+
+#include "../../ulab.h"
+#include "../../ndarray.h"
+
+extern const mp_obj_module_t ulab_scipy_signal_module;
+
+MP_DECLARE_CONST_FUN_OBJ_VAR_BETWEEN(signal_spectrogram_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(signal_sosfilt_obj);
+
+#endif /* _SCIPY_SIGNAL_ */
diff --git a/circuitpython/extmod/ulab/code/scipy/special/special.c b/circuitpython/extmod/ulab/code/scipy/special/special.c
new file mode 100644
index 0000000..decfde0
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/scipy/special/special.c
@@ -0,0 +1,43 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020-2021 Zoltán Vörös
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include "py/runtime.h"
+
+#include "../../ulab.h"
+#include "../../numpy/vector.h"
+
+static const mp_rom_map_elem_t ulab_scipy_special_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_special) },
+    #if ULAB_SCIPY_SPECIAL_HAS_ERF
+		{ MP_OBJ_NEW_QSTR(MP_QSTR_erf), (mp_obj_t)&vector_erf_obj },
+    #endif
+	#if ULAB_SCIPY_SPECIAL_HAS_ERFC
+		{ MP_OBJ_NEW_QSTR(MP_QSTR_erfc), (mp_obj_t)&vector_erfc_obj },
+	#endif
+	#if ULAB_SCIPY_SPECIAL_HAS_GAMMA
+		{ MP_OBJ_NEW_QSTR(MP_QSTR_gamma), (mp_obj_t)&vector_gamma_obj },
+	#endif
+	#if ULAB_SCIPY_SPECIAL_HAS_GAMMALN
+		{ MP_OBJ_NEW_QSTR(MP_QSTR_gammaln), (mp_obj_t)&vector_lgamma_obj },
+	#endif
+};
+
+static MP_DEFINE_CONST_DICT(mp_module_ulab_scipy_special_globals, ulab_scipy_special_globals_table);
+
+const mp_obj_module_t ulab_scipy_special_module = {
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_scipy_special_globals,
+};
+MP_REGISTER_MODULE(MP_QSTR_ulab_dot_scipy_dot_special, ulab_scipy_special_module, MODULE_ULAB_ENABLED && CIRCUITPY_ULAB);
diff --git a/circuitpython/extmod/ulab/code/scipy/special/special.h b/circuitpython/extmod/ulab/code/scipy/special/special.h
new file mode 100644
index 0000000..bb34e27
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/scipy/special/special.h
@@ -0,0 +1,21 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+ *               
+*/
+
+#ifndef _SCIPY_SPECIAL_
+#define _SCIPY_SPECIAL_
+
+#include "../../ulab.h"
+#include "../../ndarray.h"
+
+extern const mp_obj_module_t ulab_scipy_special_module;
+
+#endif /* _SCIPY_SPECIAL_ */
diff --git a/circuitpython/extmod/ulab/code/ulab.c b/circuitpython/extmod/ulab/code/ulab.c
new file mode 100644
index 0000000..e8dfe0e
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/ulab.c
@@ -0,0 +1,185 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *               2020 Jeff Epler for Adafruit Industries
+*/
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/runtime.h"
+#include "py/binary.h"
+#include "py/obj.h"
+#include "py/objarray.h"
+
+#include "ulab.h"
+#include "ndarray.h"
+#include "ndarray_properties.h"
+#include "numpy/create.h"
+#include "numpy/ndarray/ndarray_iter.h"
+
+#include "numpy/numpy.h"
+#include "scipy/scipy.h"
+// TODO: we should get rid of this; array.sort depends on it
+#include "numpy/numerical.h"
+
+#include "user/user.h"
+#include "utils/utils.h"
+
+#define ULAB_VERSION 4.0.0
+#define xstr(s) str(s)
+#define str(s) #s
+
+#if ULAB_SUPPORTS_COMPLEX
+#define ULAB_VERSION_STRING xstr(ULAB_VERSION) xstr(-) xstr(ULAB_MAX_DIMS) xstr(D-c)
+#else
+#define ULAB_VERSION_STRING xstr(ULAB_VERSION) xstr(-) xstr(ULAB_MAX_DIMS) xstr(D)
+#endif
+
+STATIC MP_DEFINE_STR_OBJ(ulab_version_obj, ULAB_VERSION_STRING);
+
+
+STATIC const mp_rom_map_elem_t ulab_ndarray_locals_dict_table[] = {
+    #if ULAB_MAX_DIMS > 1
+        #if NDARRAY_HAS_RESHAPE
+            { MP_ROM_QSTR(MP_QSTR_reshape), MP_ROM_PTR(&ndarray_reshape_obj) },
+        #endif
+        #if NDARRAY_HAS_TRANSPOSE
+            { MP_ROM_QSTR(MP_QSTR_transpose), MP_ROM_PTR(&ndarray_transpose_obj) },
+        #endif
+    #endif
+    #if NDARRAY_HAS_BYTESWAP
+        { MP_ROM_QSTR(MP_QSTR_byteswap), MP_ROM_PTR(&ndarray_byteswap_obj) },
+    #endif
+    #if NDARRAY_HAS_COPY
+        { MP_ROM_QSTR(MP_QSTR_copy), MP_ROM_PTR(&ndarray_copy_obj) },
+    #endif
+    #if NDARRAY_HAS_FLATTEN
+        { MP_ROM_QSTR(MP_QSTR_flatten), MP_ROM_PTR(&ndarray_flatten_obj) },
+    #endif
+    #if NDARRAY_HAS_TOBYTES
+        { MP_ROM_QSTR(MP_QSTR_tobytes), MP_ROM_PTR(&ndarray_tobytes_obj) },
+    #endif
+    #if NDARRAY_HAS_TOLIST
+        { MP_ROM_QSTR(MP_QSTR_tolist), MP_ROM_PTR(&ndarray_tolist_obj) },
+    #endif
+    #if NDARRAY_HAS_SORT
+        { MP_ROM_QSTR(MP_QSTR_sort), MP_ROM_PTR(&numerical_sort_inplace_obj) },
+    #endif
+    #ifdef CIRCUITPY
+        #if NDARRAY_HAS_DTYPE
+            { MP_ROM_QSTR(MP_QSTR_dtype), MP_ROM_PTR(&ndarray_dtype_obj) },
+        #endif
+        #if NDARRAY_HAS_FLATITER
+            { MP_ROM_QSTR(MP_QSTR_flat), MP_ROM_PTR(&ndarray_flat_obj) },
+        #endif
+        #if NDARRAY_HAS_ITEMSIZE
+            { MP_ROM_QSTR(MP_QSTR_itemsize), MP_ROM_PTR(&ndarray_itemsize_obj) },
+        #endif
+        #if NDARRAY_HAS_SHAPE
+            { MP_ROM_QSTR(MP_QSTR_shape), MP_ROM_PTR(&ndarray_shape_obj) },
+        #endif
+        #if NDARRAY_HAS_SIZE
+            { MP_ROM_QSTR(MP_QSTR_size), MP_ROM_PTR(&ndarray_size_obj) },
+        #endif
+        #if NDARRAY_HAS_STRIDES
+            { MP_ROM_QSTR(MP_QSTR_strides), MP_ROM_PTR(&ndarray_strides_obj) },
+        #endif
+    #endif /* CIRCUITPY */
+};
+
+STATIC MP_DEFINE_CONST_DICT(ulab_ndarray_locals_dict, ulab_ndarray_locals_dict_table);
+
+const mp_obj_type_t ulab_ndarray_type = {
+    { &mp_type_type },
+    .flags = MP_TYPE_FLAG_EXTENDED
+    #if defined(MP_TYPE_FLAG_EQ_CHECKS_OTHER_TYPE) && defined(MP_TYPE_FLAG_EQ_HAS_NEQ_TEST)
+        | MP_TYPE_FLAG_EQ_CHECKS_OTHER_TYPE | MP_TYPE_FLAG_EQ_HAS_NEQ_TEST,
+    #endif
+    .name = MP_QSTR_ndarray,
+    .print = ndarray_print,
+    .make_new = ndarray_make_new,
+    .locals_dict = (mp_obj_dict_t*)&ulab_ndarray_locals_dict,
+    MP_TYPE_EXTENDED_FIELDS(
+    #if NDARRAY_IS_SLICEABLE
+    .subscr = ndarray_subscr,
+    #endif
+    #if NDARRAY_IS_ITERABLE
+    .getiter = ndarray_getiter,
+    #endif
+    #if NDARRAY_HAS_UNARY_OPS
+    .unary_op = ndarray_unary_op,
+    #endif
+    #if NDARRAY_HAS_BINARY_OPS
+    .binary_op = ndarray_binary_op,
+    #endif
+    #ifndef CIRCUITPY
+    .attr = ndarray_properties_attr,
+    #endif
+    .buffer_p = { .get_buffer = ndarray_get_buffer, },
+    )
+};
+
+#if ULAB_HAS_DTYPE_OBJECT
+const mp_obj_type_t ulab_dtype_type = {
+    { &mp_type_type },
+    .name = MP_QSTR_dtype,
+    .print = ndarray_dtype_print,
+    .make_new = ndarray_dtype_make_new,
+};
+#endif
+
+#if NDARRAY_HAS_FLATITER
+const mp_obj_type_t ndarray_flatiter_type = {
+    { &mp_type_type },
+    .name = MP_QSTR_flatiter,
+    MP_TYPE_EXTENDED_FIELDS(
+    .getiter = ndarray_get_flatiterator,
+    )
+};
+#endif
+
+STATIC const mp_map_elem_t ulab_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_ulab) },
+    { MP_ROM_QSTR(MP_QSTR___version__), MP_ROM_PTR(&ulab_version_obj) },
+    #if ULAB_HAS_DTYPE_OBJECT
+        { MP_OBJ_NEW_QSTR(MP_QSTR_dtype), (mp_obj_t)&ulab_dtype_type },
+    #else
+        #if NDARRAY_HAS_DTYPE
+        { MP_OBJ_NEW_QSTR(MP_QSTR_dtype), (mp_obj_t)&ndarray_dtype_obj },
+        #endif /* NDARRAY_HAS_DTYPE */
+    #endif /* ULAB_HAS_DTYPE_OBJECT */
+        { MP_ROM_QSTR(MP_QSTR_numpy), MP_ROM_PTR((mp_obj_t)&ulab_numpy_module) },
+    #if ULAB_HAS_SCIPY
+        { MP_ROM_QSTR(MP_QSTR_scipy), MP_ROM_PTR((mp_obj_t)&ulab_scipy_module) },
+    #endif
+    #if ULAB_HAS_USER_MODULE
+        { MP_ROM_QSTR(MP_QSTR_user), MP_ROM_PTR((mp_obj_t)&ulab_user_module) },
+    #endif
+    #if ULAB_HAS_UTILS_MODULE
+        { MP_ROM_QSTR(MP_QSTR_utils), MP_ROM_PTR((mp_obj_t)&ulab_utils_module) },
+    #endif
+};
+
+STATIC MP_DEFINE_CONST_DICT (
+    mp_module_ulab_globals,
+    ulab_globals_table
+);
+
+#ifdef OPENMV
+const struct _mp_obj_module_t ulab_user_cmodule = {
+#else
+const mp_obj_module_t ulab_user_cmodule = {
+#endif
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_globals,
+};
+
+MP_REGISTER_MODULE(MP_QSTR_ulab, ulab_user_cmodule, MODULE_ULAB_ENABLED);
diff --git a/circuitpython/extmod/ulab/code/ulab.h b/circuitpython/extmod/ulab/code/ulab.h
new file mode 100644
index 0000000..924f4c7
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/ulab.h
@@ -0,0 +1,712 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2022 Zoltán Vörös
+*/
+
+#ifndef __ULAB__
+#define __ULAB__
+
+
+
+// The pre-processor constants in this file determine how ulab behaves:
+//
+// - how many dimensions ulab can handle
+// - which functions are included in the compiled firmware
+// - whether arrays can be sliced and iterated over
+// - which binary/unary operators are supported
+// - whether ulab can deal with complex numbers
+//
+// A considerable amount of flash space can be saved by removing (setting
+// the corresponding constants to 0) the unnecessary functions and features.
+
+// Values defined here can be overridden by your own config file as
+// make -DULAB_CONFIG_FILE="my_ulab_config.h"
+#if defined(ULAB_CONFIG_FILE)
+#include ULAB_CONFIG_FILE
+#endif
+
+// Adds support for complex ndarrays
+#ifndef ULAB_SUPPORTS_COMPLEX
+#define ULAB_SUPPORTS_COMPLEX               (1)
+#endif
+
+// Determines, whether scipy is defined in ulab. The sub-modules and functions
+// of scipy have to be defined separately
+#ifndef ULAB_HAS_SCIPY
+#define ULAB_HAS_SCIPY                      (1)
+#endif
+
+// The maximum number of dimensions the firmware should be able to support
+// Possible values lie between 1, and 4, inclusive
+#ifndef ULAB_MAX_DIMS
+#define ULAB_MAX_DIMS                       2
+#endif
+
+// By setting this constant to 1, iteration over array dimensions will be implemented
+// as a function (ndarray_rewind_array), instead of writing out the loops in macros
+// This reduces firmware size at the expense of speed
+#ifndef ULAB_HAS_FUNCTION_ITERATOR
+#define ULAB_HAS_FUNCTION_ITERATOR          (0)
+#endif
+
+// If NDARRAY_IS_ITERABLE is 1, the ndarray object defines its own iterator function
+// This option saves approx. 250 bytes of flash space
+#ifndef NDARRAY_IS_ITERABLE
+#define NDARRAY_IS_ITERABLE                 (1)
+#endif
+
+// Slicing can be switched off by setting this variable to 0
+#ifndef NDARRAY_IS_SLICEABLE
+#define NDARRAY_IS_SLICEABLE                (1)
+#endif
+
+// The default threshold for pretty printing. These variables can be overwritten
+// at run-time via the set_printoptions() function
+#ifndef ULAB_HAS_PRINTOPTIONS
+#define ULAB_HAS_PRINTOPTIONS               (1)
+#endif
+#define NDARRAY_PRINT_THRESHOLD             10
+#define NDARRAY_PRINT_EDGEITEMS             3
+
+// determines, whether the dtype is an object, or simply a character
+// the object implementation is numpythonic, but requires more space
+#ifndef ULAB_HAS_DTYPE_OBJECT
+#define ULAB_HAS_DTYPE_OBJECT               (0)
+#endif
+
+// the ndarray binary operators
+#ifndef NDARRAY_HAS_BINARY_OPS
+#define NDARRAY_HAS_BINARY_OPS              (1)
+#endif
+
+// Firmware size can be reduced at the expense of speed by using function
+// pointers in iterations. For each operator, he function pointer saves around
+// 2 kB in the two-dimensional case, and around 4 kB in the four-dimensional case.
+
+#ifndef NDARRAY_BINARY_USES_FUN_POINTER
+#define NDARRAY_BINARY_USES_FUN_POINTER     (0)
+#endif
+
+#ifndef NDARRAY_HAS_BINARY_OP_ADD
+#define NDARRAY_HAS_BINARY_OP_ADD           (1)
+#endif
+
+#ifndef NDARRAY_HAS_BINARY_OP_EQUAL
+#define NDARRAY_HAS_BINARY_OP_EQUAL         (1)
+#endif
+
+#ifndef NDARRAY_HAS_BINARY_OP_LESS
+#define NDARRAY_HAS_BINARY_OP_LESS          (1)
+#endif
+
+#ifndef NDARRAY_HAS_BINARY_OP_LESS_EQUAL
+#define NDARRAY_HAS_BINARY_OP_LESS_EQUAL    (1)
+#endif
+
+#ifndef NDARRAY_HAS_BINARY_OP_MORE
+#define NDARRAY_HAS_BINARY_OP_MORE          (1)
+#endif
+
+#ifndef NDARRAY_HAS_BINARY_OP_MORE_EQUAL
+#define NDARRAY_HAS_BINARY_OP_MORE_EQUAL    (1)
+#endif
+
+#ifndef NDARRAY_HAS_BINARY_OP_MULTIPLY
+#define NDARRAY_HAS_BINARY_OP_MULTIPLY      (1)
+#endif
+
+#ifndef NDARRAY_HAS_BINARY_OP_NOT_EQUAL
+#define NDARRAY_HAS_BINARY_OP_NOT_EQUAL     (1)
+#endif
+
+#ifndef NDARRAY_HAS_BINARY_OP_POWER
+#define NDARRAY_HAS_BINARY_OP_POWER         (1)
+#endif
+
+#ifndef NDARRAY_HAS_BINARY_OP_SUBTRACT
+#define NDARRAY_HAS_BINARY_OP_SUBTRACT      (1)
+#endif
+
+#ifndef NDARRAY_HAS_BINARY_OP_TRUE_DIVIDE
+#define NDARRAY_HAS_BINARY_OP_TRUE_DIVIDE   (1)
+#endif
+
+#ifndef NDARRAY_HAS_INPLACE_OPS
+#define NDARRAY_HAS_INPLACE_OPS             (1)
+#endif
+
+#ifndef NDARRAY_HAS_INPLACE_ADD
+#define NDARRAY_HAS_INPLACE_ADD             (1)
+#endif
+
+#ifndef NDARRAY_HAS_INPLACE_MULTIPLY
+#define NDARRAY_HAS_INPLACE_MULTIPLY        (1)
+#endif
+
+#ifndef NDARRAY_HAS_INPLACE_POWER
+#define NDARRAY_HAS_INPLACE_POWER           (1)
+#endif
+
+#ifndef NDARRAY_HAS_INPLACE_SUBTRACT
+#define NDARRAY_HAS_INPLACE_SUBTRACT        (1)
+#endif
+
+#ifndef NDARRAY_HAS_INPLACE_TRUE_DIVIDE
+#define NDARRAY_HAS_INPLACE_TRUE_DIVIDE     (1)
+#endif
+
+// the ndarray unary operators
+#ifndef NDARRAY_HAS_UNARY_OPS
+#define NDARRAY_HAS_UNARY_OPS               (1)
+#endif
+
+#ifndef NDARRAY_HAS_UNARY_OP_ABS
+#define NDARRAY_HAS_UNARY_OP_ABS            (1)
+#endif
+
+#ifndef NDARRAY_HAS_UNARY_OP_INVERT
+#define NDARRAY_HAS_UNARY_OP_INVERT         (1)
+#endif
+
+#ifndef NDARRAY_HAS_UNARY_OP_LEN
+#define NDARRAY_HAS_UNARY_OP_LEN            (1)
+#endif
+
+#ifndef NDARRAY_HAS_UNARY_OP_NEGATIVE
+#define NDARRAY_HAS_UNARY_OP_NEGATIVE       (1)
+#endif
+
+#ifndef NDARRAY_HAS_UNARY_OP_POSITIVE
+#define NDARRAY_HAS_UNARY_OP_POSITIVE       (1)
+#endif
+
+
+// determines, which ndarray methods are available
+#ifndef NDARRAY_HAS_BYTESWAP
+#define NDARRAY_HAS_BYTESWAP            (1)
+#endif
+
+#ifndef NDARRAY_HAS_COPY
+#define NDARRAY_HAS_COPY                (1)
+#endif
+
+#ifndef NDARRAY_HAS_DTYPE
+#define NDARRAY_HAS_DTYPE               (1)
+#endif
+
+#ifndef NDARRAY_HAS_FLATTEN
+#define NDARRAY_HAS_FLATTEN             (1)
+#endif
+
+#ifndef NDARRAY_HAS_ITEMSIZE
+#define NDARRAY_HAS_ITEMSIZE            (1)
+#endif
+
+#ifndef NDARRAY_HAS_RESHAPE
+#define NDARRAY_HAS_RESHAPE             (1)
+#endif
+
+#ifndef NDARRAY_HAS_SHAPE
+#define NDARRAY_HAS_SHAPE               (1)
+#endif
+
+#ifndef NDARRAY_HAS_SIZE
+#define NDARRAY_HAS_SIZE                (1)
+#endif
+
+#ifndef NDARRAY_HAS_SORT
+#define NDARRAY_HAS_SORT                (1)
+#endif
+
+#ifndef NDARRAY_HAS_STRIDES
+#define NDARRAY_HAS_STRIDES             (1)
+#endif
+
+#ifndef NDARRAY_HAS_TOBYTES
+#define NDARRAY_HAS_TOBYTES             (1)
+#endif
+
+#ifndef NDARRAY_HAS_TOLIST
+#define NDARRAY_HAS_TOLIST              (1)
+#endif
+
+#ifndef NDARRAY_HAS_TRANSPOSE
+#define NDARRAY_HAS_TRANSPOSE           (1)
+#endif
+
+// Firmware size can be reduced at the expense of speed by using a function
+// pointer in iterations. Setting ULAB_VECTORISE_USES_FUNCPOINTER to 1 saves
+// around 800 bytes in the four-dimensional case, and around 200 in two dimensions.
+#ifndef ULAB_VECTORISE_USES_FUN_POINTER
+#define ULAB_VECTORISE_USES_FUN_POINTER (1)
+#endif
+
+// determines, whether e is defined in ulab.numpy itself
+#ifndef ULAB_NUMPY_HAS_E
+#define ULAB_NUMPY_HAS_E                (1)
+#endif
+
+// ulab defines infinite as a class constant in ulab.numpy
+#ifndef ULAB_NUMPY_HAS_INF
+#define ULAB_NUMPY_HAS_INF              (1)
+#endif
+
+// ulab defines NaN as a class constant in ulab.numpy
+#ifndef ULAB_NUMPY_HAS_NAN
+#define ULAB_NUMPY_HAS_NAN              (1)
+#endif
+
+// determines, whether pi is defined in ulab.numpy itself
+#ifndef ULAB_NUMPY_HAS_PI
+#define ULAB_NUMPY_HAS_PI               (1)
+#endif
+
+// determines, whether the ndinfo function is available
+#ifndef ULAB_NUMPY_HAS_NDINFO
+#define ULAB_NUMPY_HAS_NDINFO           (1)
+#endif
+
+// if this constant is set to 1, the interpreter can iterate
+// over the flat array without copying any data
+#ifndef NDARRAY_HAS_FLATITER
+#define NDARRAY_HAS_FLATITER            (1)
+#endif
+
+// frombuffer adds 600 bytes to the firmware
+#ifndef ULAB_NUMPY_HAS_FROMBUFFER
+#define ULAB_NUMPY_HAS_FROMBUFFER       (1)
+#endif
+
+// functions that create an array
+#ifndef ULAB_NUMPY_HAS_ARANGE
+#define ULAB_NUMPY_HAS_ARANGE           (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_CONCATENATE
+#define ULAB_NUMPY_HAS_CONCATENATE      (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_DIAG
+#define ULAB_NUMPY_HAS_DIAG             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_EMPTY
+#define ULAB_NUMPY_HAS_EMPTY            (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_EYE
+#define ULAB_NUMPY_HAS_EYE              (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_FULL
+#define ULAB_NUMPY_HAS_FULL             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_LINSPACE
+#define ULAB_NUMPY_HAS_LINSPACE         (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_LOGSPACE
+#define ULAB_NUMPY_HAS_LOGSPACE         (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ONES
+#define ULAB_NUMPY_HAS_ONES             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ZEROS
+#define ULAB_NUMPY_HAS_ZEROS            (1)
+#endif
+
+// functions that compare arrays
+#ifndef ULAB_NUMPY_HAS_CLIP
+#define ULAB_NUMPY_HAS_CLIP             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_EQUAL
+#define ULAB_NUMPY_HAS_EQUAL            (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ISFINITE
+#define ULAB_NUMPY_HAS_ISFINITE         (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ISINF
+#define ULAB_NUMPY_HAS_ISINF            (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_MAXIMUM
+#define ULAB_NUMPY_HAS_MAXIMUM          (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_MINIMUM
+#define ULAB_NUMPY_HAS_MINIMUM          (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_NOTEQUAL
+#define ULAB_NUMPY_HAS_NOTEQUAL         (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_WHERE
+#define ULAB_NUMPY_HAS_WHERE            (1)
+#endif
+
+// the linalg module; functions of the linalg module still have
+// to be defined separately
+#ifndef ULAB_NUMPY_HAS_LINALG_MODULE
+#define ULAB_NUMPY_HAS_LINALG_MODULE    (1)
+#endif
+
+#ifndef ULAB_LINALG_HAS_CHOLESKY
+#define ULAB_LINALG_HAS_CHOLESKY        (1)
+#endif
+
+#ifndef ULAB_LINALG_HAS_DET
+#define ULAB_LINALG_HAS_DET             (1)
+#endif
+
+#ifndef ULAB_LINALG_HAS_EIG
+#define ULAB_LINALG_HAS_EIG             (1)
+#endif
+
+#ifndef ULAB_LINALG_HAS_INV
+#define ULAB_LINALG_HAS_INV             (1)
+#endif
+
+#ifndef ULAB_LINALG_HAS_NORM
+#define ULAB_LINALG_HAS_NORM            (1)
+#endif
+
+#ifndef ULAB_LINALG_HAS_QR
+#define ULAB_LINALG_HAS_QR              (1)
+#endif
+
+// the FFT module; functions of the fft module still have
+// to be defined separately
+#ifndef ULAB_NUMPY_HAS_FFT_MODULE
+#define ULAB_NUMPY_HAS_FFT_MODULE       (1)
+#endif
+
+// By setting this constant to 1, the FFT routine will behave in a
+// numpy-compatible way, i.e., it will output a complex array
+// This setting has no effect, if ULAB_SUPPORTS_COMPLEX is 0
+// Note that in this case, the input also must be numpythonic,
+// i.e., the real an imaginary parts cannot be passed as two arguments
+#ifndef ULAB_FFT_IS_NUMPY_COMPATIBLE
+#define ULAB_FFT_IS_NUMPY_COMPATIBLE    (0)
+#endif
+
+#ifndef ULAB_FFT_HAS_FFT
+#define ULAB_FFT_HAS_FFT                (1)
+#endif
+
+#ifndef ULAB_FFT_HAS_IFFT
+#define ULAB_FFT_HAS_IFFT               (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ALL
+#define ULAB_NUMPY_HAS_ALL              (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ANY
+#define ULAB_NUMPY_HAS_ANY              (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ARGMINMAX
+#define ULAB_NUMPY_HAS_ARGMINMAX        (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ARGSORT
+#define ULAB_NUMPY_HAS_ARGSORT          (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_COMPRESS
+#define ULAB_NUMPY_HAS_COMPRESS         (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_CONVOLVE
+#define ULAB_NUMPY_HAS_CONVOLVE         (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_CROSS
+#define ULAB_NUMPY_HAS_CROSS            (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_DIFF
+#define ULAB_NUMPY_HAS_DIFF             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_DOT
+#define ULAB_NUMPY_HAS_DOT              (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_FLIP
+#define ULAB_NUMPY_HAS_FLIP             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_INTERP
+#define ULAB_NUMPY_HAS_INTERP           (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_MEAN
+#define ULAB_NUMPY_HAS_MEAN             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_MEDIAN
+#define ULAB_NUMPY_HAS_MEDIAN           (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_MINMAX
+#define ULAB_NUMPY_HAS_MINMAX           (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_POLYFIT
+#define ULAB_NUMPY_HAS_POLYFIT          (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_POLYVAL
+#define ULAB_NUMPY_HAS_POLYVAL          (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ROLL
+#define ULAB_NUMPY_HAS_ROLL             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_SORT
+#define ULAB_NUMPY_HAS_SORT             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_STD
+#define ULAB_NUMPY_HAS_STD              (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_SUM
+#define ULAB_NUMPY_HAS_SUM              (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_TRACE
+#define ULAB_NUMPY_HAS_TRACE            (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_TRAPZ
+#define ULAB_NUMPY_HAS_TRAPZ            (1)
+#endif
+
+// vectorised versions of the functions of the math python module, with
+// the exception of the functions listed in scipy.special
+#ifndef ULAB_NUMPY_HAS_ACOS
+#define ULAB_NUMPY_HAS_ACOS             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ACOSH
+#define ULAB_NUMPY_HAS_ACOSH            (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ARCTAN2
+#define ULAB_NUMPY_HAS_ARCTAN2          (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_AROUND
+#define ULAB_NUMPY_HAS_AROUND           (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ASIN
+#define ULAB_NUMPY_HAS_ASIN             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ASINH
+#define ULAB_NUMPY_HAS_ASINH            (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ATAN
+#define ULAB_NUMPY_HAS_ATAN             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_ATANH
+#define ULAB_NUMPY_HAS_ATANH            (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_CEIL
+#define ULAB_NUMPY_HAS_CEIL             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_COS
+#define ULAB_NUMPY_HAS_COS              (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_COSH
+#define ULAB_NUMPY_HAS_COSH             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_DEGREES
+#define ULAB_NUMPY_HAS_DEGREES          (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_EXP
+#define ULAB_NUMPY_HAS_EXP              (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_EXPM1
+#define ULAB_NUMPY_HAS_EXPM1            (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_FLOOR
+#define ULAB_NUMPY_HAS_FLOOR            (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_LOG
+#define ULAB_NUMPY_HAS_LOG              (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_LOG10
+#define ULAB_NUMPY_HAS_LOG10            (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_LOG2
+#define ULAB_NUMPY_HAS_LOG2             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_RADIANS
+#define ULAB_NUMPY_HAS_RADIANS          (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_SIN
+#define ULAB_NUMPY_HAS_SIN              (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_SINH
+#define ULAB_NUMPY_HAS_SINH             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_SQRT
+#define ULAB_NUMPY_HAS_SQRT             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_TAN
+#define ULAB_NUMPY_HAS_TAN              (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_TANH
+#define ULAB_NUMPY_HAS_TANH             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_VECTORIZE
+#define ULAB_NUMPY_HAS_VECTORIZE        (1)
+#endif
+
+// Complex functions. The implementations are compiled into
+// the firmware, only if ULAB_SUPPORTS_COMPLEX is set to 1
+#ifndef ULAB_NUMPY_HAS_CONJUGATE
+#define ULAB_NUMPY_HAS_CONJUGATE        (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_IMAG
+#define ULAB_NUMPY_HAS_IMAG             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_REAL
+#define ULAB_NUMPY_HAS_REAL             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_SORT_COMPLEX
+#define ULAB_NUMPY_HAS_SORT_COMPLEX     (1)
+#endif
+
+// scipy modules
+#ifndef ULAB_SCIPY_HAS_LINALG_MODULE
+#define ULAB_SCIPY_HAS_LINALG_MODULE        (1)
+#endif
+
+#ifndef ULAB_SCIPY_LINALG_HAS_CHO_SOLVE
+#define ULAB_SCIPY_LINALG_HAS_CHO_SOLVE     (1)
+#endif
+
+#ifndef ULAB_SCIPY_LINALG_HAS_SOLVE_TRIANGULAR
+#define ULAB_SCIPY_LINALG_HAS_SOLVE_TRIANGULAR  (1)
+#endif
+
+#ifndef ULAB_SCIPY_HAS_SIGNAL_MODULE
+#define ULAB_SCIPY_HAS_SIGNAL_MODULE        (1)
+#endif
+
+#ifndef ULAB_SCIPY_SIGNAL_HAS_SPECTROGRAM
+#define ULAB_SCIPY_SIGNAL_HAS_SPECTROGRAM   (1)
+#endif
+
+#ifndef ULAB_SCIPY_SIGNAL_HAS_SOSFILT
+#define ULAB_SCIPY_SIGNAL_HAS_SOSFILT       (1)
+#endif
+
+#ifndef ULAB_SCIPY_HAS_OPTIMIZE_MODULE
+#define ULAB_SCIPY_HAS_OPTIMIZE_MODULE      (1)
+#endif
+
+#ifndef ULAB_SCIPY_OPTIMIZE_HAS_BISECT
+#define ULAB_SCIPY_OPTIMIZE_HAS_BISECT      (1)
+#endif
+
+#ifndef ULAB_SCIPY_OPTIMIZE_HAS_CURVE_FIT
+#define ULAB_SCIPY_OPTIMIZE_HAS_CURVE_FIT   (0) // not fully implemented
+#endif
+
+#ifndef ULAB_SCIPY_OPTIMIZE_HAS_FMIN
+#define ULAB_SCIPY_OPTIMIZE_HAS_FMIN        (1)
+#endif
+
+#ifndef ULAB_SCIPY_OPTIMIZE_HAS_NEWTON
+#define ULAB_SCIPY_OPTIMIZE_HAS_NEWTON      (1)
+#endif
+
+#ifndef ULAB_SCIPY_HAS_SPECIAL_MODULE
+#define ULAB_SCIPY_HAS_SPECIAL_MODULE       (1)
+#endif
+
+#ifndef ULAB_SCIPY_SPECIAL_HAS_ERF
+#define ULAB_SCIPY_SPECIAL_HAS_ERF          (1)
+#endif
+
+#ifndef ULAB_SCIPY_SPECIAL_HAS_ERFC
+#define ULAB_SCIPY_SPECIAL_HAS_ERFC         (1)
+#endif
+
+#ifndef ULAB_SCIPY_SPECIAL_HAS_GAMMA
+#define ULAB_SCIPY_SPECIAL_HAS_GAMMA        (1)
+#endif
+
+#ifndef ULAB_SCIPY_SPECIAL_HAS_GAMMALN
+#define ULAB_SCIPY_SPECIAL_HAS_GAMMALN      (1)
+#endif
+
+// user-defined module; source of the module and
+// its sub-modules should be placed in code/user/
+#ifndef ULAB_HAS_USER_MODULE
+#define ULAB_HAS_USER_MODULE                (0)
+#endif
+
+#ifndef ULAB_HAS_UTILS_MODULE
+#define ULAB_HAS_UTILS_MODULE               (1)
+#endif
+
+#ifndef ULAB_UTILS_HAS_FROM_INT16_BUFFER
+#define ULAB_UTILS_HAS_FROM_INT16_BUFFER    (1)
+#endif
+
+#ifndef ULAB_UTILS_HAS_FROM_UINT16_BUFFER
+#define ULAB_UTILS_HAS_FROM_UINT16_BUFFER   (1)
+#endif
+
+#ifndef ULAB_UTILS_HAS_FROM_INT32_BUFFER
+#define ULAB_UTILS_HAS_FROM_INT32_BUFFER    (1)
+#endif
+
+#ifndef ULAB_UTILS_HAS_FROM_UINT32_BUFFER
+#define ULAB_UTILS_HAS_FROM_UINT32_BUFFER   (1)
+#endif
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/ulab_tools.c b/circuitpython/extmod/ulab/code/ulab_tools.c
new file mode 100644
index 0000000..7fb6363
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/ulab_tools.c
@@ -0,0 +1,260 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2022 Zoltán Vörös
+ */
+
+
+#include <string.h>
+#include "py/runtime.h"
+
+#include "ulab.h"
+#include "ndarray.h"
+#include "ulab_tools.h"
+
+// The following five functions return a float from a void type
+// The value in question is supposed to be located at the head of the pointer
+
+mp_float_t ndarray_get_float_uint8(void *data) {
+    // Returns a float value from an uint8_t type
+    return (mp_float_t)(*(uint8_t *)data);
+}
+
+mp_float_t ndarray_get_float_int8(void *data) {
+    // Returns a float value from an int8_t type
+    return (mp_float_t)(*(int8_t *)data);
+}
+
+mp_float_t ndarray_get_float_uint16(void *data) {
+    // Returns a float value from an uint16_t type
+    return (mp_float_t)(*(uint16_t *)data);
+}
+
+mp_float_t ndarray_get_float_int16(void *data) {
+    // Returns a float value from an int16_t type
+    return (mp_float_t)(*(int16_t *)data);
+}
+
+
+mp_float_t ndarray_get_float_float(void *data) {
+    // Returns a float value from an mp_float_t type
+    return *((mp_float_t *)data);
+}
+
+// returns a single function pointer, depending on the dtype
+void *ndarray_get_float_function(uint8_t dtype) {
+    if(dtype == NDARRAY_UINT8) {
+        return ndarray_get_float_uint8;
+    } else if(dtype == NDARRAY_INT8) {
+        return ndarray_get_float_int8;
+    } else if(dtype == NDARRAY_UINT16) {
+        return ndarray_get_float_uint16;
+    } else if(dtype == NDARRAY_INT16) {
+        return ndarray_get_float_int16;
+    } else {
+        return ndarray_get_float_float;
+    }
+}
+
+mp_float_t ndarray_get_float_index(void *data, uint8_t dtype, size_t index) {
+    // returns a single float value from an array located at index
+    if(dtype == NDARRAY_UINT8) {
+        return (mp_float_t)((uint8_t *)data)[index];
+    } else if(dtype == NDARRAY_INT8) {
+        return (mp_float_t)((int8_t *)data)[index];
+    } else if(dtype == NDARRAY_UINT16) {
+        return (mp_float_t)((uint16_t *)data)[index];
+    } else if(dtype == NDARRAY_INT16) {
+        return (mp_float_t)((int16_t *)data)[index];
+    } else {
+        return (mp_float_t)((mp_float_t *)data)[index];
+    }
+}
+
+mp_float_t ndarray_get_float_value(void *data, uint8_t dtype) {
+    // Returns a float value from an arbitrary data type
+    // The value in question is supposed to be located at the head of the pointer
+    if(dtype == NDARRAY_UINT8) {
+        return (mp_float_t)(*(uint8_t *)data);
+    } else if(dtype == NDARRAY_INT8) {
+        return (mp_float_t)(*(int8_t *)data);
+    } else if(dtype == NDARRAY_UINT16) {
+        return (mp_float_t)(*(uint16_t *)data);
+    } else if(dtype == NDARRAY_INT16) {
+        return (mp_float_t)(*(int16_t *)data);
+    } else {
+        return *((mp_float_t *)data);
+    }
+}
+
+#if NDARRAY_BINARY_USES_FUN_POINTER | ULAB_NUMPY_HAS_WHERE
+uint8_t ndarray_upcast_dtype(uint8_t ldtype, uint8_t rdtype) {
+    // returns a single character that corresponds to the broadcasting rules
+    // - if one of the operarands is a float, the result is always float
+    // - operation on identical types preserves type
+    //
+    // uint8 + int8 => int16
+    // uint8 + int16 => int16
+    // uint8 + uint16 => uint16
+    // int8 + int16 => int16
+    // int8 + uint16 => uint16
+    // uint16 + int16 => float
+
+    if(ldtype == rdtype) {
+        // if the two dtypes are equal, the result is also of that type
+        return ldtype;
+    } else if(((ldtype == NDARRAY_UINT8) && (rdtype == NDARRAY_INT8)) ||
+            ((ldtype == NDARRAY_INT8) && (rdtype == NDARRAY_UINT8)) ||
+            ((ldtype == NDARRAY_UINT8) && (rdtype == NDARRAY_INT16)) ||
+            ((ldtype == NDARRAY_INT16) && (rdtype == NDARRAY_UINT8)) ||
+            ((ldtype == NDARRAY_INT8) && (rdtype == NDARRAY_INT16)) ||
+            ((ldtype == NDARRAY_INT16) && (rdtype == NDARRAY_INT8))) {
+        return NDARRAY_INT16;
+    } else if(((ldtype == NDARRAY_UINT8) && (rdtype == NDARRAY_UINT16)) ||
+            ((ldtype == NDARRAY_UINT16) && (rdtype == NDARRAY_UINT8)) ||
+            ((ldtype == NDARRAY_INT8) && (rdtype == NDARRAY_UINT16)) ||
+            ((ldtype == NDARRAY_UINT16) && (rdtype == NDARRAY_INT8))) {
+        return NDARRAY_UINT16;
+    }
+    return NDARRAY_FLOAT;
+}
+
+// The following five functions are the inverse of the ndarray_get_... functions,
+// and write a floating point datum into a void pointer
+
+void ndarray_set_float_uint8(void *data, mp_float_t datum) {
+    *((uint8_t *)data) = (uint8_t)datum;
+}
+
+void ndarray_set_float_int8(void *data, mp_float_t datum) {
+    *((int8_t *)data) = (int8_t)datum;
+}
+
+void ndarray_set_float_uint16(void *data, mp_float_t datum) {
+    *((uint16_t *)data) = (uint16_t)datum;
+}
+
+void ndarray_set_float_int16(void *data, mp_float_t datum) {
+    *((int16_t *)data) = (int16_t)datum;
+}
+
+void ndarray_set_float_float(void *data, mp_float_t datum) {
+    *((mp_float_t *)data) = datum;
+}
+
+// returns a single function pointer, depending on the dtype
+void *ndarray_set_float_function(uint8_t dtype) {
+    if(dtype == NDARRAY_UINT8) {
+        return ndarray_set_float_uint8;
+    } else if(dtype == NDARRAY_INT8) {
+        return ndarray_set_float_int8;
+    } else if(dtype == NDARRAY_UINT16) {
+        return ndarray_set_float_uint16;
+    } else if(dtype == NDARRAY_INT16) {
+        return ndarray_set_float_int16;
+    } else {
+        return ndarray_set_float_float;
+    }
+}
+#endif /* NDARRAY_BINARY_USES_FUN_POINTER */
+
+shape_strides tools_reduce_axes(ndarray_obj_t *ndarray, mp_obj_t axis) {
+    // TODO: replace numerical_reduce_axes with this function, wherever applicable
+    // This function should be used, whenever a tensor is contracted;
+    // The shape and strides at `axis` are moved to the zeroth position,
+    // everything else is aligned to the right
+    if(!mp_obj_is_int(axis) & (axis != mp_const_none)) {
+        mp_raise_TypeError(translate("axis must be None, or an integer"));
+    }
+    shape_strides _shape_strides;
+
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS + 1);
+    _shape_strides.shape = shape;
+    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS + 1);
+    _shape_strides.strides = strides;
+
+    _shape_strides.increment = 0;
+    // this is the contracted dimension (won't be overwritten for axis == None)
+    _shape_strides.ndim = 0;
+
+    memcpy(_shape_strides.shape, ndarray->shape, sizeof(size_t) * ULAB_MAX_DIMS);
+    memcpy(_shape_strides.strides, ndarray->strides, sizeof(int32_t) * ULAB_MAX_DIMS);
+
+    if(axis == mp_const_none) {
+        return _shape_strides;
+    }
+
+    uint8_t index = ULAB_MAX_DIMS - 1; // value of index for axis == mp_const_none (won't be overwritten)
+
+    if(axis != mp_const_none) { // i.e., axis is an integer
+        int8_t ax = mp_obj_get_int(axis);
+        if(ax < 0) ax += ndarray->ndim;
+        if((ax < 0) || (ax > ndarray->ndim - 1)) {
+            mp_raise_ValueError(translate("index out of range"));
+        }
+        index = ULAB_MAX_DIMS - ndarray->ndim + ax;
+        _shape_strides.ndim = ndarray->ndim - 1;
+    }
+
+    // move the value stored at index to the leftmost position, and align everything else to the right
+    _shape_strides.shape[0] = ndarray->shape[index];
+    _shape_strides.strides[0] = ndarray->strides[index];
+    for(uint8_t i = 0; i < index; i++) {
+        // entries to the right of index must be shifted by one position to the left
+        _shape_strides.shape[i + 1] = ndarray->shape[i];
+        _shape_strides.strides[i + 1] = ndarray->strides[i];
+    }
+
+    if(_shape_strides.ndim != 0) {
+        _shape_strides.increment = 1;
+    }
+
+    return _shape_strides;
+}
+
+int8_t tools_get_axis(mp_obj_t axis, uint8_t ndim) {
+    int8_t ax = mp_obj_get_int(axis);
+    if(ax < 0) ax += ndim;
+    if((ax < 0) || (ax > ndim - 1)) {
+        mp_raise_ValueError(translate("axis is out of bounds"));
+    }
+    return ax;
+}
+
+#if ULAB_MAX_DIMS > 1
+ndarray_obj_t *tools_object_is_square(mp_obj_t obj) {
+    // Returns an ndarray, if the object is a square ndarray,
+    // raises the appropriate exception otherwise
+    if(!mp_obj_is_type(obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("size is defined for ndarrays only"));
+    }
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(obj);
+    if((ndarray->shape[ULAB_MAX_DIMS - 1] != ndarray->shape[ULAB_MAX_DIMS - 2]) || (ndarray->ndim != 2)) {
+        mp_raise_ValueError(translate("input must be square matrix"));
+    }
+    return ndarray;
+}
+#endif
+
+uint8_t ulab_binary_get_size(uint8_t dtype) {
+    #if ULAB_SUPPORTS_COMPLEX
+    if(dtype == NDARRAY_COMPLEX) {
+        return 2 * (uint8_t)sizeof(mp_float_t);
+    }
+    #endif
+    return dtype == NDARRAY_BOOL ? 1 : mp_binary_get_size('@', dtype, NULL);
+}
+
+#if ULAB_SUPPORTS_COMPLEX
+void ulab_rescale_float_strides(int32_t *strides) {
+    // re-scale the strides, so that we can work with floats, when iterating
+    uint8_t sz = sizeof(mp_float_t);
+    for(uint8_t i = 0; i < ULAB_MAX_DIMS; i++) {
+        strides[i] /= sz;
+    }
+}
+#endif
+\ No newline at end of file
diff --git a/circuitpython/extmod/ulab/code/ulab_tools.h b/circuitpython/extmod/ulab/code/ulab_tools.h
new file mode 100644
index 0000000..2898ef1
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/ulab_tools.h
@@ -0,0 +1,45 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2022 Zoltán Vörös
+*/
+
+#ifndef _TOOLS_
+#define _TOOLS_
+
+#include "ndarray.h"
+
+#define SWAP(t, a, b) { t tmp = a; a = b; b = tmp; }
+
+typedef struct _shape_strides_t {
+    uint8_t increment;
+    uint8_t ndim;
+    size_t *shape;
+    int32_t *strides;
+} shape_strides;
+
+mp_float_t ndarray_get_float_uint8(void *);
+mp_float_t ndarray_get_float_int8(void *);
+mp_float_t ndarray_get_float_uint16(void *);
+mp_float_t ndarray_get_float_int16(void *);
+mp_float_t ndarray_get_float_float(void *);
+void *ndarray_get_float_function(uint8_t );
+
+uint8_t ndarray_upcast_dtype(uint8_t , uint8_t );
+void *ndarray_set_float_function(uint8_t );
+
+shape_strides tools_reduce_axes(ndarray_obj_t *, mp_obj_t );
+int8_t tools_get_axis(mp_obj_t , uint8_t );
+ndarray_obj_t *tools_object_is_square(mp_obj_t );
+
+uint8_t ulab_binary_get_size(uint8_t );
+
+#if ULAB_SUPPORTS_COMPLEX
+void ulab_rescale_float_strides(int32_t *);
+#endif
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/user/user.c b/circuitpython/extmod/ulab/code/user/user.c
new file mode 100644
index 0000000..5ee890a
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/user/user.c
@@ -0,0 +1,96 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+#include "user.h"
+
+#if ULAB_HAS_USER_MODULE
+
+//| """This module should hold arbitrary user-defined functions."""
+//|
+
+static mp_obj_t user_square(mp_obj_t arg) {
+    // the function takes a single dense ndarray, and calculates the
+    // element-wise square of its entries
+
+    // raise a TypeError exception, if the input is not an ndarray
+    if(!mp_obj_is_type(arg, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("input must be an ndarray"));
+    }
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(arg);
+
+    // make sure that the input is a dense array
+    if(!ndarray_is_dense(ndarray)) {
+        mp_raise_TypeError(translate("input must be a dense ndarray"));
+    }
+
+    // if the input is a dense array, create `results` with the same number of
+    // dimensions, shape, and dtype
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndarray->ndim, ndarray->shape, ndarray->dtype);
+
+    // since in a dense array the iteration over the elements is trivial, we
+    // can cast the data arrays ndarray->array and results->array to the actual type
+    if(ndarray->dtype == NDARRAY_UINT8) {
+        uint8_t *array = (uint8_t *)ndarray->array;
+        uint8_t *rarray = (uint8_t *)results->array;
+        for(size_t i=0; i < ndarray->len; i++, array++) {
+            *rarray++ = (*array) * (*array);
+        }
+    } else if(ndarray->dtype == NDARRAY_INT8) {
+        int8_t *array = (int8_t *)ndarray->array;
+        int8_t *rarray = (int8_t *)results->array;
+        for(size_t i=0; i < ndarray->len; i++, array++) {
+            *rarray++ = (*array) * (*array);
+        }
+    } else if(ndarray->dtype == NDARRAY_UINT16) {
+        uint16_t *array = (uint16_t *)ndarray->array;
+        uint16_t *rarray = (uint16_t *)results->array;
+        for(size_t i=0; i < ndarray->len; i++, array++) {
+            *rarray++ = (*array) * (*array);
+        }
+    } else if(ndarray->dtype == NDARRAY_INT16) {
+        int16_t *array = (int16_t *)ndarray->array;
+        int16_t *rarray = (int16_t *)results->array;
+        for(size_t i=0; i < ndarray->len; i++, array++) {
+            *rarray++ = (*array) * (*array);
+        }
+    } else { // if we end up here, the dtype is NDARRAY_FLOAT
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        mp_float_t *rarray = (mp_float_t *)results->array;
+        for(size_t i=0; i < ndarray->len; i++, array++) {
+            *rarray++ = (*array) * (*array);
+        }
+    }
+    // at the end, return a micrppython object
+    return MP_OBJ_FROM_PTR(results);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(user_square_obj, user_square);
+
+static const mp_rom_map_elem_t ulab_user_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_user) },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_square), (mp_obj_t)&user_square_obj },
+};
+
+static MP_DEFINE_CONST_DICT(mp_module_ulab_user_globals, ulab_user_globals_table);
+
+const mp_obj_module_t ulab_user_module = {
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_user_globals,
+};
+MP_REGISTER_MODULE(MP_QSTR_ulab_dot_user, ulab_user_module, ULAB_HAS_USER_MODULE && CIRCUITPY_ULAB);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/user/user.h b/circuitpython/extmod/ulab/code/user/user.h
new file mode 100644
index 0000000..ff274f4
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/user/user.h
@@ -0,0 +1,20 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+*/
+
+#ifndef _USER_
+#define _USER_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+extern const mp_obj_module_t ulab_user_module;
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/utils/utils.c b/circuitpython/extmod/ulab/code/utils/utils.c
new file mode 100644
index 0000000..c265d49
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/utils/utils.c
@@ -0,0 +1,216 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+#include "utils.h"
+
+#if ULAB_HAS_UTILS_MODULE
+
+enum UTILS_BUFFER_TYPE {
+    UTILS_INT16_BUFFER,
+    UTILS_UINT16_BUFFER,
+    UTILS_INT32_BUFFER,
+    UTILS_UINT32_BUFFER,
+};
+
+#if ULAB_UTILS_HAS_FROM_INT16_BUFFER | ULAB_UTILS_HAS_FROM_UINT16_BUFFER | ULAB_UTILS_HAS_FROM_INT32_BUFFER | ULAB_UTILS_HAS_FROM_UINT32_BUFFER
+static mp_obj_t utils_from_intbuffer_helper(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args, uint8_t buffer_type) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } } ,
+        { MP_QSTR_count, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_INT(-1) } },
+        { MP_QSTR_offset, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_INT(0) } },
+        { MP_QSTR_out, MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_byteswap, MP_ARG_OBJ, { .u_rom_obj = mp_const_false } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    ndarray_obj_t *ndarray = NULL;
+
+    if(args[3].u_obj != mp_const_none) {
+        ndarray = MP_OBJ_TO_PTR(args[3].u_obj);
+        if((ndarray->dtype != NDARRAY_FLOAT) || !ndarray_is_dense(ndarray)) {
+            mp_raise_TypeError(translate("out must be a float dense array"));
+        }
+    }
+
+    size_t offset = mp_obj_get_int(args[2].u_obj);
+
+    mp_buffer_info_t bufinfo;
+    if(mp_get_buffer(args[0].u_obj, &bufinfo, MP_BUFFER_READ)) {
+        if(bufinfo.len < offset) {
+            mp_raise_ValueError(translate("offset is too large"));
+        }
+        uint8_t sz = sizeof(int16_t);
+        #if ULAB_UTILS_HAS_FROM_INT32_BUFFER | ULAB_UTILS_HAS_FROM_UINT32_BUFFER
+        if((buffer_type == UTILS_INT32_BUFFER) ||  (buffer_type == UTILS_UINT32_BUFFER)) {
+            sz = sizeof(int32_t);
+        }
+        #endif
+
+        size_t len = (bufinfo.len - offset) / sz;
+        if((len * sz) != (bufinfo.len - offset)) {
+            mp_raise_ValueError(translate("buffer size must be a multiple of element size"));
+        }
+        if(mp_obj_get_int(args[1].u_obj) > 0) {
+            size_t count = mp_obj_get_int(args[1].u_obj);
+            if(len < count) {
+                mp_raise_ValueError(translate("buffer is smaller than requested size"));
+            } else {
+                len = count;
+            }
+        }
+        if(args[3].u_obj == mp_const_none) {
+            ndarray = ndarray_new_linear_array(len, NDARRAY_FLOAT);
+        } else {
+            if(ndarray->len < len) {
+                mp_raise_ValueError(translate("out array is too small"));
+            }
+        }
+        uint8_t *buffer = bufinfo.buf;
+
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        if(args[4].u_obj == mp_const_true) {
+            // swap the bytes before conversion
+            uint8_t *tmpbuff = m_new(uint8_t, sz);
+            #if ULAB_UTILS_HAS_FROM_INT16_BUFFER | ULAB_UTILS_HAS_FROM_UINT16_BUFFER
+            if((buffer_type == UTILS_INT16_BUFFER) || (buffer_type == UTILS_UINT16_BUFFER)) {
+                for(size_t i = 0; i < len; i++) {
+                    tmpbuff += sz;
+                    for(uint8_t j = 0; j < sz; j++) {
+                        memcpy(--tmpbuff, buffer++, 1);
+                    }
+                    if(buffer_type == UTILS_INT16_BUFFER) {
+                        *array++ = (mp_float_t)(*(int16_t *)tmpbuff);
+                    } else {
+                        *array++ = (mp_float_t)(*(uint16_t *)tmpbuff);
+                    }
+                }
+            }
+            #endif
+            #if ULAB_UTILS_HAS_FROM_INT32_BUFFER | ULAB_UTILS_HAS_FROM_UINT32_BUFFER
+            if((buffer_type == UTILS_INT32_BUFFER) || (buffer_type == UTILS_UINT32_BUFFER)) {
+                for(size_t i = 0; i < len; i++) {
+                    tmpbuff += sz;
+                    for(uint8_t j = 0; j < sz; j++) {
+                        memcpy(--tmpbuff, buffer++, 1);
+                    }
+                    if(buffer_type == UTILS_INT32_BUFFER) {
+                        *array++ = (mp_float_t)(*(int32_t *)tmpbuff);
+                    } else {
+                        *array++ = (mp_float_t)(*(uint32_t *)tmpbuff);
+                    }
+                }
+            }
+            #endif
+        } else {
+            #if ULAB_UTILS_HAS_FROM_INT16_BUFFER
+            if(buffer_type == UTILS_INT16_BUFFER) {
+                for(size_t i = 0; i < len; i++) {
+                    *array++ = (mp_float_t)(*(int16_t *)buffer);
+                    buffer += sz;
+                }
+            }
+            #endif
+            #if ULAB_UTILS_HAS_FROM_UINT16_BUFFER
+            if(buffer_type == UTILS_UINT16_BUFFER) {
+                for(size_t i = 0; i < len; i++) {
+                    *array++ = (mp_float_t)(*(uint16_t *)buffer);
+                    buffer += sz;
+                }
+            }
+            #endif
+            #if ULAB_UTILS_HAS_FROM_INT32_BUFFER
+            if(buffer_type == UTILS_INT32_BUFFER) {
+                for(size_t i = 0; i < len; i++) {
+                    *array++ = (mp_float_t)(*(int32_t *)buffer);
+                    buffer += sz;
+                }
+            }
+            #endif
+            #if ULAB_UTILS_HAS_FROM_UINT32_BUFFER
+            if(buffer_type == UTILS_UINT32_BUFFER) {
+                for(size_t i = 0; i < len; i++) {
+                    *array++ = (mp_float_t)(*(uint32_t *)buffer);
+                    buffer += sz;
+                }
+            }
+            #endif
+        }
+        return MP_OBJ_FROM_PTR(ndarray);
+    }
+    return mp_const_none;
+}
+
+#ifdef ULAB_UTILS_HAS_FROM_INT16_BUFFER
+static mp_obj_t utils_from_int16_buffer(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return utils_from_intbuffer_helper(n_args, pos_args, kw_args, UTILS_INT16_BUFFER);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(utils_from_int16_buffer_obj, 1, utils_from_int16_buffer);
+#endif
+
+#ifdef ULAB_UTILS_HAS_FROM_UINT16_BUFFER
+static mp_obj_t utils_from_uint16_buffer(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return utils_from_intbuffer_helper(n_args, pos_args, kw_args, UTILS_UINT16_BUFFER);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(utils_from_uint16_buffer_obj, 1, utils_from_uint16_buffer);
+#endif
+
+#ifdef ULAB_UTILS_HAS_FROM_INT32_BUFFER
+static mp_obj_t utils_from_int32_buffer(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return utils_from_intbuffer_helper(n_args, pos_args, kw_args, UTILS_INT32_BUFFER);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(utils_from_int32_buffer_obj, 1, utils_from_int32_buffer);
+#endif
+
+#ifdef ULAB_UTILS_HAS_FROM_UINT32_BUFFER
+static mp_obj_t utils_from_uint32_buffer(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return utils_from_intbuffer_helper(n_args, pos_args, kw_args, UTILS_UINT32_BUFFER);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(utils_from_uint32_buffer_obj, 1, utils_from_uint32_buffer);
+#endif
+
+#endif
+
+static const mp_rom_map_elem_t ulab_utils_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_utils) },
+    #if ULAB_UTILS_HAS_FROM_INT16_BUFFER
+        { MP_OBJ_NEW_QSTR(MP_QSTR_from_int16_buffer), (mp_obj_t)&utils_from_int16_buffer_obj },
+    #endif
+    #if ULAB_UTILS_HAS_FROM_UINT16_BUFFER
+        { MP_OBJ_NEW_QSTR(MP_QSTR_from_uint16_buffer), (mp_obj_t)&utils_from_uint16_buffer_obj },
+    #endif
+    #if ULAB_UTILS_HAS_FROM_INT32_BUFFER
+        { MP_OBJ_NEW_QSTR(MP_QSTR_from_int32_buffer), (mp_obj_t)&utils_from_int32_buffer_obj },
+    #endif
+    #if ULAB_UTILS_HAS_FROM_UINT32_BUFFER
+        { MP_OBJ_NEW_QSTR(MP_QSTR_from_uint32_buffer), (mp_obj_t)&utils_from_uint32_buffer_obj },
+    #endif
+};
+
+static MP_DEFINE_CONST_DICT(mp_module_ulab_utils_globals, ulab_utils_globals_table);
+
+const mp_obj_module_t ulab_utils_module = {
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_utils_globals,
+};
+MP_REGISTER_MODULE(MP_QSTR_ulab_dot_utils, ulab_utils_module, MODULE_ULAB_ENABLED && CIRCUITPY_ULAB);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/utils/utils.h b/circuitpython/extmod/ulab/code/utils/utils.h
new file mode 100644
index 0000000..b2155c3
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/utils/utils.h
@@ -0,0 +1,19 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+*/
+
+#ifndef _UTILS_
+#define _UTILS_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+extern const mp_obj_module_t ulab_utils_module;
+
+#endif
author	Raghuram Subramani <raghus2247@gmail.com>	2022-06-19 19:47:51 +0530
committer	Raghuram Subramani <raghus2247@gmail.com>	2022-06-19 19:47:51 +0530
commit	4fd287655a72b9aea14cdac715ad5b90ed082ed2 (patch)
tree	65d393bc0e699dd12d05b29ba568e04cea666207 /circuitpython/extmod/ulab/code
parent	0150f70ce9c39e9e6dd878766c0620c85e47bed0 (diff)